예제 #1
0
def create_resnet():
    # Build network
    import keras_resnet_single as networks
    resnet = networks.ResNet.build(
        len(channels), resblocks, [16, 32],
        (125 * granularity, 125 * granularity, len(channels)), granularity)
    # Load saved weights, if indicated
    if args.load_epoch != 0:
        directory = args.save_dir
        if args.save_dir == '':
            directory = expt_name
        model_name = glob.glob('../MODELS/%s/epoch%02d-*.hdf5' %
                               (directory, args.load_epoch))[0]
        #assert len(model_name) == 2
        #model_name = model_name[0].split('.hdf5')[0]+'.hdf5'
        print('Loading weights from file:', model_name)
        resnet.load_weights(model_name)
    #opt = keras.optimizers.Adam(lr=lr_init, epsilon=1.e-5) # changed eps to match pytorch value
    #opt = keras.optimizers.SGD(lr=lr_init * hvd.size())
    opt = NovoGrad(learning_rate=lr_init * hvd.size())
    #Wrap the optimizer in a Horovod distributed optimizer -> uses hvd.DistributedOptimizer() to compute gradients.
    opt = hvd.DistributedOptimizer(opt)

    #For Horovod: We specify `experimental_run_tf_function=False` to ensure TensorFlow
    #resnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'], experimental_run_tf_function = False)
    #resnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    resnet.summary()
    return resnet
예제 #2
0
def create_model(resume_from_epoch):

    if resume_from_epoch > 0:
        # Restore from a previous checkpoint, if initial_epoch is specified.
        model = keras.models.load_model(args.checkpoint_format.format(epoch=resume_from_epoch))
    else:
        # Set up standard WideResNet-16-10 model.
        model = WideResidualNetwork(depth=16, width=10, input_shape=input_shape,
                                    classes=num_classes, dropout_rate=0.01)

        # WideResNet model that is included with Keras is optimized for inference.
        # Add L2 weight decay & adjust BN settings.
        model_config = model.get_config()
        for layer, layer_config in zip(model.layers, model_config['layers']):
            if hasattr(layer, 'kernel_regularizer'):
                regularizer = keras.regularizers.l2(args.wd)
                layer_config['config']['kernel_regularizer'] = \
                    {'class_name': regularizer.__class__.__name__,
                     'config': regularizer.get_config()}
            if type(layer) == keras.layers.BatchNormalization:
                layer_config['config']['momentum'] = 0.9
                layer_config['config']['epsilon'] = 1e-5

        model = keras.models.Model.from_config(model_config)

        # TODO: Step 8: Scale the learning rate by the number of workers.
        # opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(), momentum=args.momentum)
        
        # TODO: Step 10: use the NovoGrad optimizer instead of SGD
        opt = NovoGrad(learning_rate=args.base_lr * hvd.size())
        
        # TODO: Step 3: Wrap the optimizer in a Horovod distributed optimizer
        opt = hvd.DistributedOptimizer(opt)

        # For Horovod: We specify `experimental_run_tf_function=False` to ensure TensorFlow
        # uses hvd.DistributedOptimizer() to compute gradients.        
        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=opt,
                      metrics=['accuracy'],
                      experimental_run_tf_function = False) 
        
    return model
예제 #3
0
def create_model():

    # Set up standard WideResNet-16-10 model.
    model = WideResidualNetwork(depth=16,
                                width=10,
                                weights=None,
                                input_shape=input_shape,
                                classes=num_classes,
                                dropout_rate=0.01)

    # WideResNet model that is included with Keras is optimized for inference.
    # Add L2 weight decay & adjust BN settings.
    model_config = model.get_config()
    for layer, layer_config in zip(model.layers, model_config['layers']):
        if hasattr(layer, 'kernel_regularizer'):
            regularizer = keras.regularizers.l2(args.wd)
            layer_config['config']['kernel_regularizer'] = \
                {'class_name': regularizer.__class__.__name__,
                 'config': regularizer.get_config()}
        if type(layer) == keras.layers.BatchNormalization:
            layer_config['config']['momentum'] = 0.9
            layer_config['config']['epsilon'] = 1e-5

    model = keras.models.Model.from_config(model_config)

    if args.novo_grad:
        opt = NovoGrad(lr=args.base_lr)
    else:
        opt = keras.optimizers.SGD(lr=args.base_lr, momentum=args.momentum)

    # Wrap the optimizer in a Horovod distributed optimizer
    opt = hvd.DistributedOptimizer(opt)

    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=opt,
                  metrics=['accuracy'])

    return model
예제 #4
0
elif args.optimizer.lower() == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'sgdwm':
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'radam':
    optimizer = RAdam(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'lars':  #no tensorboardX
    optimizer = LARS(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'lamb':
    optimizer = Lamb(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'novograd':
    optimizer = NovoGrad(model.parameters(), lr=args.lr, weight_decay=0.0001)
else:
    optimizer = optim.SGD(model.parameters(), lr=0.01)

optname = args.optimizer if len(sys.argv) >= 2 else 'sgd'

# log = open(optname+'log.txt','w+')

log = None

criterion = nn.CrossEntropyLoss()

model, optimizer, _ = training_loop(model, criterion, optimizer, train_loader,
                                    valid_loader, N_EPOCHS, DEVICE, log)

with open('lbloss/' + optname + str(args.lr) + '_loss.txt', 'w+') as myfile:
    optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'radam':
    from radam import RAdam
    optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':#no tensorboardX
    from lars import LARS
    optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer  = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'dyna':
    from dyna import Dyna
    optimizer = Dyna(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
else:
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                          weight_decay=args.weight_decay)
# lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay)
# lr_scheduler = LambdaLR(optimizer,lrs)
# lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1)

batch_acumulate = args.batch_size//256
batch_per_step = len(trainloader)//batch_acumulate+int(len(trainloader)%batch_acumulate>0)

lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,args.max_lr,steps_per_epoch=batch_per_step,
                                                   epochs=args.num_epoch,div_factor=args.div_factor,final_div_factor=args.final_div,pct_start=args.pct_start)
def main(lr=0.1):
    global best_acc
    args.lr = lr
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Data
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10',
                                            train=True,
                                            download=True,
                                            transform=transform_train)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='/tmp/cifar10',
                                           train=False,
                                           download=True,
                                           transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=100,
                                             shuffle=False,
                                             num_workers=2)

    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    # Model
    print('==> Building model..')
    # net = VGG('VGG19')
    # net = ResNet18()
    # net = PreActResNet18()
    # net = GoogLeNet()
    # net = DenseNet121()
    # net = ResNeXt29_2x64d()
    # net = MobileNet()
    # net = MobileNetV2()
    # net = DPN92()
    # net = ShuffleNetG2()
    # net = SENet18()
    # net = ShuffleNetV2(1)
    # net = EfficientNetB0()
    # net = RegNetX_200MF()
    net = ResNet50()
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    ckpt = './checkpoint/' + args.optimizer + str(lr) + '_ckpt.pth'

    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isdir(
            'checkpoint'), 'Error: no checkpoint directory found!'
        checkpoint = torch.load(ckpt)
        net.load_state_dict(checkpoint['net'])
        best_acc = checkpoint['acc']
        start_epoch = checkpoint['epoch']

    criterion = nn.CrossEntropyLoss()
    if args.optimizer.lower() == 'sgd':
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              weight_decay=args.weight_decay)
    if args.optimizer.lower() == 'sgdwm':
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adam':
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'rmsprop':
        optimizer = optim.RMSprop(net.parameters(),
                                  lr=args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adagrad':
        optimizer = optim.Adagrad(net.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'radam':
        from radam import RAdam
        optimizer = RAdam(net.parameters(),
                          lr=args.lr,
                          weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lars':  #no tensorboardX
        from lars import LARS
        optimizer = LARS(net.parameters(),
                         lr=args.lr,
                         momentum=args.momentum,
                         weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lamb':
        from lamb import Lamb
        optimizer = Lamb(net.parameters(),
                         lr=args.lr,
                         weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'novograd':
        from novograd import NovoGrad
        optimizer = NovoGrad(net.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)
    else:
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay)
    # lr_scheduler = LambdaLR(optimizer,lrs)
    # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1)
    train_acc = []
    valid_acc = []

    # Training
    def train(epoch):
        print('\nEpoch: %d' % epoch)
        net.train()
        train_loss = 0
        correct = 0
        total = 0
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            print(batch_idx)
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            # lr_scheduler.step()
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        print(100. * correct / total)
        train_acc.append(correct / total)

    def test(epoch):
        global best_acc
        net.eval()
        test_loss = 0
        correct = 0
        total = 0
        print('test')
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(testloader):
                print(batch_idx)
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = net(inputs)
                loss = criterion(outputs, targets)

                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

        # Save checkpoint.
        acc = 100. * correct / total
        print(acc)
        valid_acc.append(correct / total)

        if acc > best_acc:
            print('Saving..')
            state = {
                'net': net.state_dict(),
                'acc': acc,
                'epoch': epoch,
            }
            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(state, ckpt)
            best_acc = acc

    for epoch in range(200):
        if epoch in args.lr_decay:
            checkpoint = torch.load(ckpt)
            net.load_state_dict(checkpoint['net'])
            best_acc = checkpoint['acc']
            args.lr *= 0.1
            if args.optimizer.lower() == 'sgd':
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
            if args.optimizer.lower() == 'sgdwm':
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=args.momentum,
                                      weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'adam':
                optimizer = optim.Adam(net.parameters(),
                                       lr=args.lr,
                                       weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'rmsprop':
                optimizer = optim.RMSprop(net.parameters(),
                                          lr=args.lr,
                                          momentum=args.momentum,
                                          weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'adagrad':
                optimizer = optim.Adagrad(net.parameters(),
                                          lr=args.lr,
                                          weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'radam':
                from radam import RAdam

                optimizer = RAdam(net.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'lars':  # no tensorboardX
                optimizer = LARS(net.parameters(),
                                 lr=args.lr,
                                 momentum=args.momentum,
                                 weight_decay=args.weight_decay,
                                 dampening=args.damping)
            elif args.optimizer.lower() == 'lamb':
                optimizer = Lamb(net.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'novograd':
                optimizer = NovoGrad(net.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
            else:
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=args.momentum,
                                      weight_decay=args.weight_decay)
        train(epoch)
        test(epoch)
    file = open(args.optimizer + str(lr) + 'log.json', 'w+')
    json.dump([train_acc, valid_acc], file)
    return best_acc
예제 #7
0
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':  #no tensorboardX
    from lars import LARS
    optimizer = LARS(model.parameters(),
                     lr=args.base_lr,
                     momentum=args.momentum,
                     weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer = Lamb(model.parameters(),
                     lr=args.base_lr,
                     weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(model.parameters(),
                         lr=args.base_lr,
                         weight_decay=args.weight_decay)
    lr_scheduler = [
        optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(train_loader),
                                             1e-4)
    ]
else:
    optimizer = optim.SGD(model.parameters(),
                          lr=args.base_lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

if use_kfac:
    preconditioner = kfac.KFAC(
        model,
        lr=args.base_lr,
예제 #8
0
    #test_data = test_dataset(dataset.skip(train_sz+valid_sz).take(test_sz), start=train_sz+valid_sz, end=train_sz+valid_sz+test_sz)
    '''
    print("\n Timestamp: " + str(tf.cast(tf.timestamp(), tf.float64)))

    with tf.device('/gpu:0'):
        daliop = dali_tf.DALIIterator()
        shapes = [(BATCH_SZ, 125, 125, 8), (BATCH_SZ, 2)]
        dtypes = [tf.float32, tf.int32]

        # Create TF dataset
        out_dataset = dali_tf.DALIDataset(pipeline=pipe,
                                          batch_size=BATCH_SZ,
                                          shapes=shapes,
                                          dtypes=dtypes,
                                          device_id=0)
        opt = NovoGrad(learning_rate=lr_init * hvd.size())
        #Wrap the optimizer in a Horovod distributed optimizer -> uses hvd.DistributedOptimizer() to compute gradients.
        opt = hvd.DistributedOptimizer(opt)

        resnet.compile(optimizer=opt,
                       loss='binary_crossentropy',
                       metrics=['accuracy'],
                       experimental_run_tf_function=False)

        # Train using DALI dataset
        history = resnet.fit(
            out_dataset,
            steps_per_epoch=train_sz // (BATCH_SZ * hvd.size()),
            epochs=epochs,
            callbacks=callbacks_list,
            verbose=verbose,
예제 #9
0
        x = self.pool(F.relu(self.conv2(x)))
        x = self.dropout1(x)
        x = x.view(-1, 12 * 12 * 64)
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        x = F.log_softmax(x, dim=1)
        return x


model = Net()
criterion = nn.NLLLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
# optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.001)
optimizer = NovoGrad(model.parameters(), lr=0.01, weight_decay=0.001)
schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                 3 * len(trainloader), 1e-4)

epochs = 3

for epoch in range(epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(trainloader, 0):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
예제 #10
0
        x = self.dropout1(x)
        x = x.view(batch_size, -1)
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        x = F.log_softmax(x, dim=1)
        return x


model = Net()
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
# optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.001)
optimizer = NovoGrad(model.parameters(),
                     lr=0.01,
                     grad_averaging=True,
                     weight_decay=0.001)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                 3 * len(trainloader), 1e-4)
epochs = 3

for epoch in range(epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(trainloader, 0):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()