Exemplo n.º 1
0
def get_folder_data(train_path, val_path, data_shape, batch_size, num_workers=os.cpu_count()):
    train_dataset = ImageFolderDataset(train_path)
    val_dataset = ImageFolderDataset(val_path)

    train_transformer = gluon.data.vision.transforms.Compose([
        transforms.RandomFlipLeftRight(),
        transforms.RandomResizedCrop(data_shape, scale=(0.5, 1.0)),
        transforms.RandomBrightness(0.5),
        transforms.RandomHue(0.1),
        transforms.Resize(data_shape),
        transforms.ToTensor()
    ])
    val_transformer = gluon.data.vision.transforms.Compose([
        transforms.Resize(data_shape),
        transforms.ToTensor()
    ])

    train_dataloader = data.DataLoader(train_dataset.transform_first(train_transformer),
                                         batch_size=batch_size, shuffle=True, last_batch='rollover', 
                                        num_workers=num_workers)
    val_dataloader = data.DataLoader(val_dataset.transform_first(val_transformer),
                                         batch_size=batch_size, shuffle=True, last_batch='rollover', 
                                        num_workers=num_workers)

    return train_dataloader, val_dataloader
Exemplo n.º 2
0
def train(current_host, hosts, num_gpus, log_interval, channel_input_dirs,
          batch_size, epochs, learning_rate, momentum, wd, resnet_size):

    print("Using Resnet {} model".format(resnet_size))
    model_options = {
        '18': models.resnet18_v2,
        '34': models.resnet34_v2,
        '50': models.resnet50_v2,
        '101': models.resnet101_v2,
        '152': models.resnet152_v2
    }

    if resnet_size not in model_options:
        raise Exception('Resnet size must be one of 18, 34, 50, 101, or 152')

    if len(hosts) == 1:
        kvstore = 'device' if num_gpus > 0 else 'local'
    else:
        kvstore = 'dist_device_sync'

    if num_gpus > 0:
        ctx = mx.gpu()
    else:
        ctx = mx.cpu()

    print(ctx)
    selected_model = model_options[resnet_size]
    pretrained_net = selected_model(ctx=ctx, pretrained=True)
    net = selected_model(ctx=ctx, pretrained=False,
                         classes=2)  # Changed classes to 2
    net.features = pretrained_net.features

    part_index = 0
    for i, host in enumerate(hosts):
        if host == current_host:
            part_index = i
            break

    data_dir = channel_input_dirs
    os.mkdir('/opt/ml/checkpoints')
    CHECKPOINTS_DIR = '/opt/ml/checkpoints'
    checkpoints_enabled = os.path.exists(CHECKPOINTS_DIR)

    train = ImageFolderDataset('/opt/ml/input/data/training/train')
    test = ImageFolderDataset('/opt/ml/input/data/training/test')

    transform_func = transforms.Compose([
        transforms.Resize(size=(256)),
        transforms.CenterCrop(size=(224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.49139969, 0.48215842, 0.44653093],
                             std=[0.20220212, 0.19931542, 0.20086347])
    ])

    train_transformed = train.transform_first(transform_func)
    test_transformed = test.transform_first(transform_func)

    print("Transformed Training and Test Files")

    train_data = gluon.data.DataLoader(train_transformed,
                                       batch_size=32,
                                       shuffle=True,
                                       num_workers=1)
    test_data = gluon.data.DataLoader(test_transformed,
                                      batch_size=32,
                                      num_workers=1)

    print("Initialized Batching Operation")

    net.initialize(mx.init.Xavier(), ctx=ctx)

    # Trainer is for updating parameters with gradient.
    criterion = gluon.loss.SoftmaxCrossEntropyLoss()
    trainer = gluon.Trainer(net.collect_params(),
                            'sgd',
                            optimizer_params={
                                'learning_rate': learning_rate,
                                'momentum': momentum,
                                'wd': wd
                            },
                            kvstore=kvstore)
    metric = mx.metric.Accuracy()
    net.hybridize()

    best_loss = 5.0
    for epoch in range(epochs):
        # training loop (with autograd and trainer steps, etc.)
        cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
        training_samples = 0
        metric.reset()
        for batch_idx, (data, label) in enumerate(train_data):
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            outputs = []
            with ag.record():
                output = net(data)

                loss = criterion(output, label)
                outputs.append(output)
            loss.backward()
            trainer.step(data.shape[0])
            metric.update(label, output)
            cumulative_train_loss += loss.sum()
            training_samples += data.shape[0]
        train_loss = cumulative_train_loss.asscalar() / training_samples
        name, train_acc = metric.get()

        # validation loop
        cumulative_valid_loss = mx.nd.zeros(1, ctx)
        valid_samples = 0
        metric.reset()
        for batch_idx, (data, label) in enumerate(test_data):
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            output = net(data)
            loss = criterion(output, label)
            cumulative_valid_loss += loss.sum()
            valid_samples += data.shape[0]
            metric.update(label, output)
        valid_loss = cumulative_valid_loss.asscalar() / valid_samples
        name, val_acc = metric.get()

        print(
            "Epoch {}, training loss: {:.2f}, validation loss: {:.2f}, train accuracy: {:.2f}, validation accuracy: {:.2f}"
            .format(epoch, train_loss, valid_loss, train_acc, val_acc))

        # only save params on primary host
        if checkpoints_enabled and current_host == hosts[0]:
            if valid_loss < best_loss:
                best_loss = valid_loss
                logging.info('Saving the model, params and optimizer state')
                net.export(CHECKPOINTS_DIR + "/%.4f-hotdog" % (best_loss),
                           epoch)
                save(net, CHECKPOINTS_DIR)
                trainer.save_states(CHECKPOINTS_DIR +
                                    '/%.4f-hotdog-%d.states' %
                                    (best_loss, epoch))

    return net