Exemplo n.º 1
0
def run_training(
        net, train, valid, result_dir, batchsize=64, devices=-1,
        training_epoch=300, initial_lr=0.05, lr_decay_rate=0.5,
        lr_decay_epoch=30, weight_decay=0.0005):
    # Iterator
    train_iter = iterators.MultiprocessIterator(train, batchsize)
    test_iter = iterators.MultiprocessIterator(valid, batchsize, False, False)

    # Model
    net = L.Classifier(net)

    # Optimizer
    optimizer = optimizers.MomentumSGD(lr=initial_lr)
    optimizer.setup(net)
    if weight_decay > 0:
        optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))

    # Updater
    if isinstance(devices, int):
        devices['main'] = devices
        updater = training.StandardUpdater(
            train_iter, optimizer, device=devices)
    elif isinstance(devices, dict):
        updater = training.ParallelUpdater(
            train_iter, optimizer, devices=devices)

    # 6. Trainer
    trainer = training.Trainer(
        updater, (training_epoch, 'epoch'), out=result_dir)

    # 7. Trainer extensions
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.observe_lr())
    trainer.extend(extensions.Evaluator(
        test_iter, net, device=devices['main']), name='val')
    trainer.extend(extensions.PrintReport(
        ['epoch', 'main/loss', 'main/accuracy', 'val/main/loss',
         'val/main/accuracy', 'elapsed_time', 'lr']))
    trainer.extend(extensions.PlotReport(
        ['main/loss', 'val/main/loss'], x_key='epoch', file_name='loss.png'))
    trainer.extend(extensions.PlotReport(
        ['main/accuracy', 'val/main/accuracy'], x_key='epoch',
        file_name='accuracy.png'))
    trainer.extend(extensions.ExponentialShift(
        'lr', lr_decay_rate), trigger=(lr_decay_epoch, 'epoch'))
    trainer.run()

    return net
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        choices=('ssd300', 'ssd512'),
                        default='ssd300')
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    if args.model == 'ssd300':
        model = SSD300(n_fg_class=len(voc_bbox_label_names),
                       pretrained_model='voc0712')
    elif args.model == 'ssd512':
        model = SSD512(n_fg_class=len(via_bbox_label_names),
                       pretrained_model='imagenet')

    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    train = TransformDataset(DatasetFromDat(file_path='Rack.dat'),
                             Transform(model.coder, model.insize, model.mean))
    train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize)

    test = DatasetFromDat('Rack_val.dat')
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # initial lr is set to 1e-3 by ExponentialShift
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    # 120000->8000
    trainer = training.Trainer(updater, (500, 'iteration'), args.out)
    # 80000->5000,100000->7000
    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3),
                   trigger=triggers.ManualScheduleTrigger([300, 400],
                                                          'iteration'))
    # 10000->700
    trainer.extend(DetectionEvaluator(test_iter,
                                      model,
                                      use_07_metric=True,
                                      label_names=via_bbox_label_names),
                   trigger=(7, 'iteration'))

    log_interval = 10, 'iteration'
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc',
        'main/loss/conf', 'validation/main/map'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    # 10000->700
    trainer.extend(extensions.snapshot(), trigger=(50, 'iteration'))
    # 120000->8000
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=(500, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    trainer.run()

    serializers.save_npz('via_model', model)
    serializers.save_npz('via_state', optimizer)
Exemplo n.º 3
0
def TrainUNet(X,
              Y,
              model_=None,
              optimizer_=None,
              epoch=40,
              alpha=0.001,
              gpu_id=0,
              loop=1,
              earlystop=True):
    assert (len(X) == len(Y))
    d_time = datetime.datetime.now().strftime("%m-%d-%H-%M-%S")

    # 1. Model load.

    # print(sum(p.data.size for p in model.unet.params()))
    if model_ is not None:
        model = Regressor(model_)
        print("## model loaded.")
    else:
        model = Regressor(UNet())

    model.compute_accuracy = False

    if gpu_id >= 0:
        model.to_gpu(gpu_id)

    # 2. optimizer load.

    if optimizer_ is not None:
        opt = optimizer_
        print("## optimizer loaded.")
    else:
        opt = optimizers.Adam(alpha=alpha)
        opt.setup(model)

    # 3. Data Split.
    dataset = Unet_DataSet(X, Y)
    print("# number of patterns", len(dataset))

    train, valid = \
        split_dataset_random(dataset, int(len(dataset) * 0.8), seed=0)

    # 4. Iterator
    train_iter = SerialIterator(train, batch_size=C.BATCH_SIZE)
    test_iter = SerialIterator(valid,
                               batch_size=C.BATCH_SIZE,
                               repeat=False,
                               shuffle=False)

    # 5. config train, enable backprop
    chainer.config.train = True
    chainer.config.enable_backprop = True

    # 6. UnetUpdater
    updater = UnetUpdater(train_iter, opt, model, device=gpu_id)

    # 7. EarlyStopping
    if earlystop:
        stop_trigger = triggers.EarlyStoppingTrigger(
            monitor='validation/main/loss',
            max_trigger=(epoch, 'epoch'),
            patients=5)
    else:
        stop_trigger = (epoch, 'epoch')

    # 8. Trainer
    trainer = training.Trainer(updater, stop_trigger, out=C.PATH_TRAINRESULT)

    # 8.1. UnetEvaluator
    trainer.extend(UnetEvaluator(test_iter, model, device=gpu_id))

    trainer.extend(SaveRestore(),
                   trigger=triggers.MinValueTrigger('validation/main/loss'))

    # 8.2. Extensions LogReport
    trainer.extend(extensions.LogReport())

    # 8.3. Extension Snapshot
    # trainer.extend(extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}'))
    # trainer.extend(extensions.snapshot_object(model.unet, filename='loop' + str(loop) + '.model'))

    # 8.4. Print Report
    trainer.extend(extensions.observe_lr())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'elapsed_time', 'lr'
        ]))

    # 8.5. Extension Graph
    trainer.extend(
        extensions.PlotReport(['main/loss', 'validation/main/loss'],
                              x_key='epoch',
                              file_name='loop-' + str(loop) + '-loss' +
                              d_time + '.png'))
    # trainer.extend(extensions.dump_graph('main/loss'))

    # 8.6. Progree Bar
    trainer.extend(extensions.ProgressBar())

    # 9. Trainer run
    trainer.run()

    chainer.serializers.save_npz(C.PATH_TRAINRESULT / ('loop' + str(loop)),
                                 model.unet)
    return model.unet, opt
Exemplo n.º 4
0
def main(argv=sys.argv[1:]):

    if type(argv) == str:
        argv = argv.split()

    parser = ArgumentParserWithEpilog(
        description='Chainer CIFAR with recall error:')

    # Command line arguments
    add_base_args(parser)
    parser.add_argument(
        '--dynamic_rescale',
        '-R',
        default=False,
        type=float,
        help=
        'Rescale activations to this range [-R,+R] on a per-channel basis, before compressing'
    )
    add_ae_args(parser)
    args = parser.parse_args(argv)

    # Other settings and derived arguments
    end_trigger = (args.epoch, 'epoch')
    report_file = os.path.join(args.out, 'report.txt')
    report_entries = [
        'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
        'validation/main/accuracy', 'lr', 'elapsed_time'
    ]

    # Header, and output directory
    if not os.path.exists(args.out):
        os.mkdir(args.out)
    open(report_file, 'w').close()  # Clears report
    report = open(report_file, 'a')
    print_log = print_header(args,
                             argv,
                             log=report,
                             preamble='CIFAR10/100 (%s)' % __file__)

    ##
    # Set up model and dataset iterators
    rng, fixed_seeds = seed_rng(args.seed, args.gpu)
    train_iter, val_iter, class_labels = load_dataset(args.batchsize,
                                                      args.dataset,
                                                      args.augment, args.fast,
                                                      args.old_test_method)
    model = init_model(models[args.model],
                       class_labels=class_labels,
                       gpu=args.gpu,
                       fast=args.fast)

    ##
    # Get the recall error helper map
    all_layers = model.predictor.act_names
    helper_map, filterspec_map = parse_ae_args(parser,
                                               args,
                                               rng,
                                               all_layers=all_layers)
    print_helper_summary(helper_map, filterspec_map, print_log)
    print_helper_map(all_layers, helper_map, print_log)

    # Set up an optimizer
    lr, lr_ext, lr_trigger = get_lr_schedule(args, train_iter, fast=args.fast)

    optimizer = MomentumSGDScrambler(helper_map,
                                     compress_x_hat=False,
                                     dynamic_rescale=args.dynamic_rescale,
                                     lr=lr,
                                     momentum=args.momentum)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, end_trigger, out=args.out)

    # Decay
    optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay))

    # Learning rate schedule
    trainer.extend(lr_ext, trigger=lr_trigger)

    # Extensions - Measurements
    trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu))
    trainer.extend(extensions.observe_lr())

    # Extensions - Logging
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.PrintReport(report_entries))
    trainer.extend(PrintReportNoSpecial(report_entries, out=report))
    trainer.extend(
        extensions.ProgressBar(update_interval=args.update_interval))

    # Extensions - Snapshots
    trainer.extend(extensions.snapshot(), trigger=end_trigger)
    if args.snapshot_every:
        trainer.extend(extensions.snapshot(
            filename='snapshot_{0.updater.epoch}_iter_{0.updater.iteration}'),
                       trigger=(args.snapshot_every, 'epoch'))

    ##
    # Resume Training
    if args.resume:
        #chainer.serializers.load_npz(args.resume, trainer)
        from train_cifar import model_from_snapshot
        model_from_snapshot(model, args.resume)

    ##
    # Run the training
    trainer.run()

    report.close()

    return trainer, None, helper_map
Exemplo n.º 5
0
def main():

    args = parser()
    now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

    save_dir = Path('result') / now
    log_dir = save_dir / 'log'
    model_dir = save_dir / 'model'
    snap_dir = save_dir / 'snap'
    matrix_dir = save_dir / 'matrix'

    save_dir.mkdir(exist_ok=True, parents=True)
    log_dir.mkdir(exist_ok=True, parents=True)
    model_dir.mkdir(exist_ok=True, parents=True)
    snap_dir.mkdir(exist_ok=True, parents=True)
    matrix_dir.mkdir(exist_ok=True, parents=True)


    root = args.dataset

    dir_list = os.listdir(root)
    dir_list.sort()

    if 'mean.npy' in dir_list:
        dir_list.remove('mean.npy')

    print('dataset loading ...')
    datasets = DirectoryParsingLabelDataset(root)
    print('finish!')

    class_num = len(set(datasets.labels))
    print('class number : {}'.format(class_num))


    k_fold = args.kfold
    print('k_fold : {}'.format(k_fold))

    X = np.array([image_paths for image_paths in datasets.img_paths])
    y = np.array([label for label in datasets.labels])

    kfold = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=402).split(X, y)
    for k, (train_idx, val_idx) in enumerate(kfold):

        print("============= {} fold training =============".format(k + 1))
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]

        train = LabeledImageDataset([(x, y) for x, y in zip(X_train, y_train)])
        validation = LabeledImageDataset([(x, y) for x, y in zip(X_val, y_val)])

        train, validation, mean = get_dataset(train, validation, root, datasets, use_mean=False)


        model = L.Classifier(archs[args.arch](output=class_num))
        lr = args.lr
        optimizer = chainer.optimizers.MomentumSGD(lr)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0001))

        if args.gpu >= 0:
            chainer.cuda.get_device_from_id(args.gpu).use()
            model.to_gpu()

        train_iter = chainer.iterators.MultithreadIterator(train, args.batchsize, n_threads=8)
        validation_iter = chainer.iterators.MultithreadIterator(validation, args.batchsize,
                                                                repeat=False, shuffle=False, n_threads=8)
        updater = training.StandardUpdater(
            train_iter, optimizer, device=args.gpu)
        trainer = training.Trainer(
            updater, (args.epoch, 'epoch'), out=save_dir)

        log_trigger = (1, 'epoch')
        target = 'lr'
        trainer.extend(CosineShift(target, args.epoch, 1),
                       trigger=(1, "epoch"))

        trainer.extend(extensions.Evaluator(validation_iter, model, device=args.gpu),
                       trigger=log_trigger)

        snap_name = '{}-{}_fold_model.npz'.format(k_fold, k+1)
        trainer.extend(extensions.snapshot_object(model, str(snap_name)),
                       trigger=chainer.training.triggers.MaxValueTrigger(
                       key='validation/main/accuracy', trigger=(1, 'epoch')))

        log_name = '{}-{}_fold_log.json'.format(k_fold, k+1)
        trainer.extend(extensions.LogReport(
            log_name=str(log_name), trigger=log_trigger))

        trainer.extend(extensions.observe_lr(), trigger=log_trigger)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration',
            'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy',
            'elapsed_time', 'lr'
        ]), trigger=(1, 'epoch'))

        trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                             'epoch',file_name='loss{}.png'.format(k+1)))
        trainer.extend(extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'],
                                             'epoch', file_name='accuracy{}.png'.format(k+1)))
        trainer.extend(extensions.ProgressBar(update_interval=10))
 
        trainer.run()


        snap_file = save_dir / snap_name
        shutil.move(str(snap_file), str(snap_dir))

        log_file = save_dir / log_name
        shutil.move(str(log_file), str(log_dir))

        save_model = model_dir / "{}_{}-{}_fold.npz".format(now, k_fold, k + 1)
        chainer.serializers.save_npz(str(save_model), model)

        print("============= {} fold Evaluation =============".format(k + 1))

        dnames = glob.glob('{}/*'.format(root))
        labels_list = []
        for d in dnames:
            p_dir = Path(d)
            labels_list.append(p_dir.name)

        if 'mean.npy' in labels_list:
            labels_list.remove('mean.npy')

        confusion_matrix_cocoa(validation, args.gpu, 7,
                               model, matrix_dir, k, labels_list)
Exemplo n.º 6
0
    sum(depth) * 2 + 1, args.valid)
trainer = training.Trainer(updater, (epoch_size * max_epoch, 'iteration'),
                           out=result_dir)

from chainer.training import extensions

trainer.extend(extensions.LogReport(trigger=(epoch_size, 'iteration')))
trainer.extend(
    extensions.snapshot(filename='snapshot_iteration-{.updater.iteration}'),
    trigger=(epoch_size, 'iteration'))
trainer.extend(extensions.snapshot_object(
    model.predictor, filename='model_iteration-{.updater.iteration}'),
               trigger=(epoch_size, 'iteration'))
trainer.extend(extensions.Evaluator(test_iter, model, device=gpu_id),
               trigger=(epoch_size, 'iteration'))
trainer.extend(extensions.observe_lr(), trigger=(epoch_size, 'iteration'))
trainer.extend(extensions.PrintReport([
    'iteration', 'lr', 'main/accuracy', 'validation/main/accuracy',
    'elapsed_time'
]),
               trigger=(epoch_size, 'iteration'))
trainer.extend(extensions.dump_graph('main/loss'))
trainer.extend(extensions.ExponentialShift('lr', 0.5),
               trigger=(epoch_size * 3, 'iteration'))
trainer.extend(extensions.ProgressBar(update_interval=30))

print('running')
print('reslut_dir:{}'.format(result_dir))

trainer.run()
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(description='Chainer CIFAR example:')
    parser.add_argument('--dataset', '-d', default='cifar100',
                        help='The dataset to use: cifar10 or cifar100')
    parser.add_argument('--model', '-m', default='VGG16',
                        help='The model to use: VGG16 or PreResNet110'
                             ' or WideResNet28x10')
    parser.add_argument('--batchsize', '-b', type=int, default=128,
                        help='Number of images in each mini-batch')
    parser.add_argument('--lr_init', '-l', type=float, default=0.05,
                        help='Learning rate for SGD')
    parser.add_argument('--epoch', '-e', type=int, default=200,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', type=int, default=0,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--wd', type=float, default=1e-4,
                        help='weight decay')
    parser.add_argument('--swa', action='store_true',
                        help='swa usage flag')
    parser.add_argument('--swa_start', type=float, default=161,
                        help='SWA start epoch number')
    parser.add_argument('--swa_lr', type=float, default=0.05,
                        help='SWA LR')
    parser.add_argument('--swa_c_epochs', type=int, default=1,
                        help='SWA model collection frequency length in epochs')

    args = parser.parse_args()

    if args.dataset.lower() == 'cifar10':
        print('Using CIFAR10 dataset')
        class_labels = 10
        train, test = get_cifar10()
    elif args.dataset.lower() == 'cifar100':
        print('Using CIFAR100 dataset')
        class_labels = 100
        train, test = get_cifar100()
    else:
        raise RuntimeError('Invalid dataset choice.')

    print('Using %s model' % args.model)
    if args.model == 'VGG16':
        model_cls = VGG16
    elif args.model == 'PreResNet110':
        model_cls = PreResNet110
    elif args.model == 'WideResNet28x10':
        model_cls = WideResNet28x10
    else:
        raise RuntimeError('Invalid model choice.')

    model = L.Classifier(model_cls(class_labels))

    if args.swa:
        swa_model = L.Classifier(model_cls(class_labels))
        swa_n = 0

    if args.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()
        if args.swa:
            swa_model.to_gpu()

    # Data augmentation / preprocess
    train = TransformDataset(train, partial(transform, train=True))
    test = TransformDataset(test, partial(transform, train=False))

    optimizer = chainer.optimizers.MomentumSGD(args.lr_init, momentum=0.9)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(args.wd))

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    swa_train_iter = chainer.iterators.SerialIterator(
        train, args.batchsize, repeat=False, shuffle=False)
    test_iter = chainer.iterators.SerialIterator(test, args.batchsize,
                                                 repeat=False, shuffle=False)
    stop_trigger = (args.epoch, 'epoch')

    # Set up a trainer
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    # Learning rate adjustment (this function is called every epoch)
    def lr_schedule(trainer):
        epoch = trainer.updater.epoch
        t = epoch / (args.swa_start if args.swa else args.epoch)
        lr_ratio = args.swa_lr / args.lr_init if args.swa else 0.01

        if t <= 0.5:
            factor = 1.0
        elif t <= 0.9:
            factor = 1.0 - (1.0 - lr_ratio) * (t - 0.5) / 0.4
        else:
            factor = lr_ratio
        trainer.updater.get_optimizer('main').lr = factor * args.lr_init

    # The main function for SWA (this function is called every epoch)
    def avg_weight(trainer):
        epoch = trainer.updater.epoch
        if args.swa and (epoch + 1) >= args.swa_start and \
                (epoch + 1 - args.swa_start) % args.swa_c_epochs == 0:
            nonlocal swa_n
            # moving average
            alpha = 1.0 / (swa_n + 1)
            for param1, param2 in zip(swa_model.params(), model.params()):
                param1.data *= (1.0 - alpha)
                param1.data += param2.data * alpha
            swa_n += 1

    # This funtion is called before evaluating SWA model
    # for fixing batchnorm's running mean and variance
    def fix_swa_batchnorm(evaluator):
        # Check batchnorm layer
        bn_flg = False
        for l in swa_model.links():
            if type(l) == L.normalization.batch_normalization.BatchNormalization:
                bn_flg = True
                break

        # Fix batchnorm's running mean and variance
        if bn_flg:
            swa_train_iter.reset()
            with chainer.using_config('train', True):
                for batch in swa_train_iter:
                    in_arrays = evaluator.converter(batch, evaluator.device)
                    with function.no_backprop_mode():
                        swa_model(*in_arrays)

    # Set up extentions
    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu),
                   trigger=(5, 'epoch'))
    if args.swa:
        eval_points = [x for x in range(args.epoch + 1)
                       if x > args.swa_start and x % 5 == 0]
        trainer.extend(SwaEvaluator(test_iter, swa_model, device=args.gpu, eval_hook=fix_swa_batchnorm),
                       trigger=triggers.ManualScheduleTrigger(eval_points, 'epoch'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(lr_schedule, trigger=triggers.IntervalTrigger(1, 'epoch'))
    trainer.extend(avg_weight, trigger=triggers.IntervalTrigger(1, 'epoch'))
    trainer.extend(extensions.observe_lr())
    trainer.extend(extensions.LogReport())
    cols = ['epoch', 'lr', 'main/loss', 'main/accuracy',
            'validation/main/loss', 'validation/main/accuracy', 'elapsed_time']
    if args.swa:
        cols = cols[:-1] + ['swa/main/loss', 'swa/main/accuracy'] + cols[-1:]
    trainer.extend(extensions.PrintReport(cols))
    trainer.extend(extensions.ProgressBar())

    trainer.run()
Exemplo n.º 8
0
def main():
    # Check if GPU is available
    # (ImageNet example does not support CPU execution)
    if not chainer.cuda.available:
        raise RuntimeError("ImageNet requires GPU support.")

    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--communicator', default='hierarchical')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Start method of multiprocessing module need to be changed if we
    # are using InfiniBand and MultiprocessIterator. This is because
    # processes often crash when calling fork if they are using
    # Infiniband.  (c.f.,
    # https://www.open-mpi.org/faq/?category=tuning#fork-warning )
    # Also, just setting the start method does not seem to be
    # sufficient to actually launch the forkserver processes, so also
    # start a dummy process.
    # See also our document:
    # https://chainermn.readthedocs.io/en/stable/tutorial/tips_faqs.html#using-multiprocessiterator
    # This must be done *before* ``chainermn.create_communicator``!!!
    multiprocessing.set_start_method('forkserver')
    p = multiprocessing.Process(target=lambda *x: x, args=())
    p.start()
    p.join()

    # Prepare ChainerMN communicator.
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    chainer.cuda.get_device_from_id(device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    mean = np.load(args.mean)
    if comm.rank == 0:
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(args.val, args.root, mean, model.insize,
                                  False)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)

    # A workaround for processes crash should be done before making
    # communicator above, when using fork (e.g. MultiProcessIterator)
    # along with Infiniband.
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch')
    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    checkpointer = chainermn.create_multi_node_checkpointer(
        name='imagenet-example', comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer, trigger=checkpoint_interval)

    # Create a multi node evaluator from an evaluator.
    evaluator = TestModeEvaluator(val_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 9
0
def train(args):
    config = yaml.load(open(args.config))

    print('==========================================')

    # Set workspace size
    if 'max_workspace_size' in config:
        chainer.cuda.set_max_workspace_size(config['max_workspace_size'])

    # Output version info
    print('chainer version: {}'.format(chainer.__version__))
    print('cuda: {}, cudnn: {}, nccl: {}'.format(chainer.cuda.available,
                                                 chainer.cuda.cudnn_enabled,
                                                 HAVE_NCCL))

    # Create result_dir
    if args.result_dir is not None:
        config['result_dir'] = args.result_dir
    else:
        config['result_dir'] = create_result_dir_from_config_path(args.config)
    log_fn = save_config_get_log_fn(config['result_dir'], args.config)
    print('result_dir:', config['result_dir'])

    # Instantiate model
    model = get_model_from_config(config)
    print('model:', model.__class__.__name__)

    # Initialize optimizer
    optimizer = get_optimizer_from_config(model, config)
    print('optimizer:', optimizer.__class__.__name__)

    # Setting up datasets
    train_dataset, valid_dataset = get_dataset_from_config(config)
    print('train_dataset: {}'.format(len(train_dataset)),
          train_dataset.__class__.__name__)
    print('valid_dataset: {}'.format(len(valid_dataset)),
          valid_dataset.__class__.__name__)

    # Prepare devices
    devices = {'main': args.gpus[0]}
    for gid in args.gpus[1:]:
        devices['gpu{}'.format(gid)] = gid

    # Create iterators
    train_iter, valid_iter = create_iterators(
        train_dataset, config['dataset']['train']['batchsize'], valid_dataset,
        config['dataset']['valid']['batchsize'], devices)
    print('train_iter:', train_iter.__class__.__name__)
    print('valid_iter:', valid_iter.__class__.__name__)

    # Create updater
    updater_creator = get_updater_creator_from_config(config)
    updater = updater_creator(train_iter, optimizer, devices)
    print('updater:', updater.__class__.__name__)

    # Create trainer
    trainer = training.Trainer(updater,
                               config['stop_trigger'],
                               out=config['result_dir'])
    print('Trainer stops:', config['stop_trigger'])

    # Trainer extensions
    for ext in config['trainer_extension']:
        ext, values = ext.popitem()
        if ext == 'LogReport':
            trigger = values['trigger']
            trainer.extend(
                extensions.LogReport(trigger=trigger, log_name=log_fn))
        elif ext == 'observe_lr':
            trainer.extend(extensions.observe_lr(), trigger=values['trigger'])
        elif ext == 'dump_graph':
            trainer.extend(extensions.dump_graph(**values))
        elif ext == 'Evaluator':
            evaluator_creator = get_evaluator_creator_from_config(values)
            evaluator = evaluator_creator(valid_iter, model, devices)
            trainer.extend(evaluator,
                           trigger=values['trigger'],
                           name=values['prefix'])
        elif ext == 'PlotReport':
            trainer.extend(extensions.PlotReport(**values))
        elif ext == 'PrintReport':
            trigger = values.pop('trigger')
            trainer.extend(extensions.PrintReport(**values), trigger=trigger)
        elif ext == 'ProgressBar':
            upd_int = values['update_interval']
            trigger = values['trigger']
            trainer.extend(extensions.ProgressBar(update_interval=upd_int),
                           trigger=trigger)
        elif ext == 'snapshot':
            filename = values['filename']
            trigger = values['trigger']
            trainer.extend(extensions.snapshot(filename=filename),
                           trigger=trigger)
        elif ext == 'ParameterStatistics':
            links = []
            for link_name in values.pop('links'):
                lns = [ln.strip() for ln in link_name.split('.') if ln.strip()]
                target = model.predictor
                for ln in lns:
                    target = getattr(target, ln)
                links.append(target)
            trainer.extend(extensions.ParameterStatistics(links, **values))
        elif ext == 'custom':
            custom_extension = get_custum_extension_from_config(values)
            trainer.extend(custom_extension, trigger=values['trigger'])

    # LR decay
    if 'lr_drop_ratio' in config['optimizer'] \
            and 'lr_drop_triggers' in config['optimizer']:
        ratio = config['optimizer']['lr_drop_ratio']
        points = config['optimizer']['lr_drop_triggers']['points']
        unit = config['optimizer']['lr_drop_triggers']['unit']
        drop_trigger = triggers.ManualScheduleTrigger(points, unit)

        def lr_drop(trainer):
            trainer.updater.get_optimizer('main').lr *= ratio

        trainer.extend(lr_drop, trigger=drop_trigger)

    # Resume
    if args.resume is not None:
        fn = '{}.bak'.format(args.resume)
        shutil.copy(args.resume, fn)
        serializers.load_npz(args.resume, trainer)
        print('Resumed from:', args.resume)

    print('==========================================')

    trainer.run()
    return 0
def main():
    archs = {
        'alex': alex.Alex,
        'alex_fp16': alex.AlexFp16,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16,
        'nin': nin.NIN
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch', '-a', choices=archs.keys(),
                        default='nin', help='Convnet architecture')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch', '-E', type=int, default=10,
                        help='Number of epochs to train')
    parser.add_argument('--gpus', '-g', type=int, nargs="*",
                        default=[0, 1, 2, 3])
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean', '-m', default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume', '-r', default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--root', '-R', default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize', '-b', type=int, default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    # Load the datasets and mean file
    mean = np.load(args.mean)
    train = train_imagenet.PreprocessedDataset(
        args.train, args.root, mean, model.insize)
    val = train_imagenet.PreprocessedDataset(
        args.val, args.root, mean, model.insize, False)
    # These iterators load the images with subprocesses running in parallel to
    # the training/validation.
    devices = tuple(args.gpus)

    train_iters = [
        chainer.iterators.MultiprocessIterator(i,
                                               args.batchsize,
                                               n_processes=args.loaderjob)
        for i in chainer.datasets.split_dataset_n_random(train, len(devices))]
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = updaters.MultiprocessParallelUpdater(train_iters, optimizer,
                                                   devices=devices)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    if args.test:
        val_interval = 5, 'epoch'
        log_interval = 1, 'epoch'
    else:
        val_interval = 100000, 'iteration'
        log_interval = 1000, 'iteration'

    trainer.extend(train_imagenet.TestModeEvaluator(val_iter, model,
                                                    device=args.gpus[0]),
                   trigger=val_interval)
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'), trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]), trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=2))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 11
0
    updater = training.StandardUpdater(
        train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(
        updater, (args.iteration, "iteration"), args.out)

    val_interval = args.val_iter, "iteration"
    trainer.extend(
        DetectionVOCEvaluator(
            test_iter, model, use_07_metric=True,
            label_names=label_names),
        trigger=val_interval)

    log_interval = args.log_iter, "iteration"
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport(
        ['epoch', 'iteration', 'lr',
         'main/loss', 'main/loss/loc', 'main/loss/conf',
         'validation/main/map']),
        trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(
        extensions.snapshot_object(model, 'model_iter_{.updater.iteration}'),
        trigger=(args.model_iter, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)
Exemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--batchsize', type=int, default=12)
    parser.add_argument('--class_weight', type=str, default='class_weight.npy')
    parser.add_argument('--out', type=str, default='result')
    args = parser.parse_args()

    # Triggers
    log_trigger = (50, 'iteration')
    validation_trigger = (2000, 'iteration')
    end_trigger = (16000, 'iteration')

    # Dataset
    train = CamVidDataset(split='train')
    train = TransformDataset(train, transform)
    val = CamVidDataset(split='val')

    # Iterator
    train_iter = iterators.MultiprocessIterator(train, args.batchsize)
    val_iter = iterators.MultiprocessIterator(
        val, args.batchsize, shuffle=False, repeat=False)

    # Model
    class_weight = np.load(args.class_weight)
    model = SegNetBasic(n_class=11)
    model = PixelwiseSoftmaxClassifier(
        model, class_weight=class_weight)
    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    # Optimizer
    optimizer = optimizers.MomentumSGD(lr=0.1, momentum=0.9)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005))

    # Updater
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)

    # Trainer
    trainer = training.Trainer(updater, end_trigger, out=args.out)

    trainer.extend(extensions.LogReport(trigger=log_trigger))
    trainer.extend(extensions.observe_lr(), trigger=log_trigger)
    trainer.extend(extensions.dump_graph('main/loss'))

    if extensions.PlotReport.available():
        trainer.extend(extensions.PlotReport(
            ['main/loss'], x_key='iteration',
            file_name='loss.png'))
        trainer.extend(extensions.PlotReport(
            ['validation/main/miou'], x_key='iteration',
            file_name='miou.png'))

    trainer.extend(extensions.snapshot_object(
        model.predictor, filename='model_iteration-{.updater.iteration}'),
        trigger=end_trigger)
    trainer.extend(extensions.PrintReport(
        ['epoch', 'iteration', 'elapsed_time', 'lr',
         'main/loss', 'validation/main/miou',
         'validation/main/mean_class_accuracy',
         'validation/main/pixel_accuracy']),
        trigger=log_trigger)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.extend(
        SemanticSegmentationEvaluator(
            val_iter, model.predictor,
            camvid_label_names),
        trigger=validation_trigger)

    trainer.run()
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model', choices=('ssd300', 'ssd512'), default='ssd300')
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    if args.model == 'ssd300':
        model = SSD300(
            n_fg_class=len(voc_bbox_label_names),
            pretrained_model='imagenet')
    elif args.model == 'ssd512':
        model = SSD512(
            n_fg_class=len(voc_bbox_label_names),
            pretrained_model='imagenet')

    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    train = TransformDataset(
        ConcatenatedDataset(
            VOCBboxDataset(year='2007', split='trainval'),
            VOCBboxDataset(year='2012', split='trainval')
        ),
        Transform(model.coder, model.insize, model.mean))
    train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize)

    test = VOCBboxDataset(
        year='2007', split='test',
        use_difficult=True, return_difficult=True)
    test_iter = chainer.iterators.SerialIterator(
        test, args.batchsize, repeat=False, shuffle=False)

    # initial lr is set to 1e-3 by ExponentialShift
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (120000, 'iteration'), args.out)
    trainer.extend(
        extensions.ExponentialShift('lr', 0.1, init=1e-3),
        trigger=triggers.ManualScheduleTrigger([80000, 100000], 'iteration'))

    trainer.extend(
        DetectionVOCEvaluator(
            test_iter, model, use_07_metric=True,
            label_names=voc_bbox_label_names),
        trigger=(10000, 'iteration'))

    log_interval = 10, 'iteration'
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport(
        ['epoch', 'iteration', 'lr',
         'main/loss', 'main/loss/loc', 'main/loss/conf',
         'validation/main/map']),
        trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.extend(extensions.snapshot(), trigger=(10000, 'iteration'))
    trainer.extend(
        extensions.snapshot_object(model, 'model_iter_{.updater.iteration}'),
        trigger=(120000, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 14
0
    def setup_trainer(self):
        converter = functools.partial(
            cmr.datasets.concat_examples,
            padding=0,
            # img, bboxes, labels, masks, scales
            indices_concat=[0, 2, 3, 4],  # img, _, labels, masks, scales
            indices_to_device=[0, 1],  # img, bbox
        )
        self.updater = chainer.training.updater.StandardUpdater(
            self.train_iterator, self.optimizer, device=self.gpu,
            converter=converter)
        self.trainer = chainer.training.Trainer(
            self.updater, (self.max_epoch, 'epoch'), out=self.out_dir)

        step_size = [
            (120e3 / 180e3) * self.max_epoch,
            (160e3 / 180e3) * self.max_epoch,
        ]
        self.trainer.extend(
            extensions.ExponentialShift('lr', 0.1),
            trigger=chainer.training.triggers.ManualScheduleTrigger(
                step_size, 'epoch'))

        evaluator = cmr.extensions.InstanceSegmentationVOCEvaluator(
            self.val_iterator, self.model.mask_rcnn, device=self.gpu,
            use_07_metric=True, label_names=self.train_dataset.fg_class_names)
        self.trainer.extend(
            evaluator, trigger=(self.eval_interval, self.eval_interval_type))

        # Save snapshot
        self.trainer.extend(
            extensions.snapshot_object(
                self.model.mask_rcnn, 'snapshot_model.npz'),
            trigger=chainer.training.triggers.MaxValueTrigger(
                'validation/main/map',
                (self.save_interval, self.save_interval_type)))

        # Dump network architecture
        self.trainer.extend(
            extensions.dump_graph(
                root_name='main/loss',
                out_name='network_architecture.dot'))

        # Logging
        self.trainer.extend(
            extensions.ProgressBar(
                update_interval=self.progressbar_update_interval))
        self.trainer.extend(
            extensions.observe_lr(),
            trigger=(self.log_interval, self.log_interval_type))
        self.trainer.extend(
            extensions.LogReport(
                log_name='log.json',
                trigger=(self.log_interval, self.log_interval_type)))
        self.trainer.extend(
            extensions.PrintReport([
                'iteration',
                'epoch',
                'elapsed_time',
                'lr',
                'main/loss',
                'main/roi_loc_loss',
                'main/roi_cls_loss',
                'main/roi_mask_loss',
                'main/rpn_loc_loss',
                'main/rpn_cls_loss',
                'validation/main/map',
            ]), trigger=(self.print_interval, self.print_interval_type))

        # Plot
        self.trainer.extend(
            extensions.PlotReport([
                'main/loss',
                'main/roi_loc_loss',
                'main/roi_cls_loss',
                'main/roi_mask_loss',
                'main/rpn_loc_loss',
                'main/rpn_cls_loss',
            ],
                file_name='loss_plot.png',
                x_key=self.plot_interval_type,
                trigger=(self.plot_interval, self.plot_interval_type)),
            trigger=(self.plot_interval, self.plot_interval_type))
        self.trainer.extend(
            extensions.PlotReport(
                ['validation/main/map'],
                file_name='accuracy_plot.png',
                x_key=self.plot_interval_type,
                trigger=(self.plot_interval, self.plot_interval_type)),
            trigger=(self.eval_interval, self.eval_interval_type))

        # Dump params
        params = dict()
        params['model_name'] = self.model_name
        params['train_dataset_dir'] = self.train_dataset_dir
        params['val_dataset_dir'] = self.val_dataset_dir
        params['fg_class_names'] = self.train_dataset.fg_class_names
        params['timestamp'] = self.timestamp_iso
        params['out_dir'] = self.out_dir
        params['gpu'] = self.gpu
        params['batch_size'] = self.batch_size
        params['max_epoch'] = self.max_epoch
        params['lr'] = self.lr
        params['weight_decay'] = self.weight_decay
        self.trainer.extend(
            fcn.extensions.ParamsReport(params, file_name='params.yaml'))

        # Dump param for mask_rcnn_instance_segmentation.py
        target_names = dict()
        target_names['fg_class_names'] = self.train_dataset.fg_class_names
        self.trainer.extend(
            fcn.extensions.ParamsReport(
                target_names, file_name='fg_class_names.yaml'))
def train_mode(updater, mode, lr_drop_iter, snapshot_iter, report_iter,
               stop_iter):
    trainer = training.Trainer(updater, (stop_iter, 'iteration'),
                               out='results')
    trainer.extend(
        extensions.LogReport(trigger=(report_iter, 'iteration')))
    trainer.extend(extensions.observe_lr(),
                   trigger=(report_iter, 'iteration'))
    trainer.extend(create_lrdrop_ext(args.gamma),
                   trigger=(lr_drop_iter, 'iteration'))
    if mode == 'rpn':
        updater.get_optimizer('main').target.rpn_train = True
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration',
            'main/RPN/rpn_loss',
            'main/RPN/rpn_loss_cls',
            'main/RPN/rpn_cls_accuracy',
            'main/RPN/rpn_loss_bbox',
            'elapsed_time',
            'lr',
        ]), trigger=(report_iter, 'iteration'))
        trainer.extend(extensions.ProgressBar(),
                       trigger=(report_iter, 'iteration'))
        trainer.extend(extensions.PlotReport(
            ['main/RPN/rpn_loss'],
            trigger=(report_iter, 'iteration')))
        trainer.extend(
            extensions.dump_graph('main/RPN/rpn_loss',
                                  out_name='rpn_loss.dot'))

        # Add snapshot extensions
        trainer.extend(
            extensions.snapshot(
                filename='rpn_trainer_snapshot_{.updater.iteration}'),
            trigger=(snapshot_iter, 'iteration'))
        trainer.extend(
            extensions.snapshot_object(
                model, 'rpn_model_snapshot_{.updater.iteration}'),
            trigger=(snapshot_iter, 'iteration'))
    elif mode == 'rcnn':
        updater.get_optimizer('main').target.rcnn_train = True
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration',
            'main/loss_cls',
            'main/cls_accuracy',
            'main/loss_bbox',
            'main/loss_rcnn',
            'elapsed_time',
            'lr',
        ]), trigger=(report_iter, 'iteration'))
        trainer.extend(extensions.ProgressBar(),
                       trigger=(report_iter, 'iteration'))
        trainer.extend(extensions.PlotReport(
            ['main/RPN/rpn_loss'],
            trigger=(report_iter, 'iteration')))
        trainer.extend(
            extensions.dump_graph('main/RPN/rpn_loss',
                                  out_name='rpn_loss.dot'))

        # Add snapshot extensions
        trainer.extend(
            extensions.snapshot(
                filename='rpn_trainer_snapshot_{.updater.iteration}'),
            trigger=(snapshot_iter, 'iteration'))
        trainer.extend(
            extensions.snapshot_object(
                model, 'rpn_model_snapshot_{.updater.iteration}'),
            trigger=(snapshot_iter, 'iteration'))

    trainer.run()
    del trainer
Exemplo n.º 16
0
    optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))

    dataset = Dataset(2048, args.boardnorm)
    iter_ = chainer.iterators.SerialIterator(dataset, args.batchsize)

    print(
        'chance rate: ',
        sum(dataset[i][1].mean() for i in range(len(dataset))) / len(dataset))

    updater = chainer.training.StandardUpdater(iter_,
                                               optimizer,
                                               device=args.gpu)
    trainer = chainer.training.Trainer(updater, (15000, 'iteration'),
                                       out=args.out)
    trainer.extend(extensions.snapshot_object(
        model.model, filename='model_iter_{.updater.iteration}'),
                   trigger=(15000, 'iteration'))

    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=0.1),
                   trigger=(10000, 'iteration'))

    log_interval = (10, 'iteration')
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(
        extensions.PrintReport(
            ['iteration', 'lr', 'main/loss', 'main/accuracy']))
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.run()
Exemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    model = SSD300(n_fg_class=len(voc_detection_label_names),
                   pretrained_model='imagenet')
    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu()

    train = TransformDataset(
        ConcatenatedDataset(VOCDetectionDataset(year='2007', split='trainval'),
                            VOCDetectionDataset(year='2012',
                                                split='trainval')),
        Transform(model.coder, model.insize, model.mean))
    train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize)

    test = VOCDetectionDataset(year='2007',
                               split='test',
                               use_difficult=True,
                               return_difficult=True)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # initial lr is set to 1e-3 by ExponentialShift
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (120000, 'iteration'), args.out)
    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3),
                   trigger=triggers.ManualScheduleTrigger([80000, 100000],
                                                          'iteration'))

    trainer.extend(DetectionVOCEvaluator(
        test_iter,
        model,
        use_07_metric=True,
        label_names=voc_detection_label_names),
                   trigger=(10000, 'iteration'))

    log_interval = 10, 'iteration'
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc',
        'main/loss/conf', 'validation/main/map'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.extend(extensions.snapshot(), trigger=(10000, 'iteration'))
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=(120000, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 18
0
def main():
    '''
    main function, start point
    '''
    # 引数関連
    parser = argparse.ArgumentParser()
    parser.add_argument('--batchsize', '-b', type=int, default=128,
                        help='Number of images in each mini-batch')
    parser.add_argument('--learnrate', '-l', type=float, default=0.001,
                        help='Learning rate for SGD')
    parser.add_argument('--epoch', '-e', type=int, default=100,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', type=int, default=0,
                        help='GPU1 ID (negative value indicates CPU)')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--iter_parallel', '-p', action='store_true', default=False,
                        help='loading dataset from disk')
    parser.add_argument('--test', action='store_true', default=False,
                        help='Test Mode, a few dataset')
    parser.add_argument('--opt' , '-o', type=str, choices=('adam', 'sgd') ,default='adam')
    parser.add_argument('--fsize' , '-f', type=int ,default=5)
    parser.add_argument('--ch' , '-c', type=int ,default=4)
    parser.add_argument('--decay' , '-d', type=str ,default='exp', choices=('exp', 'lin'))
    parser.add_argument('--weight', '-w', type=float ,default=1.0)
    args = parser.parse_args()

    # parameter出力
    print("-=Learning Parameter=-")
    print("# Max Epochs: {}".format(args.epoch))
    print("# Batch Size: {}".format(args.batchsize))
    print("# Learning Rate: {}".format(args.learnrate))
    print("# Optimizer Method: {}".format(args.opt))
    print("# Filter Size: {}".format(args.fsize))
    print("# Channel Scale: {}".format(args.ch))
    print("# coef. decay : {}".format(args.decay))
    print("# contloss' weight : {}".format(args.weight))
    print('# Train Dataet: General 100')
    if args.iter_parallel:
        print("# Data Iters that loads in Parallel")
    print("\n")

    # 保存ディレクトリ
    # save didrectory
    model_dir_name = 'CAEFINet_opt_{}_ch_{}_fsize_{}_decay_{}_weight_{}'.format(args.opt, args.ch, args.fsize, args.decay, args.weight)
    outdir = path.join(ROOT_PATH, 'results','FI' ,'CAEFINet', model_dir_name)
    if not path.exists(outdir):
        os.makedirs(outdir)
    with open(path.join(outdir, 'arg_param.txt'), 'w') as f:
        for k, v in args.__dict__.items():
            f.write('{}:{}\n'.format(k, v))

    #loading dataset
    if args.test:
        print('# loading test dataet(UCF101_minimam_test_size64_frame3_group2_max4_p) ...')
        train_dataset = 'UCF101_minimam_test_size64_frame3_group2_max4_p'
        test_dataset = 'UCF101_minimam_test_size64_frame3_group2_max4_p'
    else:
        print('# loading test dataet(UCF101_train_size64_frame3_group10_max100_p, UCF101_test_size64_frame3_group25_max5_p) ...')
        train_dataset = 'UCF101_train_size64_frame3_group10_max100_p'
        test_dataset = 'UCF101_test_size64_frame3_group25_max5_p'

    if args.iter_parallel:
        train = ds.SequenceDataset(dataset=train_dataset)
        test = ds.SequenceDataset(dataset=test_dataset)
    else:
        train = ds.SequenceDatasetOnMem(dataset=train_dataset)
        test = ds.SequenceDatasetOnMem(dataset=test_dataset)

   # prepare model
    model = N.CAEFINet(vgg_path=path.join(ROOT_PATH, 'models', 'VGG16.npz'), f_size=args.fsize, n_ch=args.ch, size=64)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    # setup optimizer
    if args.opt == 'adam':
        optimizer = chainer.optimizers.Adam(alpha=args.learnrate)
    elif args.opt == 'sgd':
        optimizer = chainer.optimizers.MomentumSGD(lr=args.learnrate, momentum=0.9)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

    # setup iter
    if args.iter_parallel:
        train_iter = chainer.iterators.MultiprocessIterator(
            train, args.batchsize, n_processes=8)
        test_iter = chainer.iterators.MultiprocessIterator(
            test, args.batchsize, repeat=False, shuffle=False, n_processes=8)
    else:
        train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
        test_iter = chainer.iterators.SerialIterator(
            test, args.batchsize, repeat=False, shuffle=False)

    # setup trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, loss_func=model.get_loss_func(weight=args.weight, coef_decay=args.decay))
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=outdir)

    # # eval test data
    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu, eval_func=model.get_loss_func(weight=args.weight, coef_decay=args.decay)))
    # dump loss graph
    trainer.extend(extensions.dump_graph('main/loss'))
    # lr shift
    if args.opt == 'sgd':
        trainer.extend(extensions.ExponentialShift("lr", 0.1), trigger=(50, 'epoch'))
    elif args.opt == 'adam':
        trainer.extend(extensions.ExponentialShift("alpha", 0.1), trigger=(50, 'epoch'))
    # save snapshot
    trainer.extend(extensions.snapshot(), trigger=(10, 'epoch'))
    trainer.extend(extensions.snapshot_object(
        model, 'model_snapshot_{.updater.epoch}'), trigger=(10, 'epoch'))
    # log report
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.observe_lr(), trigger=(1, 'epoch'))
    #  plot loss graph
    trainer.extend(
        extensions.PlotReport(['main/loss', 'validation/main/loss'],
                            'epoch', file_name='loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'],
                            'epoch', file_name='mse_loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/cont_loss', 'validation/main/cont_loss'],
                            'epoch', file_name='cont_loss.png'))
    # plot acc graph
    trainer.extend(extensions.PlotReport(['main/psnr', 'validation/main/psnr'],
                            'epoch', file_name='PSNR.png'))
    # print info
    trainer.extend(extensions.PrintReport(
        ['epoch', 'main/loss', 'validation/main/loss','main/mse_loss', 'validation/main/mse_loss',
        'main/cont_loss', 'validation/main/cont_loss', 'main/psnr', 'validation/main/psnr', 'lr', 'elapsed_time']))
    # print progbar
    trainer.extend(extensions.ProgressBar())

    # [ChainerUI] enable to send commands from ChainerUI
    trainer.extend(CommandsExtension())
    # [ChainerUI] save 'args' to show experimental conditions
    save_args(args, outdir)

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()

    # save final model
    model_outdir = path.join(ROOT_PATH, 'models', model_dir_name)
    if not path.exists(model_outdir):
        os.makedirs(model_outdir)
    model_name = 'CAEFINet_{}_ch_{}_fsize_{}_decay_{}_weight_{}.npz'.format(args.opt, args.ch, args.fsize, args.decay, args.weight)
    chainer.serializers.save_npz(path.join(model_outdir, model_name), model)

    model_parameter = {
        'name': 'CAEFINetConcat',
        'parameter': {'f_size':args.fsize, 'ch':args.ch}
    }
    with open(path.join(model_outdir, 'model_parameter.json'), 'w') as f:
        json.dump(model_parameter, f)
Exemplo n.º 19
0
def main():
    archs = {
        'alex': alex.Alex,
        'alex_fp16': alex.AlexFp16,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
        'resnext50': resnet50.ResNeXt50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch', '-E', type=int, default=10,
                        help='Number of epochs to train')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID (negative value indicates CPU')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean', '-m', default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume', '-r', default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--root', '-R', default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize', '-b', type=int, default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from {}'.format(args.initmodel))
        chainer.serializers.load_npz(args.initmodel, model)
    if args.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(
            args.gpu).use()  # Make the GPU current
        model.to_gpu()

    # Load the datasets and mean file
    mean = np.load(args.mean)
    train = PreprocessedDataset(args.train, args.root, mean, model.insize)
    val = PreprocessedDataset(args.val, args.root, mean, model.insize, False)
    # These iterators load the images with subprocesses running in parallel to
    # the training/validation.
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    val_interval = (1 if args.test else 100000), 'iteration'
    log_interval = (1 if args.test else 1000), 'iteration'

    trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu),
                   trigger=val_interval)
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'), trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]), trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 20
0
def get_trainer(args):
    config = yaml.load(open(args.config))

    # Set workspace size
    if 'max_workspace_size' in config:
        chainer.cuda.set_max_workspace_size(config['max_workspace_size'])

    # Prepare ChainerMN communicator
    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    # Show the setup information
    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        if args.gpu:
            print('Using GPUs - max workspace size:',
                  chainer.cuda.get_max_workspace_size())
        print('Using {} communicator'.format(args.communicator))

    # Output version info
    if comm.rank == 0:
        print('Chainer version: {}'.format(chainer.__version__))
        print('ChainerMN version: {}'.format(chainermn.__version__))
        print('cuda: {}, cudnn: {}'.format(chainer.cuda.available,
                                           chainer.cuda.cudnn_enabled))

    # Create result_dir
    if args.result_dir is not None:
        config['result_dir'] = args.result_dir
        model_fn = config['model']['module'].split('.')[-1]
        sys.path.insert(0, args.result_dir)
        config['model']['module'] = model_fn
    else:
        config['result_dir'] = create_result_dir_from_config_path(args.config)
    log_fn = save_config_get_log_fn(config['result_dir'], args.config)
    if comm.rank == 0:
        print('result_dir:', config['result_dir'])

    # Instantiate model
    model = get_model_from_config(config, comm)
    if args.gpu:
        chainer.cuda.get_device(device).use()
        model.to_gpu()
    if comm.rank == 0:
        print('model:', model.__class__.__name__)

    # Initialize optimizer
    optimizer = get_optimizer_from_config(model, config)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    if comm.rank == 0:
        print('optimizer:', optimizer.__class__.__name__)

    # Setting up datasets
    if comm.rank == 0:
        train_dataset, valid_dataset = get_dataset_from_config(config)
        print('train_dataset: {}'.format(len(train_dataset)),
              train_dataset.__class__.__name__)
        print('valid_dataset: {}'.format(len(valid_dataset)),
              valid_dataset.__class__.__name__)
    else:
        train_dataset, valid_dataset = [], []
    train_dataset = chainermn.scatter_dataset(train_dataset, comm)
    valid_dataset = chainermn.scatter_dataset(valid_dataset, comm)

    # Create iterators
    # multiprocessing.set_start_method('forkserver')
    train_iter, valid_iter = create_iterators(train_dataset, valid_dataset,
                                              config)
    if comm.rank == 0:
        print('train_iter:', train_iter.__class__.__name__)
        print('valid_iter:', valid_iter.__class__.__name__)

    # Create updater and trainer
    if 'updater_creator' in config:
        updater_creator = get_updater_creator_from_config(config)
        updater = updater_creator(train_iter, optimizer, device=device)
    else:
        updater = create_updater(train_iter, optimizer, device=device)
    if comm.rank == 0:
        print('updater:', updater.__class__.__name__)

    # Create Trainer
    trainer = training.Trainer(updater,
                               config['stop_trigger'],
                               out=config['result_dir'])
    if comm.rank == 0:
        print('Trainer stops:', config['stop_trigger'])

    # Trainer extensions
    for ext in config['trainer_extension']:
        ext, values = ext.popitem()
        if ext == 'LogReport' and comm.rank == 0:
            trigger = values['trigger']
            trainer.extend(
                extensions.LogReport(trigger=trigger, log_name=log_fn))
        elif ext == 'observe_lr' and comm.rank == 0:
            trainer.extend(extensions.observe_lr(), trigger=values['trigger'])
        elif ext == 'dump_graph' and comm.rank == 0:
            trainer.extend(extensions.dump_graph(**values))
        elif ext == 'Evaluator':
            assert 'module' in values
            mod = import_module(values['module'])
            evaluator = getattr(mod, values['name'])
            if evaluator is extensions.Evaluator:
                evaluator = evaluator(valid_iter, model, device=device)
            else:
                evaluator = evaluator(valid_iter, model.predictor)
            evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
            trainer.extend(evaluator,
                           trigger=values['trigger'],
                           name=values['prefix'])
        elif ext == 'PlotReport' and comm.rank == 0:
            trainer.extend(extensions.PlotReport(**values))
        elif ext == 'PrintReport' and comm.rank == 0:
            trigger = values.pop('trigger')
            trainer.extend(extensions.PrintReport(**values), trigger=trigger)
        elif ext == 'ProgressBar' and comm.rank == 0:
            upd_int = values['update_interval']
            trigger = values['trigger']
            trainer.extend(extensions.ProgressBar(update_interval=upd_int),
                           trigger=trigger)
        elif ext == 'snapshot' and comm.rank == 0:
            filename = values['filename']
            trigger = values['trigger']
            trainer.extend(extensions.snapshot(filename=filename),
                           trigger=trigger)

    # LR decay
    if 'lr_drop_ratio' in config['optimizer'] \
            and 'lr_drop_triggers' in config['optimizer']:
        ratio = config['optimizer']['lr_drop_ratio']
        points = config['optimizer']['lr_drop_triggers']['points']
        unit = config['optimizer']['lr_drop_triggers']['unit']
        drop_trigger = triggers.ManualScheduleTrigger(points, unit)

        def lr_drop(trainer):
            trainer.updater.get_optimizer('main').lr *= ratio

        trainer.extend(lr_drop, trigger=drop_trigger)

    if 'lr_drop_poly_power' in config['optimizer']:
        power = config['optimizer']['lr_drop_poly_power']
        stop_trigger = config['stop_trigger']
        batchsize = train_iter.batch_size
        len_dataset = len(train_dataset)
        trainer.extend(PolynomialShift('lr', power, stop_trigger, batchsize,
                                       len_dataset),
                       trigger=(1, 'iteration'))

    # Resume
    if args.resume is not None:
        # fn = '{}.bak'.format(args.resume)
        # shutil.copy(args.resume, fn)
        serializers.load_npz(args.resume, trainer)
        if comm.rank == 0:
            print('Resumed from:', args.resume)

    if comm.rank == 0:
        print('==========================================')

    return trainer
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(description='Chainer CIFAR example:')
    parser.add_argument('--seed',
                        '-s',
                        type=int,
                        default=0,
                        help='seed for random values')
    parser.add_argument('--dataset',
                        '-d',
                        default='cifar10',
                        help='The dataset to use: cifar10 or cifar100')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='Number of images in each mini-batch')
    parser.add_argument('--learnrate',
                        '-l',
                        type=float,
                        default=0.1,
                        help='Learning rate for SGD')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=300,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=0,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--aug_method',
                        '-a',
                        default='both',
                        choices=['none', 'mixup', 'random_erasing', 'both'],
                        help='data augmentation strategy')
    parser.add_argument('--model',
                        '-m',
                        default='pyramid',
                        choices=['resnet50', 'pyramid'],
                        help='data augmentation strategy')
    args = parser.parse_args()

    print('GPU: {}'.format(args.gpu))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print(args)
    print('')

    set_random_seed(args.seed)

    # Set up a neural network to train.
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    if args.dataset == 'cifar10':
        print('Using CIFAR10 dataset.')
        class_labels = 10
        train, test = get_cifar10()
        # for mean-teacher experiment
        #train = train[:-10000]
        #print(len(train))
    elif args.dataset == 'cifar100':
        print('Using CIFAR100 dataset.')
        class_labels = 100
        train, test = get_cifar100()
    else:
        raise RuntimeError('Invalid dataset choice.')

    if args.model == 'resnet50':
        predictor = ResNet(None)
        predictor.fc6 = L.Linear(2048, class_labels)
    elif args.model == 'pyramid':
        predictor = shaked_pyramid_net.PyramidNet(skip=True)

    # 下の方にあるtrain dataのtransformの条件分岐とかぶってるけどなー
    if args.aug_method in ('both', 'mixup'):
        lossfun = soft_label_classification_loss
        accfun = soft_label_classification_acc
    else:
        lossfun = F.softmax_cross_entropy
        accfun = F.accuracy

    model = L.Classifier(predictor, lossfun=lossfun, accfun=accfun)

    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    optimizer = chainer.optimizers.MomentumSGD(args.learnrate)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))

    # augment train data
    if args.aug_method == 'none':
        print('data augmentationなしです')
        train = dataset.SingleCifar10((train, None))
    elif args.aug_method in ('both', 'mixup'):
        use_random_erasing = args.aug_method == 'both'
        train = dataset.PairwiseCifar10((train, None))
        train = chainer.datasets.transform_dataset.TransformDataset(
            train,
            transformer.MixupTransform(use_random_erasing=use_random_erasing))
    elif args.aug_method == 'random_erasing':
        train = dataset.SingleCifar10((train, None))
        train = chainer.datasets.transform_dataset.TransformDataset(
            train, transformer.RandomErasingTransform())

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)
    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    eval_trigger = (1, 'epoch')
    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu),
                   trigger=eval_trigger)

    # Reduce the learning rate by half every 25 epochs.
    lr_drop_epoch = [int(args.epoch * 0.5), int(args.epoch * 0.75)]
    lr_drop_ratio = 0.1
    print(f'lr schedule: {lr_drop_ratio}, timing: {lr_drop_epoch}')

    def lr_drop(trainer):
        trainer.updater.get_optimizer('main').lr *= lr_drop_ratio

    trainer.extend(lr_drop,
                   trigger=chainer.training.triggers.ManualScheduleTrigger(
                       lr_drop_epoch, 'epoch'))
    trainer.extend(extensions.observe_lr(), trigger=(1, 'epoch'))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    trainer.extend(extensions.dump_graph('main/loss'))

    # Take a snapshot at each epoch
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'lr', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())
    # interact with chainerui
    trainer.extend(CommandsExtension(), trigger=(100, 'iteration'))
    # save args
    save_args(args, args.out)

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
Exemplo n.º 22
0
def main():

    args = parse_args()

    model = archs[args.arch]()
    ema_model = archs[args.arch]()

    if args.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(
            args.gpu).use()  # Make the GPU current
        model.to_gpu()
        ema_model.to_gpu()

    train, val = chainer.datasets.get_cifar10()
    _, test = chainer.datasets.get_cifar10(withlabel=False)

    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.batchsize, repeat=False, n_processes=args.loaderjob)
    ema_iter = chainer.iterators.MultiprocessIterator(
        test, args.batchsize, repeat=False, n_processes=args.loaderjob)

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    if args.consistency_type == 'mse':
        consistency_lossfun = softmax_mse_loss
    elif args.consistency_type == 'kl':
        consistency_lossfun = softmax_kl_loss
    updater = MeanTeacherUpdater(train_iter,
                                 ema_iter,
                                 optimizer,
                                 ema_model,
                                 ema_decay=args.ema_decay,
                                 distance_cost=args.distance_cost,
                                 consistency=args.consistency,
                                 consistency_lossfun=consistency_lossfun,
                                 device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    val_interval = (1 if args.test else 100000), 'iteration'
    log_interval = (1 if args.test else 1000), 'iteration'

    trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu),
                   trigger=val_interval)
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-g', '--gpu', default=0, type=int, help='GPU id')
    parser.add_argument('-d',
                        '--dataset',
                        type=str,
                        required=True,
                        help='Dataset class name')
    parser.add_argument('-m',
                        '--model',
                        type=str,
                        required=True,
                        help='Model class name')
    parser.add_argument('-b',
                        '--batch_size',
                        type=int,
                        required=True,
                        help='Batch size')
    args = parser.parse_args()

    gpu = args.gpu

    # 0. config

    timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    out = timestamp
    out = osp.join(osp.dirname(here), 'logs', out)

    max_iter_epoch = 100, 'epoch'
    progress_bar_update_interval = 10  # iteration
    print_interval = 100, 'iteration'
    log_interval = 100, 'iteration'
    test_interval = 5, 'epoch'
    save_interval = 5, 'epoch'

    # 1. dataset

    if args.dataset == 'Mirror3DAnnotatedDataset':
        dataset_train = Mirror3DAnnotatedDataset(split='train', aug=True)
        dataset_valid = Mirror3DAnnotatedDataset(split='test', aug=False)
    else:
        print('Invalid dataset class.')
        exit(1)

    dataset_train_transformed = TransformDataset(dataset_train, transform)
    dataset_valid_transformed = TransformDataset(dataset_valid, transform)

    iter_train = chainer.iterators.MultiprocessIterator(
        dataset_train_transformed,
        batch_size=args.batch_size,
        shared_mem=10**8)
    iter_valid = chainer.iterators.MultiprocessIterator(
        dataset_valid_transformed,
        batch_size=1,
        shared_mem=10**8,
        repeat=False,
        shuffle=False)

    # 2. model

    vgg = fcn.models.VGG16()
    vgg_path = vgg.download()
    chainer.serializers.load_npz(vgg_path, vgg)

    n_class = len(dataset_train.class_names)
    assert n_class == 2

    if args.model == 'FCN8sMirrorSegmentationDepthEstimation':
        model = FCN8sMirrorSegmentationDepthEstimation(n_class=n_class)
    else:
        print('Invalid model class.')
        exit(1)

    model.init_from_vgg16(vgg)

    if gpu >= 0:
        cuda.get_device_from_id(gpu).use()
        model.to_gpu()

    # 3. optimizer

    optimizer = chainer.optimizers.Adam(alpha=1.0e-5)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005))

    updater = chainer.training.updater.StandardUpdater(iter_train,
                                                       optimizer,
                                                       device=gpu)

    trainer = chainer.training.Trainer(updater, max_iter_epoch, out=out)

    trainer.extend(extensions.ExponentialShift("alpha", 0.99995))

    if not osp.isdir(out):
        os.makedirs(out)

    with open(osp.join(out, 'dataset.txt'), 'w') as f:
        f.write(dataset_train.__class__.__name__)

    with open(osp.join(out, 'model.txt'), 'w') as f:
        f.write(model.__class__.__name__)

    with open(osp.join(out, 'n_class.txt'), 'w') as f:
        f.write(str(n_class))

    with open(osp.join(out, 'batch_size.txt'), 'w') as f:
        f.write(str(args.batch_size))

    trainer.extend(extensions.snapshot_object(
        model, savefun=chainer.serializers.save_npz, filename='max_miou.npz'),
                   trigger=chainer.training.triggers.MaxValueTrigger(
                       'validation/main/miou', save_interval))
    trainer.extend(extensions.snapshot_object(
        model,
        savefun=chainer.serializers.save_npz,
        filename='max_depth_acc.npz'),
                   trigger=chainer.training.triggers.MaxValueTrigger(
                       'validation/main/depth_acc<0.10', save_interval))

    trainer.extend(
        extensions.dump_graph(root_name='main/loss', out_name='graph.dot'))

    trainer.extend(
        extensions.LogReport(log_name='log.json', trigger=log_interval))

    trainer.extend(chainer.training.extensions.PrintReport([
        'iteration',
        'epoch',
        'elapsed_time',
        'lr',
        'main/loss',
        'main/seg_loss',
        'main/reg_loss',
        'main/miou',
        'main/depth_acc<0.03',
        'main/depth_acc<0.10',
        'main/depth_acc<0.30',
        'validation/main/miou',
        'validation/main/depth_acc<0.03',
        'validation/main/depth_acc<0.10',
        'validation/main/depth_acc<0.30',
    ]),
                   trigger=print_interval)

    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(
        extensions.ProgressBar(update_interval=progress_bar_update_interval))
    trainer.extend(extensions.Evaluator(iter_valid, model, device=gpu),
                   trigger=test_interval)

    trainer.run()
def main():
    parser = argparse.ArgumentParser(description='ColumnNet')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=200,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--frequency', '-f', type=int, default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1000,
                        help='Number of units')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--val_batchsize', '-b', type=int, default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true') 
    args = parser.parse_args()
 

    print('GPU: {}'.format(args.gpu))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Set up a neural network to train
    Model = ColumnNet()
    model = L.Classifier(Model)
   
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    
    # Load the ColumnNet dataset
    f = open('train_list.txt')
    train_lines = f.readlines()
    f.close()

    f = open('val_list.txt')
    val_lines = f.readlines()
    f.close()

    #dataset = LabeledImageDataset(list(zip(fnames, labels)))
    #transform_dataset = TransformDataset(dataset, transform)

    #train, val = datasets.split_dataset_random(transform_dataset, int(len(dataset) * 0.8), seed=0)

    train = load_dataset(train_lines)
    val = load_dataset(val_lines)

    train_iter = iterators.MultiprocessIterator(train, args.batchsize)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, shuffle=False)


    if args.test:
        val_interval = 5, 'epoch'
        log_interval = 1, 'epoch'
    else:
        val_interval = 100000, 'iteration'
        log_interval = 1000, 'iteration'


    # Set up an optimizer
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out='result')
 
    # Set up a trainer
    trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu),trigger=val_interval)
    trainer.extend(extensions.snapshot(), trigger=(1, 'epoch'))
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'), trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'validation/main/map', 'lr'
    ]), trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))
    trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/map'], x_key='epoch', file_name='loss.png'))
    trainer.extend(extensions.PlotReport(['main/accuracy', 'validation/main/map'], x_key='epoch', file_name='accuracy.png'))
    trainer.extend(extensions.dump_graph('main/loss'))
    # Run the training
    trainer.run()
    chainer.serializers.save_npz('result/columnnet.model', Model)
Exemplo n.º 25
0
def main():
    archs = {
        'alex': alex.Alex,
        'alex_fp16': alex.AlexFp16,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
        'resnext50': resnext50.ResNeXt50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--gpus',
                        '-g',
                        type=int,
                        nargs="*",
                        default=[0, 1, 2, 3])
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from {}'.format(args.initmodel))
        chainer.serializers.load_npz(args.initmodel, model)

    # Load the datasets and mean file
    mean = np.load(args.mean)
    train = train_imagenet.PreprocessedDataset(args.train, args.root, mean,
                                               model.insize)
    val = train_imagenet.PreprocessedDataset(args.val, args.root, mean,
                                             model.insize, False)
    # These iterators load the images with subprocesses running in parallel to
    # the training/validation.
    devices = tuple(args.gpus)

    train_iters = [
        chainer.iterators.MultiprocessIterator(i,
                                               args.batchsize,
                                               n_processes=args.loaderjob)
        for i in chainer.datasets.split_dataset_n_random(train, len(devices))
    ]
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = updaters.MultiprocessParallelUpdater(train_iters,
                                                   optimizer,
                                                   devices=devices)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    if args.test:
        val_interval = 5, 'epoch'
        log_interval = 1, 'epoch'
    else:
        val_interval = 100000, 'iteration'
        log_interval = 1000, 'iteration'

    trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpus[0]),
                   trigger=val_interval)
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=2))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 26
0
def main():
    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
        'resnext50': resnext50.ResNeXt50,
    }

    dtypes = {
        'float16': np.float16,
        'float32': np.float32,
        'float64': np.float64,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--dtype',
                        choices=dtypes,
                        help='Specify the dtype '
                        'used. If not supplied, the default dtype is used')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--device',
                        '-d',
                        type=str,
                        default='-1',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    parser.add_argument('--dali', action='store_true')
    parser.set_defaults(dali=False)
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu',
                       '-g',
                       dest='device',
                       type=int,
                       nargs='?',
                       const=0,
                       help='GPU ID (negative value indicates CPU)')
    args = parser.parse_args()

    device = chainer.get_device(args.device)

    # Set the dtype if supplied.
    if args.dtype is not None:
        chainer.config.dtype = args.dtype

    print('Device: {}'.format(device))
    print('Dtype: {}'.format(chainer.config.dtype))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from {}'.format(args.initmodel))
        chainer.serializers.load_npz(args.initmodel, model)
    model.to_device(device)
    device.use()

    # Load the mean file
    mean = np.load(args.mean)
    if args.dali:
        if not dali_util._dali_available:
            raise RuntimeError('DALI seems not available on your system.')
        if device.xp is not chainer.backend.cuda.cupy:
            raise RuntimeError('Using DALI requires GPU device. Please '
                               'specify it with --device option.')
        n_threads = args.loaderjob
        if n_threads is None or n_threads <= 0:
            n_threads = 1
        ch_mean = list(np.average(mean, axis=(1, 2)))
        ch_std = [255.0, 255.0, 255.0]
        # Setup DALI pipelines
        train_pipe = dali_util.DaliPipelineTrain(args.train,
                                                 args.root,
                                                 model.insize,
                                                 args.batchsize,
                                                 n_threads,
                                                 device.device.id,
                                                 True,
                                                 mean=ch_mean,
                                                 std=ch_std)
        val_pipe = dali_util.DaliPipelineVal(args.val,
                                             args.root,
                                             model.insize,
                                             args.val_batchsize,
                                             n_threads,
                                             device.device.id,
                                             False,
                                             mean=ch_mean,
                                             std=ch_std)
        train_iter = chainer.iterators.DaliIterator(train_pipe)
        val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False)
        # converter = dali_converter
        converter = dali_util.DaliConverter(mean=mean, crop_size=model.insize)
    else:
        # Load the dataset files
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(args.val, args.root, mean, model.insize,
                                  False)
        # These iterators load the images with subprocesses running in parallel
        # to the training/validation.
        train_iter = chainer.iterators.MultiprocessIterator(
            train, args.batchsize, n_processes=args.loaderjob)
        val_iter = chainer.iterators.MultiprocessIterator(
            val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)
        converter = dataset.concat_examples

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=converter,
                                                device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    val_interval = (100000, 'iteration')
    log_interval = (1000, 'iteration')
    if args.test:
        val_interval = (1, 'iteration')
        log_interval = (1, 'iteration')
    # BEGIN ADDITIONAL TEST CODE
    val_interval = (1, 'iteration')
    log_interval = (1, 'iteration')
    # END ADDITIONAL TEST CODE

    trainer.extend(extensions.Evaluator(val_iter,
                                        model,
                                        converter=converter,
                                        device=device),
                   trigger=val_interval)
    # TODO(sonots): Temporarily disabled for chainerx. Fix it.
    if device.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 27
0
def main(args):

    assert ((args.depth - args.block - 1) % args.block == 0)
    n_layer = (args.depth - args.block - 1) / args.block
    if args.dataset == 'cifar10':
        train, test = cifar.get_cifar10()
        n_class = 10
    elif args.dataset == 'cifar100':
        train, test = cifar.get_cifar100()
        n_class = 100
    elif args.dataset == 'SVHN':
        raise NotImplementedError()

    mean = numpy.zeros((3, 32, 32), dtype=numpy.float32)
    for image, _ in train:
        mean += image / len(train)

    train = PreprocessedDataset(train, mean, random=True)
    test = PreprocessedDataset(test, mean)

    train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize)
    test_iter = chainer.iterators.MultiprocessIterator(test,
                                                       args.batchsize,
                                                       repeat=False,
                                                       shuffle=False)

    model = chainer.links.Classifier(
        DenseNet(n_layer, args.growth_rate, n_class, args.drop_ratio, 16,
                 args.block))
    if args.init_model:
        serializers.load_npz(args.init_model, model)

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr / len(args.gpus),
                                               momentum=0.9)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay))

    devices = {'main': args.gpus[0]}
    if len(args.gpus) > 2:
        for gid in args.gpus[1:]:
            devices['gpu%d' % gid] = gid
    updater = training.ParallelUpdater(train_iter, optimizer, devices=devices)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.dir)

    val_interval = (1, 'epoch')
    log_interval = (1, 'epoch')

    eval_model = model.copy()
    eval_model.train = False

    trainer.extend(extensions.Evaluator(test_iter,
                                        eval_model,
                                        device=args.gpus[0]),
                   trigger=val_interval)
    trainer.extend(extensions.ExponentialShift('lr', args.lr_decay_ratio),
                   trigger=(args.lr_decay_freq, 'epoch'))
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot_object(model,
                                              'epoch_{.updater.epoch}.model'),
                   trigger=val_interval)
    trainer.extend(extensions.snapshot_object(optimizer,
                                              'epoch_{.updater.epoch}.state'),
                   trigger=val_interval)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    start_time = time.time()
    trainer.extend(extensions.observe_value(
        'time', lambda _: time.time() - start_time),
                   trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'time',
        'epoch',
        'iteration',
        'main/loss',
        'validation/main/loss',
        'main/accuracy',
        'validation/main/accuracy',
        'lr',
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.observe_value('graph',
                                            lambda _: create_fig(args.dir)),
                   trigger=(2, 'epoch'))
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.run()
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--batchsize', type=int, default=12)
    parser.add_argument('--class-weight', type=str, default='class_weight.npy')
    parser.add_argument('--out', type=str, default='result')
    args = parser.parse_args()

    # Triggers
    log_trigger = (50, 'iteration')
    validation_trigger = (2000, 'iteration')
    end_trigger = (16000, 'iteration')

    # Dataset
    train = CamVidDataset(split='train')
    train = TransformDataset(train, transform)
    val = CamVidDataset(split='val')

    # Iterator
    train_iter = iterators.MultiprocessIterator(train, args.batchsize)
    val_iter = iterators.MultiprocessIterator(
        val, args.batchsize, shuffle=False, repeat=False)

    # Model
    class_weight = np.load(args.class_weight)
    model = SegNetBasic(n_class=len(camvid_label_names))
    model = PixelwiseSoftmaxClassifier(
        model, class_weight=class_weight)
    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    # Optimizer
    optimizer = optimizers.MomentumSGD(lr=0.1, momentum=0.9)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(rate=0.0005))

    # Updater
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, device=args.gpu)

    # Trainer
    trainer = training.Trainer(updater, end_trigger, out=args.out)

    trainer.extend(extensions.LogReport(trigger=log_trigger))
    trainer.extend(extensions.observe_lr(), trigger=log_trigger)
    trainer.extend(extensions.dump_graph('main/loss'))

    if extensions.PlotReport.available():
        trainer.extend(extensions.PlotReport(
            ['main/loss'], x_key='iteration',
            file_name='loss.png'))
        trainer.extend(extensions.PlotReport(
            ['validation/main/miou'], x_key='iteration',
            file_name='miou.png'))

    trainer.extend(extensions.PrintReport(
        ['epoch', 'iteration', 'elapsed_time', 'lr',
         'main/loss', 'validation/main/miou',
         'validation/main/mean_class_accuracy',
         'validation/main/pixel_accuracy']),
        trigger=log_trigger)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.extend(
        SemanticSegmentationEvaluator(
            val_iter, model.predictor,
            camvid_label_names),
        trigger=validation_trigger)

    trainer.run()

    chainer.serializers.save_npz(
        os.path.join(args.out, 'snapshot_model.npz'),
        recalculate_bn_statistics(model.predictor, 24))
Exemplo n.º 29
0
def run_training(config: str, device: int, seed: int):
    configs = ConfigParser.parse(config)
    params = yaml.load(open(config, encoding="utf-8"))

    if device >= 0:
        cuda.get_device(device).use()

    set_seed(seed, device)

    vocab = Vocabulary.prepare(configs)
    num_word_vocab = max(vocab.dictionaries["word2idx"].values()) + 1
    num_char_vocab = max(vocab.dictionaries["char2idx"].values()) + 1
    num_tag_vocab = max(vocab.dictionaries["tag2idx"].values()) + 1

    model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab)

    transformer = DatasetTransformer(vocab)
    transform = transformer.transform

    external_configs = configs["external"]
    if "word_vector" in external_configs:
        syn0 = model.embed_word.W.data
        _, word_dim = syn0.shape
        pre_word_dim = vocab.gensim_model.vector_size
        if word_dim != pre_word_dim:
            msg = "Mismatch vector size between model and pre-trained word vectors"  # NOQA
            msg += f"(model: \x1b[31m{word_dim}\x1b[0m"
            msg += f", pre-trained word vector: \x1b[31m{pre_word_dim}\x1b[0m"
            raise Exception(msg)

        word2idx = vocab.dictionaries["word2idx"]
        syn0 = prepare_pretrained_word_vector(word2idx, vocab.gensim_model,
                                              syn0, num_word_vocab)
        model.set_pretrained_word_vectors(syn0)

    train_iterator = create_iterator(vocab, configs, "train", transform)
    valid_iterator = create_iterator(vocab, configs, "valid", transform)
    test_iterator = create_iterator(vocab, configs, "test", transform)

    if device >= 0:
        model.to_gpu(device)

    optimizer = create_optimizer(configs)
    optimizer.setup(model)
    optimizer = add_hooks(optimizer, configs)

    updater = T.StandardUpdater(train_iterator,
                                optimizer,
                                converter=converter,
                                device=device)

    params = configs.export()
    params["num_word_vocab"] = num_word_vocab
    params["num_char_vocab"] = num_char_vocab
    params["num_tag_vocab"] = num_tag_vocab

    epoch = configs["iteration"]["epoch"]
    trigger = (epoch, "epoch")

    model_path = configs["output"]
    timestamp = datetime.datetime.now()
    timestamp_str = timestamp.isoformat()
    output_path = Path(f"{model_path}.{timestamp_str}")

    trainer = T.Trainer(updater, trigger, out=output_path)
    save_args(params, output_path)
    msg = f"Create \x1b[31m{output_path}\x1b[0m for saving model snapshots"
    logging.debug(msg)

    entries = ["epoch", "iteration", "elapsed_time", "lr", "main/loss"]
    entries += ["validation/main/loss", "validation/main/fscore"]
    entries += ["validation_1/main/loss", "validation_1/main/fscore"]

    valid_evaluator = NamedEntityEvaluator(valid_iterator,
                                           model,
                                           transformer.itransform,
                                           converter,
                                           device=device)

    test_evaluator = NamedEntityEvaluator(test_iterator,
                                          model,
                                          transformer.itransform,
                                          converter,
                                          device=device)

    epoch_trigger = (1, "epoch")
    snapshot_filename = "snapshot_epoch_{.updater.epoch:04d}"
    trainer.extend(valid_evaluator, trigger=epoch_trigger)
    trainer.extend(test_evaluator, trigger=epoch_trigger)
    trainer.extend(E.observe_lr(), trigger=epoch_trigger)
    trainer.extend(E.LogReport(trigger=epoch_trigger))
    trainer.extend(E.PrintReport(entries=entries), trigger=epoch_trigger)
    trainer.extend(E.ProgressBar(update_interval=20))
    trainer.extend(E.snapshot_object(model, filename=snapshot_filename),
                   trigger=(1, "epoch"))

    if "learning_rate_decay" in params:
        logger.debug("Enable Learning Rate decay")
        trainer.extend(
            LearningRateDecay("lr", params["learning_rate"],
                              params["learning_rate_decay"]),
            trigger=epoch_trigger,
        )

    trainer.run()
Exemplo n.º 30
0
def main():
    if not chainer.cuda.available:
        raise RuntimeError("ImageNet requires GPU support.")

    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
    }

    parser = argparse.ArgumentParser()
    parser.add_argument('train')
    parser.add_argument('val')
    parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin')
    parser.add_argument('--batchsize', '-B', type=int, default=32)
    parser.add_argument('--epoch', '-E', type=int, default=2)
    parser.add_argument('--initmodel')
    parser.add_argument('--loaderjob', '-j', type=int)
    parser.add_argument('--mean', '-m', default='mean.npy')
    parser.add_argument('--resume', '-r', default='')
    parser.add_argument('--out', '-o', default='result')
    parser.add_argument('--train-root', default='.')
    parser.add_argument('--val-root', default='.')
    parser.add_argument('--val-batchsize', '-b', type=int, default=250)
    parser.add_argument('--communicator', default='hierarchical')
    parser.add_argument('--loadtype', default='original')
    parser.add_argument('--iterator', default='process')
    parser.add_argument('--optimizer', default='rmsprop_warmup')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--cov_ema_decay', type=float, default=0.99)
    parser.add_argument('--inv_freq', type=int, default=20)
    parser.add_argument('--damping', type=float, default=0.001)
    parser.add_argument('--cov-batchsize', type=int, default=16)
    parser.add_argument('--n-cov-workers', type=int, default=1)
    parser.add_argument('--n-inv-workers', type=int, default=1)
    parser.add_argument('--join-cov', action='store_true')
    parser.add_argument('--npergroup', type=int, default=1)
    parser.add_argument('--weight-decay', type=float, default=0.00022)
    parser.set_defaults(test=False)
    args = parser.parse_args()

    comm = dlframeworks.chainer.communicators.create_communicator(debug=True)
    device = comm.intra_rank  # GPU is related with intra rank
    chainer.cuda.get_device_from_id(device).use()
    model = archs[args.arch]()

    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    # Initialize weights
    x = np.zeros((1, 3, model.insize, model.insize), dtype=np.float32)
    t = np.zeros((1, ), dtype=np.int32)
    model(x, t)

    try:
        model.to_gpu()
    except chainer.cuda.cupy.cuda.runtime.CUDARuntimeError as e:
        print('Error occured in {}'.format(comm.rank), file=sys.stderr)
        raise e

    if comm.mpi_comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.mpi_comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    comm.mpi_comm.Barrier()

    # ======== Create optimizer ========
    optimizer = dlframeworks.chainer.optimizers.KFAC(
        comm,
        lr=args.lr,
        momentum=args.momentum,
        cov_ema_decay=args.cov_ema_decay,
        inv_freq=args.inv_freq,
        damping=args.damping,
    )
    # damping ~ 0.035 is good
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay))

    batchsize = args.batchsize
    # Load all dataset in memory
    dataset_class = dlframeworks.chainer.datasets.CroppingDatasetIO
    mean = np.load(args.mean)
    # ======== Create dataset ========
    if comm.rank == 0:
        train = dlframeworks.chainer.datasets.read_pairs(args.train)
        val = dlframeworks.chainer.datasets.read_pairs(args.val)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)
    train_dataset = dataset_class(train, args.train_root, mean, model.insize,
                                  model.insize)
    val_dataset = dataset_class(val, args.val_root, mean, model.insize,
                                model.insize)

    # ======== Create iterator ========
    if args.iterator == 'process':
        multiprocessing.set_start_method('forkserver')
        train_iterator = chainer.iterators.MultiprocessIterator(
            train_dataset, batchsize, n_processes=args.loaderjob)
        val_iterator = chainer.iterators.MultiprocessIterator(
            val_dataset,
            args.val_batchsize,
            n_processes=args.loaderjob,
            repeat=False)
    elif args.iterator == 'thread':
        train_iterator = chainer.iterators.MultithreadIterator(
            train_dataset, batchsize, n_threads=args.loaderjob)
        val_iterator = chainer.iterators.MultithreadIterator(
            val_dataset,
            args.val_batchsize,
            n_threads=args.loaderjob,
            repeat=False)
    else:
        train_iterator = chainer.iterators.SerialIterator(
            train_dataset, batchsize)
        val_iterator = chainer.iterators.SerialIterator(val_dataset,
                                                        args.val_batchsize,
                                                        repeat=False,
                                                        shuffle=False)

    # ======== Create updater ========
    updater = training.StandardUpdater(train_iterator,
                                       optimizer,
                                       device=device)

    # ======== Create trainer ========
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    # ======== Extend trainer ========
    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(observe_hyperparam('momentum'), trigger=log_interval)
        trainer.extend(observe_hyperparam('cov_ema_decay'),
                       trigger=log_interval)
        trainer.extend(observe_hyperparam('inv_freq'), trigger=log_interval)
        trainer.extend(observe_hyperparam('damping'), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 31
0
trainer = training.Trainer(updater,
                           stop_trigger=(80, 'epoch'),
                           out='cifar10_result')

trainer.extend(
    extensions.LogReport(keys=["main/loss", "validation/main/accuracy", "lr"],
                         trigger=training.triggers.IntervalTrigger(
                             100, 'iteration')))
# logging statistics during calculation once a 100 iterations.

trainer.extend(extensions.ExponentialShift('alpha', 1 / 3),
               trigger=training.triggers.IntervalTrigger(20, 'epoch'))
# reduce learning rate 1/3 once a 20 epochs
# Since Adam's leraning rate in its original paper is "alpha",
# chainer's Adam has attribute "alpha" instead of "lr".

trainer.extend(extensions.Evaluator(test_iter, model, device=device_id),
               trigger=training.triggers.IntervalTrigger(3, 'epoch'))  #
# conduct evaluation at the end of training only
trainer.extend(extensions.observe_lr(),
               trigger=training.triggers.IntervalTrigger(
                   100, 'iteration'))  # log the learning rate

trainer.extend(extensions.PrintReport([
    'epoch', 'main/loss', 'main/accuracy', 'validation/main/accuracy',
    'elapsed_time', 'lr'
]),
               trigger=training.triggers.IntervalTrigger(
                   100, 'iteration'))  # print statistics once a 100 iterations
trainer.run()  # Train the model
Exemplo n.º 32
0
def handler(context):
    # Triggers
    log_trigger = (50, 'iteration')
    validation_trigger = (2000, 'iteration')
    end_trigger = (nb_iterations, 'iteration')

    # Dataset
    dataset_alias = context.datasets
    train_dataset_id = dataset_alias['train']
    val_dataset_id = dataset_alias['val']
    train = SegmentationDatasetFromAPI(train_dataset_id)
    val = SegmentationDatasetFromAPI(val_dataset_id)
    class_weight = calc_weight(train)

    print(class_weight)

    train = TransformDataset(train, transform)

    # Iterator
    train_iter = iterators.SerialIterator(train, BATCHSIZE)
    val_iter = iterators.SerialIterator(val,
                                        BATCHSIZE,
                                        shuffle=False,
                                        repeat=False)

    # Model
    model = SegNetBasic(n_class=len(camvid_label_names))
    model = PixelwiseSoftmaxClassifier(model, class_weight=class_weight)

    if USE_GPU >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(USE_GPU).use()
        model.to_gpu()  # Copy the model to the GPU

    # Optimizer
    optimizer = optimizers.MomentumSGD(lr=0.1, momentum=0.9)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(rate=0.0005))

    # Updater
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=USE_GPU)

    # Trainer
    trainer = training.Trainer(updater,
                               end_trigger,
                               out=ABEJA_TRAINING_RESULT_DIR)

    trainer.extend(extensions.LogReport(trigger=log_trigger))
    trainer.extend(extensions.observe_lr(), trigger=log_trigger)
    trainer.extend(extensions.dump_graph('main/loss'))

    trainer.extend(extensions.snapshot_object(
        model.predictor, filename='model_iteration-{.updater.iteration}'),
                   trigger=end_trigger)

    print_entries = [
        'iteration', 'main/loss', 'validation/main/miou',
        'validation/main/mean_class_accuracy', 'validation/main/pixel_accuracy'
    ]

    report_entries = [
        'epoch', 'iteration', 'lr', 'main/loss', 'validation/main/miou',
        'validation/main/mean_class_accuracy', 'validation/main/pixel_accuracy'
    ]

    trainer.extend(Statistics(report_entries,
                              nb_iterations,
                              obs_key='iteration'),
                   trigger=log_trigger)
    trainer.extend(Tensorboard(report_entries, out_dir=log_path))
    trainer.extend(extensions.PrintReport(print_entries), trigger=log_trigger)

    trainer.extend(SemanticSegmentationEvaluator(val_iter, model.predictor,
                                                 camvid_label_names),
                   trigger=validation_trigger)

    trainer.run()
Exemplo n.º 33
0
def train_one_epoch(model, train_data, lr, gpu, batchsize, out):
    train_model = PixelwiseSoftmaxClassifier(model)
    if gpu >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(gpu).use()
        train_model.to_gpu()  # Copy the model to the GPU
    log_trigger = (0.1, 'epoch')
    validation_trigger = (1, 'epoch')
    end_trigger = (1, 'epoch')

    train_data = TransformDataset(train_data, ('img', 'label_map'),
                                  SimpleDoesItTransform(model.mean))
    val = VOCSemanticSegmentationWithBboxDataset(
        split='val').slice[:, ['img', 'label_map']]

    # Iterator
    train_iter = iterators.MultiprocessIterator(train_data, batchsize)
    val_iter = iterators.MultiprocessIterator(val,
                                              1,
                                              shuffle=False,
                                              repeat=False,
                                              shared_mem=100000000)

    # Optimizer
    optimizer = optimizers.MomentumSGD(lr=lr, momentum=0.9)
    optimizer.setup(train_model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(rate=0.0001))

    # Updater
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=gpu)

    # Trainer
    trainer = training.Trainer(updater, end_trigger, out=out)

    trainer.extend(extensions.LogReport(trigger=log_trigger))
    trainer.extend(extensions.observe_lr(), trigger=log_trigger)
    trainer.extend(extensions.dump_graph('main/loss'))

    if extensions.PlotReport.available():
        trainer.extend(
            extensions.PlotReport(['main/loss'],
                                  x_key='iteration',
                                  file_name='loss.png'))
        trainer.extend(
            extensions.PlotReport(['validation/main/miou'],
                                  x_key='iteration',
                                  file_name='miou.png'))

    trainer.extend(extensions.snapshot_object(model, filename='snapshot.npy'),
                   trigger=end_trigger)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'elapsed_time', 'lr', 'main/loss',
        'validation/main/miou', 'validation/main/mean_class_accuracy',
        'validation/main/pixel_accuracy'
    ]),
                   trigger=log_trigger)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.extend(SemanticSegmentationEvaluator(
        val_iter, model, voc_semantic_segmentation_label_names),
                   trigger=validation_trigger)
    trainer.run()
Exemplo n.º 34
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('dataset', help="path to train json file")
    parser.add_argument('test_dataset', help="path to test dataset json file")
    parser.add_argument(
        '--dataset-root',
        help=
        "path to dataset root if dataset file is not already in root folder of dataset"
    )
    parser.add_argument('--model',
                        choices=('ssd300', 'ssd512'),
                        default='ssd512')
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--gpu', type=int, nargs='*', default=[])
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help="default learning rate")
    parser.add_argument('--port',
                        type=int,
                        default=1337,
                        help="port for bbox sending")
    parser.add_argument('--ip',
                        default='127.0.0.1',
                        help="destination ip for bbox sending")
    parser.add_argument(
        '--test-image',
        help="path to test image that shall be displayed in bbox vis")
    args = parser.parse_args()

    if args.dataset_root is None:
        args.dataset_root = os.path.dirname(args.dataset)

    if args.model == 'ssd300':
        model = SSD300(n_fg_class=1, pretrained_model='imagenet')
        image_size = (300, 300)
    elif args.model == 'ssd512':
        model = SSD512(n_fg_class=1, pretrained_model='imagenet')
        image_size = (512, 512)
    else:
        raise NotImplementedError("The model you want to train does not exist")

    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)

    train = TransformDataset(
        SheepDataset(args.dataset_root, args.dataset, image_size=image_size),
        Transform(model.coder, model.insize, model.mean))

    if len(args.gpu) > 1:
        gpu_datasets = split_dataset_n_random(train, len(args.gpu))
        if not len(gpu_datasets[0]) == len(gpu_datasets[-1]):
            adapted_second_split = split_dataset(gpu_datasets[-1],
                                                 len(gpu_datasets[0]))[0]
            gpu_datasets[-1] = adapted_second_split
    else:
        gpu_datasets = [train]

    train_iter = [
        ThreadIterator(gpu_dataset, args.batchsize)
        for gpu_dataset in gpu_datasets
    ]

    test = SheepDataset(args.dataset_root,
                        args.test_dataset,
                        image_size=image_size)
    test_iter = chainer.iterators.MultithreadIterator(test,
                                                      args.batchsize,
                                                      repeat=False,
                                                      shuffle=False,
                                                      n_threads=2)

    # initial lr is set to 1e-3 by ExponentialShift
    optimizer = chainer.optimizers.Adam(alpha=args.lr)
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    if len(args.gpu) <= 1:
        updater = training.updaters.StandardUpdater(
            train_iter[0],
            optimizer,
            device=args.gpu[0] if len(args.gpu) > 0 else -1,
        )
    else:
        updater = training.updaters.MultiprocessParallelUpdater(
            train_iter, optimizer, devices=args.gpu)
        updater.setup_workers()

    if len(args.gpu) > 0 and args.gpu[0] >= 0:
        chainer.backends.cuda.get_device_from_id(args.gpu[0]).use()
        model.to_gpu()

    trainer = training.Trainer(updater, (200, 'epoch'), args.out)

    trainer.extend(DetectionVOCEvaluator(test_iter,
                                         model,
                                         use_07_metric=True,
                                         label_names=voc_bbox_label_names),
                   trigger=(1000, 'iteration'))

    # build logger
    # make sure to log all data necessary for prediction
    log_interval = 100, 'iteration'
    data_to_log = {
        'image_size': image_size,
        'model_type': args.model,
    }

    # add all command line arguments
    for argument in filter(lambda x: not x.startswith('_'), dir(args)):
        data_to_log[argument] = getattr(args, argument)

    # create callback that logs all auxiliary data the first time things get logged
    def backup_train_config(stats_cpu):
        if stats_cpu['iteration'] == log_interval:
            stats_cpu.update(data_to_log)

    trainer.extend(
        extensions.LogReport(trigger=log_interval,
                             postprocess=backup_train_config))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc',
        'main/loss/conf', 'validation/main/map'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=(5000, 'iteration'))

    if args.test_image is not None:
        plot_image = train._dataset.load_image(args.test_image,
                                               resize_to=image_size)
    else:
        plot_image, _, _ = train.get_example(0)
        plot_image += train._transform.mean

    bbox_plotter = BBOXPlotter(
        plot_image,
        os.path.join(args.out, 'bboxes'),
        send_bboxes=True,
        upstream_port=args.port,
        upstream_ip=args.ip,
    )
    trainer.extend(bbox_plotter, trigger=(10, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 35
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--batchsize', type=int, default=1)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    comm = chainermn.create_communicator()
    device = comm.intra_rank

    faster_rcnn = FasterRCNNVGG16(
        n_fg_class=len(epic_kitchens_bbox_label_names),
        pretrained_model='imagenet')

    faster_rcnn.use_preset('evaluate')
    model = FasterRCNNTrainChain(faster_rcnn)
    chainer.cuda.get_device_from_id(device).use()
    model.to_gpu()

    train = EpicKitchensBboxDataset(year='2018', split='train')
    if comm.rank == 0:
        indices = np.arange(len(train))
    else:
        indices = None
    train = TransformDataset(train, ('img', 'bbox', 'label', 'scale'),
                             Transform(faster_rcnn))

    indices = chainermn.scatter_dataset(indices, comm, shuffle=True)
    train = train.slice[indices]

    train_iter = chainer.iterators.SerialIterator(train,
                                                  batch_size=args.batchsize)

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(), comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(rate=0.0005))

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    trainer = training.Trainer(updater, (18, 'epoch'), args.out)
    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=args.lr),
                   trigger=triggers.ManualScheduleTrigger([12, 15], 'epoch'))

    if comm.rank == 0:
        log_interval = 10, 'iteration'
        trainer.extend(
            extensions.LogReport(log_name='log.json', trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss',
            'main/roi_loc_loss', 'main/roi_cls_loss', 'main/rpn_loc_loss',
            'main/rpn_cls_loss'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=1))

        trainer.extend(extensions.snapshot_object(
            model.faster_rcnn, 'model_iter_{.updater.iteration}.npz'),
                       trigger=(1, 'epoch'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 36
0
def main():
    args = parse_args()
    dump_args(args)

    # prepare dataset
    train, val, val_raw = prepare_dataset(full_data=args.full_data)
    train_iter = chainer.iterators.MultiprocessIterator(train,
                                                        args.batchsize,
                                                        shared_mem=4000000)
    val_iter = chainer.iterators.MultiprocessIterator(val,
                                                      args.batchsize,
                                                      repeat=False,
                                                      shuffle=False,
                                                      shared_mem=4000000)
    eval_iter = chainer.iterators.MultiprocessIterator(val_raw,
                                                       4,
                                                       repeat=False,
                                                       shuffle=False,
                                                       shared_mem=4000000)

    # setup model
    if args.model == 'unet':
        model = UnetCenterNet()
    elif args.model == 'res18unet':
        model = Res18UnetCenterNet()

    training_model = TrainingModel(model)
    if args.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        training_model.to_gpu()

    # setup optimizer
    optimizer = chainer.optimizers.NesterovAG(lr=1e-3)
    optimizer.setup(training_model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-5))
    optimizer.add_hook(chainer.optimizer.GradientClipping(100.))

    # setup trainer
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=converter)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # set trainer extensions
    if not args.full_data:
        trainer.extend(
            extensions.Evaluator(val_iter,
                                 training_model,
                                 device=args.gpu,
                                 converter=converter))
        trainer.extend(DetectionMapEvaluator(eval_iter, model))

    trainer.extend(extensions.snapshot_object(model,
                                              'model_{.updater.epoch}.npz'),
                   trigger=(10, 'epoch'))
    trainer.extend(extensions.snapshot(), trigger=(10, 'epoch'))
    trainer.extend(extensions.LogReport())
    if args.full_data:
        trainer.extend(extensions.PrintReport(['epoch', 'main/loss']))
    else:
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'eval/main/map'
            ]))
    trainer.extend(extensions.ProgressBar(update_interval=10))

    # learning rate scheduling
    lr_drop_epochs = [int(args.epoch * 0.5), int(args.epoch * 0.75)]
    lr_drop_trigger = triggers.ManualScheduleTrigger(lr_drop_epochs, 'epoch')
    trainer.extend(LearningRateDrop(0.1), trigger=lr_drop_trigger)
    trainer.extend(extensions.observe_lr())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    # start training
    trainer.run()
Exemplo n.º 37
0
def main():
    archs = {
        'alex': alex.Alex,
        'alex_fp16': alex.AlexFp16,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
        'resnext50': resnext50.ResNeXt50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--iterations',
                        '-I',
                        type=int,
                        default=0,
                        help='Number of iterations to train')
    parser.add_argument('--device',
                        '-d',
                        type=str,
                        default='-1',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    parser.add_argument('--dali', action='store_true')
    parser.set_defaults(dali=False)
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu',
                       '-g',
                       dest='device',
                       type=int,
                       nargs='?',
                       const=0,
                       help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--compile',
                        action='store_true',
                        help='Compile the model')
    parser.add_argument('--dump_onnx',
                        action='store_true',
                        help='Dump ONNX model after optimization')
    args = parser.parse_args()

    chainer.config.autotune = True
    chainer.config.cudnn_fast_batch_normalization = True

    device = chainer.get_device(args.device)

    print('Device: {}'.format(device))
    print('# Minibatch-size: {}'.format(args.batchsize))
    if args.iterations:
        print('# iterations: {}'.format(args.iterations))
    else:
        print('# epoch: {}'.format(args.epoch))
    print('')

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from {}'.format(args.initmodel))
        chainer.serializers.load_npz(args.initmodel, model)
    insize = model.insize
    if args.compile:
        model = chainer_compiler.compile(model, dump_onnx=args.dump_onnx)
    model.to_device(device)
    device.use()

    # Load the mean file
    mean = np.load(args.mean)
    if args.dali:
        if not dali_util._dali_available:
            raise RuntimeError('DALI seems not available on your system.')
        num_threads = args.loaderjob
        if num_threads is None or num_threads <= 0:
            num_threads = 1
        ch_mean = list(np.average(mean, axis=(1, 2)))
        ch_std = [255.0, 255.0, 255.0]
        # Setup DALI pipelines
        train_pipe = dali_util.DaliPipelineTrain(args.train,
                                                 args.root,
                                                 insize,
                                                 args.batchsize,
                                                 num_threads,
                                                 args.gpu,
                                                 True,
                                                 mean=ch_mean,
                                                 std=ch_std)
        val_pipe = dali_util.DaliPipelineVal(args.val,
                                             args.root,
                                             insize,
                                             args.val_batchsize,
                                             num_threads,
                                             args.gpu,
                                             False,
                                             mean=ch_mean,
                                             std=ch_std)
        train_iter = chainer.iterators.DaliIterator(train_pipe)
        val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False)
        # converter = dali_converter
        converter = dali_util.DaliConverter(mean=mean, crop_size=insize)
    else:
        # Load the dataset files
        train = PreprocessedDataset(args.train, args.root, mean, insize)
        val = PreprocessedDataset(args.val, args.root, mean, insize, False)
        # These iterators load the images with subprocesses running in parallel
        # to the training/validation.
        train_iter = chainer.iterators.MultiprocessIterator(
            train, args.batchsize, n_processes=args.loaderjob)
        val_iter = chainer.iterators.MultiprocessIterator(
            val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)
        converter = dataset.concat_examples

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=converter,
                                                device=device)
    if args.iterations:
        stop_trigger = (args.iterations, 'iteration')
    else:
        stop_trigger = (args.epoch, 'epoch')
    trainer = training.Trainer(updater, stop_trigger, args.out)

    val_interval = (1 if args.test else 100000), 'iteration'
    log_interval = ((1 if args.test else 10 if args.iterations else 1000),
                    'iteration')

    trainer.extend(extensions.Evaluator(val_iter,
                                        model,
                                        converter=converter,
                                        device=device),
                   trigger=val_interval)
    # TODO(sonots): Temporarily disabled for chainerx. Fix it.
    if device.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    cuda_hook = function_hooks.CUDAProfileHook()
    with cuda_hook:
        trainer.run()

    with open('%s/log' % args.out) as f:
        logs = json.load(f)
    elapsed_times = []
    for prev, cur in zip(logs, logs[1:]):
        iters = cur['iteration'] - prev['iteration']
        elapsed = cur['elapsed_time'] - prev['elapsed_time']
        elapsed_times.append(elapsed / iters)
    sec_per_iter = sum(elapsed_times) / len(elapsed_times)
    print(sec_per_iter * 1000, 'msec/iter')
    print(args.batchsize / sec_per_iter, 'images/sec')
Exemplo n.º 38
0
def main():
    # These two lines help with memory. If they are not included training runs out of memory.
    # Use them till you the real reason why its running out of memory

    pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed)
    cp.cuda.set_allocator(pool.malloc)
    chainer.disable_experimental_feature_warning = True

    parser = argparse.ArgumentParser(
        description='CosmoFlow Multi-Node Training')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epochs',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--out',
                        '-o',
                        default='results',
                        help='Output directory')
    args = parser.parse_args()

    batch_size = args.batchsize
    epochs = args.epochs
    out = args.out

    # Prepare communicators  communicator.
    comm = chainermnx.create_communicator("spatial_hybrid_nccl")
    local_comm = create_local_comm(comm)

    data_comm = create_data_comm(comm)
    device = comm.intra_rank

    if local_comm.rank == 0:
        if data_comm.rank == 0:
            train = CosmoDataset("/groups2/gaa50004/cosmoflow_data")
            # train, val = datasets.split_dataset_random(training_data, first_size=(int(training_data.__len__() * 0.80)))

        else:
            train = None
            #val = None
        train = chainermn.scatter_dataset(train, data_comm, shuffle=True)
        # val = chainermn.scatter_dataset(val, data_comm, shuffle=True)
    else:
        train = CosmoDataset("/groups2/gaa50004/cosmoflow_data")
        train = chainermn.datasets.create_empty_dataset(train)
        # val = chainermn.datasets.create_empty_dataset(val)

    train_iterator = chainermn.iterators.create_multi_node_iterator(
        chainer.iterators.MultithreadIterator(train,
                                              batch_size,
                                              n_threads=20,
                                              shuffle=True), local_comm)
    # vali_iterator = chainermn.iterators.create_multi_node_iterator(
    #     chainer.iterators.MultithreadIterator(val, batch_size, repeat=False, shuffle=False, n_threads=20),
    #     local_comm)

    model = CosmoFlow(local_comm)
    # model = L.Classifier(model, lossfun=F.mean_squared_error, accfun=F.mean_squared_error)

    # print("Model Created successfully")
    ch.backends.cuda.get_device_from_id(device).use()
    model.to_gpu()  # Copy the model to the GPU

    optimizer = chainermnx.create_hybrid_multi_node_optimizer_alpha(
        chainer.optimizers.Adam(), data_comm, local_comm)

    optimizer.setup(model)
    # Create the updater, using the optimizer
    updater = training.StandardUpdater(train_iterator,
                                       optimizer,
                                       device=device)

    # Set up a trainer
    trainer = training.Trainer(updater, (epochs, 'epoch'), out=out)
    # trainer.extend(extensions.Evaluator(vali_iterator, model, device=device))

    filename = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".log"
    log_interval = (1, 'epoch')
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(
            extensions.LogReport(trigger=log_interval, filename=filename))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(
            extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                  'epoch',
                                  filename='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/accuracy', 'validation/main/accuracy'],
                'epoch',
                filename='accuracy.png'))

        trainer.extend(extensions.ProgressBar(update_interval=1))
        print("Starting Training ")

    trainer.run()
Exemplo n.º 39
0
def main():
    # Check if GPU is available
    # (ImageNet example does not support CPU execution)
    if not chainer.cuda.available:
        raise RuntimeError("ImageNet requires GPU support.")

    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch', '-E', type=int, default=10,
                        help='Number of epochs to train')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean', '-m', default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume', '-r', default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--root', '-R', default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize', '-b', type=int, default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--communicator', default='hierarchical')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    chainer.cuda.get_device_from_id(device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    mean = np.load(args.mean)
    if comm.rank == 0:
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(
            args.val, args.root, mean, model.insize, False)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)

    # We need to change the start method of multiprocessing module if we are
    # using InfiniBand and MultiprocessIterator. This is because processes
    # often crash when calling fork if they are using Infiniband.
    # (c.f., https://www.open-mpi.org/faq/?category=tuning#fork-warning )
    multiprocessing.set_start_method('forkserver')
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch')
    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    checkpointer = chainermn.create_multi_node_checkpointer(
        name='imagenet-example', comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer, trigger=checkpoint_interval)

    # Create a multi node evaluator from an evaluator.
    evaluator = TestModeEvaluator(val_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]), trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 40
0
def main():
    # Parse the arguments.
    args = parse_arguments()
    augment = False if args.augment == 'False' else True
    multi_gpu = False if args.multi_gpu == 'False' else True
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label(label_list):
        label_arr = np.asarray(label_list, dtype=np.int32)
        return label_arr

    # Apply a preprocessor to the dataset.
    logging.info('Preprocess train dataset and valid dataset...')
    preprocessor = preprocess_method_dict[args.method]()
    parser = CSVFileParserForPair(preprocessor,
                                  postprocess_label=postprocess_label,
                                  labels=labels,
                                  smiles_cols=['smiles_1', 'smiles_2'])
    train_dict = parser.parse(args.train_datafile,
                              return_smiles_pair_original=True)
    train = train_dict['dataset']
    train_smiles_pairs = train_dict['smiles_pair_original']
    valid_dict = parser.parse(args.valid_datafile,
                              return_smiles_pair_original=True)
    valid = valid_dict['dataset']
    valid_smiles_pairs = valid_dict['smiles_pair_original']

    if augment:
        logging.info('Utilizing data augmentation in train set')
        train = augment_dataset(train)
        if train_smiles_pairs is not None:
            train_smiles_pairs = augment_smiles_pairs(train_smiles_pairs)

    train = add_super_nodes(train, train_smiles_pairs)
    valid = add_super_nodes(valid, valid_smiles_pairs)

    num_train = train.get_datasets()[0].shape[0]
    num_valid = valid.get_datasets()[0].shape[0]
    logging.info('Train/valid split: {}/{}'.format(num_train, num_valid))

    # Set up the predictor.
    if len(args.net_hidden_dims):
        net_hidden_dims = tuple([
            int(net_hidden_dim)
            for net_hidden_dim in args.net_hidden_dims.split(',')
        ])
    else:
        net_hidden_dims = ()
    fp_attention = True if args.fp_attention else False
    update_attention = True if args.update_attention else False
    weight_tying = False if args.weight_tying == 'False' else True
    attention_tying = False if args.attention_tying == 'False' else True
    fp_batch_normalization = True if args.fp_bn == 'True' else False
    layer_aggregator = None if args.layer_aggregator == '' else args.layer_aggregator
    context = False if args.context == 'False' else True
    output_activation = functions.relu if args.output_activation == 'relu' else None
    n_heads = args.n_heads
    dropout_ratio = args.dropout_ratio
    predictor = set_up_predictor(
        method=args.method,
        fp_hidden_dim=args.fp_hidden_dim,
        fp_out_dim=args.fp_out_dim,
        conv_layers=args.conv_layers,
        concat_hidden=args.concat_hidden,
        layer_aggregator=layer_aggregator,
        fp_dropout_rate=args.fp_dropout_rate,
        fp_batch_normalization=fp_batch_normalization,
        net_hidden_dims=net_hidden_dims,
        class_num=class_num,
        sim_method=args.sim_method,
        fp_attention=fp_attention,
        weight_typing=weight_tying,
        attention_tying=attention_tying,
        update_attention=update_attention,
        context=context,
        context_layers=args.context_layers,
        context_dropout=args.context_dropout,
        message_function=args.message_function,
        readout_function=args.readout_function,
        num_timesteps=args.num_timesteps,
        num_output_hidden_layers=args.num_output_hidden_layers,
        output_hidden_dim=args.output_hidden_dim,
        output_activation=output_activation,
        symmetric=args.symmetric,
        n_heads=n_heads,
        dropout_ratio=dropout_ratio)

    train_iter = SerialIterator(train, args.batchsize)
    valid_iter = SerialIterator(valid,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)

    metrics_fun = {'accuracy': F.binary_accuracy}
    loss_func = F.sigmoid_cross_entropy
    classifier = Classifier(predictor,
                            lossfun=loss_func,
                            metrics_fun=metrics_fun,
                            device=args.gpu)

    # Set up the optimizer.
    optimizer = optimizers.Adam(alpha=args.learning_rate,
                                weight_decay_rate=args.weight_decay_rate)
    # optimizer = optimizers.Adam()
    # optimizer = optimizers.SGD(lr=args.learning_rate)
    optimizer.setup(classifier)
    # add regularization
    if args.max_norm > 0:
        optimizer.add_hook(
            chainer.optimizer.GradientClipping(threshold=args.max_norm))
    if args.l2_rate > 0:
        optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.l2_rate))
    if args.l1_rate > 0:
        optimizer.add_hook(chainer.optimizer.Lasso(rate=args.l1_rate))

    # Set up the updater.
    if multi_gpu:
        logging.info('Using multiple GPUs')
        updater = training.ParallelUpdater(train_iter,
                                           optimizer,
                                           devices={
                                               'main': 0,
                                               'second': 1
                                           },
                                           converter=concat_mols)
    else:
        logging.info('Using single GPU')
        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           device=args.gpu,
                                           converter=concat_mols)

    # Set up the trainer.
    logging.info('Training...')
    # add stop_trigger parameter
    early_stop = triggers.EarlyStoppingTrigger(monitor='validation/main/loss',
                                               patients=10,
                                               max_trigger=(500, 'epoch'))
    out = 'output' + '/' + args.out
    trainer = training.Trainer(updater, stop_trigger=early_stop, out=out)

    # trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        E.Evaluator(valid_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))

    train_eval_iter = SerialIterator(train,
                                     args.batchsize,
                                     repeat=False,
                                     shuffle=False)

    trainer.extend(
        AccuracyEvaluator(train_eval_iter,
                          classifier,
                          eval_func=predictor,
                          device=args.gpu,
                          converter=concat_mols,
                          name='train_acc',
                          pos_labels=1,
                          ignore_labels=-1,
                          raise_value_error=False))
    # extension name='validation' is already used by `Evaluator`,
    # instead extension name `val` is used.
    trainer.extend(
        AccuracyEvaluator(valid_iter,
                          classifier,
                          eval_func=predictor,
                          device=args.gpu,
                          converter=concat_mols,
                          name='val_acc',
                          pos_labels=1,
                          ignore_labels=-1))

    trainer.extend(
        ROCAUCEvaluator(train_eval_iter,
                        classifier,
                        eval_func=predictor,
                        device=args.gpu,
                        converter=concat_mols,
                        name='train_roc',
                        pos_labels=1,
                        ignore_labels=-1,
                        raise_value_error=False))
    # extension name='validation' is already used by `Evaluator`,
    # instead extension name `val` is used.
    trainer.extend(
        ROCAUCEvaluator(valid_iter,
                        classifier,
                        eval_func=predictor,
                        device=args.gpu,
                        converter=concat_mols,
                        name='val_roc',
                        pos_labels=1,
                        ignore_labels=-1))

    trainer.extend(
        PRCAUCEvaluator(train_eval_iter,
                        classifier,
                        eval_func=predictor,
                        device=args.gpu,
                        converter=concat_mols,
                        name='train_prc',
                        pos_labels=1,
                        ignore_labels=-1,
                        raise_value_error=False))
    # extension name='validation' is already used by `Evaluator`,
    # instead extension name `val` is used.
    trainer.extend(
        PRCAUCEvaluator(valid_iter,
                        classifier,
                        eval_func=predictor,
                        device=args.gpu,
                        converter=concat_mols,
                        name='val_prc',
                        pos_labels=1,
                        ignore_labels=-1))

    trainer.extend(
        F1Evaluator(train_eval_iter,
                    classifier,
                    eval_func=predictor,
                    device=args.gpu,
                    converter=concat_mols,
                    name='train_f',
                    pos_labels=1,
                    ignore_labels=-1,
                    raise_value_error=False))
    # extension name='validation' is already used by `Evaluator`,
    # instead extension name `val` is used.
    trainer.extend(
        F1Evaluator(valid_iter,
                    classifier,
                    eval_func=predictor,
                    device=args.gpu,
                    converter=concat_mols,
                    name='val_f',
                    pos_labels=1,
                    ignore_labels=-1))

    # apply shift strategy to learning rate every 10 epochs
    # trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=(10, 'epoch'))
    if args.exp_shift_strategy == 1:
        trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate),
                       trigger=triggers.ManualScheduleTrigger(
                           [10, 20, 30, 40, 50, 60], 'epoch'))
    elif args.exp_shift_strategy == 2:
        trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate),
                       trigger=triggers.ManualScheduleTrigger(
                           [5, 10, 15, 20, 25, 30], 'epoch'))
    elif args.exp_shift_strategy == 3:
        trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate),
                       trigger=triggers.ManualScheduleTrigger(
                           [5, 10, 15, 20, 25, 30, 40, 50, 60, 70], 'epoch'))
    else:
        raise ValueError('No such strategy to adapt learning rate')
    # # observation of learning rate
    trainer.extend(E.observe_lr(), trigger=(1, 'iteration'))

    entries = [
        'epoch',
        'main/loss',
        'train_acc/main/accuracy',
        'train_roc/main/roc_auc',
        'train_prc/main/prc_auc',
        # 'train_p/main/precision', 'train_r/main/recall',
        'train_f/main/f1',
        'validation/main/loss',
        'val_acc/main/accuracy',
        'val_roc/main/roc_auc',
        'val_prc/main/prc_auc',
        # 'val_p/main/precision', 'val_r/main/recall',
        'val_f/main/f1',
        'lr',
        'elapsed_time'
    ]
    trainer.extend(E.PrintReport(entries=entries))
    trainer.extend(E.snapshot(), trigger=(10, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(E.ProgressBar())
    trainer.extend(
        E.PlotReport(['main/loss', 'validation/main/loss'],
                     'epoch',
                     file_name='loss.png'))
    trainer.extend(
        E.PlotReport(['train_acc/main/accuracy', 'val_acc/main/accuracy'],
                     'epoch',
                     file_name='accuracy.png'))

    if args.resume:
        resume_path = os.path.join(out, args.resume)
        logging.info(
            'Resume training according to snapshot in {}'.format(resume_path))
        chainer.serializers.load_npz(resume_path, trainer)

    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(out, args.model_filename)
    logging.info('Saving the trained models to {}...'.format(model_path))
    classifier.save_pickle(model_path, protocol=args.protocol)
Exemplo n.º 41
0
def main():
    archs = {
        'alex': alex.Alex,
        'alex_fp16': alex.AlexFp16,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
        'resnext50': resnext50.ResNeXt50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch', '-E', type=int, default=10,
                        help='Number of epochs to train')
    parser.add_argument('--device', '-d', type=str, default='-1',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean', '-m', default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume', '-r', default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--root', '-R', default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize', '-b', type=int, default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    parser.add_argument('--dali', action='store_true')
    parser.set_defaults(dali=False)
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu', '-g', type=int, nargs='?', const=0,
                       help='GPU ID (negative value indicates CPU)')
    args = parser.parse_args()

    device = parse_device(args)

    print('Device: {}'.format(device))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from {}'.format(args.initmodel))
        chainer.serializers.load_npz(args.initmodel, model)
    model.to_device(device)
    device.use()

    # Load the mean file
    mean = np.load(args.mean)
    if args.dali:
        if not dali_util._dali_available:
            raise RuntimeError('DALI seems not available on your system.')
        num_threads = args.loaderjob
        if num_threads is None or num_threads <= 0:
            num_threads = 1
        ch_mean = list(np.average(mean, axis=(1, 2)))
        ch_std = [255.0, 255.0, 255.0]
        # Setup DALI pipelines
        train_pipe = dali_util.DaliPipelineTrain(
            args.train, args.root, model.insize, args.batchsize,
            num_threads, args.gpu, True, mean=ch_mean, std=ch_std)
        val_pipe = dali_util.DaliPipelineVal(
            args.val, args.root, model.insize, args.val_batchsize,
            num_threads, args.gpu, False, mean=ch_mean, std=ch_std)
        train_iter = chainer.iterators.DaliIterator(train_pipe)
        val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False)
        # converter = dali_converter
        converter = dali_util.DaliConverter(mean=mean, crop_size=model.insize)
    else:
        # Load the dataset files
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(args.val, args.root, mean, model.insize,
                                  False)
        # These iterators load the images with subprocesses running in parallel
        # to the training/validation.
        train_iter = chainer.iterators.MultiprocessIterator(
            train, args.batchsize, n_processes=args.loaderjob)
        val_iter = chainer.iterators.MultiprocessIterator(
            val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)
        converter = dataset.concat_examples

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, converter=converter, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    val_interval = (1 if args.test else 100000), 'iteration'
    log_interval = (1 if args.test else 1000), 'iteration'

    trainer.extend(extensions.Evaluator(val_iter, model, converter=converter,
                                        device=device), trigger=val_interval)
    # TODO(sonots): Temporarily disabled for chainerx. Fix it.
    if not (chainerx.is_available() and isinstance(device, chainerx.Device)):
        trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'), trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]), trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Exemplo n.º 42
0
    def setup_trainer(self):
        self.updater = chainer.training.updater.StandardUpdater(
            self.train_iterator, self.optimizer, device=self.gpu)
        self.trainer = chainer.training.Trainer(
            self.updater, (self.max_epoch, 'epoch'), out=self.out_dir)

        self.trainer.extend(
            extensions.Evaluator(
                self.val_iterator, self.model, device=self.gpu),
            trigger=(self.eval_interval, self.eval_interval_type))

        # Save snapshot
        self.trainer.extend(
            extensions.snapshot_object(
                self.model,
                savefun=S.save_npz,
                filename='model_snapshot.npz'),
            trigger=chainer.training.triggers.MinValueTrigger(
                'validation/main/loss',
                (self.save_interval, self.save_interval_type)))

        # Dump network architecture
        self.trainer.extend(
            extensions.dump_graph(
                root_name='main/loss',
                out_name='network_architecture.dot'))

        # Logging
        self.trainer.extend(
            extensions.ProgressBar(
                update_interval=self.progressbar_update_interval))
        self.trainer.extend(
            extensions.observe_lr(),
            trigger=(self.log_interval, self.log_interval_type))
        self.trainer.extend(
            extensions.LogReport(
                log_name='log.json',
                trigger=(self.log_interval, self.log_interval_type)))
        self.trainer.extend(
            extensions.PrintReport([
                'iteration',
                'epoch',
                'elapsed_time',
                'lr',
                'main/loss',
                'validation/main/loss',
            ]), trigger=(self.print_interval, self.print_interval_type))

        # Plot
        self.trainer.extend(
            extensions.PlotReport([
                'main/loss',
                'validation/main/loss',
            ],
                file_name='loss_plot.png',
                x_key=self.plot_interval_type,
                trigger=(self.plot_interval, self.plot_interval_type)),
            trigger=(self.plot_interval, self.plot_interval_type))

        # Dump params
        params = dict()
        params['model_name'] = self.model_name
        params['train_dataset_dir'] = self.train_dataset_dir
        params['val_dataset_dir'] = self.val_dataset_dir
        params['class_names'] = self.train_dataset.class_names
        params['timestamp'] = self.timestamp_iso
        params['out_dir'] = self.out_dir
        params['gpu'] = self.gpu
        params['batch_size'] = self.batch_size
        params['max_epoch'] = self.max_epoch
        params['lr'] = self.lr
        params['weight_decay'] = self.weight_decay
        self.trainer.extend(
            fcn.extensions.ParamsReport(params, file_name='params.yaml'))

        # Dump param for fcn_object_segmentation.py
        model_name = dict()
        model_name['model_name'] = self.model_name
        self.trainer.extend(
            fcn.extensions.ParamsReport(
                model_name, file_name='model_name.yaml'))
        target_names = dict()
        target_names['target_names'] = self.train_dataset.class_names
        self.trainer.extend(
            fcn.extensions.ParamsReport(
                target_names, file_name='target_names.yaml'))