Пример #1
0
def main():
    mushroomsfile = "mushrooms.csv"

    data_array = np.genfromtxt(mushroomsfile,
                               delimiter=',',
                               dtype=str,
                               skip_header=1)
    n_data, n_featrue = data_array.shape
    for col in range(n_featrue):
        data_array[:, col] = np.unique(data_array[:, col],
                                       return_inverse=True)[1]
    X = data_array[:, 1:].astype(np.float32)
    Y = data_array[:, 0].astype(np.int32)[:, None]
    train, test = datasets.split_dataset_random(datasets.TupleDataset(X, Y),
                                                int(n_data * 0.7))
    train_iter = ch.iterators.SerialIterator(train, 100)
    test_iter = ch.iterators.SerialIterator(test,
                                            100,
                                            repeat=False,
                                            shuffle=False)
    model = L.Classifier(MLP(44, 1),
                         lossfun=F.sigmoid_cross_entropy,
                         accfun=F.binary_accuracy)
    optimizer = ch.optimizers.SGD().setup(model)
    updater = training.StandardUpdater(train_iter, optimizer, device=-1)
    trainer = training.Trainer(updater, (50, 'epoch'), out='result')
    trainer.extend(extensions.Evaluator(test_iter, model, device=-1))
    trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(
        extensions.snapshot(filename='trainer_epoch_{.updater.epoch}'),
        trigger=(10, 'epoch'))

    trainer.extend(extensions.LogReport())
    if extensions.PlotReport.available():
        trainer.extend(
            extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                  'epoch',
                                  file_name='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/accuracy', 'validation/main/accuracy'],
                'epoch',
                file_name='accuracy.png'))

    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.run()

    x, t = test[np.random.randint(len(test))]

    predict = model.predictor(x[None]).array
    predict = predict[0][0]

    if predict >= 0:
        print('Predicted Poisonous, Actual ' + ['Edible', 'Poisonous'][t[0]])
    else:
        print('Predicted Edible, Actual ' + ['Edible', 'Poisonous'][t[0]])
Пример #2
0
def main():
    train, test = datasets.mnist.get_mnist()

    batchsize = 128
    train_iter = iterators.SerialIterator(train, batchsize)
    test_iter = iterators.SerialIterator(test, batchsize, False, False)

    gpu_id = 0
    model = MLP()

    if gpu_id != -1:
        model.to_gpu(gpu_id)

    max_epoch = 10
    model = L.Classifier(model)
    optimizer = optimizers.MomentumSGD()
    optimizer.setup(model)
    updater = training.updaters.StandardUpdater(train_iter, optimizer, device=gpu_id)
    trainer = training.Trainer(updater, (max_epoch, 'epoch'), out='mnist_out')

    trainer.extend(ext.LogReport())
    trainer.extend(ext.snapshot(filename='snapshot_epoch-{.updater.epoch}'))
    trainer.extend(ext.snapshot_object(model.predictor, filename='model_epoch-{.updater.epoch}'))
    trainer.extend(ext.Evaluator(test_iter, model, device=gpu_id))
    trainer.extend(ext.PrintReport(['epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time']))
    trainer.extend(ext.PlotReport(['main/loss', 'validation/main/loss'], x_key='epoch', file_name='loss.png'))
    trainer.extend(ext.PlotReport(['main/accuracy', 'validation/main/accuracy'], x_key='epoch', file_name='accuracy.png'))
    trainer.extend(ext.DumpGraph('main/loss'))

    trainer.run()
Пример #3
0
def train(X, y, batch_size=256, max_epoch=20, gpu_id=0):
    n_out = y.shape[1]
    train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.75)
    train = first_dataset(train_X, train_y)
    test = first_dataset(test_X, test_y)
    train_iter = chainer.iterators.SerialIterator(train, batch_size)
    test_iter = chainer.iterators.SerialIterator(test, batch_size, False,
                                                 False)

    model = NN(n_out)
    if gpu_id >= 0:
        model.to_gpu(gpu_id)
    model = L.Classifier(model,
                         lossfun=F.mean_squared_error,
                         accfun=F.r2_score)
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    updater = chainer.training.updaters.StandardUpdater(train_iter,
                                                        optimizer,
                                                        device=gpu_id)
    trainer = chainer.training.Trainer(updater, (max_epoch, 'epoch'),
                                       out='first_result')

    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.Evaluator(test_iter, model, device=gpu_id))
    trainer.extend(
        extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}'))
    trainer.extend(
        extensions.snapshot_object(
            model.predictor, filename='first_model_epoch-{.updater.epoch}'))
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(
        extensions.PlotReport(['main/loss', 'validation/main/loss'],
                              x_key='epoch',
                              file_name='loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'],
                              x_key='epoch',
                              file_name='accuracy.png'))
    trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.run()
Пример #4
0
def train():
    train, test = mnist.get_mnist()
    batchsize = 128
    train_iter = iterators.SerialIterator(train, batchsize)
    test_iter = iterators.SerialIterator(test,
                                         batchsize,
                                         shuffle=False,
                                         repeat=False)
    model = L.Classifier(MLP())

    device = -1
    max_epoch = 10
    if chainer.backends.cuda.available:
        device = 0
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    trainer = training.Trainer(updater, (max_epoch, 'epoch'),
                               out="mnist_result")
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.snapshot(filename="snapshot_epoch-{.updater.epoch}"))
    trainer.extend(
        extensions.snapshot_object(model,
                                   filename="model_epoch-{.updater.epoch}"))
    trainer.extend(extensions.Evaluator(test_iter, model, device=device))
    trainer.extend(
        extensions.PrintReport([
            "epoch", "main/loss", "main/accuracy", "validation/main/loss",
            "validation/main/accuracy", "elapsed_time"
        ]))
    trainer.extend(
        extensions.PlotReport(["main/loss", "validation/main/loss"],
                              x_key="epoch",
                              file_name="loss.png"))
    trainer.extend(
        extensions.PlotReport(["main/accuracy", "validation/main/accuracy"],
                              x_key="epoch",
                              file_name="accuracy"))
    trainer.extend(extensions.DumpGraph("main/loss"))
    trainer.run()
Пример #5
0
def main(args):
    mnist_train = chainer.datasets.get_mnist(ndim=3, withlabel=False)[0]
    itr = iterators.SerialIterator(mnist_train,
                                   args.b,
                                   shuffle=True,
                                   repeat=True)

    model = GAN(args.z)
    if chainer.config.use_ideep != "never":
        model.to_intel64()
    opt = optimizers.Adam(alpha=0.0002, beta1=0.5, beta2=0.9)
    opt.setup(model)

    updater = training.StandardUpdater(itr, opt)
    trainer = training.Trainer(updater,
                               stop_trigger=(args.e, "epoch"),
                               out=args.r)

    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport(["main/loss_real", "main/loss_fake", "epoch"]))
    trainer.extend(extensions.ProgressBar())
    trainer.extend(
        extensions.PlotReport(["main/loss_real", "main/loss_fake"],
                              filename="loss.pdf"))
    trainer.extend(ext_save_img(model.gen, args.r, args.z))
    trainer.extend(extensions.DumpGraph("main/loss_fake"))
    if args.save_model:
        trainer.extend(extensions.snapshot_object(
            model.gen, "gen_epoch_{.updater.epoch:04d}.npz"),
                       trigger=(10, "epoch"))
        trainer.extend(extensions.snapshot_object(
            model.dis, "dis_epoch_{.updater.epoch:04d}.npz"),
                       trigger=(10, "epoch"))

    trainer.run()
Пример #6
0
def main():
    parser = argparse.ArgumentParser(description='''\
ChainerMN example: MNIST with automatic checkpoints enabled''')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator',
                        type=str,
                        default='hierarchical',
                        help='Type of communicator')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--run-id',
                        type=str,
                        default='train-mnist-example',
                        help='ID of the task name')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = L.Classifier(MLP(args.unit, 10))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Enable checkpointer and recover from checkpoint if any checkpoint exists
    checkpointer = create_multi_node_checkpointer(name=args.run_id, comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    print("Rank", comm.rank, ": (Re)Starting from (epoch, iter) =",
          (trainer.updater.epoch, trainer.updater.iteration))
    trainer.extend(checkpointer, trigger=(1000, 'iteration'))

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
def main():
    # This script is almost identical to train_mnist.py. The only difference is
    # that this script uses data-parallel computation on two GPUs.
    # See train_mnist.py for more details.
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=400,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--out',
                        '-o',
                        default='result_data_parallel',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--device0',
                        '-d',
                        type=str,
                        default='0',
                        help='Device specifier of the first device. '
                        'Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--device1',
                        '-D',
                        type=str,
                        default='1',
                        help='Device specifier of the second device. '
                        'Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu0',
                       '-g',
                       dest='device0',
                       type=int,
                       nargs='?',
                       const=0,
                       help='First GPU ID')
    group.add_argument('--gpu1',
                       '-G',
                       dest='device1',
                       type=int,
                       nargs='?',
                       const=1,
                       help='Second GPU ID')
    args = parser.parse_args()
    device0 = chainer.get_device(args.device0)
    device1 = chainer.get_device(args.device1)

    print('Devices: {}, {}'.format(device0, device1))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    device0.use()

    model = L.Classifier(train_mnist.MLP(args.unit, 10))
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    train, test = chainer.datasets.get_mnist()
    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # ParallelUpdater implements the data-parallel gradient computation on
    # multiple devices. It accepts "devices" argument that specifies which
    # device to use.
    updater = training.updaters.ParallelUpdater(
        train_iter,
        optimizer,
        # The device of the name 'main' is used as a "master", while others are
        # used as slaves. Names other than 'main' are arbitrary.
        devices={
            'main': device0,
            'second': device1
        },
    )
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(extensions.Evaluator(test_iter, model, device=device0))
    # TODO(niboshi): Temporarily disabled for chainerx. Fix it.
    if device0.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
def main():
    archs = {
        'alex': alex.Alex,
        'alex_fp16': alex.AlexFp16,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
        'resnext50': resnext50.ResNeXt50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch', '-a', choices=archs.keys(),
                        default='nin', help='Convnet architecture')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch', '-E', type=int, default=10,
                        help='Number of epochs to train')
    parser.add_argument('--gpus', '-g', type=int, nargs="*",
                        default=[0, 1, 2, 3])
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean', '-m', default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume', '-r', default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--root', '-R', default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize', '-b', type=int, default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from {}'.format(args.initmodel))
        chainer.serializers.load_npz(args.initmodel, model)

    # Load the datasets and mean file
    mean = np.load(args.mean)
    train = train_imagenet.PreprocessedDataset(
        args.train, args.root, mean, model.insize)
    val = train_imagenet.PreprocessedDataset(
        args.val, args.root, mean, model.insize, False)
    # These iterators load the images with subprocesses running in parallel to
    # the training/validation.
    devices = tuple(args.gpus)

    train_iters = [
        chainer.iterators.MultiprocessIterator(i,
                                               args.batchsize,
                                               n_processes=args.loaderjob)
        for i in chainer.datasets.split_dataset_n_random(train, len(devices))]
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = updaters.MultiprocessParallelUpdater(train_iters, optimizer,
                                                   devices=devices)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    if args.test:
        val_interval = 5, 'epoch'
        log_interval = 1, 'epoch'
    else:
        val_interval = 100000, 'iteration'
        log_interval = 1000, 'iteration'

    trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpus[0]),
                   trigger=val_interval)
    trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'), trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]), trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=2))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Пример #9
0
def main():
    # Check if GPU is available
    # (ImageNet example does not support CPU execution)
    if not chainer.cuda.available:
        raise RuntimeError('ImageNet requires GPU support.')

    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--communicator', default='hierarchical')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Start method of multiprocessing module need to be changed if we
    # are using InfiniBand and MultiprocessIterator. This is because
    # processes often crash when calling fork if they are using
    # Infiniband.  (c.f.,
    # https://www.open-mpi.org/faq/?category=tuning#fork-warning )
    # Also, just setting the start method does not seem to be
    # sufficient to actually launch the forkserver processes, so also
    # start a dummy process.
    # See also our document:
    # https://chainermn.readthedocs.io/en/stable/tutorial/tips_faqs.html#using-multiprocessiterator
    # This must be done *before* ``chainermn.create_communicator``!!!
    multiprocessing.set_start_method('forkserver')
    p = multiprocessing.Process(target=lambda *x: x, args=())
    p.start()
    p.join()

    # Prepare ChainerMN communicator.
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    chainer.cuda.get_device_from_id(device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    mean = np.load(args.mean)
    if comm.rank == 0:
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(args.val, args.root, mean, model.insize,
                                  False)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)

    # A workaround for processes crash should be done before making
    # communicator above, when using fork (e.g. MultiProcessIterator)
    # along with Infiniband.
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch')
    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    checkpointer = chainermn.create_multi_node_checkpointer(
        name='imagenet-example', comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer, trigger=checkpoint_interval)

    # Create a multi node evaluator from an evaluator.
    evaluator = TestModeEvaluator(val_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Пример #10
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--frequency', '-f', type=int, default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--device', '-d', type=str, default='-1',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--model', '-m', default='MLP',
                        help='Choose the model: MLP or MLPSideEffect')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1000,
                        help='Number of units')
    parser.add_argument('--noplot', dest='plot', action='store_false',
                        help='Disable PlotReport extension')
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu', '-g', dest='device',
                       type=int, nargs='?', const=0,
                       help='GPU ID (negative value indicates CPU)')
    args = parser.parse_args()

    device = chainer.get_device(args.device)
    if device.xp is chainerx:
        sys.stderr.write('This example does not support ChainerX devices.\n')
        sys.exit(1)

    print('Device: {}'.format(device))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    device.use()

    # Set up a neural network to train
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    if args.model == 'MLP':
        model = L.Classifier(MLP(args.unit, 10))
    elif args.model == 'MLPSideEffect':
        model = L.Classifier(MLPSideEffect(args.unit, 10))

    model.to_device(device)

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # Load the MNIST dataset
    train, test = chainer.datasets.get_mnist()

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test, args.batchsize,
                                                 repeat=False, shuffle=False)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=device))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    trainer.extend(extensions.DumpGraph('main/loss'))

    # Take a snapshot for each specified epoch
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())

    # Save two plot images to the result dir
    if args.plot and extensions.PlotReport.available():
        trainer.extend(
            extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                  'epoch', file_name='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/accuracy', 'validation/main/accuracy'],
                'epoch', file_name='accuracy.png'))

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(extensions.PrintReport(
        ['epoch', 'main/loss', 'validation/main/loss',
         'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
Пример #11
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--device',
                        '-d',
                        type=str,
                        default='-1',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        help='Resume the training from snapshot')
    parser.add_argument('--autoload',
                        action='store_true',
                        help='Automatically load trainer snapshots in case'
                        ' of preemption or other temporary system failure')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu',
                       '-g',
                       dest='device',
                       type=int,
                       nargs='?',
                       const=0,
                       help='GPU ID (negative value indicates CPU)')
    args = parser.parse_args()

    device = chainer.get_device(args.device)

    print('Device: {}'.format(device))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Set up a neural network to train
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    model = L.Classifier(MLP(args.unit, 10))
    model.to_device(device)
    device.use()

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # Load the MNIST dataset
    train, test = chainer.datasets.get_mnist()

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=device),
                   call_before_training=True)

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    # TODO(niboshi): Temporarily disabled for chainerx. Fix it.
    if device.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))

    # Take a snapshot for each specified epoch
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    # Take a snapshot each ``frequency`` epoch, delete old stale
    # snapshots and automatically load from snapshot files if any
    # files are already resident at result directory.
    trainer.extend(extensions.snapshot(num_retain=1, autoload=args.autoload),
                   trigger=(frequency, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport(), call_before_training=True)

    # Save two plot images to the result dir
    trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                         'epoch',
                                         file_name='loss.png'),
                   call_before_training=True)
    trainer.extend(extensions.PlotReport(
        ['main/accuracy', 'validation/main/accuracy'],
        'epoch',
        file_name='accuracy.png'),
                   call_before_training=True)

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(extensions.PrintReport([
        'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
        'validation/main/accuracy', 'elapsed_time'
    ]),
                   call_before_training=True)

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    if args.resume is not None:
        # Resume from a snapshot (Note: this loaded model is to be
        # overwritten by --autoload option, autoloading snapshots, if
        # any snapshots exist in output directory)
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
def main():
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        default=20,
                        type=int,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu0',
                        '-g',
                        default=0,
                        type=int,
                        help='First GPU ID')
    parser.add_argument('--gpu1',
                        '-G',
                        default=1,
                        type=int,
                        help='Second GPU ID')
    parser.add_argument('--out',
                        '-o',
                        default='result_model_parallel',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        default=1000,
                        type=int,
                        help='Number of units')
    args = parser.parse_args()

    print('GPU: {}, {}'.format(args.gpu0, args.gpu1))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # See train_mnist.py for the meaning of these lines

    model = L.Classifier(ParallelMLP(args.unit, 10, args.gpu0, args.gpu1))
    chainer.backends.cuda.get_device_from_id(args.gpu0).use()

    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    train, test = chainer.datasets.get_mnist()

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=args.gpu0)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu0))
    trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Пример #13
0
def main():
    parser = argparse.ArgumentParser(description='ChainerMN example: MNIST')
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator', type=str,
                        default='pure_nccl', help='Type of communicator')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--chainerx', '-x', action='store_true',
                        default=False, help='Use ChainerX')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1000,
                        help='Number of units')

    parser.add_argument('--benchmark', action='store_true',
                        help='benchmark mode')
    
    parser.add_argument('--benchmark-iteration', type=int, default=500,
                        help='the number of iterations when using benchmark mode')

    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print('Error: \'naive\' communicator does not support GPU.\n')
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        if args.chainerx:
            device = chainer.get_device('cuda:{}'.format(comm.intra_rank))
        else:
            device = chainer.get_device(comm.intra_rank)
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        if args.chainerx:
            device = chainer.get_device('native')
        else:
            device = chainer.get_device(-1)

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = L.Classifier(MLP(args.unit, 10))

    model.to_device(device)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test, args.batchsize,
                                                 repeat=False, shuffle=False)

    if args.benchmark:
        stop_trigger = (args.benchmark_iteration, 'iteration')
    else:
        stop_trigger = (args.epoch, 'epoch')

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        if device.xp is not chainerx:
            # Disabled for ChainerX.
            # This is because ChainerX doesn't have a public API set
            # to traverse computational graphs.
            # See examples/mnist/train_mnist.py
            trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
        trainer.extend(extensions.ProgressBar())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Пример #14
0
def main():
    # These two lines help with memory. If they are not included training runs out of memory.
    # Use them till you the real reason why its running out of memory

    pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed)
    cp.cuda.set_allocator(pool.malloc)
    chainer.disable_experimental_feature_warning = True

    parser = argparse.ArgumentParser(description='CosmoFlow Multi-Node Training')
    parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size')
    parser.add_argument('--epochs', '-E', type=int, default=10, help='Number of epochs to train')
    parser.add_argument('--out', '-o', default='results', help='Output directory')
    args = parser.parse_args()

    batch_size = args.batchsize
    epochs = args.epochs
    out = args.out
    #  Create ChainerMN communicator.
    comm = chainermnx.create_communicator("spatial_nccl")
    device = comm.intra_rank

    # Input data and label
    train = CosmoDataset("/groups2/gaa50004/cosmoflow_data")

    if comm.rank != 0:
        train = chainermn.datasets.create_empty_dataset(train)
        # test = chainermn.datasets.create_empty_dataset(test)

    train_iterator = chainermn.iterators.create_multi_node_iterator(
        chainer.iterators.MultithreadIterator(train, batch_size, n_threads=20, shuffle=True), comm)
    # vali_iterator = chainermn.iterators.create_multi_node_iterator(
    #     chainer.iterators.MultithreadIterator(test, batch_size, repeat=False, shuffle=False, n_threads=20),
    #     comm)
    # train_iterator = ch.iterators.SerialIterator(train, batch_size, shuffle=True)
    # vali_iterator = ch.iterators.SerialIterator(test, batch_size, repeat=False, shuffle=False)
    model = CosmoFlow(comm)

    # print("Model Created successfully")
    ch.backends.cuda.get_device_from_id(device).use()
    model.to_gpu()  # Copy the model to the GPU

    optimizer = ch.optimizers.Adam()

    optimizer.setup(model)
    # Create the updater, using the optimizer
    updater = training.StandardUpdater(train_iterator, optimizer, device=device)

    # Set up a trainer
    trainer = training.Trainer(updater, (epochs, 'epoch'), out=out)
    # trainer.extend(extensions.Evaluator(vali_iterator, model, device=device))

    log_interval = (1, 'epoch')
    filename = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".log"

    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval, filename=filename))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport(['epoch' 'Validation loss', 'lr']), trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=1))
        print("Starting Training ")
    trainer.run()
Пример #15
0
def main():
    parser = argparse.ArgumentParser(description='Chainer CIFAR example:')
    parser.add_argument('--dataset',
                        '-d',
                        default='cifar10',
                        help='The dataset to use: cifar10 or cifar100')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--learnrate',
                        '-l',
                        type=float,
                        default=0.05,
                        help='Learning rate for SGD')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=300,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=0,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--early-stopping',
                        type=str,
                        help='Metric to watch for early stopping')
    args = parser.parse_args()

    print('GPU: {}'.format(args.gpu))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Set up a neural network to train.
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    if args.dataset == 'cifar10':
        print('Using CIFAR10 dataset.')
        class_labels = 10
        train, test = get_cifar10()
    elif args.dataset == 'cifar100':
        print('Using CIFAR100 dataset.')
        class_labels = 100
        train, test = get_cifar100()
    else:
        raise RuntimeError('Invalid dataset choice.')
    model = L.Classifier(models.VGG.VGG(class_labels))
    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    optimizer = chainer.optimizers.MomentumSGD(args.learnrate)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4))

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    stop_trigger = (args.epoch, 'epoch')
    # Early stopping option
    if args.early_stopping:
        stop_trigger = triggers.EarlyStoppingTrigger(
            monitor=args.early_stopping,
            verbose=True,
            max_trigger=(args.epoch, 'epoch'))

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=args.gpu)
    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu))

    # Reduce the learning rate by half every 25 epochs.
    trainer.extend(extensions.ExponentialShift('lr', 0.5),
                   trigger=(25, 'epoch'))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    trainer.extend(extensions.DumpGraph('main/loss'))

    # Take a snapshot at each epoch
    trainer.extend(
        extensions.snapshot(filename='snaphot_epoch_{.updater.epoch}'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
Пример #16
0
def main():
    parser = argparse.ArgumentParser(
        description='ChainerMN example: pipelined neural network')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    if args.gpu:
        comm = chainermn.create_communicator('pure_nccl')
        data_axis, model_axis = comm.rank % 2, comm.rank // 2
        data_comm = comm.split(data_axis, comm.rank)
        model_comm = comm.split(model_axis, comm.rank)
        device = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        data_axis, model_axis = comm.rank % 2, comm.rank // 2
        data_comm = comm.split(data_axis, comm.rank)
        model_comm = comm.split(model_axis, comm.rank)
        device = -1

    if model_comm.size != 2:
        raise ValueError('This example can only be executed on the even number'
                         'of processes.')

    if comm.rank == 0:
        print('==========================================')
        if args.gpu:
            print('Using GPUs')
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    if data_axis == 0:
        model = L.Classifier(MLP0(model_comm, args.unit))
    elif data_axis == 1:
        model = MLP1(model_comm, args.unit, 10)

    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), data_comm)
    optimizer.setup(model)

    # Original dataset on worker 0 and 1.
    # Datasets of worker 0 and 1 are split and distributed to all workers.
    if model_axis == 0:
        train, test = chainer.datasets.get_mnist()
        if data_axis == 1:
            train = chainermn.datasets.create_empty_dataset(train)
            test = chainermn.datasets.create_empty_dataset(test)
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, data_comm, shuffle=True)
    test = chainermn.scatter_dataset(test, data_comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  args.batchsize,
                                                  shuffle=False)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, data_comm)
    trainer.extend(evaluator)

    # Some display and output extentions are necessary only for worker 0.
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
Пример #17
0
def main():
    models = {
        'alexnet': AlexNet,
        'resnet': ResNet50,
        'vgg': VGG,
    }

    parser = argparse.ArgumentParser(description='Train ImageNet From Scratch')
    parser.add_argument('--model',
                        '-M',
                        choices=models.keys(),
                        default='AlexNet',
                        help='Convnet model')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epochs',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--out',
                        '-o',
                        default='results',
                        help='Output directory')
    args = parser.parse_args()

    batch_size = args.batchsize
    epochs = args.epochs
    out = args.out

    # Start method of multiprocessing module need to be changed if we are using InfiniBand and MultiprocessIterator.
    multiprocessing.set_start_method('forkserver')
    p = multiprocessing.Process()
    p.start()
    p.join()

    # Prepare ChainerMN communicator.
    comm = chainermn.create_communicator("pure_nccl")
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num of GPUs : {}'.format(comm.size))
        print('Model :  {}'.format(args.model))
        print('Minibatch-size: {}'.format(batch_size))
        print('Epochs: {}'.format(args.epochs))
        print('==========================================')

    model = models[args.model](comm)

    chainer.backends.cuda.get_device_from_id(
        device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 lo1898687ads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    mean = np.load(MEAN_FILE)

    # All ranks load the data
    train = PreprocessedDataset(TRAIN, TRAINING_ROOT, mean, 226)
    val = PreprocessedDataset(VAL, VALIDATION_ROOT, mean, 226, False)

    # Create a multinode iterator such that each rank gets the same batch
    if comm.rank != 0:
        train = chainermn.datasets.create_empty_dataset(train)
        val = chainermn.datasets.create_empty_dataset(val)
    # Same dataset in all nodes
    train_iter = chainermn.iterators.create_multi_node_iterator(
        chainer.iterators.MultithreadIterator(train,
                                              args.batchsize,
                                              n_threads=40), comm)
    val_iter = chainermn.iterators.create_multi_node_iterator(
        chainer.iterators.MultithreadIterator(val,
                                              args.batchsize,
                                              repeat=False,
                                              shuffle=False,
                                              n_threads=40), comm)

    # We dont use a multinode optimizer here as we dont do all reduce on final weights
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (epochs, 'iteration'), out)

    val_interval = (1, 'epoch')
    log_interval = (1, 'epoch')

    # Create an evaluator
    evaluator = extensions.Evaluator(val_iter, model, device=device)
    # Since I need to measure timer per epoch, I avoid evaluation and just train the model
    # By setting the evaluation epoch high, this will not be triggered when i am running few epochs
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=(1, 'epoch'))
        trainer.extend(extensions.ProgressBar(update_interval=10))
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(
            extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                  'epoch',
                                  filename='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/accuracy', 'validation/main/accuracy'],
                'epoch',
                filename='accuracy.png'))
        trainer.extend(extensions.ProgressBar())

    # TODO : Figure out how to send this report to a file

    if comm.rank == 0:
        print("Starting training .....")

    hook = CupyMemoryProfileHook()
    with hook:
        trainer.run()

    if comm.rank == 0:
        hook.print_report()
Пример #18
0
def main():
    # Check if GPU is available
    # (ImageNet example does not support CPU execution)
    if not chainer.cuda.available:
        raise RuntimeError('ImageNet requires GPU support.')

    archs = [f'b{i}' for i in range(8)] + ['se']
    patchsizes = {
        'b0': 224,
        'b1': 240,
        'b2': 260,
        'b3': 300,
        'b4': 380,
        'b5': 456,
        'b6': 528,
        'b7': 600,
        'se': 224
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('--arch', '-a', choices=archs, default='b0')
    parser.add_argument('--patchsize',
                        default=None,
                        type=int,
                        help='The input size of images. If not specifed,\
                                                                     architecture-wise default values wil be used.'
                        )
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--optimizer', default='RMSProp')
    parser.add_argument('--lr', default=0.256, type=float)
    parser.add_argument('--cosine_annealing', action='store_true')
    parser.add_argument('--exponent', type=float, default=0.97)
    parser.add_argument('--exponent_trigger', type=float, default=2.6)
    parser.add_argument('--soft_label', action='store_true')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=350,
                        help='Number of epochs to train')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        default=3,
                        help='Number of parallel data loading processes')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='../ssd/imagenet',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='Validation minibatch size')
    parser.add_argument('--workerwisebn', action='store_true')
    parser.add_argument('--no_dropconnect', action='store_true')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--communicator', default='pure_nccl')
    parser.add_argument('--no_autoaugment', action='store_true')
    parser.add_argument('--dtype',
                        default='float32',
                        choices=['mixed16', 'float32'],
                        help='For now do not use mixed16')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    chainer.global_config.dtype = args.dtype
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        mode = 'workerwise' if args.workerwisebn else 'synchronized'
        print(f'BatchNorm is {mode}')
        print('==========================================')

    if args.soft_label:
        accfun = soft_accuracy
        lossfun = soft_softmax_cross_entropy
    else:
        accfun = F.accuracy
        lossfun = F.softmax_cross_entropy

    if args.arch != 'se':
        model = EfficientNet(args.arch,
                             workerwisebn=args.workerwisebn,
                             no_dropconnect=args.no_dropconnect)
    else:
        model = SEResNeXt50()
    model = L.Classifier(model, lossfun=lossfun, accfun=accfun)
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    chainer.cuda.get_device_from_id(device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.

    patchsize = patchsizes[
        args.arch] if args.patchsize is None else args.patchsize
    patchsize = (patchsize, patchsize)
    train_transform, val_transform, _ = get_transforms(
        patchsize, no_autoaugment=args.no_autoaugment, soft=args.soft_label)
    if comm.rank == 0:
        train = ImageNetDataset(args.root, 'train')
        val = ImageNetDataset(args.root, 'val')
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)
    train = chainer.datasets.TransformDataset(train, train_transform)
    val = chainer.datasets.TransformDataset(val, val_transform)

    # A workaround for processes crash should be done before making
    # communicator above, when using fork (e.g. MultiProcessIterator)
    # along with Infiniband.
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Create a multi node optimizer from a standard Chainer optimizer.
    symbol = 'lr'
    if args.optimizer.lower() == 'rmsprop':
        optimizer = chainer.optimizers.RMSprop(lr=args.lr, alpha=0.9)
    elif args.optimizer.lower() == 'momentumsgd':
        optimizer = chainer.optimizers.MomentumSGD(lr=args.lr)
    elif args.optimizer.lower() == 'corrected':
        optimizer = chainer.optimizers.CorrectedMomentumSGD(lr=args.lr)
    elif args.optimizer.lower() == 'adabound':
        optimizer = chainer.optimizers.AdaBound(alpha=args.lr, final_lr=0.5)
        symbol = 'alpha'
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-5))

    args.out = f'experiments/{args.arch}' + args.out
    save_args(args, args.out)
    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch')
    val_interval = (10, 'iteration') if args.test else (2, 'epoch')
    log_interval = (10, 'iteration') if args.test else (2, 'epoch')

    checkpointer = chainermn.create_multi_node_checkpointer(
        name='imagenet-example', comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer, trigger=checkpoint_interval)

    if args.cosine_annealing:
        schedule = lr_schedules.CosineLRSchedule(args.lr)
        if args.optimizer in ['MomentumSGD', 'Corrected']:
            trainer.extend(lr_schedules.LearningRateScheduler(schedule))
    else:
        trainer.extend(extensions.ExponentialShift(symbol, args.exponent),
                       trigger=(args.exponent_trigger, 'epoch'))

    # Create a multi node evaluator from an evaluator.
    evaluator = TestModeEvaluator(val_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.snapshot_object(
            model, 'model_iter_{.updater.iteration}.npz'),
                       trigger=val_interval)
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=100))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Пример #19
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--device',
                        '-d',
                        type=str,
                        default='-1',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--noplot',
                        dest='plot',
                        action='store_false',
                        help='Disable PlotReport extension')
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu',
                       '-g',
                       dest='device',
                       type=int,
                       nargs='?',
                       const=0,
                       help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--compile',
                        action='store_true',
                        help='Compile the model')
    parser.add_argument('--dump_onnx',
                        action='store_true',
                        help='Dump ONNX model after optimization')
    parser.add_argument('--iterations',
                        '-I',
                        type=int,
                        default=None,
                        help='Number of iterations to train')
    parser.add_argument('--use-fake-data',
                        action='store_true',
                        help='Use fake data')
    parser.add_argument('--computation_order',
                        type=str,
                        default=None,
                        help='Computation order in backpropagation')
    parser.add_argument('--use_unified_memory',
                        dest='use_unified_memory',
                        action='store_true',
                        help='Use unified memory for large model')
    args = parser.parse_args()

    device = chainer.get_device(args.device)

    print('Device: {}'.format(device))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Set up a neural network to train
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    mlp = MLP(args.unit, 10)
    if args.compile:
        if args.computation_order is None:
            translator = 'ch2o'
        else:
            translator = 'onnx_chainer'
        export_allocator = None
        runtime_allocator = None
        if args.use_unified_memory:
            import cupy
            # unified memory
            export_allocator = cupy.cuda.memory.malloc_managed
            runtime_allocator = cupy.get_default_memory_pool().malloc

        mlp = chainer_compiler.compile(
            mlp,
            dump_onnx=args.dump_onnx,
            translator=translator,
            computation_order=args.computation_order,
            export_allocator=export_allocator,
            runtime_allocator=runtime_allocator)
    model = L.Classifier(mlp)
    model.to_device(device)
    device.use()

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # Load the MNIST dataset
    if args.use_fake_data:
        train, test = fake_dataset()
    else:
        train, test = chainer.datasets.get_mnist()

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    if args.iterations:
        stop_trigger = (args.iterations, 'iteration')
    else:
        stop_trigger = (args.epoch, 'epoch')
    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=device))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    # TODO(niboshi): Temporarily disabled for chainerx. Fix it.
    if device.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))

    # Take a snapshot for each specified epoch
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())

    # Save two plot images to the result dir
    if args.plot and extensions.PlotReport.available():
        trainer.extend(
            extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                  'epoch',
                                  file_name='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/accuracy', 'validation/main/accuracy'],
                'epoch',
                file_name='accuracy.png'))

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
Пример #20
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: VAE')
    parser.add_argument('--initmodel',
                        '-m',
                        type=str,
                        help='Initialize the model from given file')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        help='Resume the optimization from snapshot')
    parser.add_argument('--device',
                        '-d',
                        type=str,
                        default='-1',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--out',
                        '-o',
                        default='results',
                        help='Directory to output the result')
    parser.add_argument('--epoch',
                        '-e',
                        default=100,
                        type=int,
                        help='number of epochs to learn')
    parser.add_argument('--dim-z',
                        '-z',
                        default=20,
                        type=int,
                        help='dimension of encoded vector')
    parser.add_argument('--dim-h',
                        default=500,
                        type=int,
                        help='dimension of hidden layer')
    parser.add_argument('--beta',
                        default=1.0,
                        type=float,
                        help='Regularization coefficient for '
                        'the second term of ELBO bound')
    parser.add_argument('--k',
                        '-k',
                        default=1,
                        type=int,
                        help='Number of Monte Carlo samples used in '
                        'encoded vector')
    parser.add_argument('--binary',
                        action='store_true',
                        help='Use binarized MNIST')
    parser.add_argument('--batch-size',
                        '-b',
                        type=int,
                        default=100,
                        help='learning minibatch size')
    parser.add_argument('--test',
                        action='store_true',
                        help='Use tiny datasets for quick tests')
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu',
                       '-g',
                       dest='device',
                       type=int,
                       nargs='?',
                       const=0,
                       help='GPU ID (negative value indicates CPU)')
    args = parser.parse_args()

    if chainer.get_dtype() == np.float16:
        warnings.warn('This example may cause NaN in FP16 mode.',
                      RuntimeWarning)

    device = chainer.get_device(args.device)
    device.use()

    print('Device: {}'.format(device))
    print('# dim z: {}'.format(args.dim_z))
    print('# Minibatch-size: {}'.format(args.batch_size))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Prepare VAE model, defined in net.py
    encoder = net.make_encoder(784, args.dim_z, args.dim_h)
    decoder = net.make_decoder(784,
                               args.dim_z,
                               args.dim_h,
                               binary_check=args.binary)
    prior = net.make_prior(args.dim_z)
    avg_elbo_loss = net.AvgELBOLoss(encoder,
                                    decoder,
                                    prior,
                                    beta=args.beta,
                                    k=args.k)
    avg_elbo_loss.to_device(device)

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(avg_elbo_loss)

    # If initial parameters are given, initialize the model with them.
    if args.initmodel is not None:
        chainer.serializers.load_npz(args.initmodel, avg_elbo_loss)

    # Load the MNIST dataset
    train, test = chainer.datasets.get_mnist(withlabel=False)

    if args.binary:
        # Binarize dataset
        train = (train >= 0.5).astype(np.float32)
        test = (test >= 0.5).astype(np.float32)

    if args.test:
        train, _ = chainer.datasets.split_dataset(train, 100)
        test, _ = chainer.datasets.split_dataset(test, 100)

    train_iter = chainer.iterators.SerialIterator(train, args.batch_size)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batch_size,
                                                 repeat=False,
                                                 shuffle=False)

    # Set up an updater. StandardUpdater can explicitly specify a loss function
    # used in the training with 'loss_func' option
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device,
                                                loss_func=avg_elbo_loss)

    # Set up the trainer and extensions.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        extensions.Evaluator(test_iter, avg_elbo_loss, device=device))
    # TODO(niboshi): Temporarily disabled for chainerx. Fix it.
    if device.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/reconstr',
            'main/kl_penalty', 'elapsed_time'
        ]))
    trainer.extend(extensions.ProgressBar())

    # If snapshot file is given, resume the training.
    if args.resume is not None:
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()

    # Save images for demonstration
    save_images(device, encoder, decoder, train, test, prior, args.out)
Пример #21
0
def main():
    parser = argparse.ArgumentParser(description='Chainer CIFAR example:')
    parser.add_argument('--dataset',
                        default='cifar10',
                        help='The dataset to use: cifar10 or cifar100')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--learnrate',
                        '-l',
                        type=float,
                        default=0.05,
                        help='Learning rate for SGD')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=300,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--device',
                        '-d',
                        type=str,
                        default='0',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--early-stopping',
                        type=str,
                        help='Metric to watch for early stopping')
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu',
                       '-g',
                       dest='device',
                       type=int,
                       nargs='?',
                       const=0,
                       help='GPU ID (negative value indicates CPU)')
    args = parser.parse_args()

    if chainer.get_dtype() == numpy.float16:
        warnings.warn('This example may cause NaN in FP16 mode.',
                      RuntimeWarning)

    device = chainer.get_device(args.device)
    if device.xp is chainerx:
        sys.stderr.write('This example does not support ChainerX devices.\n')
        sys.exit(1)

    print('Device: {}'.format(device))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    device.use()

    # Set up a neural network to train.
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    if args.dataset == 'cifar10':
        print('Using CIFAR10 dataset.')
        class_labels = 10
        train, test = get_cifar10()
    elif args.dataset == 'cifar100':
        print('Using CIFAR100 dataset.')
        class_labels = 100
        train, test = get_cifar100()
    else:
        raise RuntimeError('Invalid dataset choice.')
    model = L.Classifier(models.VGG.VGG(class_labels))
    model.to_device(device)

    optimizer = chainer.optimizers.MomentumSGD(args.learnrate)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4))

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    stop_trigger = (args.epoch, 'epoch')
    # Early stopping option
    if args.early_stopping:
        stop_trigger = triggers.EarlyStoppingTrigger(
            monitor=args.early_stopping,
            verbose=True,
            max_trigger=(args.epoch, 'epoch'))

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=device))

    # Reduce the learning rate by half every 25 epochs.
    trainer.extend(extensions.ExponentialShift('lr', 0.5),
                   trigger=(25, 'epoch'))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    # TODO(hvy): Support ChainerX
    if device.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))

    # Take a snapshot at each epoch
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
Пример #22
0
def main():
    parser = argparse.ArgumentParser(description='ChainerMN example: VGG16')
    parser.add_argument('--dataset',
                        '-d',
                        default='cifar10',
                        help='The dataset to use: cifar10 or cifar100')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--learnrate',
                        '-l',
                        type=float,
                        default=0.05,
                        help='Learning rate for SGD')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--gpu',
                        '-g',
                        action='store_true',
                        default=False,
                        help='use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--noplot',
                        dest='plot',
                        action='store_false',
                        help='Disable PlotReport extension')
    args = parser.parse_args()

    # Create ChainerMN communicator.
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        device = comm.rank
    else:
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('GPU: {}'.format(args.gpu))
        print('# Minibatch-size: {}'.format(args.batchsize))
        print('# epoch: {}'.format(args.epoch))
        print('')

    # Load the CIFAR10 dataset
    if args.dataset == 'cifar10':
        class_labels = 10
        train, test = chainer.datasets.get_cifar10()
    elif args.dataset == 'cifar100':
        class_labels = 100
        train, test = chainer.datasets.get_cifar100()
    else:
        raise RuntimeError('Invalid dataset choice.')

    model = L.Classifier(VGG.VGG(comm, class_labels))

    if args.gpu:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    optimizer = chainer.optimizers.MomentumSGD(args.learnrate)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))

    if comm.rank != 0:
        train = chainermn.datasets.create_empty_dataset(train)
        test = chainermn.datasets.create_empty_dataset(test)

    train_iter = chainermn.iterators.create_multi_node_iterator(
        chainer.iterators.SerialIterator(train, args.batchsize), comm)
    test_iter = chainermn.iterators.create_multi_node_iterator(
        chainer.iterators.SerialIterator(test,
                                         args.batchsize,
                                         repeat=False,
                                         shuffle=False), comm)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=device))

    if comm.rank == 0:
        # Dump a computational graph from 'loss' variable
        # The "main" refers to the target link of the "main" optimizer.
        trainer.extend(extensions.DumpGraph('main/loss'))

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())

        # Save two plot images to the result dir
        if args.plot and extensions.PlotReport.available():
            trainer.extend(
                extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                      'epoch',
                                      file_name='loss.png'))
            trainer.extend(
                extensions.PlotReport(
                    ['main/accuracy', 'validation/main/accuracy'],
                    'epoch',
                    file_name='accuracy.png'))

        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))

        trainer.extend(extensions.ProgressBar())

    # Run the training
    trainer.run()
Пример #23
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: VAE')
    parser.add_argument('--initmodel',
                        '-m',
                        type=str,
                        help='Initialize the model from given file')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        help='Resume the optimization from snapshot')
    parser.add_argument('--gpu',
                        '-g',
                        default=-1,
                        type=int,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='results',
                        help='Directory to output the result')
    parser.add_argument('--epoch',
                        '-e',
                        default=100,
                        type=int,
                        help='number of epochs to learn')
    parser.add_argument('--dim-z',
                        '-z',
                        default=20,
                        type=int,
                        help='dimention of encoded vector')
    parser.add_argument('--dim-h',
                        default=500,
                        type=int,
                        help='dimention of hidden layer')
    parser.add_argument('--beta',
                        default=1.0,
                        type=float,
                        help='Regularization coefficient for '
                        'the second term of ELBO bound')
    parser.add_argument('--k',
                        '-k',
                        default=1,
                        type=int,
                        help='Number of Monte Carlo samples used in '
                        'encoded vector')
    parser.add_argument('--binary',
                        action='store_true',
                        help='Use binarized MNIST')
    parser.add_argument('--batch-size',
                        '-b',
                        type=int,
                        default=100,
                        help='learning minibatch size')
    parser.add_argument('--test',
                        action='store_true',
                        help='Use tiny datasets for quick tests')
    args = parser.parse_args()

    print('GPU: {}'.format(args.gpu))
    print('# dim z: {}'.format(args.dim_z))
    print('# Minibatch-size: {}'.format(args.batch_size))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Prepare VAE model, defined in net.py
    encoder = net.make_encoder(784, args.dim_z, args.dim_h)
    decoder = net.make_decoder(784,
                               args.dim_z,
                               args.dim_h,
                               binary_check=args.binary)
    prior = net.make_prior(args.dim_z)
    avg_elbo_loss = net.AvgELBOLoss(encoder,
                                    decoder,
                                    prior,
                                    beta=args.beta,
                                    k=args.k)
    if args.gpu >= 0:
        avg_elbo_loss.to_gpu(args.gpu)

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(avg_elbo_loss)

    # Initialize
    if args.initmodel is not None:
        chainer.serializers.load_npz(args.initmodel, avg_elbo_loss)

    # Load the MNIST dataset
    train, test = chainer.datasets.get_mnist(withlabel=False)

    if args.binary:
        # Binarize dataset
        train = (train >= 0.5).astype(np.float32)
        test = (test >= 0.5).astype(np.float32)

    if args.test:
        train, _ = chainer.datasets.split_dataset(train, 100)
        test, _ = chainer.datasets.split_dataset(test, 100)

    train_iter = chainer.iterators.SerialIterator(train, args.batch_size)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batch_size,
                                                 repeat=False,
                                                 shuffle=False)

    # Set up an updater. StandardUpdater can explicitly specify a loss function
    # used in the training with 'loss_func' option
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=args.gpu,
                                                loss_func=avg_elbo_loss)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        extensions.Evaluator(test_iter, avg_elbo_loss, device=args.gpu))
    trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/reconstr',
            'main/kl_penalty', 'elapsed_time'
        ]))
    trainer.extend(extensions.ProgressBar())

    if args.resume is not None:
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()

    # Visualize the results
    def save_images(x, filename):
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots(3, 3, figsize=(9, 9), dpi=100)
        for ai, xi in zip(ax.flatten(), x):
            ai.imshow(xi.reshape(28, 28))
        fig.savefig(filename)

    avg_elbo_loss.to_cpu()
    train_ind = [1, 3, 5, 10, 2, 0, 13, 15, 17]
    x = chainer.Variable(np.asarray(train[train_ind]))
    with chainer.using_config('train', False), chainer.no_backprop_mode():
        x1 = decoder(encoder(x).mean, inference=True).mean
    save_images(x.array, os.path.join(args.out, 'train'))
    save_images(x1.array, os.path.join(args.out, 'train_reconstructed'))

    test_ind = [3, 2, 1, 18, 4, 8, 11, 17, 61]
    x = chainer.Variable(np.asarray(test[test_ind]))
    with chainer.using_config('train', False), chainer.no_backprop_mode():
        x1 = decoder(encoder(x).mean, inference=True).mean
    save_images(x.array, os.path.join(args.out, 'test'))
    save_images(x1.array, os.path.join(args.out, 'test_reconstructed'))

    # draw images from randomly sampled z
    z = prior().sample(9)
    x = decoder(z, inference=True).mean
    save_images(x.array, os.path.join(args.out, 'sampled'))
Пример #24
0
def main():
    # This script is almost identical to train_mnist.py. The only difference is
    # that this script uses data-parallel computation on two GPUs.
    # See train_mnist.py for more details.
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=400,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu0',
                        '-g',
                        type=int,
                        default=0,
                        help='First GPU ID')
    parser.add_argument('--gpu1',
                        '-G',
                        type=int,
                        default=1,
                        help='Second GPU ID')
    parser.add_argument('--out',
                        '-o',
                        default='result_parallel',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    args = parser.parse_args()

    print('GPU: {}, {}'.format(args.gpu0, args.gpu1))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    chainer.backends.cuda.get_device_from_id(args.gpu0).use()

    model = L.Classifier(train_mnist.MLP(args.unit, 10))
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    train, test = chainer.datasets.get_mnist()
    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # ParallelUpdater implements the data-parallel gradient computation on
    # multiple GPUs. It accepts "devices" argument that specifies which GPU to
    # use.
    updater = training.updaters.ParallelUpdater(
        train_iter,
        optimizer,
        # The device of the name 'main' is used as a "master", while others are
        # used as slaves. Names other than 'main' are arbitrary.
        devices={
            'main': args.gpu0,
            'second': args.gpu1
        },
    )
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu0))
    trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Пример #25
0
    # しっかりドキュメント読むこと

    # optimizer 作成
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # updater 作成
    updater = training.updaters.StandardUpdater(train_itr,
                                                optimizer,
                                                device=-1)

    # trainer 作成
    trainer = training.Trainer(updater, (20, 'epoch'), out='results')
    trainer.extend(extensions.Evaluator(test_itr, model,
                                        device=-1))  # testデータセットで評価
    trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(20, 'epoch'))
    trainer.extend(extensions.LogReport())
    if extensions.PlotReport.available():
        trainer.extend(
            extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                  'epoch',
                                  filename='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/accuracy', 'validation/main/accuraccy'],
                'epoch',
                filename='accuracy.png'))

    # 学習実行
    trainer.run()
Пример #26
0
def main():
    # These two lines help with memory. If they are not included training runs out of memory.
    # Use them till you the real reason why its running out of memory

    pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed)
    cp.cuda.set_allocator(pool.malloc)
    chainer.disable_experimental_feature_warning = True

    parser = argparse.ArgumentParser(
        description='CosmoFlow Multi-Node Training')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epochs',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--out',
                        '-o',
                        default='results',
                        help='Output directory')
    args = parser.parse_args()

    batch_size = args.batchsize
    epochs = args.epochs
    out = args.out

    # Prepare communicators  communicator.
    comm = chainermnx.create_communicator("spatial_hybrid_nccl")
    local_comm = create_local_comm(comm)

    data_comm = create_data_comm(comm)
    device = comm.intra_rank

    if local_comm.rank == 0:
        if data_comm.rank == 0:
            train = CosmoDataset("/groups2/gaa50004/cosmoflow_data")
            # train, val = datasets.split_dataset_random(training_data, first_size=(int(training_data.__len__() * 0.80)))

        else:
            train = None
            #val = None
        train = chainermn.scatter_dataset(train, data_comm, shuffle=True)
        # val = chainermn.scatter_dataset(val, data_comm, shuffle=True)
    else:
        train = CosmoDataset("/groups2/gaa50004/cosmoflow_data")
        train = chainermn.datasets.create_empty_dataset(train)
        # val = chainermn.datasets.create_empty_dataset(val)

    train_iterator = chainermn.iterators.create_multi_node_iterator(
        chainer.iterators.MultithreadIterator(train,
                                              batch_size,
                                              n_threads=20,
                                              shuffle=True), local_comm)
    # vali_iterator = chainermn.iterators.create_multi_node_iterator(
    #     chainer.iterators.MultithreadIterator(val, batch_size, repeat=False, shuffle=False, n_threads=20),
    #     local_comm)

    model = CosmoFlow(local_comm)
    # model = L.Classifier(model, lossfun=F.mean_squared_error, accfun=F.mean_squared_error)

    # print("Model Created successfully")
    ch.backends.cuda.get_device_from_id(device).use()
    model.to_gpu()  # Copy the model to the GPU

    optimizer = chainermnx.create_hybrid_multi_node_optimizer_alpha(
        chainer.optimizers.Adam(), data_comm, local_comm)

    optimizer.setup(model)
    # Create the updater, using the optimizer
    updater = training.StandardUpdater(train_iterator,
                                       optimizer,
                                       device=device)

    # Set up a trainer
    trainer = training.Trainer(updater, (epochs, 'epoch'), out=out)
    # trainer.extend(extensions.Evaluator(vali_iterator, model, device=device))

    filename = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".log"
    log_interval = (1, 'epoch')
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(
            extensions.LogReport(trigger=log_interval, filename=filename))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(
            extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                  'epoch',
                                  filename='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/accuracy', 'validation/main/accuracy'],
                'epoch',
                filename='accuracy.png'))

        trainer.extend(extensions.ProgressBar(update_interval=1))
        print("Starting Training ")

    trainer.run()
Пример #27
0
def main():
    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
        'resnext50': resnext50.ResNeXt50,
    }

    dtypes = {
        'float16': np.float16,
        'float32': np.float32,
        'float64': np.float64,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--dtype',
                        choices=dtypes,
                        help='Specify the dtype '
                        'used. If not supplied, the default dtype is used')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--device',
                        '-d',
                        type=str,
                        default='-1',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    parser.add_argument('--dali', action='store_true')
    parser.set_defaults(dali=False)
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu',
                       '-g',
                       type=int,
                       nargs='?',
                       const=0,
                       help='GPU ID (negative value indicates CPU)')
    args = parser.parse_args()

    device = parse_device(args)

    # Set the dtype if supplied.
    if args.dtype is not None:
        chainer.config.dtype = args.dtype

    print('Device: {}'.format(device))
    print('Dtype: {}'.format(chainer.config.dtype))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from {}'.format(args.initmodel))
        chainer.serializers.load_npz(args.initmodel, model)
    model.to_device(device)
    device.use()

    # Load the mean file
    mean = np.load(args.mean)
    if args.dali:
        if not dali_util._dali_available:
            raise RuntimeError('DALI seems not available on your system.')
        if not isinstance(device, chainer.backend.cuda.GpuDevice):
            raise RuntimeError('Using DALI requires GPU device. Please '
                               'specify it with --device option.')
        num_threads = args.loaderjob
        if num_threads is None or num_threads <= 0:
            num_threads = 1
        ch_mean = list(np.average(mean, axis=(1, 2)))
        ch_std = [255.0, 255.0, 255.0]
        # Setup DALI pipelines
        train_pipe = dali_util.DaliPipelineTrain(args.train,
                                                 args.root,
                                                 model.insize,
                                                 args.batchsize,
                                                 num_threads,
                                                 device.device.id,
                                                 True,
                                                 mean=ch_mean,
                                                 std=ch_std)
        val_pipe = dali_util.DaliPipelineVal(args.val,
                                             args.root,
                                             model.insize,
                                             args.val_batchsize,
                                             num_threads,
                                             device.device.id,
                                             False,
                                             mean=ch_mean,
                                             std=ch_std)
        train_iter = chainer.iterators.DaliIterator(train_pipe)
        val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False)
        # converter = dali_converter
        converter = dali_util.DaliConverter(mean=mean, crop_size=model.insize)
    else:
        # Load the dataset files
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(args.val, args.root, mean, model.insize,
                                  False)
        # These iterators load the images with subprocesses running in parallel
        # to the training/validation.
        train_iter = chainer.iterators.MultiprocessIterator(
            train, args.batchsize, n_processes=args.loaderjob)
        val_iter = chainer.iterators.MultiprocessIterator(
            val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)
        converter = dataset.concat_examples

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=converter,
                                                device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    val_interval = (1 if args.test else 100000), 'iteration'
    log_interval = (1 if args.test else 1000), 'iteration'

    trainer.extend(extensions.Evaluator(val_iter,
                                        model,
                                        converter=converter,
                                        device=device),
                   trigger=val_interval)
    # TODO(sonots): Temporarily disabled for chainerx. Fix it.
    if device.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
def main():
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        default=20,
                        type=int,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--out',
                        '-o',
                        default='result_model_parallel',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        default=1000,
                        type=int,
                        help='Number of units')
    parser.add_argument('--device0',
                        '-d',
                        type=str,
                        default='0',
                        help='Device specifier of the first device. '
                        'Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--device1',
                        '-D',
                        type=str,
                        default='1',
                        help='Device specifier of the second device. '
                        'Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu0',
                       '-g',
                       dest='device0',
                       type=int,
                       nargs='?',
                       const=0,
                       help='First GPU ID')
    group.add_argument('--gpu1',
                       '-G',
                       dest='device1',
                       type=int,
                       nargs='?',
                       const=1,
                       help='Second GPU ID')
    args = parser.parse_args()
    device0 = chainer.get_device(args.device0)
    device1 = chainer.get_device(args.device1)

    print('Devices: {}, {}'.format(device0, device1))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # See train_mnist.py for the meaning of these lines

    model = L.Classifier(ParallelMLP(args.unit, 10, device0, device1))
    device0.use()

    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    train, test = chainer.datasets.get_mnist()

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                input_device=device0)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(extensions.Evaluator(test_iter, model, device=device0))
    # TODO(niboshi): Temporarily disabled for chainerx. Fix it.
    if device0.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Пример #29
0
def train():
    device = chainer.get_device(device_id)
    device.use()

    print('Device: {}'.format(device))
    print('# Minibatch-size: {}'.format(batchsize))
    print('# epoch: {}'.format(epoch))
    print('')

    # Set up a neural network to train.
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    if dataset == 'cifar10':
        print('Using CIFAR10 dataset.')
        class_labels = 10
        train, test = get_cifar10()
    elif dataset == 'cifar100':
        print('Using CIFAR100 dataset.')
        class_labels = 100
        train, test = get_cifar100()
    else:
        raise RuntimeError('Invalid dataset choice.')
    model = L.Classifier(models.VGG.VGG(class_labels))
    model.to_device(device)

    optimizer = chainer.optimizers.MomentumSGD(learnrate)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4))

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    stop_trigger = (epoch, 'epoch')

    # Set up a trainer
    out = './result'
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    trainer = training.Trainer(updater, stop_trigger, out=out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=device))

    # Reduce the learning rate by half every 25 epochs.
    trainer.extend(extensions.ExponentialShift('lr', 0.5),
                   trigger=(25, 'epoch'))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    # TODO(imanishi): Support for ChainerX
    if not isinstance(device, backend.ChainerxDevice):
        trainer.extend(extensions.DumpGraph('main/loss'))

    # Take a snapshot at each epoch
    trainer.extend(
        extensions.snapshot(filename='snaphot_epoch_{.updater.epoch}'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Run the training
    trainer.run()
Пример #30
0
def main():
    archs = {
        'alex': alex.Alex,
        'alex_fp16': alex.AlexFp16,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
        'resnext50': resnext50.ResNeXt50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--iterations',
                        '-I',
                        type=int,
                        default=0,
                        help='Number of iterations to train')
    parser.add_argument('--device',
                        '-d',
                        type=str,
                        default='-1',
                        help='Device specifier. Either ChainerX device '
                        'specifier or an integer. If non-negative integer, '
                        'CuPy arrays with specified device id are used. If '
                        'negative integer, NumPy arrays are used')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    parser.add_argument('--dali', action='store_true')
    parser.set_defaults(dali=False)
    group = parser.add_argument_group('deprecated arguments')
    group.add_argument('--gpu',
                       '-g',
                       dest='device',
                       type=int,
                       nargs='?',
                       const=0,
                       help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--compile',
                        action='store_true',
                        help='Compile the model')
    parser.add_argument('--dump_onnx',
                        action='store_true',
                        help='Dump ONNX model after optimization')
    args = parser.parse_args()

    chainer.config.autotune = True
    chainer.config.cudnn_fast_batch_normalization = True

    device = chainer.get_device(args.device)

    print('Device: {}'.format(device))
    print('# Minibatch-size: {}'.format(args.batchsize))
    if args.iterations:
        print('# iterations: {}'.format(args.iterations))
    else:
        print('# epoch: {}'.format(args.epoch))
    print('')

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from {}'.format(args.initmodel))
        chainer.serializers.load_npz(args.initmodel, model)
    insize = model.insize
    if args.compile:
        model = chainer_compiler.compile(model, dump_onnx=args.dump_onnx)
    model.to_device(device)
    device.use()

    # Load the mean file
    mean = np.load(args.mean)
    if args.dali:
        if not dali_util._dali_available:
            raise RuntimeError('DALI seems not available on your system.')
        num_threads = args.loaderjob
        if num_threads is None or num_threads <= 0:
            num_threads = 1
        ch_mean = list(np.average(mean, axis=(1, 2)))
        ch_std = [255.0, 255.0, 255.0]
        # Setup DALI pipelines
        train_pipe = dali_util.DaliPipelineTrain(args.train,
                                                 args.root,
                                                 insize,
                                                 args.batchsize,
                                                 num_threads,
                                                 args.gpu,
                                                 True,
                                                 mean=ch_mean,
                                                 std=ch_std)
        val_pipe = dali_util.DaliPipelineVal(args.val,
                                             args.root,
                                             insize,
                                             args.val_batchsize,
                                             num_threads,
                                             args.gpu,
                                             False,
                                             mean=ch_mean,
                                             std=ch_std)
        train_iter = chainer.iterators.DaliIterator(train_pipe)
        val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False)
        # converter = dali_converter
        converter = dali_util.DaliConverter(mean=mean, crop_size=insize)
    else:
        # Load the dataset files
        train = PreprocessedDataset(args.train, args.root, mean, insize)
        val = PreprocessedDataset(args.val, args.root, mean, insize, False)
        # These iterators load the images with subprocesses running in parallel
        # to the training/validation.
        train_iter = chainer.iterators.MultiprocessIterator(
            train, args.batchsize, n_processes=args.loaderjob)
        val_iter = chainer.iterators.MultiprocessIterator(
            val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)
        converter = dataset.concat_examples

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=converter,
                                                device=device)
    if args.iterations:
        stop_trigger = (args.iterations, 'iteration')
    else:
        stop_trigger = (args.epoch, 'epoch')
    trainer = training.Trainer(updater, stop_trigger, args.out)

    val_interval = (1 if args.test else 100000), 'iteration'
    log_interval = ((1 if args.test else 10 if args.iterations else 1000),
                    'iteration')

    trainer.extend(extensions.Evaluator(val_iter,
                                        model,
                                        converter=converter,
                                        device=device),
                   trigger=val_interval)
    # TODO(sonots): Temporarily disabled for chainerx. Fix it.
    if device.xp is not chainerx:
        trainer.extend(extensions.DumpGraph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    cuda_hook = function_hooks.CUDAProfileHook()
    with cuda_hook:
        trainer.run()

    with open('%s/log' % args.out) as f:
        logs = json.load(f)
    elapsed_times = []
    for prev, cur in zip(logs, logs[1:]):
        iters = cur['iteration'] - prev['iteration']
        elapsed = cur['elapsed_time'] - prev['elapsed_time']
        elapsed_times.append(elapsed / iters)
    sec_per_iter = sum(elapsed_times) / len(elapsed_times)
    print(sec_per_iter * 1000, 'msec/iter')
    print(args.batchsize / sec_per_iter, 'images/sec')