def main(): mushroomsfile = "mushrooms.csv" data_array = np.genfromtxt(mushroomsfile, delimiter=',', dtype=str, skip_header=1) n_data, n_featrue = data_array.shape for col in range(n_featrue): data_array[:, col] = np.unique(data_array[:, col], return_inverse=True)[1] X = data_array[:, 1:].astype(np.float32) Y = data_array[:, 0].astype(np.int32)[:, None] train, test = datasets.split_dataset_random(datasets.TupleDataset(X, Y), int(n_data * 0.7)) train_iter = ch.iterators.SerialIterator(train, 100) test_iter = ch.iterators.SerialIterator(test, 100, repeat=False, shuffle=False) model = L.Classifier(MLP(44, 1), lossfun=F.sigmoid_cross_entropy, accfun=F.binary_accuracy) optimizer = ch.optimizers.SGD().setup(model) updater = training.StandardUpdater(train_iter, optimizer, device=-1) trainer = training.Trainer(updater, (50, 'epoch'), out='result') trainer.extend(extensions.Evaluator(test_iter, model, device=-1)) trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend( extensions.snapshot(filename='trainer_epoch_{.updater.epoch}'), trigger=(10, 'epoch')) trainer.extend(extensions.LogReport()) if extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.run() x, t = test[np.random.randint(len(test))] predict = model.predictor(x[None]).array predict = predict[0][0] if predict >= 0: print('Predicted Poisonous, Actual ' + ['Edible', 'Poisonous'][t[0]]) else: print('Predicted Edible, Actual ' + ['Edible', 'Poisonous'][t[0]])
def main(): train, test = datasets.mnist.get_mnist() batchsize = 128 train_iter = iterators.SerialIterator(train, batchsize) test_iter = iterators.SerialIterator(test, batchsize, False, False) gpu_id = 0 model = MLP() if gpu_id != -1: model.to_gpu(gpu_id) max_epoch = 10 model = L.Classifier(model) optimizer = optimizers.MomentumSGD() optimizer.setup(model) updater = training.updaters.StandardUpdater(train_iter, optimizer, device=gpu_id) trainer = training.Trainer(updater, (max_epoch, 'epoch'), out='mnist_out') trainer.extend(ext.LogReport()) trainer.extend(ext.snapshot(filename='snapshot_epoch-{.updater.epoch}')) trainer.extend(ext.snapshot_object(model.predictor, filename='model_epoch-{.updater.epoch}')) trainer.extend(ext.Evaluator(test_iter, model, device=gpu_id)) trainer.extend(ext.PrintReport(['epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time'])) trainer.extend(ext.PlotReport(['main/loss', 'validation/main/loss'], x_key='epoch', file_name='loss.png')) trainer.extend(ext.PlotReport(['main/accuracy', 'validation/main/accuracy'], x_key='epoch', file_name='accuracy.png')) trainer.extend(ext.DumpGraph('main/loss')) trainer.run()
def train(X, y, batch_size=256, max_epoch=20, gpu_id=0): n_out = y.shape[1] train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.75) train = first_dataset(train_X, train_y) test = first_dataset(test_X, test_y) train_iter = chainer.iterators.SerialIterator(train, batch_size) test_iter = chainer.iterators.SerialIterator(test, batch_size, False, False) model = NN(n_out) if gpu_id >= 0: model.to_gpu(gpu_id) model = L.Classifier(model, lossfun=F.mean_squared_error, accfun=F.r2_score) optimizer = chainer.optimizers.Adam() optimizer.setup(model) updater = chainer.training.updaters.StandardUpdater(train_iter, optimizer, device=gpu_id) trainer = chainer.training.Trainer(updater, (max_epoch, 'epoch'), out='first_result') trainer.extend(extensions.LogReport()) trainer.extend(extensions.Evaluator(test_iter, model, device=gpu_id)) trainer.extend( extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}')) trainer.extend( extensions.snapshot_object( model.predictor, filename='first_model_epoch-{.updater.epoch}')) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], x_key='epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], x_key='epoch', file_name='accuracy.png')) trainer.extend(extensions.DumpGraph('main/loss')) trainer.run()
def train(): train, test = mnist.get_mnist() batchsize = 128 train_iter = iterators.SerialIterator(train, batchsize) test_iter = iterators.SerialIterator(test, batchsize, shuffle=False, repeat=False) model = L.Classifier(MLP()) device = -1 max_epoch = 10 if chainer.backends.cuda.available: device = 0 model.to_gpu() optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(model) updater = training.updaters.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (max_epoch, 'epoch'), out="mnist_result") trainer.extend(extensions.LogReport()) trainer.extend( extensions.snapshot(filename="snapshot_epoch-{.updater.epoch}")) trainer.extend( extensions.snapshot_object(model, filename="model_epoch-{.updater.epoch}")) trainer.extend(extensions.Evaluator(test_iter, model, device=device)) trainer.extend( extensions.PrintReport([ "epoch", "main/loss", "main/accuracy", "validation/main/loss", "validation/main/accuracy", "elapsed_time" ])) trainer.extend( extensions.PlotReport(["main/loss", "validation/main/loss"], x_key="epoch", file_name="loss.png")) trainer.extend( extensions.PlotReport(["main/accuracy", "validation/main/accuracy"], x_key="epoch", file_name="accuracy")) trainer.extend(extensions.DumpGraph("main/loss")) trainer.run()
def main(args): mnist_train = chainer.datasets.get_mnist(ndim=3, withlabel=False)[0] itr = iterators.SerialIterator(mnist_train, args.b, shuffle=True, repeat=True) model = GAN(args.z) if chainer.config.use_ideep != "never": model.to_intel64() opt = optimizers.Adam(alpha=0.0002, beta1=0.5, beta2=0.9) opt.setup(model) updater = training.StandardUpdater(itr, opt) trainer = training.Trainer(updater, stop_trigger=(args.e, "epoch"), out=args.r) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport(["main/loss_real", "main/loss_fake", "epoch"])) trainer.extend(extensions.ProgressBar()) trainer.extend( extensions.PlotReport(["main/loss_real", "main/loss_fake"], filename="loss.pdf")) trainer.extend(ext_save_img(model.gen, args.r, args.z)) trainer.extend(extensions.DumpGraph("main/loss_fake")) if args.save_model: trainer.extend(extensions.snapshot_object( model.gen, "gen_epoch_{.updater.epoch:04d}.npz"), trigger=(10, "epoch")) trainer.extend(extensions.snapshot_object( model.dis, "dis_epoch_{.updater.epoch:04d}.npz"), trigger=(10, "epoch")) trainer.run()
def main(): parser = argparse.ArgumentParser(description='''\ ChainerMN example: MNIST with automatic checkpoints enabled''') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--communicator', type=str, default='hierarchical', help='Type of communicator') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') parser.add_argument('--run-id', type=str, default='train-mnist-example', help='ID of the task name') args = parser.parse_args() # Prepare ChainerMN communicator. if args.gpu: if args.communicator == 'naive': print("Error: 'naive' communicator does not support GPU.\n") exit(-1) comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank else: if args.communicator != 'naive': print('Warning: using naive communicator ' 'because only naive supports CPU-only execution') comm = chainermn.create_communicator('naive') device = -1 if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) if args.gpu: print('Using GPUs') print('Using {} communicator'.format(args.communicator)) print('Num unit: {}'.format(args.unit)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') model = L.Classifier(MLP(args.unit, 10)) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu() # Create a multi node optimizer from a standard Chainer optimizer. optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) # Split and distribute the dataset. Only worker 0 loads the whole dataset. # Datasets of worker 0 are evenly split and distributed to all workers. if comm.rank == 0: train, test = chainer.datasets.get_mnist() else: train, test = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) test = chainermn.scatter_dataset(test, comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Enable checkpointer and recover from checkpoint if any checkpoint exists checkpointer = create_multi_node_checkpointer(name=args.run_id, comm=comm) checkpointer.maybe_load(trainer, optimizer) print("Rank", comm.rank, ": (Re)Starting from (epoch, iter) =", (trainer.updater.epoch, trainer.updater.iteration)) trainer.extend(checkpointer, trigger=(1000, 'iteration')) # Create a multi node evaluator from a standard Chainer evaluator. evaluator = extensions.Evaluator(test_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) trainer.run()
def main(): # This script is almost identical to train_mnist.py. The only difference is # that this script uses data-parallel computation on two GPUs. # See train_mnist.py for more details. parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=400, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--out', '-o', default='result_data_parallel', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') parser.add_argument('--device0', '-d', type=str, default='0', help='Device specifier of the first device. ' 'Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--device1', '-D', type=str, default='1', help='Device specifier of the second device. ' 'Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu0', '-g', dest='device0', type=int, nargs='?', const=0, help='First GPU ID') group.add_argument('--gpu1', '-G', dest='device1', type=int, nargs='?', const=1, help='Second GPU ID') args = parser.parse_args() device0 = chainer.get_device(args.device0) device1 = chainer.get_device(args.device1) print('Devices: {}, {}'.format(device0, device1)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') device0.use() model = L.Classifier(train_mnist.MLP(args.unit, 10)) optimizer = chainer.optimizers.Adam() optimizer.setup(model) train, test = chainer.datasets.get_mnist() train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # ParallelUpdater implements the data-parallel gradient computation on # multiple devices. It accepts "devices" argument that specifies which # device to use. updater = training.updaters.ParallelUpdater( train_iter, optimizer, # The device of the name 'main' is used as a "master", while others are # used as slaves. Names other than 'main' are arbitrary. devices={ 'main': device0, 'second': device1 }, ) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.Evaluator(test_iter, model, device=device0)) # TODO(niboshi): Temporarily disabled for chainerx. Fix it. if device0.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): archs = { 'alex': alex.Alex, 'alex_fp16': alex.AlexFp16, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, 'resnext50': resnext50.ResNeXt50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--gpus', '-g', type=int, nargs="*", default=[0, 1, 2, 3]) parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() # Initialize the model to train model = archs[args.arch]() if args.initmodel: print('Load model from {}'.format(args.initmodel)) chainer.serializers.load_npz(args.initmodel, model) # Load the datasets and mean file mean = np.load(args.mean) train = train_imagenet.PreprocessedDataset( args.train, args.root, mean, model.insize) val = train_imagenet.PreprocessedDataset( args.val, args.root, mean, model.insize, False) # These iterators load the images with subprocesses running in parallel to # the training/validation. devices = tuple(args.gpus) train_iters = [ chainer.iterators.MultiprocessIterator(i, args.batchsize, n_processes=args.loaderjob) for i in chainer.datasets.split_dataset_n_random(train, len(devices))] val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer updater = updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=devices) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) if args.test: val_interval = 5, 'epoch' log_interval = 1, 'epoch' else: val_interval = 100000, 'iteration' log_interval = 1000, 'iteration' trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpus[0]), trigger=val_interval) trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=2)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): # Check if GPU is available # (ImageNet example does not support CPU execution) if not chainer.cuda.available: raise RuntimeError('ImageNet requires GPU support.') archs = { 'alex': alex.Alex, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.add_argument('--communicator', default='hierarchical') parser.set_defaults(test=False) args = parser.parse_args() # Start method of multiprocessing module need to be changed if we # are using InfiniBand and MultiprocessIterator. This is because # processes often crash when calling fork if they are using # Infiniband. (c.f., # https://www.open-mpi.org/faq/?category=tuning#fork-warning ) # Also, just setting the start method does not seem to be # sufficient to actually launch the forkserver processes, so also # start a dummy process. # See also our document: # https://chainermn.readthedocs.io/en/stable/tutorial/tips_faqs.html#using-multiprocessiterator # This must be done *before* ``chainermn.create_communicator``!!! multiprocessing.set_start_method('forkserver') p = multiprocessing.Process(target=lambda *x: x, args=()) p.start() p.join() # Prepare ChainerMN communicator. comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) print('Using {} communicator'.format(args.communicator)) print('Using {} arch'.format(args.arch)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') model = archs[args.arch]() if args.initmodel: print('Load model from', args.initmodel) chainer.serializers.load_npz(args.initmodel, model) chainer.cuda.get_device_from_id(device).use() # Make the GPU current model.to_gpu() # Split and distribute the dataset. Only worker 0 loads the whole dataset. # Datasets of worker 0 are evenly split and distributed to all workers. mean = np.load(args.mean) if comm.rank == 0: train = PreprocessedDataset(args.train, args.root, mean, model.insize) val = PreprocessedDataset(args.val, args.root, mean, model.insize, False) else: train = None val = None train = chainermn.scatter_dataset(train, comm, shuffle=True) val = chainermn.scatter_dataset(val, comm) # A workaround for processes crash should be done before making # communicator above, when using fork (e.g. MultiProcessIterator) # along with Infiniband. train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) # Create a multi node optimizer from a standard Chainer optimizer. optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm) optimizer.setup(model) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch') val_interval = (10, 'iteration') if args.test else (1, 'epoch') log_interval = (10, 'iteration') if args.test else (1, 'epoch') checkpointer = chainermn.create_multi_node_checkpointer( name='imagenet-example', comm=comm) checkpointer.maybe_load(trainer, optimizer) trainer.extend(checkpointer, trigger=checkpoint_interval) # Create a multi node evaluator from an evaluator. evaluator = TestModeEvaluator(val_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator, trigger=val_interval) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--model', '-m', default='MLP', help='Choose the model: MLP or MLPSideEffect') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') parser.add_argument('--noplot', dest='plot', action='store_false', help='Disable PlotReport extension') group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() device = chainer.get_device(args.device) if device.xp is chainerx: sys.stderr.write('This example does not support ChainerX devices.\n') sys.exit(1) print('Device: {}'.format(device)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') device.use() # Set up a neural network to train # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. if args.model == 'MLP': model = L.Classifier(MLP(args.unit, 10)) elif args.model == 'MLPSideEffect': model = L.Classifier(MLPSideEffect(args.unit, 10)) model.to_device(device) # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) # Load the MNIST dataset train, test = chainer.datasets.get_mnist() train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # Set up a trainer updater = training.updaters.StandardUpdater( train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=device)) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.DumpGraph('main/loss')) # Take a snapshot for each specified epoch frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Save two plot images to the result dir if args.plot and extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run()
def main(): parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', type=str, help='Resume the training from snapshot') parser.add_argument('--autoload', action='store_true', help='Automatically load trainer snapshots in case' ' of preemption or other temporary system failure') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() device = chainer.get_device(args.device) print('Device: {}'.format(device)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Set up a neural network to train # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. model = L.Classifier(MLP(args.unit, 10)) model.to_device(device) device.use() # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) # Load the MNIST dataset train, test = chainer.datasets.get_mnist() train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=device), call_before_training=True) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. # TODO(niboshi): Temporarily disabled for chainerx. Fix it. if device.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) # Take a snapshot for each specified epoch frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) # Take a snapshot each ``frequency`` epoch, delete old stale # snapshots and automatically load from snapshot files if any # files are already resident at result directory. trainer.extend(extensions.snapshot(num_retain=1, autoload=args.autoload), trigger=(frequency, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(), call_before_training=True) # Save two plot images to the result dir trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png'), call_before_training=True) trainer.extend(extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png'), call_before_training=True) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend(extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ]), call_before_training=True) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) if args.resume is not None: # Resume from a snapshot (Note: this loaded model is to be # overwritten by --autoload option, autoloading snapshots, if # any snapshots exist in output directory) chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run()
def main(): parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', default=20, type=int, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu0', '-g', default=0, type=int, help='First GPU ID') parser.add_argument('--gpu1', '-G', default=1, type=int, help='Second GPU ID') parser.add_argument('--out', '-o', default='result_model_parallel', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', default=1000, type=int, help='Number of units') args = parser.parse_args() print('GPU: {}, {}'.format(args.gpu0, args.gpu1)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # See train_mnist.py for the meaning of these lines model = L.Classifier(ParallelMLP(args.unit, 10, args.gpu0, args.gpu1)) chainer.backends.cuda.get_device_from_id(args.gpu0).use() optimizer = chainer.optimizers.Adam() optimizer.setup(model) train, test = chainer.datasets.get_mnist() train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) updater = training.updaters.StandardUpdater(train_iter, optimizer, device=args.gpu0) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu0)) trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): parser = argparse.ArgumentParser(description='ChainerMN example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--communicator', type=str, default='pure_nccl', help='Type of communicator') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU') parser.add_argument('--chainerx', '-x', action='store_true', default=False, help='Use ChainerX') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') parser.add_argument('--benchmark', action='store_true', help='benchmark mode') parser.add_argument('--benchmark-iteration', type=int, default=500, help='the number of iterations when using benchmark mode') args = parser.parse_args() # Prepare ChainerMN communicator. if args.gpu: if args.communicator == 'naive': print('Error: \'naive\' communicator does not support GPU.\n') exit(-1) comm = chainermn.create_communicator(args.communicator) if args.chainerx: device = chainer.get_device('cuda:{}'.format(comm.intra_rank)) else: device = chainer.get_device(comm.intra_rank) else: if args.communicator != 'naive': print('Warning: using naive communicator ' 'because only naive supports CPU-only execution') comm = chainermn.create_communicator('naive') if args.chainerx: device = chainer.get_device('native') else: device = chainer.get_device(-1) if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) if args.gpu: print('Using GPUs') print('Using {} communicator'.format(args.communicator)) print('Num unit: {}'.format(args.unit)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') model = L.Classifier(MLP(args.unit, 10)) model.to_device(device) # Create a multi node optimizer from a standard Chainer optimizer. optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) # Split and distribute the dataset. Only worker 0 loads the whole dataset. # Datasets of worker 0 are evenly split and distributed to all workers. if comm.rank == 0: train, test = chainer.datasets.get_mnist() else: train, test = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) test = chainermn.scatter_dataset(test, comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) if args.benchmark: stop_trigger = (args.benchmark_iteration, 'iteration') else: stop_trigger = (args.epoch, 'epoch') updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, stop_trigger, out=args.out) # Create a multi node evaluator from a standard Chainer evaluator. evaluator = extensions.Evaluator(test_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0: if device.xp is not chainerx: # Disabled for ChainerX. # This is because ChainerX doesn't have a public API set # to traverse computational graphs. # See examples/mnist/train_mnist.py trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) trainer.extend(extensions.ProgressBar()) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): # These two lines help with memory. If they are not included training runs out of memory. # Use them till you the real reason why its running out of memory pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed) cp.cuda.set_allocator(pool.malloc) chainer.disable_experimental_feature_warning = True parser = argparse.ArgumentParser(description='CosmoFlow Multi-Node Training') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epochs', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--out', '-o', default='results', help='Output directory') args = parser.parse_args() batch_size = args.batchsize epochs = args.epochs out = args.out # Create ChainerMN communicator. comm = chainermnx.create_communicator("spatial_nccl") device = comm.intra_rank # Input data and label train = CosmoDataset("/groups2/gaa50004/cosmoflow_data") if comm.rank != 0: train = chainermn.datasets.create_empty_dataset(train) # test = chainermn.datasets.create_empty_dataset(test) train_iterator = chainermn.iterators.create_multi_node_iterator( chainer.iterators.MultithreadIterator(train, batch_size, n_threads=20, shuffle=True), comm) # vali_iterator = chainermn.iterators.create_multi_node_iterator( # chainer.iterators.MultithreadIterator(test, batch_size, repeat=False, shuffle=False, n_threads=20), # comm) # train_iterator = ch.iterators.SerialIterator(train, batch_size, shuffle=True) # vali_iterator = ch.iterators.SerialIterator(test, batch_size, repeat=False, shuffle=False) model = CosmoFlow(comm) # print("Model Created successfully") ch.backends.cuda.get_device_from_id(device).use() model.to_gpu() # Copy the model to the GPU optimizer = ch.optimizers.Adam() optimizer.setup(model) # Create the updater, using the optimizer updater = training.StandardUpdater(train_iterator, optimizer, device=device) # Set up a trainer trainer = training.Trainer(updater, (epochs, 'epoch'), out=out) # trainer.extend(extensions.Evaluator(vali_iterator, model, device=device)) log_interval = (1, 'epoch') filename = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".log" if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.LogReport(trigger=log_interval, filename=filename)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport(['epoch' 'Validation loss', 'lr']), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=1)) print("Starting Training ") trainer.run()
def main(): parser = argparse.ArgumentParser(description='Chainer CIFAR example:') parser.add_argument('--dataset', '-d', default='cifar10', help='The dataset to use: cifar10 or cifar100') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--learnrate', '-l', type=float, default=0.05, help='Learning rate for SGD') parser.add_argument('--epoch', '-e', type=int, default=300, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--early-stopping', type=str, help='Metric to watch for early stopping') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Set up a neural network to train. # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. if args.dataset == 'cifar10': print('Using CIFAR10 dataset.') class_labels = 10 train, test = get_cifar10() elif args.dataset == 'cifar100': print('Using CIFAR100 dataset.') class_labels = 100 train, test = get_cifar100() else: raise RuntimeError('Invalid dataset choice.') model = L.Classifier(models.VGG.VGG(class_labels)) if args.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU optimizer = chainer.optimizers.MomentumSGD(args.learnrate) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4)) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) stop_trigger = (args.epoch, 'epoch') # Early stopping option if args.early_stopping: stop_trigger = triggers.EarlyStoppingTrigger( monitor=args.early_stopping, verbose=True, max_trigger=(args.epoch, 'epoch')) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, stop_trigger, out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu)) # Reduce the learning rate by half every 25 epochs. trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch')) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.DumpGraph('main/loss')) # Take a snapshot at each epoch trainer.extend( extensions.snapshot(filename='snaphot_epoch_{.updater.epoch}')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run()
def main(): parser = argparse.ArgumentParser( description='ChainerMN example: pipelined neural network') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') args = parser.parse_args() # Prepare ChainerMN communicator. if args.gpu: comm = chainermn.create_communicator('pure_nccl') data_axis, model_axis = comm.rank % 2, comm.rank // 2 data_comm = comm.split(data_axis, comm.rank) model_comm = comm.split(model_axis, comm.rank) device = comm.intra_rank else: comm = chainermn.create_communicator('naive') data_axis, model_axis = comm.rank % 2, comm.rank // 2 data_comm = comm.split(data_axis, comm.rank) model_comm = comm.split(model_axis, comm.rank) device = -1 if model_comm.size != 2: raise ValueError('This example can only be executed on the even number' 'of processes.') if comm.rank == 0: print('==========================================') if args.gpu: print('Using GPUs') print('Num unit: {}'.format(args.unit)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') if data_axis == 0: model = L.Classifier(MLP0(model_comm, args.unit)) elif data_axis == 1: model = MLP1(model_comm, args.unit, 10) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu() optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), data_comm) optimizer.setup(model) # Original dataset on worker 0 and 1. # Datasets of worker 0 and 1 are split and distributed to all workers. if model_axis == 0: train, test = chainer.datasets.get_mnist() if data_axis == 1: train = chainermn.datasets.create_empty_dataset(train) test = chainermn.datasets.create_empty_dataset(test) else: train, test = None, None train = chainermn.scatter_dataset(train, data_comm, shuffle=True) test = chainermn.scatter_dataset(test, data_comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, args.batchsize, shuffle=False) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) evaluator = extensions.Evaluator(test_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, data_comm) trainer.extend(evaluator) # Some display and output extentions are necessary only for worker 0. if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) trainer.run()
def main(): models = { 'alexnet': AlexNet, 'resnet': ResNet50, 'vgg': VGG, } parser = argparse.ArgumentParser(description='Train ImageNet From Scratch') parser.add_argument('--model', '-M', choices=models.keys(), default='AlexNet', help='Convnet model') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epochs', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--out', '-o', default='results', help='Output directory') args = parser.parse_args() batch_size = args.batchsize epochs = args.epochs out = args.out # Start method of multiprocessing module need to be changed if we are using InfiniBand and MultiprocessIterator. multiprocessing.set_start_method('forkserver') p = multiprocessing.Process() p.start() p.join() # Prepare ChainerMN communicator. comm = chainermn.create_communicator("pure_nccl") device = comm.intra_rank if comm.rank == 0: print('==========================================') print('Num of GPUs : {}'.format(comm.size)) print('Model : {}'.format(args.model)) print('Minibatch-size: {}'.format(batch_size)) print('Epochs: {}'.format(args.epochs)) print('==========================================') model = models[args.model](comm) chainer.backends.cuda.get_device_from_id( device).use() # Make the GPU current model.to_gpu() # Split and distribute the dataset. Only worker 0 lo1898687ads the whole dataset. # Datasets of worker 0 are evenly split and distributed to all workers. mean = np.load(MEAN_FILE) # All ranks load the data train = PreprocessedDataset(TRAIN, TRAINING_ROOT, mean, 226) val = PreprocessedDataset(VAL, VALIDATION_ROOT, mean, 226, False) # Create a multinode iterator such that each rank gets the same batch if comm.rank != 0: train = chainermn.datasets.create_empty_dataset(train) val = chainermn.datasets.create_empty_dataset(val) # Same dataset in all nodes train_iter = chainermn.iterators.create_multi_node_iterator( chainer.iterators.MultithreadIterator(train, args.batchsize, n_threads=40), comm) val_iter = chainermn.iterators.create_multi_node_iterator( chainer.iterators.MultithreadIterator(val, args.batchsize, repeat=False, shuffle=False, n_threads=40), comm) # We dont use a multinode optimizer here as we dont do all reduce on final weights optimizer = chainer.optimizers.Adam() optimizer.setup(model) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (epochs, 'iteration'), out) val_interval = (1, 'epoch') log_interval = (1, 'epoch') # Create an evaluator evaluator = extensions.Evaluator(val_iter, model, device=device) # Since I need to measure timer per epoch, I avoid evaluation and just train the model # By setting the evaluation epoch high, this will not be triggered when i am running few epochs trainer.extend(evaluator, trigger=val_interval) # Some display and output extensions are necessary only for one worker. if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=(1, 'epoch')) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', filename='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', filename='accuracy.png')) trainer.extend(extensions.ProgressBar()) # TODO : Figure out how to send this report to a file if comm.rank == 0: print("Starting training .....") hook = CupyMemoryProfileHook() with hook: trainer.run() if comm.rank == 0: hook.print_report()
def main(): # Check if GPU is available # (ImageNet example does not support CPU execution) if not chainer.cuda.available: raise RuntimeError('ImageNet requires GPU support.') archs = [f'b{i}' for i in range(8)] + ['se'] patchsizes = { 'b0': 224, 'b1': 240, 'b2': 260, 'b3': 300, 'b4': 380, 'b5': 456, 'b6': 528, 'b7': 600, 'se': 224 } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('--arch', '-a', choices=archs, default='b0') parser.add_argument('--patchsize', default=None, type=int, help='The input size of images. If not specifed,\ architecture-wise default values wil be used.' ) parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--optimizer', default='RMSProp') parser.add_argument('--lr', default=0.256, type=float) parser.add_argument('--cosine_annealing', action='store_true') parser.add_argument('--exponent', type=float, default=0.97) parser.add_argument('--exponent_trigger', type=float, default=2.6) parser.add_argument('--soft_label', action='store_true') parser.add_argument('--epoch', '-E', type=int, default=350, help='Number of epochs to train') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, default=3, help='Number of parallel data loading processes') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='../ssd/imagenet', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=32, help='Validation minibatch size') parser.add_argument('--workerwisebn', action='store_true') parser.add_argument('--no_dropconnect', action='store_true') parser.add_argument('--test', action='store_true') parser.add_argument('--communicator', default='pure_nccl') parser.add_argument('--no_autoaugment', action='store_true') parser.add_argument('--dtype', default='float32', choices=['mixed16', 'float32'], help='For now do not use mixed16') parser.set_defaults(test=False) args = parser.parse_args() chainer.global_config.dtype = args.dtype comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) print('Using {} communicator'.format(args.communicator)) print('Using {} arch'.format(args.arch)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) mode = 'workerwise' if args.workerwisebn else 'synchronized' print(f'BatchNorm is {mode}') print('==========================================') if args.soft_label: accfun = soft_accuracy lossfun = soft_softmax_cross_entropy else: accfun = F.accuracy lossfun = F.softmax_cross_entropy if args.arch != 'se': model = EfficientNet(args.arch, workerwisebn=args.workerwisebn, no_dropconnect=args.no_dropconnect) else: model = SEResNeXt50() model = L.Classifier(model, lossfun=lossfun, accfun=accfun) if args.initmodel: print('Load model from', args.initmodel) chainer.serializers.load_npz(args.initmodel, model) chainer.cuda.get_device_from_id(device).use() # Make the GPU current model.to_gpu() # Split and distribute the dataset. Only worker 0 loads the whole dataset. # Datasets of worker 0 are evenly split and distributed to all workers. patchsize = patchsizes[ args.arch] if args.patchsize is None else args.patchsize patchsize = (patchsize, patchsize) train_transform, val_transform, _ = get_transforms( patchsize, no_autoaugment=args.no_autoaugment, soft=args.soft_label) if comm.rank == 0: train = ImageNetDataset(args.root, 'train') val = ImageNetDataset(args.root, 'val') else: train = None val = None train = chainermn.scatter_dataset(train, comm, shuffle=True) val = chainermn.scatter_dataset(val, comm) train = chainer.datasets.TransformDataset(train, train_transform) val = chainer.datasets.TransformDataset(val, val_transform) # A workaround for processes crash should be done before making # communicator above, when using fork (e.g. MultiProcessIterator) # along with Infiniband. train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) # Create a multi node optimizer from a standard Chainer optimizer. symbol = 'lr' if args.optimizer.lower() == 'rmsprop': optimizer = chainer.optimizers.RMSprop(lr=args.lr, alpha=0.9) elif args.optimizer.lower() == 'momentumsgd': optimizer = chainer.optimizers.MomentumSGD(lr=args.lr) elif args.optimizer.lower() == 'corrected': optimizer = chainer.optimizers.CorrectedMomentumSGD(lr=args.lr) elif args.optimizer.lower() == 'adabound': optimizer = chainer.optimizers.AdaBound(alpha=args.lr, final_lr=0.5) symbol = 'alpha' optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-5)) args.out = f'experiments/{args.arch}' + args.out save_args(args, args.out) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch') val_interval = (10, 'iteration') if args.test else (2, 'epoch') log_interval = (10, 'iteration') if args.test else (2, 'epoch') checkpointer = chainermn.create_multi_node_checkpointer( name='imagenet-example', comm=comm) checkpointer.maybe_load(trainer, optimizer) trainer.extend(checkpointer, trigger=checkpoint_interval) if args.cosine_annealing: schedule = lr_schedules.CosineLRSchedule(args.lr) if args.optimizer in ['MomentumSGD', 'Corrected']: trainer.extend(lr_schedules.LearningRateScheduler(schedule)) else: trainer.extend(extensions.ExponentialShift(symbol, args.exponent), trigger=(args.exponent_trigger, 'epoch')) # Create a multi node evaluator from an evaluator. evaluator = TestModeEvaluator(val_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator, trigger=val_interval) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}.npz'), trigger=val_interval) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=100)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') parser.add_argument('--noplot', dest='plot', action='store_false', help='Disable PlotReport extension') group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--compile', action='store_true', help='Compile the model') parser.add_argument('--dump_onnx', action='store_true', help='Dump ONNX model after optimization') parser.add_argument('--iterations', '-I', type=int, default=None, help='Number of iterations to train') parser.add_argument('--use-fake-data', action='store_true', help='Use fake data') parser.add_argument('--computation_order', type=str, default=None, help='Computation order in backpropagation') parser.add_argument('--use_unified_memory', dest='use_unified_memory', action='store_true', help='Use unified memory for large model') args = parser.parse_args() device = chainer.get_device(args.device) print('Device: {}'.format(device)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Set up a neural network to train # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. mlp = MLP(args.unit, 10) if args.compile: if args.computation_order is None: translator = 'ch2o' else: translator = 'onnx_chainer' export_allocator = None runtime_allocator = None if args.use_unified_memory: import cupy # unified memory export_allocator = cupy.cuda.memory.malloc_managed runtime_allocator = cupy.get_default_memory_pool().malloc mlp = chainer_compiler.compile( mlp, dump_onnx=args.dump_onnx, translator=translator, computation_order=args.computation_order, export_allocator=export_allocator, runtime_allocator=runtime_allocator) model = L.Classifier(mlp) model.to_device(device) device.use() # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) # Load the MNIST dataset if args.use_fake_data: train, test = fake_dataset() else: train, test = chainer.datasets.get_mnist() train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, device=device) if args.iterations: stop_trigger = (args.iterations, 'iteration') else: stop_trigger = (args.epoch, 'epoch') trainer = training.Trainer(updater, stop_trigger, out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=device)) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. # TODO(niboshi): Temporarily disabled for chainerx. Fix it. if device.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) # Take a snapshot for each specified epoch frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Save two plot images to the result dir if args.plot and extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run()
def main(): parser = argparse.ArgumentParser(description='Chainer example: VAE') parser.add_argument('--initmodel', '-m', type=str, help='Initialize the model from given file') parser.add_argument('--resume', '-r', type=str, help='Resume the optimization from snapshot') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', default='results', help='Directory to output the result') parser.add_argument('--epoch', '-e', default=100, type=int, help='number of epochs to learn') parser.add_argument('--dim-z', '-z', default=20, type=int, help='dimension of encoded vector') parser.add_argument('--dim-h', default=500, type=int, help='dimension of hidden layer') parser.add_argument('--beta', default=1.0, type=float, help='Regularization coefficient for ' 'the second term of ELBO bound') parser.add_argument('--k', '-k', default=1, type=int, help='Number of Monte Carlo samples used in ' 'encoded vector') parser.add_argument('--binary', action='store_true', help='Use binarized MNIST') parser.add_argument('--batch-size', '-b', type=int, default=100, help='learning minibatch size') parser.add_argument('--test', action='store_true', help='Use tiny datasets for quick tests') group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() if chainer.get_dtype() == np.float16: warnings.warn('This example may cause NaN in FP16 mode.', RuntimeWarning) device = chainer.get_device(args.device) device.use() print('Device: {}'.format(device)) print('# dim z: {}'.format(args.dim_z)) print('# Minibatch-size: {}'.format(args.batch_size)) print('# epoch: {}'.format(args.epoch)) print('') # Prepare VAE model, defined in net.py encoder = net.make_encoder(784, args.dim_z, args.dim_h) decoder = net.make_decoder(784, args.dim_z, args.dim_h, binary_check=args.binary) prior = net.make_prior(args.dim_z) avg_elbo_loss = net.AvgELBOLoss(encoder, decoder, prior, beta=args.beta, k=args.k) avg_elbo_loss.to_device(device) # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(avg_elbo_loss) # If initial parameters are given, initialize the model with them. if args.initmodel is not None: chainer.serializers.load_npz(args.initmodel, avg_elbo_loss) # Load the MNIST dataset train, test = chainer.datasets.get_mnist(withlabel=False) if args.binary: # Binarize dataset train = (train >= 0.5).astype(np.float32) test = (test >= 0.5).astype(np.float32) if args.test: train, _ = chainer.datasets.split_dataset(train, 100) test, _ = chainer.datasets.split_dataset(test, 100) train_iter = chainer.iterators.SerialIterator(train, args.batch_size) test_iter = chainer.iterators.SerialIterator(test, args.batch_size, repeat=False, shuffle=False) # Set up an updater. StandardUpdater can explicitly specify a loss function # used in the training with 'loss_func' option updater = training.updaters.StandardUpdater(train_iter, optimizer, device=device, loss_func=avg_elbo_loss) # Set up the trainer and extensions. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( extensions.Evaluator(test_iter, avg_elbo_loss, device=device)) # TODO(niboshi): Temporarily disabled for chainerx. Fix it. if device.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/reconstr', 'main/kl_penalty', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) # If snapshot file is given, resume the training. if args.resume is not None: chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run() # Save images for demonstration save_images(device, encoder, decoder, train, test, prior, args.out)
def main(): parser = argparse.ArgumentParser(description='Chainer CIFAR example:') parser.add_argument('--dataset', default='cifar10', help='The dataset to use: cifar10 or cifar100') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--learnrate', '-l', type=float, default=0.05, help='Learning rate for SGD') parser.add_argument('--epoch', '-e', type=int, default=300, help='Number of sweeps over the dataset to train') parser.add_argument('--device', '-d', type=str, default='0', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--early-stopping', type=str, help='Metric to watch for early stopping') group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() if chainer.get_dtype() == numpy.float16: warnings.warn('This example may cause NaN in FP16 mode.', RuntimeWarning) device = chainer.get_device(args.device) if device.xp is chainerx: sys.stderr.write('This example does not support ChainerX devices.\n') sys.exit(1) print('Device: {}'.format(device)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') device.use() # Set up a neural network to train. # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. if args.dataset == 'cifar10': print('Using CIFAR10 dataset.') class_labels = 10 train, test = get_cifar10() elif args.dataset == 'cifar100': print('Using CIFAR100 dataset.') class_labels = 100 train, test = get_cifar100() else: raise RuntimeError('Invalid dataset choice.') model = L.Classifier(models.VGG.VGG(class_labels)) model.to_device(device) optimizer = chainer.optimizers.MomentumSGD(args.learnrate) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4)) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) stop_trigger = (args.epoch, 'epoch') # Early stopping option if args.early_stopping: stop_trigger = triggers.EarlyStoppingTrigger( monitor=args.early_stopping, verbose=True, max_trigger=(args.epoch, 'epoch')) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, stop_trigger, out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=device)) # Reduce the learning rate by half every 25 epochs. trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch')) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. # TODO(hvy): Support ChainerX if device.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) # Take a snapshot at each epoch trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run()
def main(): parser = argparse.ArgumentParser(description='ChainerMN example: VGG16') parser.add_argument('--dataset', '-d', default='cifar10', help='The dataset to use: cifar10 or cifar100') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--learnrate', '-l', type=float, default=0.05, help='Learning rate for SGD') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--gpu', '-g', action='store_true', default=False, help='use GPU') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--noplot', dest='plot', action='store_false', help='Disable PlotReport extension') args = parser.parse_args() # Create ChainerMN communicator. if args.gpu: comm = chainermn.create_communicator('hierarchical') device = comm.rank else: comm = chainermn.create_communicator('naive') device = -1 if comm.rank == 0: print('GPU: {}'.format(args.gpu)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Load the CIFAR10 dataset if args.dataset == 'cifar10': class_labels = 10 train, test = chainer.datasets.get_cifar10() elif args.dataset == 'cifar100': class_labels = 100 train, test = chainer.datasets.get_cifar100() else: raise RuntimeError('Invalid dataset choice.') model = L.Classifier(VGG.VGG(comm, class_labels)) if args.gpu: # Make a specified GPU current chainer.cuda.get_device_from_id(device).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.MomentumSGD(args.learnrate) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4)) if comm.rank != 0: train = chainermn.datasets.create_empty_dataset(train) test = chainermn.datasets.create_empty_dataset(test) train_iter = chainermn.iterators.create_multi_node_iterator( chainer.iterators.SerialIterator(train, args.batchsize), comm) test_iter = chainermn.iterators.create_multi_node_iterator( chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False), comm) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=device)) if comm.rank == 0: # Dump a computational graph from 'loss' variable # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.DumpGraph('main/loss')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Save two plot images to the result dir if args.plot and extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) # Run the training trainer.run()
def main(): parser = argparse.ArgumentParser(description='Chainer example: VAE') parser.add_argument('--initmodel', '-m', type=str, help='Initialize the model from given file') parser.add_argument('--resume', '-r', type=str, help='Resume the optimization from snapshot') parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='results', help='Directory to output the result') parser.add_argument('--epoch', '-e', default=100, type=int, help='number of epochs to learn') parser.add_argument('--dim-z', '-z', default=20, type=int, help='dimention of encoded vector') parser.add_argument('--dim-h', default=500, type=int, help='dimention of hidden layer') parser.add_argument('--beta', default=1.0, type=float, help='Regularization coefficient for ' 'the second term of ELBO bound') parser.add_argument('--k', '-k', default=1, type=int, help='Number of Monte Carlo samples used in ' 'encoded vector') parser.add_argument('--binary', action='store_true', help='Use binarized MNIST') parser.add_argument('--batch-size', '-b', type=int, default=100, help='learning minibatch size') parser.add_argument('--test', action='store_true', help='Use tiny datasets for quick tests') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# dim z: {}'.format(args.dim_z)) print('# Minibatch-size: {}'.format(args.batch_size)) print('# epoch: {}'.format(args.epoch)) print('') # Prepare VAE model, defined in net.py encoder = net.make_encoder(784, args.dim_z, args.dim_h) decoder = net.make_decoder(784, args.dim_z, args.dim_h, binary_check=args.binary) prior = net.make_prior(args.dim_z) avg_elbo_loss = net.AvgELBOLoss(encoder, decoder, prior, beta=args.beta, k=args.k) if args.gpu >= 0: avg_elbo_loss.to_gpu(args.gpu) # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(avg_elbo_loss) # Initialize if args.initmodel is not None: chainer.serializers.load_npz(args.initmodel, avg_elbo_loss) # Load the MNIST dataset train, test = chainer.datasets.get_mnist(withlabel=False) if args.binary: # Binarize dataset train = (train >= 0.5).astype(np.float32) test = (test >= 0.5).astype(np.float32) if args.test: train, _ = chainer.datasets.split_dataset(train, 100) test, _ = chainer.datasets.split_dataset(test, 100) train_iter = chainer.iterators.SerialIterator(train, args.batch_size) test_iter = chainer.iterators.SerialIterator(test, args.batch_size, repeat=False, shuffle=False) # Set up an updater. StandardUpdater can explicitly specify a loss function # used in the training with 'loss_func' option updater = training.updaters.StandardUpdater(train_iter, optimizer, device=args.gpu, loss_func=avg_elbo_loss) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( extensions.Evaluator(test_iter, avg_elbo_loss, device=args.gpu)) trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/reconstr', 'main/kl_penalty', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) if args.resume is not None: chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run() # Visualize the results def save_images(x, filename): import matplotlib.pyplot as plt fig, ax = plt.subplots(3, 3, figsize=(9, 9), dpi=100) for ai, xi in zip(ax.flatten(), x): ai.imshow(xi.reshape(28, 28)) fig.savefig(filename) avg_elbo_loss.to_cpu() train_ind = [1, 3, 5, 10, 2, 0, 13, 15, 17] x = chainer.Variable(np.asarray(train[train_ind])) with chainer.using_config('train', False), chainer.no_backprop_mode(): x1 = decoder(encoder(x).mean, inference=True).mean save_images(x.array, os.path.join(args.out, 'train')) save_images(x1.array, os.path.join(args.out, 'train_reconstructed')) test_ind = [3, 2, 1, 18, 4, 8, 11, 17, 61] x = chainer.Variable(np.asarray(test[test_ind])) with chainer.using_config('train', False), chainer.no_backprop_mode(): x1 = decoder(encoder(x).mean, inference=True).mean save_images(x.array, os.path.join(args.out, 'test')) save_images(x1.array, os.path.join(args.out, 'test_reconstructed')) # draw images from randomly sampled z z = prior().sample(9) x = decoder(z, inference=True).mean save_images(x.array, os.path.join(args.out, 'sampled'))
def main(): # This script is almost identical to train_mnist.py. The only difference is # that this script uses data-parallel computation on two GPUs. # See train_mnist.py for more details. parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=400, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu0', '-g', type=int, default=0, help='First GPU ID') parser.add_argument('--gpu1', '-G', type=int, default=1, help='Second GPU ID') parser.add_argument('--out', '-o', default='result_parallel', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') args = parser.parse_args() print('GPU: {}, {}'.format(args.gpu0, args.gpu1)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') chainer.backends.cuda.get_device_from_id(args.gpu0).use() model = L.Classifier(train_mnist.MLP(args.unit, 10)) optimizer = chainer.optimizers.Adam() optimizer.setup(model) train, test = chainer.datasets.get_mnist() train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # ParallelUpdater implements the data-parallel gradient computation on # multiple GPUs. It accepts "devices" argument that specifies which GPU to # use. updater = training.updaters.ParallelUpdater( train_iter, optimizer, # The device of the name 'main' is used as a "master", while others are # used as slaves. Names other than 'main' are arbitrary. devices={ 'main': args.gpu0, 'second': args.gpu1 }, ) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu0)) trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
# しっかりドキュメント読むこと # optimizer 作成 optimizer = chainer.optimizers.Adam() optimizer.setup(model) # updater 作成 updater = training.updaters.StandardUpdater(train_itr, optimizer, device=-1) # trainer 作成 trainer = training.Trainer(updater, (20, 'epoch'), out='results') trainer.extend(extensions.Evaluator(test_itr, model, device=-1)) # testデータセットで評価 trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=(20, 'epoch')) trainer.extend(extensions.LogReport()) if extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', filename='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuraccy'], 'epoch', filename='accuracy.png')) # 学習実行 trainer.run()
def main(): # These two lines help with memory. If they are not included training runs out of memory. # Use them till you the real reason why its running out of memory pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed) cp.cuda.set_allocator(pool.malloc) chainer.disable_experimental_feature_warning = True parser = argparse.ArgumentParser( description='CosmoFlow Multi-Node Training') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epochs', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--out', '-o', default='results', help='Output directory') args = parser.parse_args() batch_size = args.batchsize epochs = args.epochs out = args.out # Prepare communicators communicator. comm = chainermnx.create_communicator("spatial_hybrid_nccl") local_comm = create_local_comm(comm) data_comm = create_data_comm(comm) device = comm.intra_rank if local_comm.rank == 0: if data_comm.rank == 0: train = CosmoDataset("/groups2/gaa50004/cosmoflow_data") # train, val = datasets.split_dataset_random(training_data, first_size=(int(training_data.__len__() * 0.80))) else: train = None #val = None train = chainermn.scatter_dataset(train, data_comm, shuffle=True) # val = chainermn.scatter_dataset(val, data_comm, shuffle=True) else: train = CosmoDataset("/groups2/gaa50004/cosmoflow_data") train = chainermn.datasets.create_empty_dataset(train) # val = chainermn.datasets.create_empty_dataset(val) train_iterator = chainermn.iterators.create_multi_node_iterator( chainer.iterators.MultithreadIterator(train, batch_size, n_threads=20, shuffle=True), local_comm) # vali_iterator = chainermn.iterators.create_multi_node_iterator( # chainer.iterators.MultithreadIterator(val, batch_size, repeat=False, shuffle=False, n_threads=20), # local_comm) model = CosmoFlow(local_comm) # model = L.Classifier(model, lossfun=F.mean_squared_error, accfun=F.mean_squared_error) # print("Model Created successfully") ch.backends.cuda.get_device_from_id(device).use() model.to_gpu() # Copy the model to the GPU optimizer = chainermnx.create_hybrid_multi_node_optimizer_alpha( chainer.optimizers.Adam(), data_comm, local_comm) optimizer.setup(model) # Create the updater, using the optimizer updater = training.StandardUpdater(train_iterator, optimizer, device=device) # Set up a trainer trainer = training.Trainer(updater, (epochs, 'epoch'), out=out) # trainer.extend(extensions.Evaluator(vali_iterator, model, device=device)) filename = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".log" log_interval = (1, 'epoch') if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend( extensions.LogReport(trigger=log_interval, filename=filename)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', filename='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', filename='accuracy.png')) trainer.extend(extensions.ProgressBar(update_interval=1)) print("Starting Training ") trainer.run()
def main(): archs = { 'alex': alex.Alex, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, 'resnext50': resnext50.ResNeXt50, } dtypes = { 'float16': np.float16, 'float32': np.float32, 'float64': np.float64, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--dtype', choices=dtypes, help='Specify the dtype ' 'used. If not supplied, the default dtype is used') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.set_defaults(test=False) parser.add_argument('--dali', action='store_true') parser.set_defaults(dali=False) group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() device = parse_device(args) # Set the dtype if supplied. if args.dtype is not None: chainer.config.dtype = args.dtype print('Device: {}'.format(device)) print('Dtype: {}'.format(chainer.config.dtype)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Initialize the model to train model = archs[args.arch]() if args.initmodel: print('Load model from {}'.format(args.initmodel)) chainer.serializers.load_npz(args.initmodel, model) model.to_device(device) device.use() # Load the mean file mean = np.load(args.mean) if args.dali: if not dali_util._dali_available: raise RuntimeError('DALI seems not available on your system.') if not isinstance(device, chainer.backend.cuda.GpuDevice): raise RuntimeError('Using DALI requires GPU device. Please ' 'specify it with --device option.') num_threads = args.loaderjob if num_threads is None or num_threads <= 0: num_threads = 1 ch_mean = list(np.average(mean, axis=(1, 2))) ch_std = [255.0, 255.0, 255.0] # Setup DALI pipelines train_pipe = dali_util.DaliPipelineTrain(args.train, args.root, model.insize, args.batchsize, num_threads, device.device.id, True, mean=ch_mean, std=ch_std) val_pipe = dali_util.DaliPipelineVal(args.val, args.root, model.insize, args.val_batchsize, num_threads, device.device.id, False, mean=ch_mean, std=ch_std) train_iter = chainer.iterators.DaliIterator(train_pipe) val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False) # converter = dali_converter converter = dali_util.DaliConverter(mean=mean, crop_size=model.insize) else: # Load the dataset files train = PreprocessedDataset(args.train, args.root, mean, model.insize) val = PreprocessedDataset(args.val, args.root, mean, model.insize, False) # These iterators load the images with subprocesses running in parallel # to the training/validation. train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) converter = dataset.concat_examples # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=converter, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) val_interval = (1 if args.test else 100000), 'iteration' log_interval = (1 if args.test else 1000), 'iteration' trainer.extend(extensions.Evaluator(val_iter, model, converter=converter, device=device), trigger=val_interval) # TODO(sonots): Temporarily disabled for chainerx. Fix it. if device.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', default=20, type=int, help='Number of sweeps over the dataset to train') parser.add_argument('--out', '-o', default='result_model_parallel', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', default=1000, type=int, help='Number of units') parser.add_argument('--device0', '-d', type=str, default='0', help='Device specifier of the first device. ' 'Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--device1', '-D', type=str, default='1', help='Device specifier of the second device. ' 'Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu0', '-g', dest='device0', type=int, nargs='?', const=0, help='First GPU ID') group.add_argument('--gpu1', '-G', dest='device1', type=int, nargs='?', const=1, help='Second GPU ID') args = parser.parse_args() device0 = chainer.get_device(args.device0) device1 = chainer.get_device(args.device1) print('Devices: {}, {}'.format(device0, device1)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # See train_mnist.py for the meaning of these lines model = L.Classifier(ParallelMLP(args.unit, 10, device0, device1)) device0.use() optimizer = chainer.optimizers.Adam() optimizer.setup(model) train, test = chainer.datasets.get_mnist() train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) updater = training.updaters.StandardUpdater(train_iter, optimizer, input_device=device0) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.Evaluator(test_iter, model, device=device0)) # TODO(niboshi): Temporarily disabled for chainerx. Fix it. if device0.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def train(): device = chainer.get_device(device_id) device.use() print('Device: {}'.format(device)) print('# Minibatch-size: {}'.format(batchsize)) print('# epoch: {}'.format(epoch)) print('') # Set up a neural network to train. # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. if dataset == 'cifar10': print('Using CIFAR10 dataset.') class_labels = 10 train, test = get_cifar10() elif dataset == 'cifar100': print('Using CIFAR100 dataset.') class_labels = 100 train, test = get_cifar100() else: raise RuntimeError('Invalid dataset choice.') model = L.Classifier(models.VGG.VGG(class_labels)) model.to_device(device) optimizer = chainer.optimizers.MomentumSGD(learnrate) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4)) train_iter = chainer.iterators.SerialIterator(train, batchsize) test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False) stop_trigger = (epoch, 'epoch') # Set up a trainer out = './result' updater = training.updaters.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, stop_trigger, out=out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=device)) # Reduce the learning rate by half every 25 epochs. trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch')) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. # TODO(imanishi): Support for ChainerX if not isinstance(device, backend.ChainerxDevice): trainer.extend(extensions.DumpGraph('main/loss')) # Take a snapshot at each epoch trainer.extend( extensions.snapshot(filename='snaphot_epoch_{.updater.epoch}')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Run the training trainer.run()
def main(): archs = { 'alex': alex.Alex, 'alex_fp16': alex.AlexFp16, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, 'resnext50': resnext50.ResNeXt50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--iterations', '-I', type=int, default=0, help='Number of iterations to train') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.set_defaults(test=False) parser.add_argument('--dali', action='store_true') parser.set_defaults(dali=False) group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--compile', action='store_true', help='Compile the model') parser.add_argument('--dump_onnx', action='store_true', help='Dump ONNX model after optimization') args = parser.parse_args() chainer.config.autotune = True chainer.config.cudnn_fast_batch_normalization = True device = chainer.get_device(args.device) print('Device: {}'.format(device)) print('# Minibatch-size: {}'.format(args.batchsize)) if args.iterations: print('# iterations: {}'.format(args.iterations)) else: print('# epoch: {}'.format(args.epoch)) print('') # Initialize the model to train model = archs[args.arch]() if args.initmodel: print('Load model from {}'.format(args.initmodel)) chainer.serializers.load_npz(args.initmodel, model) insize = model.insize if args.compile: model = chainer_compiler.compile(model, dump_onnx=args.dump_onnx) model.to_device(device) device.use() # Load the mean file mean = np.load(args.mean) if args.dali: if not dali_util._dali_available: raise RuntimeError('DALI seems not available on your system.') num_threads = args.loaderjob if num_threads is None or num_threads <= 0: num_threads = 1 ch_mean = list(np.average(mean, axis=(1, 2))) ch_std = [255.0, 255.0, 255.0] # Setup DALI pipelines train_pipe = dali_util.DaliPipelineTrain(args.train, args.root, insize, args.batchsize, num_threads, args.gpu, True, mean=ch_mean, std=ch_std) val_pipe = dali_util.DaliPipelineVal(args.val, args.root, insize, args.val_batchsize, num_threads, args.gpu, False, mean=ch_mean, std=ch_std) train_iter = chainer.iterators.DaliIterator(train_pipe) val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False) # converter = dali_converter converter = dali_util.DaliConverter(mean=mean, crop_size=insize) else: # Load the dataset files train = PreprocessedDataset(args.train, args.root, mean, insize) val = PreprocessedDataset(args.val, args.root, mean, insize, False) # These iterators load the images with subprocesses running in parallel # to the training/validation. train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) converter = dataset.concat_examples # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=converter, device=device) if args.iterations: stop_trigger = (args.iterations, 'iteration') else: stop_trigger = (args.epoch, 'epoch') trainer = training.Trainer(updater, stop_trigger, args.out) val_interval = (1 if args.test else 100000), 'iteration' log_interval = ((1 if args.test else 10 if args.iterations else 1000), 'iteration') trainer.extend(extensions.Evaluator(val_iter, model, converter=converter, device=device), trigger=val_interval) # TODO(sonots): Temporarily disabled for chainerx. Fix it. if device.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) cuda_hook = function_hooks.CUDAProfileHook() with cuda_hook: trainer.run() with open('%s/log' % args.out) as f: logs = json.load(f) elapsed_times = [] for prev, cur in zip(logs, logs[1:]): iters = cur['iteration'] - prev['iteration'] elapsed = cur['elapsed_time'] - prev['elapsed_time'] elapsed_times.append(elapsed / iters) sec_per_iter = sum(elapsed_times) / len(elapsed_times) print(sec_per_iter * 1000, 'msec/iter') print(args.batchsize / sec_per_iter, 'images/sec')