def run_training( net, train, valid, result_dir, batchsize=64, devices=-1, training_epoch=300, initial_lr=0.05, lr_decay_rate=0.5, lr_decay_epoch=30, weight_decay=0.0005): # Iterator train_iter = iterators.MultiprocessIterator(train, batchsize) test_iter = iterators.MultiprocessIterator(valid, batchsize, False, False) # Model net = L.Classifier(net) # Optimizer optimizer = optimizers.MomentumSGD(lr=initial_lr) optimizer.setup(net) if weight_decay > 0: optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay)) # Updater if isinstance(devices, int): devices['main'] = devices updater = training.StandardUpdater( train_iter, optimizer, device=devices) elif isinstance(devices, dict): updater = training.ParallelUpdater( train_iter, optimizer, devices=devices) # 6. Trainer trainer = training.Trainer( updater, (training_epoch, 'epoch'), out=result_dir) # 7. Trainer extensions trainer.extend(extensions.LogReport()) trainer.extend(extensions.observe_lr()) trainer.extend(extensions.Evaluator( test_iter, net, device=devices['main']), name='val') trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'main/accuracy', 'val/main/loss', 'val/main/accuracy', 'elapsed_time', 'lr'])) trainer.extend(extensions.PlotReport( ['main/loss', 'val/main/loss'], x_key='epoch', file_name='loss.png')) trainer.extend(extensions.PlotReport( ['main/accuracy', 'val/main/accuracy'], x_key='epoch', file_name='accuracy.png')) trainer.extend(extensions.ExponentialShift( 'lr', lr_decay_rate), trigger=(lr_decay_epoch, 'epoch')) trainer.run() return net
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', choices=('ssd300', 'ssd512'), default='ssd300') parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--out', default='result') parser.add_argument('--resume') args = parser.parse_args() if args.model == 'ssd300': model = SSD300(n_fg_class=len(voc_bbox_label_names), pretrained_model='voc0712') elif args.model == 'ssd512': model = SSD512(n_fg_class=len(via_bbox_label_names), pretrained_model='imagenet') model.use_preset('evaluate') train_chain = MultiboxTrainChain(model) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() train = TransformDataset(DatasetFromDat(file_path='Rack.dat'), Transform(model.coder, model.insize, model.mean)) train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize) test = DatasetFromDat('Rack_val.dat') test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # initial lr is set to 1e-3 by ExponentialShift optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(train_chain) for param in train_chain.params(): if param.name == 'b': param.update_rule.add_hook(GradientScaling(2)) else: param.update_rule.add_hook(WeightDecay(0.0005)) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) # 120000->8000 trainer = training.Trainer(updater, (500, 'iteration'), args.out) # 80000->5000,100000->7000 trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3), trigger=triggers.ManualScheduleTrigger([300, 400], 'iteration')) # 10000->700 trainer.extend(DetectionEvaluator(test_iter, model, use_07_metric=True, label_names=via_bbox_label_names), trigger=(7, 'iteration')) log_interval = 10, 'iteration' trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc', 'main/loss/conf', 'validation/main/map' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) # 10000->700 trainer.extend(extensions.snapshot(), trigger=(50, 'iteration')) # 120000->8000 trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=(500, 'iteration')) if args.resume: serializers.load_npz(args.resume, trainer) trainer.run() serializers.save_npz('via_model', model) serializers.save_npz('via_state', optimizer)
def TrainUNet(X, Y, model_=None, optimizer_=None, epoch=40, alpha=0.001, gpu_id=0, loop=1, earlystop=True): assert (len(X) == len(Y)) d_time = datetime.datetime.now().strftime("%m-%d-%H-%M-%S") # 1. Model load. # print(sum(p.data.size for p in model.unet.params())) if model_ is not None: model = Regressor(model_) print("## model loaded.") else: model = Regressor(UNet()) model.compute_accuracy = False if gpu_id >= 0: model.to_gpu(gpu_id) # 2. optimizer load. if optimizer_ is not None: opt = optimizer_ print("## optimizer loaded.") else: opt = optimizers.Adam(alpha=alpha) opt.setup(model) # 3. Data Split. dataset = Unet_DataSet(X, Y) print("# number of patterns", len(dataset)) train, valid = \ split_dataset_random(dataset, int(len(dataset) * 0.8), seed=0) # 4. Iterator train_iter = SerialIterator(train, batch_size=C.BATCH_SIZE) test_iter = SerialIterator(valid, batch_size=C.BATCH_SIZE, repeat=False, shuffle=False) # 5. config train, enable backprop chainer.config.train = True chainer.config.enable_backprop = True # 6. UnetUpdater updater = UnetUpdater(train_iter, opt, model, device=gpu_id) # 7. EarlyStopping if earlystop: stop_trigger = triggers.EarlyStoppingTrigger( monitor='validation/main/loss', max_trigger=(epoch, 'epoch'), patients=5) else: stop_trigger = (epoch, 'epoch') # 8. Trainer trainer = training.Trainer(updater, stop_trigger, out=C.PATH_TRAINRESULT) # 8.1. UnetEvaluator trainer.extend(UnetEvaluator(test_iter, model, device=gpu_id)) trainer.extend(SaveRestore(), trigger=triggers.MinValueTrigger('validation/main/loss')) # 8.2. Extensions LogReport trainer.extend(extensions.LogReport()) # 8.3. Extension Snapshot # trainer.extend(extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}')) # trainer.extend(extensions.snapshot_object(model.unet, filename='loop' + str(loop) + '.model')) # 8.4. Print Report trainer.extend(extensions.observe_lr()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'elapsed_time', 'lr' ])) # 8.5. Extension Graph trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], x_key='epoch', file_name='loop-' + str(loop) + '-loss' + d_time + '.png')) # trainer.extend(extensions.dump_graph('main/loss')) # 8.6. Progree Bar trainer.extend(extensions.ProgressBar()) # 9. Trainer run trainer.run() chainer.serializers.save_npz(C.PATH_TRAINRESULT / ('loop' + str(loop)), model.unet) return model.unet, opt
def main(argv=sys.argv[1:]): if type(argv) == str: argv = argv.split() parser = ArgumentParserWithEpilog( description='Chainer CIFAR with recall error:') # Command line arguments add_base_args(parser) parser.add_argument( '--dynamic_rescale', '-R', default=False, type=float, help= 'Rescale activations to this range [-R,+R] on a per-channel basis, before compressing' ) add_ae_args(parser) args = parser.parse_args(argv) # Other settings and derived arguments end_trigger = (args.epoch, 'epoch') report_file = os.path.join(args.out, 'report.txt') report_entries = [ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr', 'elapsed_time' ] # Header, and output directory if not os.path.exists(args.out): os.mkdir(args.out) open(report_file, 'w').close() # Clears report report = open(report_file, 'a') print_log = print_header(args, argv, log=report, preamble='CIFAR10/100 (%s)' % __file__) ## # Set up model and dataset iterators rng, fixed_seeds = seed_rng(args.seed, args.gpu) train_iter, val_iter, class_labels = load_dataset(args.batchsize, args.dataset, args.augment, args.fast, args.old_test_method) model = init_model(models[args.model], class_labels=class_labels, gpu=args.gpu, fast=args.fast) ## # Get the recall error helper map all_layers = model.predictor.act_names helper_map, filterspec_map = parse_ae_args(parser, args, rng, all_layers=all_layers) print_helper_summary(helper_map, filterspec_map, print_log) print_helper_map(all_layers, helper_map, print_log) # Set up an optimizer lr, lr_ext, lr_trigger = get_lr_schedule(args, train_iter, fast=args.fast) optimizer = MomentumSGDScrambler(helper_map, compress_x_hat=False, dynamic_rescale=args.dynamic_rescale, lr=lr, momentum=args.momentum) optimizer.setup(model) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, end_trigger, out=args.out) # Decay optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) # Learning rate schedule trainer.extend(lr_ext, trigger=lr_trigger) # Extensions - Measurements trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu)) trainer.extend(extensions.observe_lr()) # Extensions - Logging trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport(report_entries)) trainer.extend(PrintReportNoSpecial(report_entries, out=report)) trainer.extend( extensions.ProgressBar(update_interval=args.update_interval)) # Extensions - Snapshots trainer.extend(extensions.snapshot(), trigger=end_trigger) if args.snapshot_every: trainer.extend(extensions.snapshot( filename='snapshot_{0.updater.epoch}_iter_{0.updater.iteration}'), trigger=(args.snapshot_every, 'epoch')) ## # Resume Training if args.resume: #chainer.serializers.load_npz(args.resume, trainer) from train_cifar import model_from_snapshot model_from_snapshot(model, args.resume) ## # Run the training trainer.run() report.close() return trainer, None, helper_map
def main(): args = parser() now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") save_dir = Path('result') / now log_dir = save_dir / 'log' model_dir = save_dir / 'model' snap_dir = save_dir / 'snap' matrix_dir = save_dir / 'matrix' save_dir.mkdir(exist_ok=True, parents=True) log_dir.mkdir(exist_ok=True, parents=True) model_dir.mkdir(exist_ok=True, parents=True) snap_dir.mkdir(exist_ok=True, parents=True) matrix_dir.mkdir(exist_ok=True, parents=True) root = args.dataset dir_list = os.listdir(root) dir_list.sort() if 'mean.npy' in dir_list: dir_list.remove('mean.npy') print('dataset loading ...') datasets = DirectoryParsingLabelDataset(root) print('finish!') class_num = len(set(datasets.labels)) print('class number : {}'.format(class_num)) k_fold = args.kfold print('k_fold : {}'.format(k_fold)) X = np.array([image_paths for image_paths in datasets.img_paths]) y = np.array([label for label in datasets.labels]) kfold = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=402).split(X, y) for k, (train_idx, val_idx) in enumerate(kfold): print("============= {} fold training =============".format(k + 1)) X_train, y_train = X[train_idx], y[train_idx] X_val, y_val = X[val_idx], y[val_idx] train = LabeledImageDataset([(x, y) for x, y in zip(X_train, y_train)]) validation = LabeledImageDataset([(x, y) for x, y in zip(X_val, y_val)]) train, validation, mean = get_dataset(train, validation, root, datasets, use_mean=False) model = L.Classifier(archs[args.arch](output=class_num)) lr = args.lr optimizer = chainer.optimizers.MomentumSGD(lr) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0001)) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() train_iter = chainer.iterators.MultithreadIterator(train, args.batchsize, n_threads=8) validation_iter = chainer.iterators.MultithreadIterator(validation, args.batchsize, repeat=False, shuffle=False, n_threads=8) updater = training.StandardUpdater( train_iter, optimizer, device=args.gpu) trainer = training.Trainer( updater, (args.epoch, 'epoch'), out=save_dir) log_trigger = (1, 'epoch') target = 'lr' trainer.extend(CosineShift(target, args.epoch, 1), trigger=(1, "epoch")) trainer.extend(extensions.Evaluator(validation_iter, model, device=args.gpu), trigger=log_trigger) snap_name = '{}-{}_fold_model.npz'.format(k_fold, k+1) trainer.extend(extensions.snapshot_object(model, str(snap_name)), trigger=chainer.training.triggers.MaxValueTrigger( key='validation/main/accuracy', trigger=(1, 'epoch'))) log_name = '{}-{}_fold_log.json'.format(k_fold, k+1) trainer.extend(extensions.LogReport( log_name=str(log_name), trigger=log_trigger)) trainer.extend(extensions.observe_lr(), trigger=log_trigger) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time', 'lr' ]), trigger=(1, 'epoch')) trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch',file_name='loss{}.png'.format(k+1))) trainer.extend(extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy{}.png'.format(k+1))) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run() snap_file = save_dir / snap_name shutil.move(str(snap_file), str(snap_dir)) log_file = save_dir / log_name shutil.move(str(log_file), str(log_dir)) save_model = model_dir / "{}_{}-{}_fold.npz".format(now, k_fold, k + 1) chainer.serializers.save_npz(str(save_model), model) print("============= {} fold Evaluation =============".format(k + 1)) dnames = glob.glob('{}/*'.format(root)) labels_list = [] for d in dnames: p_dir = Path(d) labels_list.append(p_dir.name) if 'mean.npy' in labels_list: labels_list.remove('mean.npy') confusion_matrix_cocoa(validation, args.gpu, 7, model, matrix_dir, k, labels_list)
sum(depth) * 2 + 1, args.valid) trainer = training.Trainer(updater, (epoch_size * max_epoch, 'iteration'), out=result_dir) from chainer.training import extensions trainer.extend(extensions.LogReport(trigger=(epoch_size, 'iteration'))) trainer.extend( extensions.snapshot(filename='snapshot_iteration-{.updater.iteration}'), trigger=(epoch_size, 'iteration')) trainer.extend(extensions.snapshot_object( model.predictor, filename='model_iteration-{.updater.iteration}'), trigger=(epoch_size, 'iteration')) trainer.extend(extensions.Evaluator(test_iter, model, device=gpu_id), trigger=(epoch_size, 'iteration')) trainer.extend(extensions.observe_lr(), trigger=(epoch_size, 'iteration')) trainer.extend(extensions.PrintReport([ 'iteration', 'lr', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ]), trigger=(epoch_size, 'iteration')) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(epoch_size * 3, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=30)) print('running') print('reslut_dir:{}'.format(result_dir)) trainer.run()
def main(): parser = argparse.ArgumentParser(description='Chainer CIFAR example:') parser.add_argument('--dataset', '-d', default='cifar100', help='The dataset to use: cifar10 or cifar100') parser.add_argument('--model', '-m', default='VGG16', help='The model to use: VGG16 or PreResNet110' ' or WideResNet28x10') parser.add_argument('--batchsize', '-b', type=int, default=128, help='Number of images in each mini-batch') parser.add_argument('--lr_init', '-l', type=float, default=0.05, help='Learning rate for SGD') parser.add_argument('--epoch', '-e', type=int, default=200, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--wd', type=float, default=1e-4, help='weight decay') parser.add_argument('--swa', action='store_true', help='swa usage flag') parser.add_argument('--swa_start', type=float, default=161, help='SWA start epoch number') parser.add_argument('--swa_lr', type=float, default=0.05, help='SWA LR') parser.add_argument('--swa_c_epochs', type=int, default=1, help='SWA model collection frequency length in epochs') args = parser.parse_args() if args.dataset.lower() == 'cifar10': print('Using CIFAR10 dataset') class_labels = 10 train, test = get_cifar10() elif args.dataset.lower() == 'cifar100': print('Using CIFAR100 dataset') class_labels = 100 train, test = get_cifar100() else: raise RuntimeError('Invalid dataset choice.') print('Using %s model' % args.model) if args.model == 'VGG16': model_cls = VGG16 elif args.model == 'PreResNet110': model_cls = PreResNet110 elif args.model == 'WideResNet28x10': model_cls = WideResNet28x10 else: raise RuntimeError('Invalid model choice.') model = L.Classifier(model_cls(class_labels)) if args.swa: swa_model = L.Classifier(model_cls(class_labels)) swa_n = 0 if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() if args.swa: swa_model.to_gpu() # Data augmentation / preprocess train = TransformDataset(train, partial(transform, train=True)) test = TransformDataset(test, partial(transform, train=False)) optimizer = chainer.optimizers.MomentumSGD(args.lr_init, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(args.wd)) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) swa_train_iter = chainer.iterators.SerialIterator( train, args.batchsize, repeat=False, shuffle=False) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) stop_trigger = (args.epoch, 'epoch') # Set up a trainer updater = training.updaters.StandardUpdater( train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, stop_trigger, out=args.out) # Learning rate adjustment (this function is called every epoch) def lr_schedule(trainer): epoch = trainer.updater.epoch t = epoch / (args.swa_start if args.swa else args.epoch) lr_ratio = args.swa_lr / args.lr_init if args.swa else 0.01 if t <= 0.5: factor = 1.0 elif t <= 0.9: factor = 1.0 - (1.0 - lr_ratio) * (t - 0.5) / 0.4 else: factor = lr_ratio trainer.updater.get_optimizer('main').lr = factor * args.lr_init # The main function for SWA (this function is called every epoch) def avg_weight(trainer): epoch = trainer.updater.epoch if args.swa and (epoch + 1) >= args.swa_start and \ (epoch + 1 - args.swa_start) % args.swa_c_epochs == 0: nonlocal swa_n # moving average alpha = 1.0 / (swa_n + 1) for param1, param2 in zip(swa_model.params(), model.params()): param1.data *= (1.0 - alpha) param1.data += param2.data * alpha swa_n += 1 # This funtion is called before evaluating SWA model # for fixing batchnorm's running mean and variance def fix_swa_batchnorm(evaluator): # Check batchnorm layer bn_flg = False for l in swa_model.links(): if type(l) == L.normalization.batch_normalization.BatchNormalization: bn_flg = True break # Fix batchnorm's running mean and variance if bn_flg: swa_train_iter.reset() with chainer.using_config('train', True): for batch in swa_train_iter: in_arrays = evaluator.converter(batch, evaluator.device) with function.no_backprop_mode(): swa_model(*in_arrays) # Set up extentions trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu), trigger=(5, 'epoch')) if args.swa: eval_points = [x for x in range(args.epoch + 1) if x > args.swa_start and x % 5 == 0] trainer.extend(SwaEvaluator(test_iter, swa_model, device=args.gpu, eval_hook=fix_swa_batchnorm), trigger=triggers.ManualScheduleTrigger(eval_points, 'epoch')) trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(lr_schedule, trigger=triggers.IntervalTrigger(1, 'epoch')) trainer.extend(avg_weight, trigger=triggers.IntervalTrigger(1, 'epoch')) trainer.extend(extensions.observe_lr()) trainer.extend(extensions.LogReport()) cols = ['epoch', 'lr', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time'] if args.swa: cols = cols[:-1] + ['swa/main/loss', 'swa/main/accuracy'] + cols[-1:] trainer.extend(extensions.PrintReport(cols)) trainer.extend(extensions.ProgressBar()) trainer.run()
def main(): # Check if GPU is available # (ImageNet example does not support CPU execution) if not chainer.cuda.available: raise RuntimeError("ImageNet requires GPU support.") archs = { 'alex': alex.Alex, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.add_argument('--communicator', default='hierarchical') parser.set_defaults(test=False) args = parser.parse_args() # Start method of multiprocessing module need to be changed if we # are using InfiniBand and MultiprocessIterator. This is because # processes often crash when calling fork if they are using # Infiniband. (c.f., # https://www.open-mpi.org/faq/?category=tuning#fork-warning ) # Also, just setting the start method does not seem to be # sufficient to actually launch the forkserver processes, so also # start a dummy process. # See also our document: # https://chainermn.readthedocs.io/en/stable/tutorial/tips_faqs.html#using-multiprocessiterator # This must be done *before* ``chainermn.create_communicator``!!! multiprocessing.set_start_method('forkserver') p = multiprocessing.Process(target=lambda *x: x, args=()) p.start() p.join() # Prepare ChainerMN communicator. comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) print('Using {} communicator'.format(args.communicator)) print('Using {} arch'.format(args.arch)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') model = archs[args.arch]() if args.initmodel: print('Load model from', args.initmodel) chainer.serializers.load_npz(args.initmodel, model) chainer.cuda.get_device_from_id(device).use() # Make the GPU current model.to_gpu() # Split and distribute the dataset. Only worker 0 loads the whole dataset. # Datasets of worker 0 are evenly split and distributed to all workers. mean = np.load(args.mean) if comm.rank == 0: train = PreprocessedDataset(args.train, args.root, mean, model.insize) val = PreprocessedDataset(args.val, args.root, mean, model.insize, False) else: train = None val = None train = chainermn.scatter_dataset(train, comm, shuffle=True) val = chainermn.scatter_dataset(val, comm) # A workaround for processes crash should be done before making # communicator above, when using fork (e.g. MultiProcessIterator) # along with Infiniband. train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) # Create a multi node optimizer from a standard Chainer optimizer. optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm) optimizer.setup(model) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch') val_interval = (10, 'iteration') if args.test else (1, 'epoch') log_interval = (10, 'iteration') if args.test else (1, 'epoch') checkpointer = chainermn.create_multi_node_checkpointer( name='imagenet-example', comm=comm) checkpointer.maybe_load(trainer, optimizer) trainer.extend(checkpointer, trigger=checkpoint_interval) # Create a multi node evaluator from an evaluator. evaluator = TestModeEvaluator(val_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator, trigger=val_interval) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def train(args): config = yaml.load(open(args.config)) print('==========================================') # Set workspace size if 'max_workspace_size' in config: chainer.cuda.set_max_workspace_size(config['max_workspace_size']) # Output version info print('chainer version: {}'.format(chainer.__version__)) print('cuda: {}, cudnn: {}, nccl: {}'.format(chainer.cuda.available, chainer.cuda.cudnn_enabled, HAVE_NCCL)) # Create result_dir if args.result_dir is not None: config['result_dir'] = args.result_dir else: config['result_dir'] = create_result_dir_from_config_path(args.config) log_fn = save_config_get_log_fn(config['result_dir'], args.config) print('result_dir:', config['result_dir']) # Instantiate model model = get_model_from_config(config) print('model:', model.__class__.__name__) # Initialize optimizer optimizer = get_optimizer_from_config(model, config) print('optimizer:', optimizer.__class__.__name__) # Setting up datasets train_dataset, valid_dataset = get_dataset_from_config(config) print('train_dataset: {}'.format(len(train_dataset)), train_dataset.__class__.__name__) print('valid_dataset: {}'.format(len(valid_dataset)), valid_dataset.__class__.__name__) # Prepare devices devices = {'main': args.gpus[0]} for gid in args.gpus[1:]: devices['gpu{}'.format(gid)] = gid # Create iterators train_iter, valid_iter = create_iterators( train_dataset, config['dataset']['train']['batchsize'], valid_dataset, config['dataset']['valid']['batchsize'], devices) print('train_iter:', train_iter.__class__.__name__) print('valid_iter:', valid_iter.__class__.__name__) # Create updater updater_creator = get_updater_creator_from_config(config) updater = updater_creator(train_iter, optimizer, devices) print('updater:', updater.__class__.__name__) # Create trainer trainer = training.Trainer(updater, config['stop_trigger'], out=config['result_dir']) print('Trainer stops:', config['stop_trigger']) # Trainer extensions for ext in config['trainer_extension']: ext, values = ext.popitem() if ext == 'LogReport': trigger = values['trigger'] trainer.extend( extensions.LogReport(trigger=trigger, log_name=log_fn)) elif ext == 'observe_lr': trainer.extend(extensions.observe_lr(), trigger=values['trigger']) elif ext == 'dump_graph': trainer.extend(extensions.dump_graph(**values)) elif ext == 'Evaluator': evaluator_creator = get_evaluator_creator_from_config(values) evaluator = evaluator_creator(valid_iter, model, devices) trainer.extend(evaluator, trigger=values['trigger'], name=values['prefix']) elif ext == 'PlotReport': trainer.extend(extensions.PlotReport(**values)) elif ext == 'PrintReport': trigger = values.pop('trigger') trainer.extend(extensions.PrintReport(**values), trigger=trigger) elif ext == 'ProgressBar': upd_int = values['update_interval'] trigger = values['trigger'] trainer.extend(extensions.ProgressBar(update_interval=upd_int), trigger=trigger) elif ext == 'snapshot': filename = values['filename'] trigger = values['trigger'] trainer.extend(extensions.snapshot(filename=filename), trigger=trigger) elif ext == 'ParameterStatistics': links = [] for link_name in values.pop('links'): lns = [ln.strip() for ln in link_name.split('.') if ln.strip()] target = model.predictor for ln in lns: target = getattr(target, ln) links.append(target) trainer.extend(extensions.ParameterStatistics(links, **values)) elif ext == 'custom': custom_extension = get_custum_extension_from_config(values) trainer.extend(custom_extension, trigger=values['trigger']) # LR decay if 'lr_drop_ratio' in config['optimizer'] \ and 'lr_drop_triggers' in config['optimizer']: ratio = config['optimizer']['lr_drop_ratio'] points = config['optimizer']['lr_drop_triggers']['points'] unit = config['optimizer']['lr_drop_triggers']['unit'] drop_trigger = triggers.ManualScheduleTrigger(points, unit) def lr_drop(trainer): trainer.updater.get_optimizer('main').lr *= ratio trainer.extend(lr_drop, trigger=drop_trigger) # Resume if args.resume is not None: fn = '{}.bak'.format(args.resume) shutil.copy(args.resume, fn) serializers.load_npz(args.resume, trainer) print('Resumed from:', args.resume) print('==========================================') trainer.run() return 0
def main(): archs = { 'alex': alex.Alex, 'alex_fp16': alex.AlexFp16, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16, 'nin': nin.NIN } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--gpus', '-g', type=int, nargs="*", default=[0, 1, 2, 3]) parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() # Initialize the model to train model = archs[args.arch]() if args.initmodel: print('Load model from', args.initmodel) chainer.serializers.load_npz(args.initmodel, model) # Load the datasets and mean file mean = np.load(args.mean) train = train_imagenet.PreprocessedDataset( args.train, args.root, mean, model.insize) val = train_imagenet.PreprocessedDataset( args.val, args.root, mean, model.insize, False) # These iterators load the images with subprocesses running in parallel to # the training/validation. devices = tuple(args.gpus) train_iters = [ chainer.iterators.MultiprocessIterator(i, args.batchsize, n_processes=args.loaderjob) for i in chainer.datasets.split_dataset_n_random(train, len(devices))] val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer updater = updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=devices) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) if args.test: val_interval = 5, 'epoch' log_interval = 1, 'epoch' else: val_interval = 100000, 'iteration' log_interval = 1000, 'iteration' trainer.extend(train_imagenet.TestModeEvaluator(val_iter, model, device=args.gpus[0]), trigger=val_interval) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=2)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
updater = training.StandardUpdater( train_iter, optimizer, device=args.gpu) trainer = training.Trainer( updater, (args.iteration, "iteration"), args.out) val_interval = args.val_iter, "iteration" trainer.extend( DetectionVOCEvaluator( test_iter, model, use_07_metric=True, label_names=label_names), trigger=val_interval) log_interval = args.log_iter, "iteration" trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport( ['epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc', 'main/loss/conf', 'validation/main/map']), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend( extensions.snapshot_object(model, 'model_iter_{.updater.iteration}'), trigger=(args.model_iter, 'iteration')) if args.resume: serializers.load_npz(args.resume, trainer)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--batchsize', type=int, default=12) parser.add_argument('--class_weight', type=str, default='class_weight.npy') parser.add_argument('--out', type=str, default='result') args = parser.parse_args() # Triggers log_trigger = (50, 'iteration') validation_trigger = (2000, 'iteration') end_trigger = (16000, 'iteration') # Dataset train = CamVidDataset(split='train') train = TransformDataset(train, transform) val = CamVidDataset(split='val') # Iterator train_iter = iterators.MultiprocessIterator(train, args.batchsize) val_iter = iterators.MultiprocessIterator( val, args.batchsize, shuffle=False, repeat=False) # Model class_weight = np.load(args.class_weight) model = SegNetBasic(n_class=11) model = PixelwiseSoftmaxClassifier( model, class_weight=class_weight) if args.gpu >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU # Optimizer optimizer = optimizers.MomentumSGD(lr=0.1, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005)) # Updater updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) # Trainer trainer = training.Trainer(updater, end_trigger, out=args.out) trainer.extend(extensions.LogReport(trigger=log_trigger)) trainer.extend(extensions.observe_lr(), trigger=log_trigger) trainer.extend(extensions.dump_graph('main/loss')) if extensions.PlotReport.available(): trainer.extend(extensions.PlotReport( ['main/loss'], x_key='iteration', file_name='loss.png')) trainer.extend(extensions.PlotReport( ['validation/main/miou'], x_key='iteration', file_name='miou.png')) trainer.extend(extensions.snapshot_object( model.predictor, filename='model_iteration-{.updater.iteration}'), trigger=end_trigger) trainer.extend(extensions.PrintReport( ['epoch', 'iteration', 'elapsed_time', 'lr', 'main/loss', 'validation/main/miou', 'validation/main/mean_class_accuracy', 'validation/main/pixel_accuracy']), trigger=log_trigger) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend( SemanticSegmentationEvaluator( val_iter, model.predictor, camvid_label_names), trigger=validation_trigger) trainer.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--model', choices=('ssd300', 'ssd512'), default='ssd300') parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--out', default='result') parser.add_argument('--resume') args = parser.parse_args() if args.model == 'ssd300': model = SSD300( n_fg_class=len(voc_bbox_label_names), pretrained_model='imagenet') elif args.model == 'ssd512': model = SSD512( n_fg_class=len(voc_bbox_label_names), pretrained_model='imagenet') model.use_preset('evaluate') train_chain = MultiboxTrainChain(model) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() train = TransformDataset( ConcatenatedDataset( VOCBboxDataset(year='2007', split='trainval'), VOCBboxDataset(year='2012', split='trainval') ), Transform(model.coder, model.insize, model.mean)) train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize) test = VOCBboxDataset( year='2007', split='test', use_difficult=True, return_difficult=True) test_iter = chainer.iterators.SerialIterator( test, args.batchsize, repeat=False, shuffle=False) # initial lr is set to 1e-3 by ExponentialShift optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(train_chain) for param in train_chain.params(): if param.name == 'b': param.update_rule.add_hook(GradientScaling(2)) else: param.update_rule.add_hook(WeightDecay(0.0005)) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (120000, 'iteration'), args.out) trainer.extend( extensions.ExponentialShift('lr', 0.1, init=1e-3), trigger=triggers.ManualScheduleTrigger([80000, 100000], 'iteration')) trainer.extend( DetectionVOCEvaluator( test_iter, model, use_07_metric=True, label_names=voc_bbox_label_names), trigger=(10000, 'iteration')) log_interval = 10, 'iteration' trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport( ['epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc', 'main/loss/conf', 'validation/main/map']), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.snapshot(), trigger=(10000, 'iteration')) trainer.extend( extensions.snapshot_object(model, 'model_iter_{.updater.iteration}'), trigger=(120000, 'iteration')) if args.resume: serializers.load_npz(args.resume, trainer) trainer.run()
def setup_trainer(self): converter = functools.partial( cmr.datasets.concat_examples, padding=0, # img, bboxes, labels, masks, scales indices_concat=[0, 2, 3, 4], # img, _, labels, masks, scales indices_to_device=[0, 1], # img, bbox ) self.updater = chainer.training.updater.StandardUpdater( self.train_iterator, self.optimizer, device=self.gpu, converter=converter) self.trainer = chainer.training.Trainer( self.updater, (self.max_epoch, 'epoch'), out=self.out_dir) step_size = [ (120e3 / 180e3) * self.max_epoch, (160e3 / 180e3) * self.max_epoch, ] self.trainer.extend( extensions.ExponentialShift('lr', 0.1), trigger=chainer.training.triggers.ManualScheduleTrigger( step_size, 'epoch')) evaluator = cmr.extensions.InstanceSegmentationVOCEvaluator( self.val_iterator, self.model.mask_rcnn, device=self.gpu, use_07_metric=True, label_names=self.train_dataset.fg_class_names) self.trainer.extend( evaluator, trigger=(self.eval_interval, self.eval_interval_type)) # Save snapshot self.trainer.extend( extensions.snapshot_object( self.model.mask_rcnn, 'snapshot_model.npz'), trigger=chainer.training.triggers.MaxValueTrigger( 'validation/main/map', (self.save_interval, self.save_interval_type))) # Dump network architecture self.trainer.extend( extensions.dump_graph( root_name='main/loss', out_name='network_architecture.dot')) # Logging self.trainer.extend( extensions.ProgressBar( update_interval=self.progressbar_update_interval)) self.trainer.extend( extensions.observe_lr(), trigger=(self.log_interval, self.log_interval_type)) self.trainer.extend( extensions.LogReport( log_name='log.json', trigger=(self.log_interval, self.log_interval_type))) self.trainer.extend( extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'main/roi_loc_loss', 'main/roi_cls_loss', 'main/roi_mask_loss', 'main/rpn_loc_loss', 'main/rpn_cls_loss', 'validation/main/map', ]), trigger=(self.print_interval, self.print_interval_type)) # Plot self.trainer.extend( extensions.PlotReport([ 'main/loss', 'main/roi_loc_loss', 'main/roi_cls_loss', 'main/roi_mask_loss', 'main/rpn_loc_loss', 'main/rpn_cls_loss', ], file_name='loss_plot.png', x_key=self.plot_interval_type, trigger=(self.plot_interval, self.plot_interval_type)), trigger=(self.plot_interval, self.plot_interval_type)) self.trainer.extend( extensions.PlotReport( ['validation/main/map'], file_name='accuracy_plot.png', x_key=self.plot_interval_type, trigger=(self.plot_interval, self.plot_interval_type)), trigger=(self.eval_interval, self.eval_interval_type)) # Dump params params = dict() params['model_name'] = self.model_name params['train_dataset_dir'] = self.train_dataset_dir params['val_dataset_dir'] = self.val_dataset_dir params['fg_class_names'] = self.train_dataset.fg_class_names params['timestamp'] = self.timestamp_iso params['out_dir'] = self.out_dir params['gpu'] = self.gpu params['batch_size'] = self.batch_size params['max_epoch'] = self.max_epoch params['lr'] = self.lr params['weight_decay'] = self.weight_decay self.trainer.extend( fcn.extensions.ParamsReport(params, file_name='params.yaml')) # Dump param for mask_rcnn_instance_segmentation.py target_names = dict() target_names['fg_class_names'] = self.train_dataset.fg_class_names self.trainer.extend( fcn.extensions.ParamsReport( target_names, file_name='fg_class_names.yaml'))
def train_mode(updater, mode, lr_drop_iter, snapshot_iter, report_iter, stop_iter): trainer = training.Trainer(updater, (stop_iter, 'iteration'), out='results') trainer.extend( extensions.LogReport(trigger=(report_iter, 'iteration'))) trainer.extend(extensions.observe_lr(), trigger=(report_iter, 'iteration')) trainer.extend(create_lrdrop_ext(args.gamma), trigger=(lr_drop_iter, 'iteration')) if mode == 'rpn': updater.get_optimizer('main').target.rpn_train = True trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/RPN/rpn_loss', 'main/RPN/rpn_loss_cls', 'main/RPN/rpn_cls_accuracy', 'main/RPN/rpn_loss_bbox', 'elapsed_time', 'lr', ]), trigger=(report_iter, 'iteration')) trainer.extend(extensions.ProgressBar(), trigger=(report_iter, 'iteration')) trainer.extend(extensions.PlotReport( ['main/RPN/rpn_loss'], trigger=(report_iter, 'iteration'))) trainer.extend( extensions.dump_graph('main/RPN/rpn_loss', out_name='rpn_loss.dot')) # Add snapshot extensions trainer.extend( extensions.snapshot( filename='rpn_trainer_snapshot_{.updater.iteration}'), trigger=(snapshot_iter, 'iteration')) trainer.extend( extensions.snapshot_object( model, 'rpn_model_snapshot_{.updater.iteration}'), trigger=(snapshot_iter, 'iteration')) elif mode == 'rcnn': updater.get_optimizer('main').target.rcnn_train = True trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss_cls', 'main/cls_accuracy', 'main/loss_bbox', 'main/loss_rcnn', 'elapsed_time', 'lr', ]), trigger=(report_iter, 'iteration')) trainer.extend(extensions.ProgressBar(), trigger=(report_iter, 'iteration')) trainer.extend(extensions.PlotReport( ['main/RPN/rpn_loss'], trigger=(report_iter, 'iteration'))) trainer.extend( extensions.dump_graph('main/RPN/rpn_loss', out_name='rpn_loss.dot')) # Add snapshot extensions trainer.extend( extensions.snapshot( filename='rpn_trainer_snapshot_{.updater.iteration}'), trigger=(snapshot_iter, 'iteration')) trainer.extend( extensions.snapshot_object( model, 'rpn_model_snapshot_{.updater.iteration}'), trigger=(snapshot_iter, 'iteration')) trainer.run() del trainer
optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4)) dataset = Dataset(2048, args.boardnorm) iter_ = chainer.iterators.SerialIterator(dataset, args.batchsize) print( 'chance rate: ', sum(dataset[i][1].mean() for i in range(len(dataset))) / len(dataset)) updater = chainer.training.StandardUpdater(iter_, optimizer, device=args.gpu) trainer = chainer.training.Trainer(updater, (15000, 'iteration'), out=args.out) trainer.extend(extensions.snapshot_object( model.model, filename='model_iter_{.updater.iteration}'), trigger=(15000, 'iteration')) trainer.extend(extensions.ExponentialShift('lr', 0.1, init=0.1), trigger=(10000, 'iteration')) log_interval = (10, 'iteration') trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend( extensions.PrintReport( ['iteration', 'lr', 'main/loss', 'main/accuracy'])) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--out', default='result') parser.add_argument('--resume') args = parser.parse_args() model = SSD300(n_fg_class=len(voc_detection_label_names), pretrained_model='imagenet') model.use_preset('evaluate') train_chain = MultiboxTrainChain(model) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu() train = TransformDataset( ConcatenatedDataset(VOCDetectionDataset(year='2007', split='trainval'), VOCDetectionDataset(year='2012', split='trainval')), Transform(model.coder, model.insize, model.mean)) train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize) test = VOCDetectionDataset(year='2007', split='test', use_difficult=True, return_difficult=True) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # initial lr is set to 1e-3 by ExponentialShift optimizer = chainer.optimizers.MomentumSGD() optimizer.setup(train_chain) for param in train_chain.params(): if param.name == 'b': param.update_rule.add_hook(GradientScaling(2)) else: param.update_rule.add_hook(WeightDecay(0.0005)) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (120000, 'iteration'), args.out) trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3), trigger=triggers.ManualScheduleTrigger([80000, 100000], 'iteration')) trainer.extend(DetectionVOCEvaluator( test_iter, model, use_07_metric=True, label_names=voc_detection_label_names), trigger=(10000, 'iteration')) log_interval = 10, 'iteration' trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc', 'main/loss/conf', 'validation/main/map' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.snapshot(), trigger=(10000, 'iteration')) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=(120000, 'iteration')) if args.resume: serializers.load_npz(args.resume, trainer) trainer.run()
def main(): ''' main function, start point ''' # 引数関連 parser = argparse.ArgumentParser() parser.add_argument('--batchsize', '-b', type=int, default=128, help='Number of images in each mini-batch') parser.add_argument('--learnrate', '-l', type=float, default=0.001, help='Learning rate for SGD') parser.add_argument('--epoch', '-e', type=int, default=100, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU1 ID (negative value indicates CPU)') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--iter_parallel', '-p', action='store_true', default=False, help='loading dataset from disk') parser.add_argument('--test', action='store_true', default=False, help='Test Mode, a few dataset') parser.add_argument('--opt' , '-o', type=str, choices=('adam', 'sgd') ,default='adam') parser.add_argument('--fsize' , '-f', type=int ,default=5) parser.add_argument('--ch' , '-c', type=int ,default=4) parser.add_argument('--decay' , '-d', type=str ,default='exp', choices=('exp', 'lin')) parser.add_argument('--weight', '-w', type=float ,default=1.0) args = parser.parse_args() # parameter出力 print("-=Learning Parameter=-") print("# Max Epochs: {}".format(args.epoch)) print("# Batch Size: {}".format(args.batchsize)) print("# Learning Rate: {}".format(args.learnrate)) print("# Optimizer Method: {}".format(args.opt)) print("# Filter Size: {}".format(args.fsize)) print("# Channel Scale: {}".format(args.ch)) print("# coef. decay : {}".format(args.decay)) print("# contloss' weight : {}".format(args.weight)) print('# Train Dataet: General 100') if args.iter_parallel: print("# Data Iters that loads in Parallel") print("\n") # 保存ディレクトリ # save didrectory model_dir_name = 'CAEFINet_opt_{}_ch_{}_fsize_{}_decay_{}_weight_{}'.format(args.opt, args.ch, args.fsize, args.decay, args.weight) outdir = path.join(ROOT_PATH, 'results','FI' ,'CAEFINet', model_dir_name) if not path.exists(outdir): os.makedirs(outdir) with open(path.join(outdir, 'arg_param.txt'), 'w') as f: for k, v in args.__dict__.items(): f.write('{}:{}\n'.format(k, v)) #loading dataset if args.test: print('# loading test dataet(UCF101_minimam_test_size64_frame3_group2_max4_p) ...') train_dataset = 'UCF101_minimam_test_size64_frame3_group2_max4_p' test_dataset = 'UCF101_minimam_test_size64_frame3_group2_max4_p' else: print('# loading test dataet(UCF101_train_size64_frame3_group10_max100_p, UCF101_test_size64_frame3_group25_max5_p) ...') train_dataset = 'UCF101_train_size64_frame3_group10_max100_p' test_dataset = 'UCF101_test_size64_frame3_group25_max5_p' if args.iter_parallel: train = ds.SequenceDataset(dataset=train_dataset) test = ds.SequenceDataset(dataset=test_dataset) else: train = ds.SequenceDatasetOnMem(dataset=train_dataset) test = ds.SequenceDatasetOnMem(dataset=test_dataset) # prepare model model = N.CAEFINet(vgg_path=path.join(ROOT_PATH, 'models', 'VGG16.npz'), f_size=args.fsize, n_ch=args.ch, size=64) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # setup optimizer if args.opt == 'adam': optimizer = chainer.optimizers.Adam(alpha=args.learnrate) elif args.opt == 'sgd': optimizer = chainer.optimizers.MomentumSGD(lr=args.learnrate, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001)) # setup iter if args.iter_parallel: train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=8) test_iter = chainer.iterators.MultiprocessIterator( test, args.batchsize, repeat=False, shuffle=False, n_processes=8) else: train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator( test, args.batchsize, repeat=False, shuffle=False) # setup trainer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, loss_func=model.get_loss_func(weight=args.weight, coef_decay=args.decay)) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=outdir) # # eval test data trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu, eval_func=model.get_loss_func(weight=args.weight, coef_decay=args.decay))) # dump loss graph trainer.extend(extensions.dump_graph('main/loss')) # lr shift if args.opt == 'sgd': trainer.extend(extensions.ExponentialShift("lr", 0.1), trigger=(50, 'epoch')) elif args.opt == 'adam': trainer.extend(extensions.ExponentialShift("alpha", 0.1), trigger=(50, 'epoch')) # save snapshot trainer.extend(extensions.snapshot(), trigger=(10, 'epoch')) trainer.extend(extensions.snapshot_object( model, 'model_snapshot_{.updater.epoch}'), trigger=(10, 'epoch')) # log report trainer.extend(extensions.LogReport()) trainer.extend(extensions.observe_lr(), trigger=(1, 'epoch')) # plot loss graph trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'], 'epoch', file_name='mse_loss.png')) trainer.extend( extensions.PlotReport(['main/cont_loss', 'validation/main/cont_loss'], 'epoch', file_name='cont_loss.png')) # plot acc graph trainer.extend(extensions.PlotReport(['main/psnr', 'validation/main/psnr'], 'epoch', file_name='PSNR.png')) # print info trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss','main/mse_loss', 'validation/main/mse_loss', 'main/cont_loss', 'validation/main/cont_loss', 'main/psnr', 'validation/main/psnr', 'lr', 'elapsed_time'])) # print progbar trainer.extend(extensions.ProgressBar()) # [ChainerUI] enable to send commands from ChainerUI trainer.extend(CommandsExtension()) # [ChainerUI] save 'args' to show experimental conditions save_args(args, outdir) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) trainer.run() # save final model model_outdir = path.join(ROOT_PATH, 'models', model_dir_name) if not path.exists(model_outdir): os.makedirs(model_outdir) model_name = 'CAEFINet_{}_ch_{}_fsize_{}_decay_{}_weight_{}.npz'.format(args.opt, args.ch, args.fsize, args.decay, args.weight) chainer.serializers.save_npz(path.join(model_outdir, model_name), model) model_parameter = { 'name': 'CAEFINetConcat', 'parameter': {'f_size':args.fsize, 'ch':args.ch} } with open(path.join(model_outdir, 'model_parameter.json'), 'w') as f: json.dump(model_parameter, f)
def main(): archs = { 'alex': alex.Alex, 'alex_fp16': alex.AlexFp16, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, 'resnext50': resnet50.ResNeXt50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() # Initialize the model to train model = archs[args.arch]() if args.initmodel: print('Load model from {}'.format(args.initmodel)) chainer.serializers.load_npz(args.initmodel, model) if args.gpu >= 0: chainer.backends.cuda.get_device_from_id( args.gpu).use() # Make the GPU current model.to_gpu() # Load the datasets and mean file mean = np.load(args.mean) train = PreprocessedDataset(args.train, args.root, mean, model.insize) val = PreprocessedDataset(args.val, args.root, mean, model.insize, False) # These iterators load the images with subprocesses running in parallel to # the training/validation. train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer updater = training.updaters.StandardUpdater( train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) val_interval = (1 if args.test else 100000), 'iteration' log_interval = (1 if args.test else 1000), 'iteration' trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu), trigger=val_interval) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def get_trainer(args): config = yaml.load(open(args.config)) # Set workspace size if 'max_workspace_size' in config: chainer.cuda.set_max_workspace_size(config['max_workspace_size']) # Prepare ChainerMN communicator if args.gpu: if args.communicator == 'naive': print("Error: 'naive' communicator does not support GPU.\n") exit(-1) comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank else: if args.communicator != 'naive': print('Warning: using naive communicator ' 'because only naive supports CPU-only execution') comm = chainermn.create_communicator('naive') device = -1 # Show the setup information if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size())) if args.gpu: print('Using GPUs - max workspace size:', chainer.cuda.get_max_workspace_size()) print('Using {} communicator'.format(args.communicator)) # Output version info if comm.rank == 0: print('Chainer version: {}'.format(chainer.__version__)) print('ChainerMN version: {}'.format(chainermn.__version__)) print('cuda: {}, cudnn: {}'.format(chainer.cuda.available, chainer.cuda.cudnn_enabled)) # Create result_dir if args.result_dir is not None: config['result_dir'] = args.result_dir model_fn = config['model']['module'].split('.')[-1] sys.path.insert(0, args.result_dir) config['model']['module'] = model_fn else: config['result_dir'] = create_result_dir_from_config_path(args.config) log_fn = save_config_get_log_fn(config['result_dir'], args.config) if comm.rank == 0: print('result_dir:', config['result_dir']) # Instantiate model model = get_model_from_config(config, comm) if args.gpu: chainer.cuda.get_device(device).use() model.to_gpu() if comm.rank == 0: print('model:', model.__class__.__name__) # Initialize optimizer optimizer = get_optimizer_from_config(model, config) optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) if comm.rank == 0: print('optimizer:', optimizer.__class__.__name__) # Setting up datasets if comm.rank == 0: train_dataset, valid_dataset = get_dataset_from_config(config) print('train_dataset: {}'.format(len(train_dataset)), train_dataset.__class__.__name__) print('valid_dataset: {}'.format(len(valid_dataset)), valid_dataset.__class__.__name__) else: train_dataset, valid_dataset = [], [] train_dataset = chainermn.scatter_dataset(train_dataset, comm) valid_dataset = chainermn.scatter_dataset(valid_dataset, comm) # Create iterators # multiprocessing.set_start_method('forkserver') train_iter, valid_iter = create_iterators(train_dataset, valid_dataset, config) if comm.rank == 0: print('train_iter:', train_iter.__class__.__name__) print('valid_iter:', valid_iter.__class__.__name__) # Create updater and trainer if 'updater_creator' in config: updater_creator = get_updater_creator_from_config(config) updater = updater_creator(train_iter, optimizer, device=device) else: updater = create_updater(train_iter, optimizer, device=device) if comm.rank == 0: print('updater:', updater.__class__.__name__) # Create Trainer trainer = training.Trainer(updater, config['stop_trigger'], out=config['result_dir']) if comm.rank == 0: print('Trainer stops:', config['stop_trigger']) # Trainer extensions for ext in config['trainer_extension']: ext, values = ext.popitem() if ext == 'LogReport' and comm.rank == 0: trigger = values['trigger'] trainer.extend( extensions.LogReport(trigger=trigger, log_name=log_fn)) elif ext == 'observe_lr' and comm.rank == 0: trainer.extend(extensions.observe_lr(), trigger=values['trigger']) elif ext == 'dump_graph' and comm.rank == 0: trainer.extend(extensions.dump_graph(**values)) elif ext == 'Evaluator': assert 'module' in values mod = import_module(values['module']) evaluator = getattr(mod, values['name']) if evaluator is extensions.Evaluator: evaluator = evaluator(valid_iter, model, device=device) else: evaluator = evaluator(valid_iter, model.predictor) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator, trigger=values['trigger'], name=values['prefix']) elif ext == 'PlotReport' and comm.rank == 0: trainer.extend(extensions.PlotReport(**values)) elif ext == 'PrintReport' and comm.rank == 0: trigger = values.pop('trigger') trainer.extend(extensions.PrintReport(**values), trigger=trigger) elif ext == 'ProgressBar' and comm.rank == 0: upd_int = values['update_interval'] trigger = values['trigger'] trainer.extend(extensions.ProgressBar(update_interval=upd_int), trigger=trigger) elif ext == 'snapshot' and comm.rank == 0: filename = values['filename'] trigger = values['trigger'] trainer.extend(extensions.snapshot(filename=filename), trigger=trigger) # LR decay if 'lr_drop_ratio' in config['optimizer'] \ and 'lr_drop_triggers' in config['optimizer']: ratio = config['optimizer']['lr_drop_ratio'] points = config['optimizer']['lr_drop_triggers']['points'] unit = config['optimizer']['lr_drop_triggers']['unit'] drop_trigger = triggers.ManualScheduleTrigger(points, unit) def lr_drop(trainer): trainer.updater.get_optimizer('main').lr *= ratio trainer.extend(lr_drop, trigger=drop_trigger) if 'lr_drop_poly_power' in config['optimizer']: power = config['optimizer']['lr_drop_poly_power'] stop_trigger = config['stop_trigger'] batchsize = train_iter.batch_size len_dataset = len(train_dataset) trainer.extend(PolynomialShift('lr', power, stop_trigger, batchsize, len_dataset), trigger=(1, 'iteration')) # Resume if args.resume is not None: # fn = '{}.bak'.format(args.resume) # shutil.copy(args.resume, fn) serializers.load_npz(args.resume, trainer) if comm.rank == 0: print('Resumed from:', args.resume) if comm.rank == 0: print('==========================================') return trainer
def main(): parser = argparse.ArgumentParser(description='Chainer CIFAR example:') parser.add_argument('--seed', '-s', type=int, default=0, help='seed for random values') parser.add_argument('--dataset', '-d', default='cifar10', help='The dataset to use: cifar10 or cifar100') parser.add_argument('--batchsize', '-b', type=int, default=128, help='Number of images in each mini-batch') parser.add_argument('--learnrate', '-l', type=float, default=0.1, help='Learning rate for SGD') parser.add_argument('--epoch', '-e', type=int, default=300, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--aug_method', '-a', default='both', choices=['none', 'mixup', 'random_erasing', 'both'], help='data augmentation strategy') parser.add_argument('--model', '-m', default='pyramid', choices=['resnet50', 'pyramid'], help='data augmentation strategy') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print(args) print('') set_random_seed(args.seed) # Set up a neural network to train. # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. if args.dataset == 'cifar10': print('Using CIFAR10 dataset.') class_labels = 10 train, test = get_cifar10() # for mean-teacher experiment #train = train[:-10000] #print(len(train)) elif args.dataset == 'cifar100': print('Using CIFAR100 dataset.') class_labels = 100 train, test = get_cifar100() else: raise RuntimeError('Invalid dataset choice.') if args.model == 'resnet50': predictor = ResNet(None) predictor.fc6 = L.Linear(2048, class_labels) elif args.model == 'pyramid': predictor = shaked_pyramid_net.PyramidNet(skip=True) # 下の方にあるtrain dataのtransformの条件分岐とかぶってるけどなー if args.aug_method in ('both', 'mixup'): lossfun = soft_label_classification_loss accfun = soft_label_classification_acc else: lossfun = F.softmax_cross_entropy accfun = F.accuracy model = L.Classifier(predictor, lossfun=lossfun, accfun=accfun) if args.gpu >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU optimizer = chainer.optimizers.MomentumSGD(args.learnrate) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4)) # augment train data if args.aug_method == 'none': print('data augmentationなしです') train = dataset.SingleCifar10((train, None)) elif args.aug_method in ('both', 'mixup'): use_random_erasing = args.aug_method == 'both' train = dataset.PairwiseCifar10((train, None)) train = chainer.datasets.transform_dataset.TransformDataset( train, transformer.MixupTransform(use_random_erasing=use_random_erasing)) elif args.aug_method == 'random_erasing': train = dataset.SingleCifar10((train, None)) train = chainer.datasets.transform_dataset.TransformDataset( train, transformer.RandomErasingTransform()) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Evaluate the model with the test dataset for each epoch eval_trigger = (1, 'epoch') trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu), trigger=eval_trigger) # Reduce the learning rate by half every 25 epochs. lr_drop_epoch = [int(args.epoch * 0.5), int(args.epoch * 0.75)] lr_drop_ratio = 0.1 print(f'lr schedule: {lr_drop_ratio}, timing: {lr_drop_epoch}') def lr_drop(trainer): trainer.updater.get_optimizer('main').lr *= lr_drop_ratio trainer.extend(lr_drop, trigger=chainer.training.triggers.ManualScheduleTrigger( lr_drop_epoch, 'epoch')) trainer.extend(extensions.observe_lr(), trigger=(1, 'epoch')) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.dump_graph('main/loss')) # Take a snapshot at each epoch trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend( extensions.PrintReport([ 'epoch', 'lr', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # interact with chainerui trainer.extend(CommandsExtension(), trigger=(100, 'iteration')) # save args save_args(args, args.out) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run()
def main(): args = parse_args() model = archs[args.arch]() ema_model = archs[args.arch]() if args.gpu >= 0: chainer.backends.cuda.get_device_from_id( args.gpu).use() # Make the GPU current model.to_gpu() ema_model.to_gpu() train, val = chainer.datasets.get_cifar10() _, test = chainer.datasets.get_cifar10(withlabel=False) train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.batchsize, repeat=False, n_processes=args.loaderjob) ema_iter = chainer.iterators.MultiprocessIterator( test, args.batchsize, repeat=False, n_processes=args.loaderjob) # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer if args.consistency_type == 'mse': consistency_lossfun = softmax_mse_loss elif args.consistency_type == 'kl': consistency_lossfun = softmax_kl_loss updater = MeanTeacherUpdater(train_iter, ema_iter, optimizer, ema_model, ema_decay=args.ema_decay, distance_cost=args.distance_cost, consistency=args.consistency, consistency_lossfun=consistency_lossfun, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) val_interval = (1 if args.test else 100000), 'iteration' log_interval = (1 if args.test else 1000), 'iteration' trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu), trigger=val_interval) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-g', '--gpu', default=0, type=int, help='GPU id') parser.add_argument('-d', '--dataset', type=str, required=True, help='Dataset class name') parser.add_argument('-m', '--model', type=str, required=True, help='Model class name') parser.add_argument('-b', '--batch_size', type=int, required=True, help='Batch size') args = parser.parse_args() gpu = args.gpu # 0. config timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') out = timestamp out = osp.join(osp.dirname(here), 'logs', out) max_iter_epoch = 100, 'epoch' progress_bar_update_interval = 10 # iteration print_interval = 100, 'iteration' log_interval = 100, 'iteration' test_interval = 5, 'epoch' save_interval = 5, 'epoch' # 1. dataset if args.dataset == 'Mirror3DAnnotatedDataset': dataset_train = Mirror3DAnnotatedDataset(split='train', aug=True) dataset_valid = Mirror3DAnnotatedDataset(split='test', aug=False) else: print('Invalid dataset class.') exit(1) dataset_train_transformed = TransformDataset(dataset_train, transform) dataset_valid_transformed = TransformDataset(dataset_valid, transform) iter_train = chainer.iterators.MultiprocessIterator( dataset_train_transformed, batch_size=args.batch_size, shared_mem=10**8) iter_valid = chainer.iterators.MultiprocessIterator( dataset_valid_transformed, batch_size=1, shared_mem=10**8, repeat=False, shuffle=False) # 2. model vgg = fcn.models.VGG16() vgg_path = vgg.download() chainer.serializers.load_npz(vgg_path, vgg) n_class = len(dataset_train.class_names) assert n_class == 2 if args.model == 'FCN8sMirrorSegmentationDepthEstimation': model = FCN8sMirrorSegmentationDepthEstimation(n_class=n_class) else: print('Invalid model class.') exit(1) model.init_from_vgg16(vgg) if gpu >= 0: cuda.get_device_from_id(gpu).use() model.to_gpu() # 3. optimizer optimizer = chainer.optimizers.Adam(alpha=1.0e-5) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005)) updater = chainer.training.updater.StandardUpdater(iter_train, optimizer, device=gpu) trainer = chainer.training.Trainer(updater, max_iter_epoch, out=out) trainer.extend(extensions.ExponentialShift("alpha", 0.99995)) if not osp.isdir(out): os.makedirs(out) with open(osp.join(out, 'dataset.txt'), 'w') as f: f.write(dataset_train.__class__.__name__) with open(osp.join(out, 'model.txt'), 'w') as f: f.write(model.__class__.__name__) with open(osp.join(out, 'n_class.txt'), 'w') as f: f.write(str(n_class)) with open(osp.join(out, 'batch_size.txt'), 'w') as f: f.write(str(args.batch_size)) trainer.extend(extensions.snapshot_object( model, savefun=chainer.serializers.save_npz, filename='max_miou.npz'), trigger=chainer.training.triggers.MaxValueTrigger( 'validation/main/miou', save_interval)) trainer.extend(extensions.snapshot_object( model, savefun=chainer.serializers.save_npz, filename='max_depth_acc.npz'), trigger=chainer.training.triggers.MaxValueTrigger( 'validation/main/depth_acc<0.10', save_interval)) trainer.extend( extensions.dump_graph(root_name='main/loss', out_name='graph.dot')) trainer.extend( extensions.LogReport(log_name='log.json', trigger=log_interval)) trainer.extend(chainer.training.extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'main/seg_loss', 'main/reg_loss', 'main/miou', 'main/depth_acc<0.03', 'main/depth_acc<0.10', 'main/depth_acc<0.30', 'validation/main/miou', 'validation/main/depth_acc<0.03', 'validation/main/depth_acc<0.10', 'validation/main/depth_acc<0.30', ]), trigger=print_interval) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend( extensions.ProgressBar(update_interval=progress_bar_update_interval)) trainer.extend(extensions.Evaluator(iter_valid, model, device=gpu), trigger=test_interval) trainer.run()
def main(): parser = argparse.ArgumentParser(description='ColumnNet') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=200, help='Number of sweeps over the dataset to train') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Set up a neural network to train Model = ColumnNet() model = L.Classifier(Model) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) # Load the ColumnNet dataset f = open('train_list.txt') train_lines = f.readlines() f.close() f = open('val_list.txt') val_lines = f.readlines() f.close() #dataset = LabeledImageDataset(list(zip(fnames, labels))) #transform_dataset = TransformDataset(dataset, transform) #train, val = datasets.split_dataset_random(transform_dataset, int(len(dataset) * 0.8), seed=0) train = load_dataset(train_lines) val = load_dataset(val_lines) train_iter = iterators.MultiprocessIterator(train, args.batchsize) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, shuffle=False) if args.test: val_interval = 5, 'epoch' log_interval = 1, 'epoch' else: val_interval = 100000, 'iteration' log_interval = 1000, 'iteration' # Set up an optimizer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out='result') # Set up a trainer trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu),trigger=val_interval) trainer.extend(extensions.snapshot(), trigger=(1, 'epoch')) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'validation/main/map', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/map'], x_key='epoch', file_name='loss.png')) trainer.extend(extensions.PlotReport(['main/accuracy', 'validation/main/map'], x_key='epoch', file_name='accuracy.png')) trainer.extend(extensions.dump_graph('main/loss')) # Run the training trainer.run() chainer.serializers.save_npz('result/columnnet.model', Model)
def main(): archs = { 'alex': alex.Alex, 'alex_fp16': alex.AlexFp16, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, 'resnext50': resnext50.ResNeXt50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--gpus', '-g', type=int, nargs="*", default=[0, 1, 2, 3]) parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() # Initialize the model to train model = archs[args.arch]() if args.initmodel: print('Load model from {}'.format(args.initmodel)) chainer.serializers.load_npz(args.initmodel, model) # Load the datasets and mean file mean = np.load(args.mean) train = train_imagenet.PreprocessedDataset(args.train, args.root, mean, model.insize) val = train_imagenet.PreprocessedDataset(args.val, args.root, mean, model.insize, False) # These iterators load the images with subprocesses running in parallel to # the training/validation. devices = tuple(args.gpus) train_iters = [ chainer.iterators.MultiprocessIterator(i, args.batchsize, n_processes=args.loaderjob) for i in chainer.datasets.split_dataset_n_random(train, len(devices)) ] val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer updater = updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=devices) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) if args.test: val_interval = 5, 'epoch' log_interval = 1, 'epoch' else: val_interval = 100000, 'iteration' log_interval = 1000, 'iteration' trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpus[0]), trigger=val_interval) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=2)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): archs = { 'alex': alex.Alex, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, 'resnext50': resnext50.ResNeXt50, } dtypes = { 'float16': np.float16, 'float32': np.float32, 'float64': np.float64, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--dtype', choices=dtypes, help='Specify the dtype ' 'used. If not supplied, the default dtype is used') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.set_defaults(test=False) parser.add_argument('--dali', action='store_true') parser.set_defaults(dali=False) group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() device = chainer.get_device(args.device) # Set the dtype if supplied. if args.dtype is not None: chainer.config.dtype = args.dtype print('Device: {}'.format(device)) print('Dtype: {}'.format(chainer.config.dtype)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Initialize the model to train model = archs[args.arch]() if args.initmodel: print('Load model from {}'.format(args.initmodel)) chainer.serializers.load_npz(args.initmodel, model) model.to_device(device) device.use() # Load the mean file mean = np.load(args.mean) if args.dali: if not dali_util._dali_available: raise RuntimeError('DALI seems not available on your system.') if device.xp is not chainer.backend.cuda.cupy: raise RuntimeError('Using DALI requires GPU device. Please ' 'specify it with --device option.') n_threads = args.loaderjob if n_threads is None or n_threads <= 0: n_threads = 1 ch_mean = list(np.average(mean, axis=(1, 2))) ch_std = [255.0, 255.0, 255.0] # Setup DALI pipelines train_pipe = dali_util.DaliPipelineTrain(args.train, args.root, model.insize, args.batchsize, n_threads, device.device.id, True, mean=ch_mean, std=ch_std) val_pipe = dali_util.DaliPipelineVal(args.val, args.root, model.insize, args.val_batchsize, n_threads, device.device.id, False, mean=ch_mean, std=ch_std) train_iter = chainer.iterators.DaliIterator(train_pipe) val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False) # converter = dali_converter converter = dali_util.DaliConverter(mean=mean, crop_size=model.insize) else: # Load the dataset files train = PreprocessedDataset(args.train, args.root, mean, model.insize) val = PreprocessedDataset(args.val, args.root, mean, model.insize, False) # These iterators load the images with subprocesses running in parallel # to the training/validation. train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) converter = dataset.concat_examples # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=converter, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) val_interval = (100000, 'iteration') log_interval = (1000, 'iteration') if args.test: val_interval = (1, 'iteration') log_interval = (1, 'iteration') # BEGIN ADDITIONAL TEST CODE val_interval = (1, 'iteration') log_interval = (1, 'iteration') # END ADDITIONAL TEST CODE trainer.extend(extensions.Evaluator(val_iter, model, converter=converter, device=device), trigger=val_interval) # TODO(sonots): Temporarily disabled for chainerx. Fix it. if device.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(args): assert ((args.depth - args.block - 1) % args.block == 0) n_layer = (args.depth - args.block - 1) / args.block if args.dataset == 'cifar10': train, test = cifar.get_cifar10() n_class = 10 elif args.dataset == 'cifar100': train, test = cifar.get_cifar100() n_class = 100 elif args.dataset == 'SVHN': raise NotImplementedError() mean = numpy.zeros((3, 32, 32), dtype=numpy.float32) for image, _ in train: mean += image / len(train) train = PreprocessedDataset(train, mean, random=True) test = PreprocessedDataset(test, mean) train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize) test_iter = chainer.iterators.MultiprocessIterator(test, args.batchsize, repeat=False, shuffle=False) model = chainer.links.Classifier( DenseNet(n_layer, args.growth_rate, n_class, args.drop_ratio, 16, args.block)) if args.init_model: serializers.load_npz(args.init_model, model) optimizer = chainer.optimizers.MomentumSGD(lr=args.lr / len(args.gpus), momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) devices = {'main': args.gpus[0]} if len(args.gpus) > 2: for gid in args.gpus[1:]: devices['gpu%d' % gid] = gid updater = training.ParallelUpdater(train_iter, optimizer, devices=devices) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.dir) val_interval = (1, 'epoch') log_interval = (1, 'epoch') eval_model = model.copy() eval_model.train = False trainer.extend(extensions.Evaluator(test_iter, eval_model, device=args.gpus[0]), trigger=val_interval) trainer.extend(extensions.ExponentialShift('lr', args.lr_decay_ratio), trigger=(args.lr_decay_freq, 'epoch')) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot_object(model, 'epoch_{.updater.epoch}.model'), trigger=val_interval) trainer.extend(extensions.snapshot_object(optimizer, 'epoch_{.updater.epoch}.state'), trigger=val_interval) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) start_time = time.time() trainer.extend(extensions.observe_value( 'time', lambda _: time.time() - start_time), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'time', 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr', ]), trigger=log_interval) trainer.extend(extensions.observe_value('graph', lambda _: create_fig(args.dir)), trigger=(2, 'epoch')) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--batchsize', type=int, default=12) parser.add_argument('--class-weight', type=str, default='class_weight.npy') parser.add_argument('--out', type=str, default='result') args = parser.parse_args() # Triggers log_trigger = (50, 'iteration') validation_trigger = (2000, 'iteration') end_trigger = (16000, 'iteration') # Dataset train = CamVidDataset(split='train') train = TransformDataset(train, transform) val = CamVidDataset(split='val') # Iterator train_iter = iterators.MultiprocessIterator(train, args.batchsize) val_iter = iterators.MultiprocessIterator( val, args.batchsize, shuffle=False, repeat=False) # Model class_weight = np.load(args.class_weight) model = SegNetBasic(n_class=len(camvid_label_names)) model = PixelwiseSoftmaxClassifier( model, class_weight=class_weight) if args.gpu >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU # Optimizer optimizer = optimizers.MomentumSGD(lr=0.1, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(rate=0.0005)) # Updater updater = training.updaters.StandardUpdater( train_iter, optimizer, device=args.gpu) # Trainer trainer = training.Trainer(updater, end_trigger, out=args.out) trainer.extend(extensions.LogReport(trigger=log_trigger)) trainer.extend(extensions.observe_lr(), trigger=log_trigger) trainer.extend(extensions.dump_graph('main/loss')) if extensions.PlotReport.available(): trainer.extend(extensions.PlotReport( ['main/loss'], x_key='iteration', file_name='loss.png')) trainer.extend(extensions.PlotReport( ['validation/main/miou'], x_key='iteration', file_name='miou.png')) trainer.extend(extensions.PrintReport( ['epoch', 'iteration', 'elapsed_time', 'lr', 'main/loss', 'validation/main/miou', 'validation/main/mean_class_accuracy', 'validation/main/pixel_accuracy']), trigger=log_trigger) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend( SemanticSegmentationEvaluator( val_iter, model.predictor, camvid_label_names), trigger=validation_trigger) trainer.run() chainer.serializers.save_npz( os.path.join(args.out, 'snapshot_model.npz'), recalculate_bn_statistics(model.predictor, 24))
def run_training(config: str, device: int, seed: int): configs = ConfigParser.parse(config) params = yaml.load(open(config, encoding="utf-8")) if device >= 0: cuda.get_device(device).use() set_seed(seed, device) vocab = Vocabulary.prepare(configs) num_word_vocab = max(vocab.dictionaries["word2idx"].values()) + 1 num_char_vocab = max(vocab.dictionaries["char2idx"].values()) + 1 num_tag_vocab = max(vocab.dictionaries["tag2idx"].values()) + 1 model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab) transformer = DatasetTransformer(vocab) transform = transformer.transform external_configs = configs["external"] if "word_vector" in external_configs: syn0 = model.embed_word.W.data _, word_dim = syn0.shape pre_word_dim = vocab.gensim_model.vector_size if word_dim != pre_word_dim: msg = "Mismatch vector size between model and pre-trained word vectors" # NOQA msg += f"(model: \x1b[31m{word_dim}\x1b[0m" msg += f", pre-trained word vector: \x1b[31m{pre_word_dim}\x1b[0m" raise Exception(msg) word2idx = vocab.dictionaries["word2idx"] syn0 = prepare_pretrained_word_vector(word2idx, vocab.gensim_model, syn0, num_word_vocab) model.set_pretrained_word_vectors(syn0) train_iterator = create_iterator(vocab, configs, "train", transform) valid_iterator = create_iterator(vocab, configs, "valid", transform) test_iterator = create_iterator(vocab, configs, "test", transform) if device >= 0: model.to_gpu(device) optimizer = create_optimizer(configs) optimizer.setup(model) optimizer = add_hooks(optimizer, configs) updater = T.StandardUpdater(train_iterator, optimizer, converter=converter, device=device) params = configs.export() params["num_word_vocab"] = num_word_vocab params["num_char_vocab"] = num_char_vocab params["num_tag_vocab"] = num_tag_vocab epoch = configs["iteration"]["epoch"] trigger = (epoch, "epoch") model_path = configs["output"] timestamp = datetime.datetime.now() timestamp_str = timestamp.isoformat() output_path = Path(f"{model_path}.{timestamp_str}") trainer = T.Trainer(updater, trigger, out=output_path) save_args(params, output_path) msg = f"Create \x1b[31m{output_path}\x1b[0m for saving model snapshots" logging.debug(msg) entries = ["epoch", "iteration", "elapsed_time", "lr", "main/loss"] entries += ["validation/main/loss", "validation/main/fscore"] entries += ["validation_1/main/loss", "validation_1/main/fscore"] valid_evaluator = NamedEntityEvaluator(valid_iterator, model, transformer.itransform, converter, device=device) test_evaluator = NamedEntityEvaluator(test_iterator, model, transformer.itransform, converter, device=device) epoch_trigger = (1, "epoch") snapshot_filename = "snapshot_epoch_{.updater.epoch:04d}" trainer.extend(valid_evaluator, trigger=epoch_trigger) trainer.extend(test_evaluator, trigger=epoch_trigger) trainer.extend(E.observe_lr(), trigger=epoch_trigger) trainer.extend(E.LogReport(trigger=epoch_trigger)) trainer.extend(E.PrintReport(entries=entries), trigger=epoch_trigger) trainer.extend(E.ProgressBar(update_interval=20)) trainer.extend(E.snapshot_object(model, filename=snapshot_filename), trigger=(1, "epoch")) if "learning_rate_decay" in params: logger.debug("Enable Learning Rate decay") trainer.extend( LearningRateDecay("lr", params["learning_rate"], params["learning_rate_decay"]), trigger=epoch_trigger, ) trainer.run()
def main(): if not chainer.cuda.available: raise RuntimeError("ImageNet requires GPU support.") archs = { 'alex': alex.Alex, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, } parser = argparse.ArgumentParser() parser.add_argument('train') parser.add_argument('val') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin') parser.add_argument('--batchsize', '-B', type=int, default=32) parser.add_argument('--epoch', '-E', type=int, default=2) parser.add_argument('--initmodel') parser.add_argument('--loaderjob', '-j', type=int) parser.add_argument('--mean', '-m', default='mean.npy') parser.add_argument('--resume', '-r', default='') parser.add_argument('--out', '-o', default='result') parser.add_argument('--train-root', default='.') parser.add_argument('--val-root', default='.') parser.add_argument('--val-batchsize', '-b', type=int, default=250) parser.add_argument('--communicator', default='hierarchical') parser.add_argument('--loadtype', default='original') parser.add_argument('--iterator', default='process') parser.add_argument('--optimizer', default='rmsprop_warmup') parser.add_argument('--test', action='store_true') parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--momentum', type=float, default=0.9) parser.add_argument('--cov_ema_decay', type=float, default=0.99) parser.add_argument('--inv_freq', type=int, default=20) parser.add_argument('--damping', type=float, default=0.001) parser.add_argument('--cov-batchsize', type=int, default=16) parser.add_argument('--n-cov-workers', type=int, default=1) parser.add_argument('--n-inv-workers', type=int, default=1) parser.add_argument('--join-cov', action='store_true') parser.add_argument('--npergroup', type=int, default=1) parser.add_argument('--weight-decay', type=float, default=0.00022) parser.set_defaults(test=False) args = parser.parse_args() comm = dlframeworks.chainer.communicators.create_communicator(debug=True) device = comm.intra_rank # GPU is related with intra rank chainer.cuda.get_device_from_id(device).use() model = archs[args.arch]() if args.initmodel: print('Load model from', args.initmodel) chainer.serializers.load_npz(args.initmodel, model) # Initialize weights x = np.zeros((1, 3, model.insize, model.insize), dtype=np.float32) t = np.zeros((1, ), dtype=np.int32) model(x, t) try: model.to_gpu() except chainer.cuda.cupy.cuda.runtime.CUDARuntimeError as e: print('Error occured in {}'.format(comm.rank), file=sys.stderr) raise e if comm.mpi_comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.mpi_comm.size)) print('Using {} communicator'.format(args.communicator)) print('Using {} arch'.format(args.arch)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') comm.mpi_comm.Barrier() # ======== Create optimizer ======== optimizer = dlframeworks.chainer.optimizers.KFAC( comm, lr=args.lr, momentum=args.momentum, cov_ema_decay=args.cov_ema_decay, inv_freq=args.inv_freq, damping=args.damping, ) # damping ~ 0.035 is good optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) batchsize = args.batchsize # Load all dataset in memory dataset_class = dlframeworks.chainer.datasets.CroppingDatasetIO mean = np.load(args.mean) # ======== Create dataset ======== if comm.rank == 0: train = dlframeworks.chainer.datasets.read_pairs(args.train) val = dlframeworks.chainer.datasets.read_pairs(args.val) else: train = None val = None train = chainermn.scatter_dataset(train, comm, shuffle=True) val = chainermn.scatter_dataset(val, comm) train_dataset = dataset_class(train, args.train_root, mean, model.insize, model.insize) val_dataset = dataset_class(val, args.val_root, mean, model.insize, model.insize) # ======== Create iterator ======== if args.iterator == 'process': multiprocessing.set_start_method('forkserver') train_iterator = chainer.iterators.MultiprocessIterator( train_dataset, batchsize, n_processes=args.loaderjob) val_iterator = chainer.iterators.MultiprocessIterator( val_dataset, args.val_batchsize, n_processes=args.loaderjob, repeat=False) elif args.iterator == 'thread': train_iterator = chainer.iterators.MultithreadIterator( train_dataset, batchsize, n_threads=args.loaderjob) val_iterator = chainer.iterators.MultithreadIterator( val_dataset, args.val_batchsize, n_threads=args.loaderjob, repeat=False) else: train_iterator = chainer.iterators.SerialIterator( train_dataset, batchsize) val_iterator = chainer.iterators.SerialIterator(val_dataset, args.val_batchsize, repeat=False, shuffle=False) # ======== Create updater ======== updater = training.StandardUpdater(train_iterator, optimizer, device=device) # ======== Create trainer ======== trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) # ======== Extend trainer ======== val_interval = (10, 'iteration') if args.test else (1, 'epoch') log_interval = (10, 'iteration') if args.test else (1, 'epoch') # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0: trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(observe_hyperparam('momentum'), trigger=log_interval) trainer.extend(observe_hyperparam('cov_ema_decay'), trigger=log_interval) trainer.extend(observe_hyperparam('inv_freq'), trigger=log_interval) trainer.extend(observe_hyperparam('damping'), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
trainer = training.Trainer(updater, stop_trigger=(80, 'epoch'), out='cifar10_result') trainer.extend( extensions.LogReport(keys=["main/loss", "validation/main/accuracy", "lr"], trigger=training.triggers.IntervalTrigger( 100, 'iteration'))) # logging statistics during calculation once a 100 iterations. trainer.extend(extensions.ExponentialShift('alpha', 1 / 3), trigger=training.triggers.IntervalTrigger(20, 'epoch')) # reduce learning rate 1/3 once a 20 epochs # Since Adam's leraning rate in its original paper is "alpha", # chainer's Adam has attribute "alpha" instead of "lr". trainer.extend(extensions.Evaluator(test_iter, model, device=device_id), trigger=training.triggers.IntervalTrigger(3, 'epoch')) # # conduct evaluation at the end of training only trainer.extend(extensions.observe_lr(), trigger=training.triggers.IntervalTrigger( 100, 'iteration')) # log the learning rate trainer.extend(extensions.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time', 'lr' ]), trigger=training.triggers.IntervalTrigger( 100, 'iteration')) # print statistics once a 100 iterations trainer.run() # Train the model
def handler(context): # Triggers log_trigger = (50, 'iteration') validation_trigger = (2000, 'iteration') end_trigger = (nb_iterations, 'iteration') # Dataset dataset_alias = context.datasets train_dataset_id = dataset_alias['train'] val_dataset_id = dataset_alias['val'] train = SegmentationDatasetFromAPI(train_dataset_id) val = SegmentationDatasetFromAPI(val_dataset_id) class_weight = calc_weight(train) print(class_weight) train = TransformDataset(train, transform) # Iterator train_iter = iterators.SerialIterator(train, BATCHSIZE) val_iter = iterators.SerialIterator(val, BATCHSIZE, shuffle=False, repeat=False) # Model model = SegNetBasic(n_class=len(camvid_label_names)) model = PixelwiseSoftmaxClassifier(model, class_weight=class_weight) if USE_GPU >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(USE_GPU).use() model.to_gpu() # Copy the model to the GPU # Optimizer optimizer = optimizers.MomentumSGD(lr=0.1, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(rate=0.0005)) # Updater updater = training.updaters.StandardUpdater(train_iter, optimizer, device=USE_GPU) # Trainer trainer = training.Trainer(updater, end_trigger, out=ABEJA_TRAINING_RESULT_DIR) trainer.extend(extensions.LogReport(trigger=log_trigger)) trainer.extend(extensions.observe_lr(), trigger=log_trigger) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot_object( model.predictor, filename='model_iteration-{.updater.iteration}'), trigger=end_trigger) print_entries = [ 'iteration', 'main/loss', 'validation/main/miou', 'validation/main/mean_class_accuracy', 'validation/main/pixel_accuracy' ] report_entries = [ 'epoch', 'iteration', 'lr', 'main/loss', 'validation/main/miou', 'validation/main/mean_class_accuracy', 'validation/main/pixel_accuracy' ] trainer.extend(Statistics(report_entries, nb_iterations, obs_key='iteration'), trigger=log_trigger) trainer.extend(Tensorboard(report_entries, out_dir=log_path)) trainer.extend(extensions.PrintReport(print_entries), trigger=log_trigger) trainer.extend(SemanticSegmentationEvaluator(val_iter, model.predictor, camvid_label_names), trigger=validation_trigger) trainer.run()
def train_one_epoch(model, train_data, lr, gpu, batchsize, out): train_model = PixelwiseSoftmaxClassifier(model) if gpu >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(gpu).use() train_model.to_gpu() # Copy the model to the GPU log_trigger = (0.1, 'epoch') validation_trigger = (1, 'epoch') end_trigger = (1, 'epoch') train_data = TransformDataset(train_data, ('img', 'label_map'), SimpleDoesItTransform(model.mean)) val = VOCSemanticSegmentationWithBboxDataset( split='val').slice[:, ['img', 'label_map']] # Iterator train_iter = iterators.MultiprocessIterator(train_data, batchsize) val_iter = iterators.MultiprocessIterator(val, 1, shuffle=False, repeat=False, shared_mem=100000000) # Optimizer optimizer = optimizers.MomentumSGD(lr=lr, momentum=0.9) optimizer.setup(train_model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(rate=0.0001)) # Updater updater = training.updaters.StandardUpdater(train_iter, optimizer, device=gpu) # Trainer trainer = training.Trainer(updater, end_trigger, out=out) trainer.extend(extensions.LogReport(trigger=log_trigger)) trainer.extend(extensions.observe_lr(), trigger=log_trigger) trainer.extend(extensions.dump_graph('main/loss')) if extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss'], x_key='iteration', file_name='loss.png')) trainer.extend( extensions.PlotReport(['validation/main/miou'], x_key='iteration', file_name='miou.png')) trainer.extend(extensions.snapshot_object(model, filename='snapshot.npy'), trigger=end_trigger) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'elapsed_time', 'lr', 'main/loss', 'validation/main/miou', 'validation/main/mean_class_accuracy', 'validation/main/pixel_accuracy' ]), trigger=log_trigger) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(SemanticSegmentationEvaluator( val_iter, model, voc_semantic_segmentation_label_names), trigger=validation_trigger) trainer.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument('dataset', help="path to train json file") parser.add_argument('test_dataset', help="path to test dataset json file") parser.add_argument( '--dataset-root', help= "path to dataset root if dataset file is not already in root folder of dataset" ) parser.add_argument('--model', choices=('ssd300', 'ssd512'), default='ssd512') parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--gpu', type=int, nargs='*', default=[]) parser.add_argument('--out', default='result') parser.add_argument('--resume') parser.add_argument('--lr', type=float, default=0.001, help="default learning rate") parser.add_argument('--port', type=int, default=1337, help="port for bbox sending") parser.add_argument('--ip', default='127.0.0.1', help="destination ip for bbox sending") parser.add_argument( '--test-image', help="path to test image that shall be displayed in bbox vis") args = parser.parse_args() if args.dataset_root is None: args.dataset_root = os.path.dirname(args.dataset) if args.model == 'ssd300': model = SSD300(n_fg_class=1, pretrained_model='imagenet') image_size = (300, 300) elif args.model == 'ssd512': model = SSD512(n_fg_class=1, pretrained_model='imagenet') image_size = (512, 512) else: raise NotImplementedError("The model you want to train does not exist") model.use_preset('evaluate') train_chain = MultiboxTrainChain(model) train = TransformDataset( SheepDataset(args.dataset_root, args.dataset, image_size=image_size), Transform(model.coder, model.insize, model.mean)) if len(args.gpu) > 1: gpu_datasets = split_dataset_n_random(train, len(args.gpu)) if not len(gpu_datasets[0]) == len(gpu_datasets[-1]): adapted_second_split = split_dataset(gpu_datasets[-1], len(gpu_datasets[0]))[0] gpu_datasets[-1] = adapted_second_split else: gpu_datasets = [train] train_iter = [ ThreadIterator(gpu_dataset, args.batchsize) for gpu_dataset in gpu_datasets ] test = SheepDataset(args.dataset_root, args.test_dataset, image_size=image_size) test_iter = chainer.iterators.MultithreadIterator(test, args.batchsize, repeat=False, shuffle=False, n_threads=2) # initial lr is set to 1e-3 by ExponentialShift optimizer = chainer.optimizers.Adam(alpha=args.lr) optimizer.setup(train_chain) for param in train_chain.params(): if param.name == 'b': param.update_rule.add_hook(GradientScaling(2)) else: param.update_rule.add_hook(WeightDecay(0.0005)) if len(args.gpu) <= 1: updater = training.updaters.StandardUpdater( train_iter[0], optimizer, device=args.gpu[0] if len(args.gpu) > 0 else -1, ) else: updater = training.updaters.MultiprocessParallelUpdater( train_iter, optimizer, devices=args.gpu) updater.setup_workers() if len(args.gpu) > 0 and args.gpu[0] >= 0: chainer.backends.cuda.get_device_from_id(args.gpu[0]).use() model.to_gpu() trainer = training.Trainer(updater, (200, 'epoch'), args.out) trainer.extend(DetectionVOCEvaluator(test_iter, model, use_07_metric=True, label_names=voc_bbox_label_names), trigger=(1000, 'iteration')) # build logger # make sure to log all data necessary for prediction log_interval = 100, 'iteration' data_to_log = { 'image_size': image_size, 'model_type': args.model, } # add all command line arguments for argument in filter(lambda x: not x.startswith('_'), dir(args)): data_to_log[argument] = getattr(args, argument) # create callback that logs all auxiliary data the first time things get logged def backup_train_config(stats_cpu): if stats_cpu['iteration'] == log_interval: stats_cpu.update(data_to_log) trainer.extend( extensions.LogReport(trigger=log_interval, postprocess=backup_train_config)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc', 'main/loss/conf', 'validation/main/map' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=(5000, 'iteration')) if args.test_image is not None: plot_image = train._dataset.load_image(args.test_image, resize_to=image_size) else: plot_image, _, _ = train.get_example(0) plot_image += train._transform.mean bbox_plotter = BBOXPlotter( plot_image, os.path.join(args.out, 'bboxes'), send_bboxes=True, upstream_port=args.port, upstream_ip=args.ip, ) trainer.extend(bbox_plotter, trigger=(10, 'iteration')) if args.resume: serializers.load_npz(args.resume, trainer) trainer.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--batchsize', type=int, default=1) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--out', default='result') parser.add_argument('--resume') args = parser.parse_args() comm = chainermn.create_communicator() device = comm.intra_rank faster_rcnn = FasterRCNNVGG16( n_fg_class=len(epic_kitchens_bbox_label_names), pretrained_model='imagenet') faster_rcnn.use_preset('evaluate') model = FasterRCNNTrainChain(faster_rcnn) chainer.cuda.get_device_from_id(device).use() model.to_gpu() train = EpicKitchensBboxDataset(year='2018', split='train') if comm.rank == 0: indices = np.arange(len(train)) else: indices = None train = TransformDataset(train, ('img', 'bbox', 'label', 'scale'), Transform(faster_rcnn)) indices = chainermn.scatter_dataset(indices, comm, shuffle=True) train = train.slice[indices] train_iter = chainer.iterators.SerialIterator(train, batch_size=args.batchsize) optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.MomentumSGD(), comm) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(rate=0.0005)) updater = training.updaters.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (18, 'epoch'), args.out) trainer.extend(extensions.ExponentialShift('lr', 0.1, init=args.lr), trigger=triggers.ManualScheduleTrigger([12, 15], 'epoch')) if comm.rank == 0: log_interval = 10, 'iteration' trainer.extend( extensions.LogReport(log_name='log.json', trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'main/roi_loc_loss', 'main/roi_cls_loss', 'main/rpn_loc_loss', 'main/rpn_cls_loss' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=1)) trainer.extend(extensions.snapshot_object( model.faster_rcnn, 'model_iter_{.updater.iteration}.npz'), trigger=(1, 'epoch')) if args.resume: serializers.load_npz(args.resume, trainer) trainer.run()
def main(): args = parse_args() dump_args(args) # prepare dataset train, val, val_raw = prepare_dataset(full_data=args.full_data) train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize, shared_mem=4000000) val_iter = chainer.iterators.MultiprocessIterator(val, args.batchsize, repeat=False, shuffle=False, shared_mem=4000000) eval_iter = chainer.iterators.MultiprocessIterator(val_raw, 4, repeat=False, shuffle=False, shared_mem=4000000) # setup model if args.model == 'unet': model = UnetCenterNet() elif args.model == 'res18unet': model = Res18UnetCenterNet() training_model = TrainingModel(model) if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() training_model.to_gpu() # setup optimizer optimizer = chainer.optimizers.NesterovAG(lr=1e-3) optimizer.setup(training_model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-5)) optimizer.add_hook(chainer.optimizer.GradientClipping(100.)) # setup trainer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=converter) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # set trainer extensions if not args.full_data: trainer.extend( extensions.Evaluator(val_iter, training_model, device=args.gpu, converter=converter)) trainer.extend(DetectionMapEvaluator(eval_iter, model)) trainer.extend(extensions.snapshot_object(model, 'model_{.updater.epoch}.npz'), trigger=(10, 'epoch')) trainer.extend(extensions.snapshot(), trigger=(10, 'epoch')) trainer.extend(extensions.LogReport()) if args.full_data: trainer.extend(extensions.PrintReport(['epoch', 'main/loss'])) else: trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'eval/main/map' ])) trainer.extend(extensions.ProgressBar(update_interval=10)) # learning rate scheduling lr_drop_epochs = [int(args.epoch * 0.5), int(args.epoch * 0.75)] lr_drop_trigger = triggers.ManualScheduleTrigger(lr_drop_epochs, 'epoch') trainer.extend(LearningRateDrop(0.1), trigger=lr_drop_trigger) trainer.extend(extensions.observe_lr()) if args.resume: chainer.serializers.load_npz(args.resume, trainer) # start training trainer.run()
def main(): archs = { 'alex': alex.Alex, 'alex_fp16': alex.AlexFp16, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, 'resnext50': resnext50.ResNeXt50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--iterations', '-I', type=int, default=0, help='Number of iterations to train') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.set_defaults(test=False) parser.add_argument('--dali', action='store_true') parser.set_defaults(dali=False) group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--compile', action='store_true', help='Compile the model') parser.add_argument('--dump_onnx', action='store_true', help='Dump ONNX model after optimization') args = parser.parse_args() chainer.config.autotune = True chainer.config.cudnn_fast_batch_normalization = True device = chainer.get_device(args.device) print('Device: {}'.format(device)) print('# Minibatch-size: {}'.format(args.batchsize)) if args.iterations: print('# iterations: {}'.format(args.iterations)) else: print('# epoch: {}'.format(args.epoch)) print('') # Initialize the model to train model = archs[args.arch]() if args.initmodel: print('Load model from {}'.format(args.initmodel)) chainer.serializers.load_npz(args.initmodel, model) insize = model.insize if args.compile: model = chainer_compiler.compile(model, dump_onnx=args.dump_onnx) model.to_device(device) device.use() # Load the mean file mean = np.load(args.mean) if args.dali: if not dali_util._dali_available: raise RuntimeError('DALI seems not available on your system.') num_threads = args.loaderjob if num_threads is None or num_threads <= 0: num_threads = 1 ch_mean = list(np.average(mean, axis=(1, 2))) ch_std = [255.0, 255.0, 255.0] # Setup DALI pipelines train_pipe = dali_util.DaliPipelineTrain(args.train, args.root, insize, args.batchsize, num_threads, args.gpu, True, mean=ch_mean, std=ch_std) val_pipe = dali_util.DaliPipelineVal(args.val, args.root, insize, args.val_batchsize, num_threads, args.gpu, False, mean=ch_mean, std=ch_std) train_iter = chainer.iterators.DaliIterator(train_pipe) val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False) # converter = dali_converter converter = dali_util.DaliConverter(mean=mean, crop_size=insize) else: # Load the dataset files train = PreprocessedDataset(args.train, args.root, mean, insize) val = PreprocessedDataset(args.val, args.root, mean, insize, False) # These iterators load the images with subprocesses running in parallel # to the training/validation. train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) converter = dataset.concat_examples # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=converter, device=device) if args.iterations: stop_trigger = (args.iterations, 'iteration') else: stop_trigger = (args.epoch, 'epoch') trainer = training.Trainer(updater, stop_trigger, args.out) val_interval = (1 if args.test else 100000), 'iteration' log_interval = ((1 if args.test else 10 if args.iterations else 1000), 'iteration') trainer.extend(extensions.Evaluator(val_iter, model, converter=converter, device=device), trigger=val_interval) # TODO(sonots): Temporarily disabled for chainerx. Fix it. if device.xp is not chainerx: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) cuda_hook = function_hooks.CUDAProfileHook() with cuda_hook: trainer.run() with open('%s/log' % args.out) as f: logs = json.load(f) elapsed_times = [] for prev, cur in zip(logs, logs[1:]): iters = cur['iteration'] - prev['iteration'] elapsed = cur['elapsed_time'] - prev['elapsed_time'] elapsed_times.append(elapsed / iters) sec_per_iter = sum(elapsed_times) / len(elapsed_times) print(sec_per_iter * 1000, 'msec/iter') print(args.batchsize / sec_per_iter, 'images/sec')
def main(): # These two lines help with memory. If they are not included training runs out of memory. # Use them till you the real reason why its running out of memory pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed) cp.cuda.set_allocator(pool.malloc) chainer.disable_experimental_feature_warning = True parser = argparse.ArgumentParser( description='CosmoFlow Multi-Node Training') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epochs', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--out', '-o', default='results', help='Output directory') args = parser.parse_args() batch_size = args.batchsize epochs = args.epochs out = args.out # Prepare communicators communicator. comm = chainermnx.create_communicator("spatial_hybrid_nccl") local_comm = create_local_comm(comm) data_comm = create_data_comm(comm) device = comm.intra_rank if local_comm.rank == 0: if data_comm.rank == 0: train = CosmoDataset("/groups2/gaa50004/cosmoflow_data") # train, val = datasets.split_dataset_random(training_data, first_size=(int(training_data.__len__() * 0.80))) else: train = None #val = None train = chainermn.scatter_dataset(train, data_comm, shuffle=True) # val = chainermn.scatter_dataset(val, data_comm, shuffle=True) else: train = CosmoDataset("/groups2/gaa50004/cosmoflow_data") train = chainermn.datasets.create_empty_dataset(train) # val = chainermn.datasets.create_empty_dataset(val) train_iterator = chainermn.iterators.create_multi_node_iterator( chainer.iterators.MultithreadIterator(train, batch_size, n_threads=20, shuffle=True), local_comm) # vali_iterator = chainermn.iterators.create_multi_node_iterator( # chainer.iterators.MultithreadIterator(val, batch_size, repeat=False, shuffle=False, n_threads=20), # local_comm) model = CosmoFlow(local_comm) # model = L.Classifier(model, lossfun=F.mean_squared_error, accfun=F.mean_squared_error) # print("Model Created successfully") ch.backends.cuda.get_device_from_id(device).use() model.to_gpu() # Copy the model to the GPU optimizer = chainermnx.create_hybrid_multi_node_optimizer_alpha( chainer.optimizers.Adam(), data_comm, local_comm) optimizer.setup(model) # Create the updater, using the optimizer updater = training.StandardUpdater(train_iterator, optimizer, device=device) # Set up a trainer trainer = training.Trainer(updater, (epochs, 'epoch'), out=out) # trainer.extend(extensions.Evaluator(vali_iterator, model, device=device)) filename = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".log" log_interval = (1, 'epoch') if comm.rank == 0: trainer.extend(extensions.DumpGraph('main/loss')) trainer.extend( extensions.LogReport(trigger=log_interval, filename=filename)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', filename='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', filename='accuracy.png')) trainer.extend(extensions.ProgressBar(update_interval=1)) print("Starting Training ") trainer.run()
def main(): # Check if GPU is available # (ImageNet example does not support CPU execution) if not chainer.cuda.available: raise RuntimeError("ImageNet requires GPU support.") archs = { 'alex': alex.Alex, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.add_argument('--communicator', default='hierarchical') parser.set_defaults(test=False) args = parser.parse_args() # Prepare ChainerMN communicator. comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) print('Using {} communicator'.format(args.communicator)) print('Using {} arch'.format(args.arch)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') model = archs[args.arch]() if args.initmodel: print('Load model from', args.initmodel) chainer.serializers.load_npz(args.initmodel, model) chainer.cuda.get_device_from_id(device).use() # Make the GPU current model.to_gpu() # Split and distribute the dataset. Only worker 0 loads the whole dataset. # Datasets of worker 0 are evenly split and distributed to all workers. mean = np.load(args.mean) if comm.rank == 0: train = PreprocessedDataset(args.train, args.root, mean, model.insize) val = PreprocessedDataset( args.val, args.root, mean, model.insize, False) else: train = None val = None train = chainermn.scatter_dataset(train, comm, shuffle=True) val = chainermn.scatter_dataset(val, comm) # We need to change the start method of multiprocessing module if we are # using InfiniBand and MultiprocessIterator. This is because processes # often crash when calling fork if they are using Infiniband. # (c.f., https://www.open-mpi.org/faq/?category=tuning#fork-warning ) multiprocessing.set_start_method('forkserver') train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) # Create a multi node optimizer from a standard Chainer optimizer. optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm) optimizer.setup(model) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch') val_interval = (10, 'iteration') if args.test else (1, 'epoch') log_interval = (10, 'iteration') if args.test else (1, 'epoch') checkpointer = chainermn.create_multi_node_checkpointer( name='imagenet-example', comm=comm) checkpointer.maybe_load(trainer, optimizer) trainer.extend(checkpointer, trigger=checkpoint_interval) # Create a multi node evaluator from an evaluator. evaluator = TestModeEvaluator(val_iter, model, device=device) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator, trigger=val_interval) # Some display and output extensions are necessary only for one worker. # (Otherwise, there would just be repeated outputs.) if comm.rank == 0: trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): # Parse the arguments. args = parse_arguments() augment = False if args.augment == 'False' else True multi_gpu = False if args.multi_gpu == 'False' else True if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): label_arr = np.asarray(label_list, dtype=np.int32) return label_arr # Apply a preprocessor to the dataset. logging.info('Preprocess train dataset and valid dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParserForPair(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_cols=['smiles_1', 'smiles_2']) train_dict = parser.parse(args.train_datafile, return_smiles_pair_original=True) train = train_dict['dataset'] train_smiles_pairs = train_dict['smiles_pair_original'] valid_dict = parser.parse(args.valid_datafile, return_smiles_pair_original=True) valid = valid_dict['dataset'] valid_smiles_pairs = valid_dict['smiles_pair_original'] if augment: logging.info('Utilizing data augmentation in train set') train = augment_dataset(train) if train_smiles_pairs is not None: train_smiles_pairs = augment_smiles_pairs(train_smiles_pairs) train = add_super_nodes(train, train_smiles_pairs) valid = add_super_nodes(valid, valid_smiles_pairs) num_train = train.get_datasets()[0].shape[0] num_valid = valid.get_datasets()[0].shape[0] logging.info('Train/valid split: {}/{}'.format(num_train, num_valid)) # Set up the predictor. if len(args.net_hidden_dims): net_hidden_dims = tuple([ int(net_hidden_dim) for net_hidden_dim in args.net_hidden_dims.split(',') ]) else: net_hidden_dims = () fp_attention = True if args.fp_attention else False update_attention = True if args.update_attention else False weight_tying = False if args.weight_tying == 'False' else True attention_tying = False if args.attention_tying == 'False' else True fp_batch_normalization = True if args.fp_bn == 'True' else False layer_aggregator = None if args.layer_aggregator == '' else args.layer_aggregator context = False if args.context == 'False' else True output_activation = functions.relu if args.output_activation == 'relu' else None n_heads = args.n_heads dropout_ratio = args.dropout_ratio predictor = set_up_predictor( method=args.method, fp_hidden_dim=args.fp_hidden_dim, fp_out_dim=args.fp_out_dim, conv_layers=args.conv_layers, concat_hidden=args.concat_hidden, layer_aggregator=layer_aggregator, fp_dropout_rate=args.fp_dropout_rate, fp_batch_normalization=fp_batch_normalization, net_hidden_dims=net_hidden_dims, class_num=class_num, sim_method=args.sim_method, fp_attention=fp_attention, weight_typing=weight_tying, attention_tying=attention_tying, update_attention=update_attention, context=context, context_layers=args.context_layers, context_dropout=args.context_dropout, message_function=args.message_function, readout_function=args.readout_function, num_timesteps=args.num_timesteps, num_output_hidden_layers=args.num_output_hidden_layers, output_hidden_dim=args.output_hidden_dim, output_activation=output_activation, symmetric=args.symmetric, n_heads=n_heads, dropout_ratio=dropout_ratio) train_iter = SerialIterator(train, args.batchsize) valid_iter = SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) metrics_fun = {'accuracy': F.binary_accuracy} loss_func = F.sigmoid_cross_entropy classifier = Classifier(predictor, lossfun=loss_func, metrics_fun=metrics_fun, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam(alpha=args.learning_rate, weight_decay_rate=args.weight_decay_rate) # optimizer = optimizers.Adam() # optimizer = optimizers.SGD(lr=args.learning_rate) optimizer.setup(classifier) # add regularization if args.max_norm > 0: optimizer.add_hook( chainer.optimizer.GradientClipping(threshold=args.max_norm)) if args.l2_rate > 0: optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.l2_rate)) if args.l1_rate > 0: optimizer.add_hook(chainer.optimizer.Lasso(rate=args.l1_rate)) # Set up the updater. if multi_gpu: logging.info('Using multiple GPUs') updater = training.ParallelUpdater(train_iter, optimizer, devices={ 'main': 0, 'second': 1 }, converter=concat_mols) else: logging.info('Using single GPU') updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. logging.info('Training...') # add stop_trigger parameter early_stop = triggers.EarlyStoppingTrigger(monitor='validation/main/loss', patients=10, max_trigger=(500, 'epoch')) out = 'output' + '/' + args.out trainer = training.Trainer(updater, stop_trigger=early_stop, out=out) # trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(valid_iter, classifier, device=args.gpu, converter=concat_mols)) train_eval_iter = SerialIterator(train, args.batchsize, repeat=False, shuffle=False) trainer.extend( AccuracyEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_acc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( AccuracyEvaluator(valid_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_acc', pos_labels=1, ignore_labels=-1)) trainer.extend( ROCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_roc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( ROCAUCEvaluator(valid_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_roc', pos_labels=1, ignore_labels=-1)) trainer.extend( PRCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_prc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( PRCAUCEvaluator(valid_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_prc', pos_labels=1, ignore_labels=-1)) trainer.extend( F1Evaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_f', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( F1Evaluator(valid_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_f', pos_labels=1, ignore_labels=-1)) # apply shift strategy to learning rate every 10 epochs # trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=(10, 'epoch')) if args.exp_shift_strategy == 1: trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=triggers.ManualScheduleTrigger( [10, 20, 30, 40, 50, 60], 'epoch')) elif args.exp_shift_strategy == 2: trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=triggers.ManualScheduleTrigger( [5, 10, 15, 20, 25, 30], 'epoch')) elif args.exp_shift_strategy == 3: trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=triggers.ManualScheduleTrigger( [5, 10, 15, 20, 25, 30, 40, 50, 60, 70], 'epoch')) else: raise ValueError('No such strategy to adapt learning rate') # # observation of learning rate trainer.extend(E.observe_lr(), trigger=(1, 'iteration')) entries = [ 'epoch', 'main/loss', 'train_acc/main/accuracy', 'train_roc/main/roc_auc', 'train_prc/main/prc_auc', # 'train_p/main/precision', 'train_r/main/recall', 'train_f/main/f1', 'validation/main/loss', 'val_acc/main/accuracy', 'val_roc/main/roc_auc', 'val_prc/main/prc_auc', # 'val_p/main/precision', 'val_r/main/recall', 'val_f/main/f1', 'lr', 'elapsed_time' ] trainer.extend(E.PrintReport(entries=entries)) trainer.extend(E.snapshot(), trigger=(10, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.ProgressBar()) trainer.extend( E.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( E.PlotReport(['train_acc/main/accuracy', 'val_acc/main/accuracy'], 'epoch', file_name='accuracy.png')) if args.resume: resume_path = os.path.join(out, args.resume) logging.info( 'Resume training according to snapshot in {}'.format(resume_path)) chainer.serializers.load_npz(resume_path, trainer) trainer.run() # Save the regressor's parameters. model_path = os.path.join(out, args.model_filename) logging.info('Saving the trained models to {}...'.format(model_path)) classifier.save_pickle(model_path, protocol=args.protocol)
def main(): archs = { 'alex': alex.Alex, 'alex_fp16': alex.AlexFp16, 'googlenet': googlenet.GoogLeNet, 'googlenetbn': googlenetbn.GoogLeNetBN, 'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16, 'nin': nin.NIN, 'resnet50': resnet50.ResNet50, 'resnext50': resnext50.ResNeXt50, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('val', help='Path to validation image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Learning minibatch size') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--initmodel', help='Initialize the model from given file') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--resume', '-r', default='', help='Initialize the trainer from given file') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') parser.set_defaults(test=False) parser.add_argument('--dali', action='store_true') parser.set_defaults(dali=False) group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() device = parse_device(args) print('Device: {}'.format(device)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Initialize the model to train model = archs[args.arch]() if args.initmodel: print('Load model from {}'.format(args.initmodel)) chainer.serializers.load_npz(args.initmodel, model) model.to_device(device) device.use() # Load the mean file mean = np.load(args.mean) if args.dali: if not dali_util._dali_available: raise RuntimeError('DALI seems not available on your system.') num_threads = args.loaderjob if num_threads is None or num_threads <= 0: num_threads = 1 ch_mean = list(np.average(mean, axis=(1, 2))) ch_std = [255.0, 255.0, 255.0] # Setup DALI pipelines train_pipe = dali_util.DaliPipelineTrain( args.train, args.root, model.insize, args.batchsize, num_threads, args.gpu, True, mean=ch_mean, std=ch_std) val_pipe = dali_util.DaliPipelineVal( args.val, args.root, model.insize, args.val_batchsize, num_threads, args.gpu, False, mean=ch_mean, std=ch_std) train_iter = chainer.iterators.DaliIterator(train_pipe) val_iter = chainer.iterators.DaliIterator(val_pipe, repeat=False) # converter = dali_converter converter = dali_util.DaliConverter(mean=mean, crop_size=model.insize) else: # Load the dataset files train = PreprocessedDataset(args.train, args.root, mean, model.insize) val = PreprocessedDataset(args.val, args.root, mean, model.insize, False) # These iterators load the images with subprocesses running in parallel # to the training/validation. train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=args.loaderjob) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, n_processes=args.loaderjob) converter = dataset.concat_examples # Set up an optimizer optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(model) # Set up a trainer updater = training.updaters.StandardUpdater( train_iter, optimizer, converter=converter, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) val_interval = (1 if args.test else 100000), 'iteration' log_interval = (1 if args.test else 1000), 'iteration' trainer.extend(extensions.Evaluator(val_iter, model, converter=converter, device=device), trigger=val_interval) # TODO(sonots): Temporarily disabled for chainerx. Fix it. if not (chainerx.is_available() and isinstance(device, chainerx.Device)): trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot(), trigger=val_interval) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def setup_trainer(self): self.updater = chainer.training.updater.StandardUpdater( self.train_iterator, self.optimizer, device=self.gpu) self.trainer = chainer.training.Trainer( self.updater, (self.max_epoch, 'epoch'), out=self.out_dir) self.trainer.extend( extensions.Evaluator( self.val_iterator, self.model, device=self.gpu), trigger=(self.eval_interval, self.eval_interval_type)) # Save snapshot self.trainer.extend( extensions.snapshot_object( self.model, savefun=S.save_npz, filename='model_snapshot.npz'), trigger=chainer.training.triggers.MinValueTrigger( 'validation/main/loss', (self.save_interval, self.save_interval_type))) # Dump network architecture self.trainer.extend( extensions.dump_graph( root_name='main/loss', out_name='network_architecture.dot')) # Logging self.trainer.extend( extensions.ProgressBar( update_interval=self.progressbar_update_interval)) self.trainer.extend( extensions.observe_lr(), trigger=(self.log_interval, self.log_interval_type)) self.trainer.extend( extensions.LogReport( log_name='log.json', trigger=(self.log_interval, self.log_interval_type))) self.trainer.extend( extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'validation/main/loss', ]), trigger=(self.print_interval, self.print_interval_type)) # Plot self.trainer.extend( extensions.PlotReport([ 'main/loss', 'validation/main/loss', ], file_name='loss_plot.png', x_key=self.plot_interval_type, trigger=(self.plot_interval, self.plot_interval_type)), trigger=(self.plot_interval, self.plot_interval_type)) # Dump params params = dict() params['model_name'] = self.model_name params['train_dataset_dir'] = self.train_dataset_dir params['val_dataset_dir'] = self.val_dataset_dir params['class_names'] = self.train_dataset.class_names params['timestamp'] = self.timestamp_iso params['out_dir'] = self.out_dir params['gpu'] = self.gpu params['batch_size'] = self.batch_size params['max_epoch'] = self.max_epoch params['lr'] = self.lr params['weight_decay'] = self.weight_decay self.trainer.extend( fcn.extensions.ParamsReport(params, file_name='params.yaml')) # Dump param for fcn_object_segmentation.py model_name = dict() model_name['model_name'] = self.model_name self.trainer.extend( fcn.extensions.ParamsReport( model_name, file_name='model_name.yaml')) target_names = dict() target_names['target_names'] = self.train_dataset.class_names self.trainer.extend( fcn.extensions.ParamsReport( target_names, file_name='target_names.yaml'))