def get_config(model): nr_tower = max(get_nr_gpu(), 1) batch = PER_GPU_BATCH_SIZE logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] callbacks = [ StepTimeCallback(), GPUUtilizationTracker(), ] if args.fake: dataset_train = FakeData([[batch, 224, 224, 3], [batch]], 1000, random=False, dtype=['uint8', 'int32']) input = QueueInput(dataset_train) input = StagingInput(input, nr_stage=1) num_gpus = get_nr_gpu() return TrainConfig( model=model, data=input, callbacks=callbacks, steps_per_epoch=1281 // (PER_GPU_BATCH_SIZE * get_nr_gpu()), max_epoch=3, )
def get_config(args, model): ds_train, train_sample_num = get_data('train', args.batch_size_per_gpu) ds_test, _ = get_data('test', 1) #args.batch_size_per_gpu) training_number_of_steps = 300 * train_sample_num // (args.batch_size_per_gpu * get_nr_gpu()) steps_per_epoch = train_sample_num // (args.batch_size_per_gpu * get_nr_gpu()) epoch_num = int(cfg.max_itr_num / steps_per_epoch) callbacks = [ ModelSaver(), PeriodicTrigger(InferenceRunner(ds_test, CalMIOU()), every_k_epochs=3), HyperParamSetterWithFunc('learning_rate', lambda e, x: (((cfg.base_lr - cfg.end_lr) * (1 - steps_per_epoch * (e+9) / cfg.max_itr_num) ** cfg.learning_power) + cfg.end_lr)), HumanHyperParamSetter('learning_rate'), ] return TrainConfig( dataflow=ds_train, callbacks=callbacks, model=model, steps_per_epoch=steps_per_epoch, max_epoch=epoch_num, )
def get_config(): assert tf.test.is_gpu_available() nr_gpu = get_nr_gpu() batch = TOTAL_BATCH_SIZE // nr_gpu logger.info("Running on {} GPUs. Batch size per GPU: {}".format( nr_gpu, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=Model(), dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower)
def get_config(): assert tf.test.is_gpu_available() nr_gpu = get_nr_gpu() global BATCH_SIZE BATCH_SIZE = TOTAL_BATCH_SIZE // nr_gpu logger.info("Running on {} GPUs. Batch size per GPU: {}".format(nr_gpu, BATCH_SIZE)) dataset_train = get_data('train') dataset_val = get_data('val') return TrainConfig( model=Model(), dataflow=dataset_train, callbacks=[ ModelSaver(), InferenceRunner(dataset_val, [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')]), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), ], steps_per_epoch=5000, max_epoch=110, nr_tower=nr_gpu )
def get_config(): nr_tower = max(get_nr_gpu(), 1) batch = args.batch total_batch = batch * nr_tower BASE_LR = 0.01 * (total_batch / 256.) logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] callbacks = [ ModelSaver(), GPUUtilizationTracker(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [(0, 0.01), (3, max(BASE_LR, 0.01))], interp='linear'), ScheduledHyperParamSetter( 'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]), DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower))), ] input = QueueInput(dataset_train) input = StagingInput(input, nr_stage=1) return TrainConfig( model=Model(), data=input, callbacks=callbacks, steps_per_epoch=1281167 // total_batch, max_epoch=100, )
def get_config(model): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 3e-1), (30, 3e-2), (60, 3e-3), (90, 3e-4)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=100, nr_tower=nr_tower)
def get_config(model): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] callbacks = [ ModelSaver(), GPUUtilizationTracker(), ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear'), ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]), PeriodicTrigger(DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower))), every_k_epochs=1), ] input = QueueInput(dataset_train) input = StagingInput(input, nr_stage=1) return TrainConfig( model=model, data=input, callbacks=callbacks, steps_per_epoch=1281167 // TOTAL_BATCH_SIZE, max_epoch=100, )
def get_config(): nr_gpu = get_nr_gpu() global BATCH_SIZE BATCH_SIZE = TOTAL_BATCH_SIZE // nr_gpu dataset_train = get_data('train') dataset_val = get_data('val') return TrainConfig( model=Model(), dataflow=dataset_train, callbacks=[ ModelSaver(), PeriodicTrigger(InferenceRunner(dataset_val, [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ]), every_k_epochs=2), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (55, 1e-3), (75, 1e-4), (95, 1e-5)]), ], steps_per_epoch=5000, max_epoch=105, nr_tower=nr_gpu)
def get_config(fake=False, data_format='NCHW'): nr_tower = max(get_nr_gpu(), 1) global BATCH_SIZE BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = dataset_val = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, BATCH_SIZE)) dataset_train = get_data('train') dataset_val = get_data('val') return TrainConfig( model=Model(data_format=data_format), dataflow=dataset_train, callbacks=[ ModelSaver(), InferenceRunner(dataset_val, [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ]), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ], steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower)
def get_config(): global BATCH nr_tower = max(get_nr_gpu(), 1) BATCH = TOTAL_BATCH_SIZE // nr_tower logger.set_logger_dir() ds_train = get_data('train') ds_test = get_data('test') return TrainConfig( model=Model(), data=QueueInput(ds_train), callbacks=[ ModelSaver(), InferenceRunner(ds_test, [ScalarStats('total_costs')]), ], extra_callbacks=[ MovingAverageSummary(), ProgressBar(['']), MergeAllSummaries(), RunUpdateOps() ], steps_per_epoch=ds_train.size(), max_epoch=100, )
def main(_): if FLAGS.inference_input: print("Not implemented") sys.exit() if FLAGS.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu nr_tower = max(get_nr_gpu(), 1) batch_size_per_gpu = FLAGS.total_batch_size // nr_tower else: nr_tower = 1 batch_size_per_gpu = FLAGS.total_batch_size config = get_config(batch_size_per_gpu, nr_tower) if FLAGS.load: config.session_init = SaverRestore(FLAGS.load) logger.info("Using {} prefetch threads".format(FLAGS.num_prefetch_threads)) if FLAGS.gpu: logger.info( "Using GPU training. Num towers: {} Batch per tower: {}".format( nr_tower, batch_size_per_gpu)) config.nr_tower = nr_tower SyncMultiGPUTrainer(config).train() else: logger.info("Using CPU. Batch size: {}".format(batch_size_per_gpu)) QueueInputTrainer(config).train()
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower )
def get_config(): log_dir = 'train_log/cifar10-bottleneck-var-N%s-L%s' % (str(BOTTLE_NUM), str(LAYER_NUM)) logger.set_logger_dir(log_dir) nr_tower = max(get_nr_gpu(), 1) # prepare dataset dataset_train = get_data('train') steps_per_epoch = dataset_train.size() dataset_test = get_data('test') callbacks=[ ModelSaver(), #InferenceRunner(dataset_test, # [ScalarStats('cost'), ClassificationError()]), ScheduledHyperParamSetter('learning_rate', [(1, 0.1), (args.drop_1, 0.01), (args.drop_2, 0.001)]), TensorPrinter(['MASK/dense_layer.{}/mask_con_var:0'.format(i) for i in xrange(7)]) #+ ['block2/dense_bottleneck.0/dense_layer.{}/mask_con/EMA:0'.format(j) for j in xrange(10)] + ['block3/dense_bottleneck.0/dense_layer.{}/mask_con/EMA:0'.format(k) for k in xrange(10)]) ] if nr_tower == 1: callbacks.append(InferenceRunner(dataset_test, [ScalarStats('cost'), ClassificationError()])) else: callbacks.append(DataParallelInferenceRunner(dataset_test, [ScalarStats('cost'), ClassificationError()], list(range(nr_tower)))) return TrainConfig( dataflow=dataset_train, callbacks=callbacks, model=Model(depth=args.depth), steps_per_epoch=steps_per_epoch // nr_tower, max_epoch=args.max_epoch, )
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) if batch < 32 or batch > 64: logger.warn( "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported." ) if fake: data = QueueInput( FakeData([[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: data = QueueInput(get_data('train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] dataset_val = get_data('val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=105, )
def get_config(): nr_gpu = get_nr_gpu() global PREDICTOR_THREAD if nr_gpu > 0: if nr_gpu > 1: # use half gpus for inference predict_tower = list(range(nr_gpu))[-nr_gpu // 2:] else: predict_tower = [0] PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU train_tower = list(range(nr_gpu))[:-nr_gpu // 2] or [0] logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format( ','.join(map(str, train_tower)), ','.join(map(str, predict_tower)))) else: logger.warn( "Without GPU this model will never learn! CPU is only useful for debug." ) PREDICTOR_THREAD = 1 predict_tower, train_tower = [0], [0] # setup simulator processes base_port = args.base_port name_base = str(uuid.uuid1())[:6] PIPE_DIR = os.environ.get('TENSORPACK_PIPEDIR', '.').rstrip('/') namec2s = 'ipc://{}/sim-c2s-{}'.format(PIPE_DIR, name_base) names2c = 'ipc://{}/sim-s2c-{}'.format(PIPE_DIR, name_base) procs = [ MySimulatorWorker(k, namec2s, names2c, base_port=base_port) for k in range(SIMULATOR_PROC) ] ensure_proc_terminate(procs) start_proc_mask_signal(procs) M = Model() master = MySimulatorMaster(namec2s, names2c, M, predict_tower) dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE) return TrainConfig( model=M, dataflow=dataflow, callbacks=[ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(200, 0.0003), (1200, 0.0001)]), ScheduledHyperParamSetter('entropy_beta', [(800, 0.005)]), HumanHyperParamSetter('learning_rate'), HumanHyperParamSetter('entropy_beta'), master, StartProcOrThread(master), #PeriodicTrigger(Evaluator( # EVAL_EPISODE, ['state'], ['policy'], get_player), # every_k_epochs=3), ], session_creator=sesscreate.NewSessionCreator( config=get_default_sess_config(0.5)), steps_per_epoch=STEPS_PER_EPOCH, max_epoch=10000, tower=train_tower)
def get_config(model, fake=False): start_ = 0 nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if fake: dataset_train = FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8') callbacks = [] else: dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) if start_ < 31: lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 61: lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 91: lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] else: print('not found learning rate setting!!!!!!!!!!!!!') callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', lr_setting), # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0']) ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=120, )
def train(): dirname = os.path.join('train_log', 'train-atari-{}'.format(ENV_NAME)) logger.set_logger_dir(dirname) # assign GPUs for training & inference nr_gpu = get_nr_gpu() global PREDICTOR_THREAD if nr_gpu > 0: if nr_gpu > 1: # use half gpus for inference predict_tower = list(range(nr_gpu))[-nr_gpu // 2:] else: predict_tower = [0] PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU train_tower = list(range(nr_gpu))[:-nr_gpu // 2] or [0] logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format( ','.join(map(str, train_tower)), ','.join(map(str, predict_tower)))) else: logger.warn("Without GPU this model will never learn! CPU is only useful for debug.") PREDICTOR_THREAD = 1 predict_tower, train_tower = [0], [0] # setup simulator processes name_base = str(uuid.uuid1())[:6] prefix = '@' if sys.platform.startswith('linux') else '' namec2s = 'ipc://{}sim-c2s-{}'.format(prefix, name_base) names2c = 'ipc://{}sim-s2c-{}'.format(prefix, name_base) procs = [MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)] ensure_proc_terminate(procs) start_proc_mask_signal(procs) master = MySimulatorMaster(namec2s, names2c, predict_tower) dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE) config = TrainConfig( model=Model(), dataflow=dataflow, callbacks=[ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]), ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]), HumanHyperParamSetter('learning_rate'), HumanHyperParamSetter('entropy_beta'), master, StartProcOrThread(master), PeriodicTrigger(Evaluator( EVAL_EPISODE, ['state'], ['policy'], get_player), every_k_epochs=3), ], session_creator=sesscreate.NewSessionCreator( config=get_default_sess_config(0.5)), steps_per_epoch=STEPS_PER_EPOCH, session_init=get_model_loader(args.load) if args.load else None, max_epoch=1000, ) trainer = SimpleTrainer() if config.nr_tower == 1 else AsyncMultiGPUTrainer(train_tower) launch_train_with_config(config, trainer)
def get_config(model, scales, distill=False, fake=False, data_aug=True): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch, data_aug) dataset_val = get_data('val', batch, data_aug) callbacks = [ModelSaver()] if data_aug: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)])) callbacks.append(HumanHyperParamSetter('learning_rate')) infs = [] for scale in scales: infs.append( ClassificationError('wrong-scale%03d-top1' % scale, 'val-error-scale%03d-top1' % scale)) infs.append( ClassificationError('wrong-scale%03d-top5' % scale, 'val-error-scale%03d-top5' % scale)) if distill: infs.append( ClassificationError('wrong-scale_ensemble-top1', 'val-error-scale_ensemble-top1')) infs.append( ClassificationError('wrong-scale_ensemble-top5', 'val-error-scale_ensemble-top5')) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return AutoResumeTrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000, max_epoch=120 if data_aug else 64, nr_tower=nr_tower)
def _get_optimizer(self): lr = tf.get_variable('learning_rate', initializer=0.01, trainable=False) tf.summary.scalar('learning_rate', lr) print("get_nr_gpu", get_nr_gpu()) if config.BIG: factor = 32 // (config.BATCH * get_nr_gpu()) if factor != 1: lr = lr / float(factor) opt = tf.train.MomentumOptimizer(lr, 0.9) opt = optimizer.AccumGradOptimizer(opt, factor) else: opt = tf.train.MomentumOptimizer(lr, 0.9) #opt = tf.train.AdamOptimizer(lr) else: opt = tf.train.MomentumOptimizer(lr, 0.9) return opt
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) BASE_LR = 0.1 * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (85, BASE_LR * 1e-3), (95, BASE_LR * 1e-4), (105, BASE_LR * 1e-5)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=110, )
def get_config(model, option): dataset_train = get_data('train', option) dataset_val = get_data('val', option) nr_tower = max(get_nr_gpu(), 1) total_batch = int(option.batch) * nr_tower lr_string = 'learning_rate' infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] START_LR = option.base_lr BASE_LR = START_LR callbacks = [ ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=1000), EstimatedTimeLeft(), MinSaver('val-error-top1'), ScheduledHyperParamSetter(lr_string, [(0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if nr_tower == 1: call = [ PeriodicTrigger(InferenceRunner(dataset_val, infs), every_k_epochs=2) ] else: call = [ PeriodicTrigger(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower))), every_k_epochs=2) ] call.extend(callbacks) input = QueueInput(dataset_train) input = StagingInput(input, nr_stage=1) if option.cub: steps_per_epoch = 25 * (256 / total_batch) * option.stepscale else: steps_per_epoch = 5000 * (256 / total_batch) * option.stepscale return TrainConfig( model=model, data=input, callbacks=call, steps_per_epoch=int(steps_per_epoch), max_epoch=option.epoch, )
def get_config(model): nr_tower = max(get_nr_gpu(), 1) batch = PER_GPU_BATCH_SIZE logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if not args.skip_ray: ray.init() if args.fake: dataset_train = FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype=['uint8', 'int32']) else: dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] callbacks = [ StepTimeCallback(), EpochTimeCallback(), GPUUtilizationTracker(), ] input = QueueInput(dataset_train) input = StagingInput(input, nr_stage=1) num_gpus = get_nr_gpu() return TrainConfig( model=model, data=input, callbacks=callbacks, extra_callbacks=train.DEFAULT_CALLBACKS()+[ MergeAllSummaries(period=1), ], steps_per_epoch=DATASET_SIZE // (PER_GPU_BATCH_SIZE*get_nr_gpu()), max_epoch=2, )
def train(): dirname = os.path.join('train_log', 'a3c_small') logger.set_logger_dir(dirname) # assign GPUs for training & inference nr_gpu = get_nr_gpu() global PREDICTOR_THREAD if nr_gpu > 0: if nr_gpu > 1: # use all gpus for inference predict_tower = list(range(nr_gpu)) else: predict_tower = [0] PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU train_tower = list(range(nr_gpu))[:-nr_gpu // 2] or [0] logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format( ','.join(map(str, train_tower)), ','.join(map(str, predict_tower)))) else: logger.warn("Without GPU this model will never learn! CPU is only useful for debug.") PREDICTOR_THREAD = 1 predict_tower, train_tower = [0], [0] # setup simulator processes name_base = str(uuid.uuid1())[:6] prefix = '@' if sys.platform.startswith('linux') else '' namec2s = 'ipc://{}sim-c2s-{}'.format(prefix, name_base) names2c = 'ipc://{}sim-s2c-{}'.format(prefix, name_base) procs = [MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)] ensure_proc_terminate(procs) start_proc_mask_signal(procs) master = MySimulatorMaster(namec2s, names2c, predict_tower) dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE) config = AutoResumeTrainConfig( model=Model(), dataflow=dataflow, callbacks=[ ModelSaver(), # ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]), # ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]), master, StartProcOrThread(master), HumanHyperParamSetter('learning_rate'), Evaluator( 100, ['role_id', 'policy_state_in', 'last_cards_in', 'minor_type_in'], ['passive_decision_prob', 'passive_bomb_prob', 'passive_response_prob', 'active_decision_prob', 'active_response_prob', 'active_seq_prob', 'minor_response_prob'], get_player), ], # session_init=ModelLoader('policy_network_2', 'SL_policy_network', 'value_network', 'SL_value_network'), steps_per_epoch=STEPS_PER_EPOCH, max_epoch=1000, ) trainer = SimpleTrainer() if config.nr_tower == 1 else AsyncMultiGPUTrainer(train_tower) launch_train_with_config(config, trainer)
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5), (80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]), HumanHyperParamSetter('learning_rate'), ] # Finetune COCO #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)] #JT COCO #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)] #Fintune to VOC #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)] #infs = [ClassificationError('wrong-top1', 'val-error-top1'), # ClassificationError('wrong-top5', 'val-error-top5')] infs = [ ClassificationError('loss-wrong-top1', 'loss-val-error-top1'), ClassificationError('loss-wrong-top5', 'loss-val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=1522, max_epoch=140, nr_tower=nr_tower)
def run(args): num_gpus = get_nr_gpu() num_towers = max(num_gpus, 1) config = get_config(args, AvatarSynthModel(args), num_gpus, num_towers) if args.load_path: config.session_init = SaverRestore(args.load_path) # trainer = SyncMultiGPUTrainerParameterServer(num_towers) # trainer = QueueInputTrainer() trainer = SyncMultiGPUTrainerReplicated(num_towers) launch_train_with_config(config, trainer)
def train(): dirname = os.path.join('train_log', 'train-SL-1.4') logger.set_logger_dir(dirname) # assign GPUs for training & inference nr_gpu = get_nr_gpu() if nr_gpu > 0: train_tower = list(range(nr_gpu)) or [0] logger.info("[Batch-SL] Train on gpu {}".format(','.join( map(str, train_tower)))) else: logger.warn( "Without GPU this model will never learn! CPU is only useful for debug." ) train_tower = [0], [0] dataflow = DataFromGeneratorRNG(data_generator) if os.name == 'nt': dataflow = PrefetchData(dataflow, nr_proc=multiprocessing.cpu_count() // 2, nr_prefetch=multiprocessing.cpu_count() // 2) else: dataflow = PrefetchDataZMQ(dataflow, nr_proc=multiprocessing.cpu_count() // 2) dataflow = BatchData(dataflow, BATCH_SIZE) config = TrainConfig( model=Model(), dataflow=dataflow, callbacks=[ ModelSaver(), EstimatedTimeLeft(), # ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]), # ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]), # HumanHyperParamSetter('learning_rate'), # HumanHyperParamSetter('entropy_beta') PeriodicTrigger(Evaluator( 100, ['state_in', 'last_cards_in', 'minor_type_in'], [ 'passive_decision_prob', 'passive_bomb_prob', 'passive_response_prob', 'active_decision_prob', 'active_response_prob', 'active_seq_prob', 'minor_response_prob' ], get_player), every_k_epochs=1), ], steps_per_epoch=STEPS_PER_EPOCH, max_epoch=100, ) trainer = AsyncMultiGPUTrainer( train_tower) if nr_gpu > 1 else SimpleTrainer() launch_train_with_config(config, trainer)
def get_config(model, fake=False, xla=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) config = tf.ConfigProto() jit_level = 0 if xla: # Turns on XLA JIT compilation jit_level = tf.OptimizerOptions.ON_1 config.graph_options.optimizer_options.global_jit_level = jit_level return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=10, max_epoch=1, nr_tower=nr_tower)
def get_config(args, model): ds_train, sample_num = get_data('train', args.batch_size_per_gpu) ds_val, _ = get_data('test', args.batch_size_per_gpu) return TrainConfig( dataflow = ds_train, callbacks = [ ModelSaver(), PeriodicTrigger(InferenceRunner(ds_val, [ScalarStats('cost')]), every_k_epochs=5), HumanHyperParamSetter('learning_rate'), ], model = model, steps_per_epoch = sample_num // (args.batch_size_per_gpu * get_nr_gpu()), )
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if fake: data = QueueInput(FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: data = QueueInput(get_data('train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top3', 'val-error-top3')] dataset_val = get_data('val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, data=data, callbacks=callbacks, #steps_per_epoch=100 if args.fake else 437513 // args.batch, val_num:24426 steps_per_epoch=100 if args.fake else 437513 // args.batch, max_epoch=105, )
def run(model): instance = Model(model, model.conf.data_format) if not model.conf.is_train: batch = 64 dataset = get_data(model.conf.data_dir, 'val', batch) eval_on_ILSVRC12( instance, get_model_loader(model.conf.logdir + '/' + model.conf.test_step), dataset) else: logger.set_logger_dir(os.path.join(model.conf.logdir)) config = get_config(instance, model.conf) if model.conf.reload_step: config.session_init = get_model_loader(model.conf.logdir + '/' + model.conf.reload_step) trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) BASE_LR = 0.1 * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=105, )
def get_config(model, checkpoint_dir, target_shape, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, target_shape, target_shape, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch, target_shape) dataset_val = get_data('val', batch, target_shape) callbacks = [ ModelSaver(checkpoint_dir=checkpoint_dir), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) # 7.5 it / sec testing return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 300, #5000 max_epoch=110, nr_tower=nr_tower)
def get_config(model, fake=False, data_aug=True): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch, data_aug) dataset_val = get_data('val', batch, data_aug) callbacks = [ ModelSaver(), ] if data_aug: callbacks.append(ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)])) callbacks.append(HumanHyperParamSetter('learning_rate')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return AutoResumeTrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000, max_epoch=110 if data_aug else 64, nr_tower=nr_tower )
def get_config(): nr_tower = max(get_nr_gpu(), 1) batch = args.batch total_batch = batch * nr_tower assert total_batch >= 256 # otherwise the learning rate warmup is wrong. BASE_LR = 0.01 * (total_batch / 256.) logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] callbacks = [ ModelSaver(), GPUUtilizationTracker(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [(0, 0.01), (3, max(BASE_LR, 0.01))], interp='linear'), ScheduledHyperParamSetter( 'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]), DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower))), ] input = QueueInput(dataset_train) input = StagingInput(input, nr_stage=1) return TrainConfig( model=Model(), data=input, callbacks=callbacks, steps_per_epoch=1281167 // total_batch, max_epoch=100, )
model = Model() if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) elif args.flops: # manually build the graph with batch=1 input_desc = [ InputDesc(tf.float32, [1, 224, 224, 3], 'input'), InputDesc(tf.int32, [1], 'label') ] input = PlaceholderInput() input.setup(input_desc) with TowerContext('', is_training=True): model.build_graph(*input.get_input_tensors()) tf.profiler.profile( tf.get_default_graph(), cmd='op', options=tf.profiler.ProfileOptionBuilder.float_operation()) else: logger.set_logger_dir( os.path.join('train_log', 'shufflenet')) nr_tower = max(get_nr_gpu(), 1) config = get_config(model, nr_tower) if args.load: config.session_init = get_model_loader(args.load) launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_tower))
parser.add_argument('-n', '--num_units', help='number of units in each stage', type=int, default=18) parser.add_argument('--load', help='load model for training') args = parser.parse_args() NUM_UNITS = args.num_units if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu logger.auto_set_dir() dataset_train = get_data('train') dataset_test = get_data('test') config = TrainConfig( model=Model(n=NUM_UNITS), dataflow=dataset_train, callbacks=[ ModelSaver(), InferenceRunner(dataset_test, [ScalarStats('cost'), ClassificationError('wrong_vector')]), ScheduledHyperParamSetter('learning_rate', [(1, 0.1), (82, 0.01), (123, 0.001), (300, 0.0002)]) ], max_epoch=400, session_init=SaverRestore(args.load) if args.load else None ) nr_gpu = max(get_nr_gpu(), 1) launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))
def get_batch_factor(): nr_gpu = get_nr_gpu() assert nr_gpu in [1, 2, 4, 8], nr_gpu return 8 // nr_gpu
stepnum = config.STEPS_PER_EPOCH # warmup is step based, lr is epoch based warmup_schedule = [(0, config.BASE_LR / 3), (config.WARMUP * factor, config.BASE_LR)] warmup_end_epoch = config.WARMUP * factor * 1. / stepnum lr_schedule = [(int(np.ceil(warmup_end_epoch)), warmup_schedule[-1][1])] for idx, steps in enumerate(config.LR_SCHEDULE[:-1]): mult = 0.1 ** (idx + 1) lr_schedule.append( (steps * factor // stepnum, config.BASE_LR * mult)) cfg = TrainConfig( model=Model(), data=QueueInput(get_train_dataflow(add_mask=config.MODE_MASK)), callbacks=[ ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1), # linear warmup ScheduledHyperParamSetter( 'learning_rate', warmup_schedule, interp='linear', step_based=True), ScheduledHyperParamSetter('learning_rate', lr_schedule), EvalCallback(), GPUUtilizationTracker(), EstimatedTimeLeft(), ], steps_per_epoch=stepnum, max_epoch=config.LR_SCHEDULE[2] * factor // stepnum, session_init=get_model_loader(args.load) if args.load else None, ) trainer = SyncMultiGPUTrainerReplicated(get_nr_gpu()) launch_train_with_config(cfg, trainer)
InferenceRunner(dataset_val, [ ClassificationError('wrong-top1', 'val-top1-error'), ClassificationError('wrong-top5', 'val-top5-error')]), ScheduledHyperParamSetter('learning_rate', [(8, 0.03), (14, 0.02), (17, 5e-3), (19, 3e-3), (24, 1e-3), (26, 2e-4), (30, 5e-5)]) ], model=Model(), steps_per_epoch=5000, max_epoch=80, ) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') parser.add_argument('--load', help='load model') parser.add_argument('--data', help='ImageNet data root directory', required=True) args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu config = get_config() if args.load: config.session_init = SaverRestore(args.load) nr_tower = get_nr_gpu() assert nr_tower == NR_GPU launch_train_with_config(config, SyncMultiGPUTrainer(NR_GPU))
pred = outputs[k][0] cv2.imwrite("out{}.png".format( '-fused' if k == 5 else str(k + 1)), pred * 255) else: pred = outputs[5][0] cv2.imwrite(output, pred * 255) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') parser.add_argument('--load', help='load model') parser.add_argument('--view', help='view dataset', action='store_true') parser.add_argument('--run', help='run model on images') parser.add_argument('--output', help='fused output filename. default to out-fused.png') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if args.view: view_data() elif args.run: run(args.load, args.run, args.output) else: config = get_config() if args.load: config.session_init = get_model_loader(args.load) launch_train_with_config( config, SyncMultiGPUTrainer(max(get_nr_gpu(), 1)))
type=str, default='NCHW') parser.add_argument('-d', '--depth', help='resnet depth', type=int, default=18, choices=[18, 34, 50, 101, 152]) parser.add_argument('--eval', action='store_true') parser.add_argument('--batch', default=256, type=int, help='total batch size. 32 per GPU gives best accuracy, higher values should be similarly good') parser.add_argument('--mode', choices=['resnet', 'preact', 'se'], help='variants of resnet to use', default='resnet') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.depth, args.data_format, args.mode) if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir( os.path.join('train_log', 'imagenet-{}-d{}'.format(args.mode, args.depth))) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
if cnt == 500: return if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--depth', type=int, default=18) parser.add_argument('--load', help='load model') parser.add_argument('--cam', action='store_true') args = parser.parse_args() DEPTH = args.depth if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu nr_gpu = get_nr_gpu() BATCH_SIZE = TOTAL_BATCH_SIZE // nr_gpu if args.cam: BATCH_SIZE = 128 # something that can run on one gpu viz_cam(args.load, args.data) sys.exit() logger.auto_set_dir() config = get_config() if args.load: config.session_init = get_model_loader(args.load) launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))