示例#1
0
def get_config(model):
    nr_tower = max(get_nr_gpu(), 1)
    batch = PER_GPU_BATCH_SIZE

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    callbacks = [
        StepTimeCallback(),
        GPUUtilizationTracker(),
    ]

    if args.fake:
        dataset_train = FakeData([[batch, 224, 224, 3], [batch]],
                                 1000,
                                 random=False,
                                 dtype=['uint8', 'int32'])

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)

    num_gpus = get_nr_gpu()
    return TrainConfig(
        model=model,
        data=input,
        callbacks=callbacks,
        steps_per_epoch=1281 // (PER_GPU_BATCH_SIZE * get_nr_gpu()),
        max_epoch=3,
    )
示例#2
0
def get_config(args, model):

    ds_train, train_sample_num = get_data('train', args.batch_size_per_gpu)
    ds_test, _ = get_data('test', 1) #args.batch_size_per_gpu)

    training_number_of_steps = 300 * train_sample_num // (args.batch_size_per_gpu * get_nr_gpu())

    steps_per_epoch = train_sample_num // (args.batch_size_per_gpu * get_nr_gpu())

    epoch_num = int(cfg.max_itr_num / steps_per_epoch)

    callbacks = [
      ModelSaver(),
      PeriodicTrigger(InferenceRunner(ds_test, CalMIOU()), every_k_epochs=3),
      HyperParamSetterWithFunc('learning_rate', lambda e, x: (((cfg.base_lr - cfg.end_lr) * (1 - steps_per_epoch * (e+9) / cfg.max_itr_num) ** cfg.learning_power) + cfg.end_lr)),
      HumanHyperParamSetter('learning_rate'),
    ]

    return TrainConfig(
        dataflow=ds_train,
        callbacks=callbacks,
        model=model,
        steps_per_epoch=steps_per_epoch,
        max_epoch=epoch_num,
    )
示例#3
0
def get_config():
    assert tf.test.is_gpu_available()
    nr_gpu = get_nr_gpu()
    batch = TOTAL_BATCH_SIZE // nr_gpu
    logger.info("Running on {} GPUs. Batch size per GPU: {}".format(
        nr_gpu, batch))

    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3),
                                                    (85, 1e-4), (95, 1e-5),
                                                    (105, 1e-6)]),
        HumanHyperParamSetter('learning_rate'),
    ]
    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(model=Model(),
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=5000,
                       max_epoch=110,
                       nr_tower=nr_tower)
示例#4
0
def get_config():
    assert tf.test.is_gpu_available()
    nr_gpu = get_nr_gpu()
    global BATCH_SIZE
    BATCH_SIZE = TOTAL_BATCH_SIZE // nr_gpu
    logger.info("Running on {} GPUs. Batch size per GPU: {}".format(nr_gpu, BATCH_SIZE))

    dataset_train = get_data('train')
    dataset_val = get_data('val')

    return TrainConfig(
        model=Model(),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_val, [
                ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]),
        ],
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_gpu
    )
示例#5
0
def get_config():
    nr_tower = max(get_nr_gpu(), 1)
    batch = args.batch
    total_batch = batch * nr_tower
    BASE_LR = 0.01 * (total_batch / 256.)

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    callbacks = [
        ModelSaver(),
        GPUUtilizationTracker(),
        EstimatedTimeLeft(),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(0, 0.01), (3, max(BASE_LR, 0.01))], interp='linear'),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]),
        DataParallelInferenceRunner(
            dataset_val, infs, list(range(nr_tower))),
    ]

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)
    return TrainConfig(
        model=Model(),
        data=input,
        callbacks=callbacks,
        steps_per_epoch=1281167 // total_batch,
        max_epoch=100,
    )
示例#6
0
def get_config(model):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(0, 3e-1), (30, 3e-2),
                                                    (60, 3e-3), (90, 3e-4)]),
        HumanHyperParamSetter('learning_rate'),
    ]
    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=5000,
                       max_epoch=100,
                       nr_tower=nr_tower)
示例#7
0
def get_config(model):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    callbacks = [
        ModelSaver(),
        GPUUtilizationTracker(),
        ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (3, BASE_LR)],
                                  interp='linear'),
        ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1),
                                                    (60, BASE_LR * 1e-2),
                                                    (80, BASE_LR * 1e-3)]),
        PeriodicTrigger(DataParallelInferenceRunner(dataset_val, infs,
                                                    list(range(nr_tower))),
                        every_k_epochs=1),
    ]

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)
    return TrainConfig(
        model=model,
        data=input,
        callbacks=callbacks,
        steps_per_epoch=1281167 // TOTAL_BATCH_SIZE,
        max_epoch=100,
    )
示例#8
0
def get_config():
    nr_gpu = get_nr_gpu()
    global BATCH_SIZE
    BATCH_SIZE = TOTAL_BATCH_SIZE // nr_gpu

    dataset_train = get_data('train')
    dataset_val = get_data('val')

    return TrainConfig(
        model=Model(),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            PeriodicTrigger(InferenceRunner(dataset_val, [
                ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')
            ]),
                            every_k_epochs=2),
            ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (55, 1e-3),
                                                        (75, 1e-4),
                                                        (95, 1e-5)]),
        ],
        steps_per_epoch=5000,
        max_epoch=105,
        nr_tower=nr_gpu)
示例#9
0
def get_config(fake=False, data_format='NCHW'):
    nr_tower = max(get_nr_gpu(), 1)
    global BATCH_SIZE
    BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = dataset_val = FakeData([[64, 224, 224, 3], [64]],
                                               1000,
                                               random=False,
                                               dtype='uint8')
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, BATCH_SIZE))
        dataset_train = get_data('train')
        dataset_val = get_data('val')

    return TrainConfig(
        model=Model(data_format=data_format),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_val, [
                ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')
            ]),
            ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ],
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_tower)
示例#10
0
def get_config():
    global BATCH
    nr_tower = max(get_nr_gpu(), 1)
    BATCH = TOTAL_BATCH_SIZE // nr_tower
    logger.set_logger_dir()

    ds_train = get_data('train')
    ds_test = get_data('test')

    return TrainConfig(
        model=Model(),
        data=QueueInput(ds_train),
        callbacks=[
            ModelSaver(),
            InferenceRunner(ds_test, [ScalarStats('total_costs')]),
        ],
        extra_callbacks=[
            MovingAverageSummary(),
            ProgressBar(['']),
            MergeAllSummaries(),
            RunUpdateOps()
        ],
        steps_per_epoch=ds_train.size(),
        max_epoch=100,
    )
def main(_):

    if FLAGS.inference_input:
        print("Not implemented")
        sys.exit()

    if FLAGS.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu
        nr_tower = max(get_nr_gpu(), 1)
        batch_size_per_gpu = FLAGS.total_batch_size // nr_tower
    else:
        nr_tower = 1
        batch_size_per_gpu = FLAGS.total_batch_size

    config = get_config(batch_size_per_gpu, nr_tower)
    if FLAGS.load:
        config.session_init = SaverRestore(FLAGS.load)
    logger.info("Using {} prefetch threads".format(FLAGS.num_prefetch_threads))

    if FLAGS.gpu:
        logger.info(
            "Using GPU training. Num towers: {} Batch per tower: {}".format(
                nr_tower, batch_size_per_gpu))
        config.nr_tower = nr_tower
        SyncMultiGPUTrainer(config).train()
    else:
        logger.info("Using CPU. Batch size: {}".format(batch_size_per_gpu))
        QueueInputTrainer(config).train()
示例#12
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_tower
    )
示例#13
0
def get_config():
    log_dir = 'train_log/cifar10-bottleneck-var-N%s-L%s' % (str(BOTTLE_NUM), str(LAYER_NUM))
    logger.set_logger_dir(log_dir)
    nr_tower = max(get_nr_gpu(), 1)

    # prepare dataset
    dataset_train = get_data('train')
    steps_per_epoch = dataset_train.size()
    dataset_test = get_data('test')

    callbacks=[
        ModelSaver(),
        #InferenceRunner(dataset_test,
        #    [ScalarStats('cost'), ClassificationError()]),
        ScheduledHyperParamSetter('learning_rate',
                                  [(1, 0.1), (args.drop_1, 0.01), (args.drop_2, 0.001)]),
        TensorPrinter(['MASK/dense_layer.{}/mask_con_var:0'.format(i) for i in xrange(7)]) #+ ['block2/dense_bottleneck.0/dense_layer.{}/mask_con/EMA:0'.format(j) for j in xrange(10)] + ['block3/dense_bottleneck.0/dense_layer.{}/mask_con/EMA:0'.format(k) for k in xrange(10)])
    ]
    if nr_tower == 1:
        callbacks.append(InferenceRunner(dataset_test,
	    [ScalarStats('cost'), ClassificationError()]))
    else:
        callbacks.append(DataParallelInferenceRunner(dataset_test,
	    [ScalarStats('cost'), ClassificationError()], list(range(nr_tower))))

    return TrainConfig(
        dataflow=dataset_train,
        callbacks=callbacks,
        model=Model(depth=args.depth),
        steps_per_epoch=steps_per_epoch // nr_tower,
        max_epoch=args.max_epoch,
    )
示例#14
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn(
            "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported."
        )
    if fake:
        data = QueueInput(
            FakeData([[batch, 224, 224, 3], [batch]],
                     1000,
                     random=False,
                     dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (90, BASE_LR * 1e-3),
                                       (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, START_LR),
                                                            (5, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=105,
    )
def get_config():
    nr_gpu = get_nr_gpu()
    global PREDICTOR_THREAD
    if nr_gpu > 0:
        if nr_gpu > 1:
            # use half gpus for inference
            predict_tower = list(range(nr_gpu))[-nr_gpu // 2:]
        else:
            predict_tower = [0]
        PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
        train_tower = list(range(nr_gpu))[:-nr_gpu // 2] or [0]
        logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format(
            ','.join(map(str, train_tower)), ','.join(map(str,
                                                          predict_tower))))
    else:
        logger.warn(
            "Without GPU this model will never learn! CPU is only useful for debug."
        )
        PREDICTOR_THREAD = 1
        predict_tower, train_tower = [0], [0]

    # setup simulator processes
    base_port = args.base_port
    name_base = str(uuid.uuid1())[:6]
    PIPE_DIR = os.environ.get('TENSORPACK_PIPEDIR', '.').rstrip('/')
    namec2s = 'ipc://{}/sim-c2s-{}'.format(PIPE_DIR, name_base)
    names2c = 'ipc://{}/sim-s2c-{}'.format(PIPE_DIR, name_base)
    procs = [
        MySimulatorWorker(k, namec2s, names2c, base_port=base_port)
        for k in range(SIMULATOR_PROC)
    ]
    ensure_proc_terminate(procs)
    start_proc_mask_signal(procs)

    M = Model()
    master = MySimulatorMaster(namec2s, names2c, M, predict_tower)
    dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE)
    return TrainConfig(
        model=M,
        dataflow=dataflow,
        callbacks=[
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(200, 0.0003),
                                                        (1200, 0.0001)]),
            ScheduledHyperParamSetter('entropy_beta', [(800, 0.005)]),
            HumanHyperParamSetter('learning_rate'),
            HumanHyperParamSetter('entropy_beta'),
            master,
            StartProcOrThread(master),
            #PeriodicTrigger(Evaluator(
            #    EVAL_EPISODE, ['state'], ['policy'], get_player),
            #    every_k_epochs=3),
        ],
        session_creator=sesscreate.NewSessionCreator(
            config=get_default_sess_config(0.5)),
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=10000,
        tower=train_tower)
示例#16
0
def get_config(model, fake=False):
    start_ = 0
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if fake:
        dataset_train = FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        if start_ < 31:
            lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 61:
            lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 91:
            lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        else:
            print('not found learning rate setting!!!!!!!!!!!!!')

        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', lr_setting),
            # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0'])
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=120,
    )
示例#17
0
def train():
    dirname = os.path.join('train_log', 'train-atari-{}'.format(ENV_NAME))
    logger.set_logger_dir(dirname)

    # assign GPUs for training & inference
    nr_gpu = get_nr_gpu()
    global PREDICTOR_THREAD
    if nr_gpu > 0:
        if nr_gpu > 1:
            # use half gpus for inference
            predict_tower = list(range(nr_gpu))[-nr_gpu // 2:]
        else:
            predict_tower = [0]
        PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
        train_tower = list(range(nr_gpu))[:-nr_gpu // 2] or [0]
        logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format(
            ','.join(map(str, train_tower)), ','.join(map(str, predict_tower))))
    else:
        logger.warn("Without GPU this model will never learn! CPU is only useful for debug.")
        PREDICTOR_THREAD = 1
        predict_tower, train_tower = [0], [0]

    # setup simulator processes
    name_base = str(uuid.uuid1())[:6]
    prefix = '@' if sys.platform.startswith('linux') else ''
    namec2s = 'ipc://{}sim-c2s-{}'.format(prefix, name_base)
    names2c = 'ipc://{}sim-s2c-{}'.format(prefix, name_base)
    procs = [MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)]
    ensure_proc_terminate(procs)
    start_proc_mask_signal(procs)

    master = MySimulatorMaster(namec2s, names2c, predict_tower)
    dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE)
    config = TrainConfig(
        model=Model(),
        dataflow=dataflow,
        callbacks=[
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]),
            ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]),
            HumanHyperParamSetter('learning_rate'),
            HumanHyperParamSetter('entropy_beta'),
            master,
            StartProcOrThread(master),
            PeriodicTrigger(Evaluator(
                EVAL_EPISODE, ['state'], ['policy'], get_player),
                every_k_epochs=3),
        ],
        session_creator=sesscreate.NewSessionCreator(
            config=get_default_sess_config(0.5)),
        steps_per_epoch=STEPS_PER_EPOCH,
        session_init=get_model_loader(args.load) if args.load else None,
        max_epoch=1000,
    )
    trainer = SimpleTrainer() if config.nr_tower == 1 else AsyncMultiGPUTrainer(train_tower)
    launch_train_with_config(config, trainer)
示例#18
0
def get_config(model, scales, distill=False, fake=False, data_aug=True):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch, data_aug)
        dataset_val = get_data('val', batch, data_aug)
        callbacks = [ModelSaver()]
        if data_aug:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(30, 1e-2),
                                                            (60, 1e-3),
                                                            (85, 1e-4),
                                                            (95, 1e-5),
                                                            (105, 1e-6)]))
        callbacks.append(HumanHyperParamSetter('learning_rate'))
        infs = []
        for scale in scales:
            infs.append(
                ClassificationError('wrong-scale%03d-top1' % scale,
                                    'val-error-scale%03d-top1' % scale))
            infs.append(
                ClassificationError('wrong-scale%03d-top5' % scale,
                                    'val-error-scale%03d-top5' % scale))
        if distill:
            infs.append(
                ClassificationError('wrong-scale_ensemble-top1',
                                    'val-error-scale_ensemble-top1'))
            infs.append(
                ClassificationError('wrong-scale_ensemble-top5',
                                    'val-error-scale_ensemble-top5'))
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return AutoResumeTrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000,
        max_epoch=120 if data_aug else 64,
        nr_tower=nr_tower)
示例#19
0
    def _get_optimizer(self):
        lr = tf.get_variable('learning_rate',
                             initializer=0.01,
                             trainable=False)
        tf.summary.scalar('learning_rate', lr)
        print("get_nr_gpu", get_nr_gpu())
        if config.BIG:
            factor = 32 // (config.BATCH * get_nr_gpu())
            if factor != 1:
                lr = lr / float(factor)
                opt = tf.train.MomentumOptimizer(lr, 0.9)
                opt = optimizer.AccumGradOptimizer(opt, factor)
            else:
                opt = tf.train.MomentumOptimizer(lr, 0.9)

            #opt = tf.train.AdamOptimizer(lr)
        else:
            opt = tf.train.MomentumOptimizer(lr, 0.9)
        return opt
示例#20
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (85, BASE_LR * 1e-3),
                                       (95, BASE_LR * 1e-4),
                                       (105, BASE_LR * 1e-5)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, 0.1),
                                                            (3, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=110,
    )
示例#21
0
def get_config(model, option):
    dataset_train = get_data('train', option)
    dataset_val = get_data('val', option)

    nr_tower = max(get_nr_gpu(), 1)
    total_batch = int(option.batch) * nr_tower
    lr_string = 'learning_rate'
    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]

    START_LR = option.base_lr
    BASE_LR = START_LR

    callbacks = [
        ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=1000),
        EstimatedTimeLeft(),
        MinSaver('val-error-top1'),
        ScheduledHyperParamSetter(lr_string, [(0, min(START_LR, BASE_LR)),
                                              (30, BASE_LR * 1e-1),
                                              (60, BASE_LR * 1e-2),
                                              (90, BASE_LR * 1e-3),
                                              (100, BASE_LR * 1e-4)]),
    ]

    if nr_tower == 1:
        call = [
            PeriodicTrigger(InferenceRunner(dataset_val, infs),
                            every_k_epochs=2)
        ]
    else:
        call = [
            PeriodicTrigger(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))),
                            every_k_epochs=2)
        ]

    call.extend(callbacks)

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)

    if option.cub:
        steps_per_epoch = 25 * (256 / total_batch) * option.stepscale
    else:
        steps_per_epoch = 5000 * (256 / total_batch) * option.stepscale

    return TrainConfig(
        model=model,
        data=input,
        callbacks=call,
        steps_per_epoch=int(steps_per_epoch),
        max_epoch=option.epoch,
    )
示例#22
0
def get_config(model):
  nr_tower = max(get_nr_gpu(), 1)
  batch = PER_GPU_BATCH_SIZE

  logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))

  if not args.skip_ray:
    ray.init()

  if args.fake:
    dataset_train = FakeData(
      [[batch, 224, 224, 3], [batch]], 1000,
      random=False, dtype=['uint8', 'int32'])
  else:
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

  infs = [ClassificationError('wrong-top1', 'val-error-top1'),
          ClassificationError('wrong-top5', 'val-error-top5')]
  callbacks = [
    StepTimeCallback(),
    EpochTimeCallback(),
    GPUUtilizationTracker(),
  ]



  input = QueueInput(dataset_train)
  input = StagingInput(input, nr_stage=1)

  num_gpus = get_nr_gpu()
  
  return TrainConfig(
    model=model,
    data=input,
    callbacks=callbacks,
    extra_callbacks=train.DEFAULT_CALLBACKS()+[
      MergeAllSummaries(period=1),
    ],
    steps_per_epoch=DATASET_SIZE // (PER_GPU_BATCH_SIZE*get_nr_gpu()),
    max_epoch=2,
  )
示例#23
0
def train():
    dirname = os.path.join('train_log', 'a3c_small')
    logger.set_logger_dir(dirname)

    # assign GPUs for training & inference
    nr_gpu = get_nr_gpu()
    global PREDICTOR_THREAD
    if nr_gpu > 0:
        if nr_gpu > 1:
            # use all gpus for inference
            predict_tower = list(range(nr_gpu))
        else:
            predict_tower = [0]
        PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
        train_tower = list(range(nr_gpu))[:-nr_gpu // 2] or [0]
        logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format(
            ','.join(map(str, train_tower)), ','.join(map(str, predict_tower))))
    else:
        logger.warn("Without GPU this model will never learn! CPU is only useful for debug.")
        PREDICTOR_THREAD = 1
        predict_tower, train_tower = [0], [0]

    # setup simulator processes
    name_base = str(uuid.uuid1())[:6]
    prefix = '@' if sys.platform.startswith('linux') else ''
    namec2s = 'ipc://{}sim-c2s-{}'.format(prefix, name_base)
    names2c = 'ipc://{}sim-s2c-{}'.format(prefix, name_base)
    procs = [MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)]
    ensure_proc_terminate(procs)
    start_proc_mask_signal(procs)

    master = MySimulatorMaster(namec2s, names2c, predict_tower)
    dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE)
    config = AutoResumeTrainConfig(
        model=Model(),
        dataflow=dataflow,
        callbacks=[
            ModelSaver(),
            # ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]),
            # ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]),
            master,
            StartProcOrThread(master),
            HumanHyperParamSetter('learning_rate'),
            Evaluator(
                100, ['role_id', 'policy_state_in', 'last_cards_in', 'minor_type_in'],
                ['passive_decision_prob', 'passive_bomb_prob', 'passive_response_prob',
                 'active_decision_prob', 'active_response_prob', 'active_seq_prob', 'minor_response_prob'], get_player),
        ],
        # session_init=ModelLoader('policy_network_2', 'SL_policy_network', 'value_network', 'SL_value_network'),
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=1000,
    )
    trainer = SimpleTrainer() if config.nr_tower == 1 else AsyncMultiGPUTrainer(train_tower)
    launch_train_with_config(config, trainer)
示例#24
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4),
                                       (60, 5e-5), (80, 2.5e-5),
                                       (100, 1.25e-5), (120, 5e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        # Finetune COCO
        #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]
        #JT COCO
        #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)]
        #Fintune to VOC
        #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)]

        #infs = [ClassificationError('wrong-top1', 'val-error-top1'),
        #        ClassificationError('wrong-top5', 'val-error-top5')]
        infs = [
            ClassificationError('loss-wrong-top1', 'loss-val-error-top1'),
            ClassificationError('loss-wrong-top5', 'loss-val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=1522,
                       max_epoch=140,
                       nr_tower=nr_tower)
示例#25
0
def run(args):
    num_gpus = get_nr_gpu()
    num_towers = max(num_gpus, 1)

    config = get_config(args, AvatarSynthModel(args), num_gpus, num_towers)

    if args.load_path:
        config.session_init = SaverRestore(args.load_path)

    # trainer = SyncMultiGPUTrainerParameterServer(num_towers)
    # trainer = QueueInputTrainer()
    trainer = SyncMultiGPUTrainerReplicated(num_towers)
    launch_train_with_config(config, trainer)
示例#26
0
def train():
    dirname = os.path.join('train_log', 'train-SL-1.4')
    logger.set_logger_dir(dirname)

    # assign GPUs for training & inference
    nr_gpu = get_nr_gpu()
    if nr_gpu > 0:
        train_tower = list(range(nr_gpu)) or [0]
        logger.info("[Batch-SL] Train on gpu {}".format(','.join(
            map(str, train_tower))))
    else:
        logger.warn(
            "Without GPU this model will never learn! CPU is only useful for debug."
        )
        train_tower = [0], [0]

    dataflow = DataFromGeneratorRNG(data_generator)
    if os.name == 'nt':
        dataflow = PrefetchData(dataflow,
                                nr_proc=multiprocessing.cpu_count() // 2,
                                nr_prefetch=multiprocessing.cpu_count() // 2)
    else:
        dataflow = PrefetchDataZMQ(dataflow,
                                   nr_proc=multiprocessing.cpu_count() // 2)
    dataflow = BatchData(dataflow, BATCH_SIZE)
    config = TrainConfig(
        model=Model(),
        dataflow=dataflow,
        callbacks=[
            ModelSaver(),
            EstimatedTimeLeft(),
            # ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]),
            # ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]),
            # HumanHyperParamSetter('learning_rate'),
            # HumanHyperParamSetter('entropy_beta')
            PeriodicTrigger(Evaluator(
                100, ['state_in', 'last_cards_in', 'minor_type_in'], [
                    'passive_decision_prob', 'passive_bomb_prob',
                    'passive_response_prob', 'active_decision_prob',
                    'active_response_prob', 'active_seq_prob',
                    'minor_response_prob'
                ], get_player),
                            every_k_epochs=1),
        ],
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=100,
    )
    trainer = AsyncMultiGPUTrainer(
        train_tower) if nr_gpu > 1 else SimpleTrainer()
    launch_train_with_config(config, trainer)
示例#27
0
def get_config(model, fake=False, xla=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

        config = tf.ConfigProto()
        jit_level = 0
        if xla:
            # Turns on XLA JIT compilation
            jit_level = tf.OptimizerOptions.ON_1
        config.graph_options.optimizer_options.global_jit_level = jit_level

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=10,
                       max_epoch=1,
                       nr_tower=nr_tower)
示例#28
0
def get_config(args, model):
    ds_train, sample_num = get_data('train', args.batch_size_per_gpu)
    ds_val, _ = get_data('test', args.batch_size_per_gpu)

    return TrainConfig(
        dataflow = ds_train,
        callbacks = [
            ModelSaver(),
            PeriodicTrigger(InferenceRunner(ds_val, [ScalarStats('cost')]),
                            every_k_epochs=5),
            HumanHyperParamSetter('learning_rate'),
        ],
        model = model,
        steps_per_epoch = sample_num // (args.batch_size_per_gpu * get_nr_gpu()),
    )
示例#29
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if fake:
        data = QueueInput(FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                                  (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top3', 'val-error-top3')]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        #steps_per_epoch=100 if args.fake else 437513 // args.batch,  val_num:24426
        steps_per_epoch=100 if args.fake else 437513 // args.batch,
        max_epoch=105,
    )
示例#30
0
def run(model):
    instance = Model(model, model.conf.data_format)
    if not model.conf.is_train:
        batch = 64
        dataset = get_data(model.conf.data_dir, 'val', batch)
        eval_on_ILSVRC12(
            instance,
            get_model_loader(model.conf.logdir + '/' + model.conf.test_step),
            dataset)
    else:
        logger.set_logger_dir(os.path.join(model.conf.logdir))
        config = get_config(instance, model.conf)
        if model.conf.reload_step:
            config.session_init = get_model_loader(model.conf.logdir + '/' +
                                                   model.conf.reload_step)
        trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
示例#31
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                                  (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=105,
    )
def get_config(model, checkpoint_dir, target_shape, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, target_shape, target_shape, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch, target_shape)
        dataset_val = get_data('val', batch, target_shape)
        callbacks = [
            ModelSaver(checkpoint_dir=checkpoint_dir),
            ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))


# 7.5 it / sec testing
    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 300,  #5000 
        max_epoch=110,
        nr_tower=nr_tower)
示例#33
0
def get_config(model, fake=False, data_aug=True):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch, data_aug)
        dataset_val = get_data('val', batch, data_aug)
        callbacks = [
            ModelSaver(),
        ]
        if data_aug:
            callbacks.append(ScheduledHyperParamSetter('learning_rate',
                                                       [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]))
        callbacks.append(HumanHyperParamSetter('learning_rate'))
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return AutoResumeTrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000,
        max_epoch=110 if data_aug else 64,
        nr_tower=nr_tower
    )
示例#34
0
def get_config():
    nr_tower = max(get_nr_gpu(), 1)
    batch = args.batch
    total_batch = batch * nr_tower
    assert total_batch >= 256   # otherwise the learning rate warmup is wrong.
    BASE_LR = 0.01 * (total_batch / 256.)

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    callbacks = [
        ModelSaver(),
        GPUUtilizationTracker(),
        EstimatedTimeLeft(),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(0, 0.01), (3, max(BASE_LR, 0.01))], interp='linear'),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]),
        DataParallelInferenceRunner(
            dataset_val, infs, list(range(nr_tower))),
    ]

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)
    return TrainConfig(
        model=Model(),
        data=input,
        callbacks=callbacks,
        steps_per_epoch=1281167 // total_batch,
        max_epoch=100,
    )
示例#35
0
    model = Model()

    if args.eval:
        batch = 128    # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    elif args.flops:
        # manually build the graph with batch=1
        input_desc = [
            InputDesc(tf.float32, [1, 224, 224, 3], 'input'),
            InputDesc(tf.int32, [1], 'label')
        ]
        input = PlaceholderInput()
        input.setup(input_desc)
        with TowerContext('', is_training=True):
            model.build_graph(*input.get_input_tensors())

        tf.profiler.profile(
            tf.get_default_graph(),
            cmd='op',
            options=tf.profiler.ProfileOptionBuilder.float_operation())
    else:
        logger.set_logger_dir(
            os.path.join('train_log', 'shufflenet'))

        nr_tower = max(get_nr_gpu(), 1)
        config = get_config(model, nr_tower)
        if args.load:
            config.session_init = get_model_loader(args.load)
        launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_tower))
示例#36
0
    parser.add_argument('-n', '--num_units',
                        help='number of units in each stage',
                        type=int, default=18)
    parser.add_argument('--load', help='load model for training')
    args = parser.parse_args()
    NUM_UNITS = args.num_units

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    logger.auto_set_dir()

    dataset_train = get_data('train')
    dataset_test = get_data('test')

    config = TrainConfig(
        model=Model(n=NUM_UNITS),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_test,
                            [ScalarStats('cost'), ClassificationError('wrong_vector')]),
            ScheduledHyperParamSetter('learning_rate',
                                      [(1, 0.1), (82, 0.01), (123, 0.001), (300, 0.0002)])
        ],
        max_epoch=400,
        session_init=SaverRestore(args.load) if args.load else None
    )
    nr_gpu = max(get_nr_gpu(), 1)
    launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))
示例#37
0
def get_batch_factor():
    nr_gpu = get_nr_gpu()
    assert nr_gpu in [1, 2, 4, 8], nr_gpu
    return 8 // nr_gpu
示例#38
0
        stepnum = config.STEPS_PER_EPOCH

        # warmup is step based, lr is epoch based
        warmup_schedule = [(0, config.BASE_LR / 3), (config.WARMUP * factor, config.BASE_LR)]
        warmup_end_epoch = config.WARMUP * factor * 1. / stepnum
        lr_schedule = [(int(np.ceil(warmup_end_epoch)), warmup_schedule[-1][1])]
        for idx, steps in enumerate(config.LR_SCHEDULE[:-1]):
            mult = 0.1 ** (idx + 1)
            lr_schedule.append(
                (steps * factor // stepnum, config.BASE_LR * mult))

        cfg = TrainConfig(
            model=Model(),
            data=QueueInput(get_train_dataflow(add_mask=config.MODE_MASK)),
            callbacks=[
                ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1),
                # linear warmup
                ScheduledHyperParamSetter(
                    'learning_rate', warmup_schedule, interp='linear', step_based=True),
                ScheduledHyperParamSetter('learning_rate', lr_schedule),
                EvalCallback(),
                GPUUtilizationTracker(),
                EstimatedTimeLeft(),
            ],
            steps_per_epoch=stepnum,
            max_epoch=config.LR_SCHEDULE[2] * factor // stepnum,
            session_init=get_model_loader(args.load) if args.load else None,
        )
        trainer = SyncMultiGPUTrainerReplicated(get_nr_gpu())
        launch_train_with_config(cfg, trainer)
示例#39
0
            InferenceRunner(dataset_val, [
                ClassificationError('wrong-top1', 'val-top1-error'),
                ClassificationError('wrong-top5', 'val-top5-error')]),
            ScheduledHyperParamSetter('learning_rate',
                                      [(8, 0.03), (14, 0.02), (17, 5e-3),
                                       (19, 3e-3), (24, 1e-3), (26, 2e-4),
                                       (30, 5e-5)])
        ],
        model=Model(),
        steps_per_epoch=5000,
        max_epoch=80,
    )


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--load', help='load model')
    parser.add_argument('--data', help='ImageNet data root directory', required=True)
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    config = get_config()
    if args.load:
        config.session_init = SaverRestore(args.load)
    nr_tower = get_nr_gpu()
    assert nr_tower == NR_GPU
    launch_train_with_config(config, SyncMultiGPUTrainer(NR_GPU))
示例#40
0
            pred = outputs[k][0]
            cv2.imwrite("out{}.png".format(
                '-fused' if k == 5 else str(k + 1)), pred * 255)
    else:
        pred = outputs[5][0]
        cv2.imwrite(output, pred * 255)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--load', help='load model')
    parser.add_argument('--view', help='view dataset', action='store_true')
    parser.add_argument('--run', help='run model on images')
    parser.add_argument('--output', help='fused output filename. default to out-fused.png')
    args = parser.parse_args()
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.view:
        view_data()
    elif args.run:
        run(args.load, args.run, args.output)
    else:
        config = get_config()
        if args.load:
            config.session_init = get_model_loader(args.load)
        launch_train_with_config(
            config,
            SyncMultiGPUTrainer(max(get_nr_gpu(), 1)))
示例#41
0
                        type=str, default='NCHW')
    parser.add_argument('-d', '--depth', help='resnet depth',
                        type=int, default=18, choices=[18, 34, 50, 101, 152])
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--batch', default=256, type=int,
                        help='total batch size. 32 per GPU gives best accuracy, higher values should be similarly good')
    parser.add_argument('--mode', choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use', default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.data_format, args.mode)
    if args.eval:
        batch = 128    # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join('train_log', 'imagenet-{}-d{}'.format(args.mode, args.depth)))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
示例#42
0
            if cnt == 500:
                return


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--depth', type=int, default=18)
    parser.add_argument('--load', help='load model')
    parser.add_argument('--cam', action='store_true')
    args = parser.parse_args()

    DEPTH = args.depth
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    nr_gpu = get_nr_gpu()
    BATCH_SIZE = TOTAL_BATCH_SIZE // nr_gpu

    if args.cam:
        BATCH_SIZE = 128    # something that can run on one gpu
        viz_cam(args.load, args.data)
        sys.exit()

    logger.auto_set_dir()
    config = get_config()
    if args.load:
        config.session_init = get_model_loader(args.load)
    launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))