예제 #1
0
def train(args, logdir):

    # model
    # ;model = Net1()

    # dataflow
    # ;df = Net1DataFlow(hp.train1.data_path, hp.train1.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)

    session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True, ), )

    train_conf = TrainConfig(
        # ;model=model,
        # ;data=QueueInput(df(n_prefetch=1000, n_thread=4)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            # TODO EvalCallback()
        ],
        # ;max_epoch=hp.train1.num_epochs,
        # ;steps_per_epoch=hp.train1.steps_per_epoch,
        # session_config=session_conf
    )
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
예제 #2
0
파일: train.py 프로젝트: EJOOSTEROP/dsf-cnn
    def run_once(self, opt, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(opt['train_batch_size'], mode='train')
        valid_datagen = self.get_datagen(opt['infer_batch_size'], mode='valid')

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        ######
        model_flags = opt['model_flags']
        model = self.get_model()(**model_flags)
        ######
        callbacks = [
            ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=None),
        ]

        for param_name, param_info in opt['manual_parameters'].items():
            model.add_manual_variable(param_name, param_info[0])
            callbacks.append(
                ScheduledHyperParamSetter(param_name, param_info[1]))
        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs,
                                        list(range(nr_gpus))))
        if self.model_mode == 'seg_gland':
            callbacks.append(MaxSaver('valid_dice_obj'))
        elif self.model_mode == 'seg_nuc':
            callbacks.append(MaxSaver('valid_dice_np'))
        else:
            callbacks.append(MaxSaver('valid_auc'))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        config = TrainConfig(
            model=model,
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=opt['nr_epochs'],
        )
        config.session_init = sess_init

        launch_train_with_config(config,
                                 SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph(
        )  # remove the entire graph in case of multiple runs
        return
예제 #3
0
def get_config(model):
    nr_tower = max(get_nr_gpu(), 1)
    batch = PER_GPU_BATCH_SIZE

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    callbacks = [
        StepTimeCallback(),
        GPUUtilizationTracker(),
    ]

    if args.fake:
        dataset_train = FakeData([[batch, 224, 224, 3], [batch]],
                                 1000,
                                 random=False,
                                 dtype=['uint8', 'int32'])

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)

    num_gpus = get_nr_gpu()
    return TrainConfig(
        model=model,
        data=input,
        callbacks=callbacks,
        steps_per_epoch=1281 // (PER_GPU_BATCH_SIZE * get_nr_gpu()),
        max_epoch=3,
    )
예제 #4
0
def get_config(X_train, X_valid, y_train, y_valid, model_path=None):
    data_train, data_valid = get_data(X_train, X_valid, y_train, y_valid)

    steps_per_epoch = data_train.size()

    cur_loss = 'tot_loss'
    triggerk = 15
    visualk = 5
    config = TrainConfig(
        model=Model(),
        dataflow=data_train,
        callbacks=[
            PeriodicTrigger(ModelSaver(), every_k_epochs=triggerk),
            PeriodicTrigger(MinSaver('validation_' + cur_loss),
                            every_k_epochs=triggerk),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, 2e-4), (150, 1e-4), (300, 5e-5),
                                       (600, 1e-5), (800, 1e-6)],
                                      interp='linear'),
            PeriodicTrigger(VisualizeRunner(), every_k_epochs=visualk),
            PeriodicTrigger(InferenceRunner(data_valid,
                                            [ScalarStats(cur_loss)]),
                            every_k_epochs=5)
        ],
        session_init=SaverRestore(model_path) if model_path != None else None,
        # session_config=session_config,
        steps_per_epoch=data_train.size(),
        max_epoch=2000)
    return config
예제 #5
0
def get_config(args):
    if args.gpu != None:
        NR_GPU = len(args.gpu.split(','))
        batch_size = int(args.batch_size) // NR_GPU
    else:
        batch_size = int(args.batch_size)

    ds_train = get_data('train', batch_size)
    ds_test = get_data('test', batch_size)

    callbacks = [
      ModelSaver(),
    #   ScheduledHyperParamSetter('learning_rate',
    #                             [(0, 1e-4), (3, 2e-4), (6, 3e-4), (10, 6e-4), (15, 1e-3), (60, 1e-4), (90, 1e-5)]),
        HumanHyperParamSetter('learning_rate')
    ]
    config = edict()
    config.stage = 1

    return TrainConfig(
        dataflow = ds_train,
        callbacks = callbacks,
        model = Model(stage = 1),
        max_epoch = cfg.me_max_iteration * batch_size - ds_train.size()
    )
예제 #6
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_tower
    )
예제 #7
0
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn(
            "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported."
        )
    if fake:
        data = QueueInput(
            FakeData([[batch, 224, 224, 3], [batch]],
                     1000,
                     random=False,
                     dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, min(START_LR, BASE_LR)),
                                       (30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (90, BASE_LR * 1e-3),
                                       (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, START_LR),
                                                            (5, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        max_epoch=105,
    )
예제 #8
0
def get_config(fake=False, data_format='NCHW'):
    nr_tower = max(get_nr_gpu(), 1)
    global BATCH_SIZE
    BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = dataset_val = FakeData([[64, 224, 224, 3], [64]],
                                               1000,
                                               random=False,
                                               dtype='uint8')
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, BATCH_SIZE))
        dataset_train = get_data('train')
        dataset_val = get_data('val')

    return TrainConfig(
        model=Model(data_format=data_format),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_val, [
                ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')
            ]),
            ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ],
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_tower)
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
        steps_per_epoch = 100
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))

        dataset_train = get_imagenet_dataflow(args.data, 'train', batch)
        dataset_val = get_imagenet_dataflow(args.data, 'val', min(64, batch))
        steps_per_epoch = 1281167 // args.batch

        BASE_LR = 0.1 * args.batch / 256.0
        logger.info("BASELR: {}".format(BASE_LR))
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            GPUUtilizationTracker(),
            ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR),
                                                        (30, BASE_LR * 1e-1),
                                                        (60, BASE_LR * 1e-2),
                                                        (90, BASE_LR * 1e-3)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate',
                                          [(0, 0.1),
                                           (5 * steps_per_epoch, BASE_LR)],
                                          interp='linear',
                                          step_based=True))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=steps_per_epoch,
        max_epoch=100,
    )
예제 #10
0
파일: run.py 프로젝트: hewr1993/nn_expr
def get_config():
    # prepare dataset
    step_per_epoch = 1024
    dataset_train = get_data('train')
    dataset_test = get_data('test', 128)

    sess_config = get_default_sess_config(0.5)

    nr_gpu = get_nr_gpu()
    lr = tf.train.exponential_decay(
        learning_rate=1e-5,
        global_step=get_global_step_var(),
        decay_steps=step_per_epoch * 10,
        decay_rate=0.8,
        staircase=True, name='learning_rate')
    tf.scalar_summary('learning_rate', lr)

    return TrainConfig(
        dataset=dataset_train,
        optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-5),
        callbacks=Callbacks([
            StatPrinter(),
            ModelSaver(),
            InferenceRunner(dataset_test, ScalarStats('combined_cost')),
        ]),
        session_config=sess_config,
        model=Model(),
        step_per_epoch=step_per_epoch,
        max_epoch=1000,
    )
예제 #11
0
def get_config():
    # prepare dataset
    dataset_train = get_data('train')
    step_per_epoch = dataset_train.size()
    dataset_test = get_data('test')

    sess_config = get_default_sess_config(0.5)

    nr_gpu = get_nr_gpu()
    lr = tf.train.exponential_decay(learning_rate=1e-2,
                                    global_step=get_global_step_var(),
                                    decay_steps=step_per_epoch *
                                    (30 if nr_gpu == 1 else 20),
                                    decay_rate=0.5,
                                    staircase=True,
                                    name='learning_rate')
    tf.scalar_summary('learning_rate', lr)

    return TrainConfig(
        dataset=dataset_train,
        optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3),
        callbacks=Callbacks([
            StatPrinter(),
            ModelSaver(),
            InferenceRunner(dataset_test, ClassificationError())
        ]),
        session_config=sess_config,
        model=Model(),
        step_per_epoch=step_per_epoch,
        max_epoch=300,
    )
예제 #12
0
def get_config(model):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(0, 3e-1), (30, 3e-2),
                                                    (60, 3e-3), (90, 3e-4)]),
        HumanHyperParamSetter('learning_rate'),
    ]
    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=5000,
                       max_epoch=100,
                       nr_tower=nr_tower)
예제 #13
0
def get_config(args):
    if args.gpu != None:
        NR_GPU = len(args.gpu.split(','))
        batch_size = int(args.batch_size) // NR_GPU
    else:
        batch_size = int(args.batch_size)

    ds_train = get_data('train', batch_size)
    ds_test = get_data('test', batch_size)

    callbacks = [
        ModelSaver(),
        HumanHyperParamSetter('learning_rate')
    ]
    if args.stage == 1:
        max_epoch = cfg.me_max_iteration * batch_size - ds_train.size()
    elif args.stage == 2:
        max_epoch = cfg.spmc_max_iteration * batch_size - ds_train.size()
    else:
        max_epoch = 99999


    return TrainConfig(
        dataflow = ds_train,
        callbacks = callbacks,
        model = Model(stage = args.stage),
        max_epoch = max_epoch
    )
예제 #14
0
def get_config():
    # prepare dataset
    dataset_train = get_data('train')
    step_per_epoch = dataset_train.size()
    dataset_test = get_data('test')

    sess_config = get_default_sess_config(0.9)

    lr = tf.Variable(0.01, trainable=False, name='learning_rate')
    tf.scalar_summary('learning_rate', lr)

    return TrainConfig(
        dataset=dataset_train,
        optimizer=tf.train.MomentumOptimizer(lr, 0.9),
        callbacks=Callbacks([
            StatPrinter(),
            ModelSaver(),
            InferenceRunner(dataset_test,
                            [ScalarStats('cost'),
                             ClassificationError()]),
            ScheduledHyperParamSetter('learning_rate', [(1, 0.1), (82, 0.01),
                                                        (123, 0.001),
                                                        (300, 0.0002)])
        ]),
        session_config=sess_config,
        model=Model(n=5),
        step_per_epoch=step_per_epoch,
        max_epoch=500,
    )
예제 #15
0
def get_config(model):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    callbacks = [
        ModelSaver(),
        GPUUtilizationTracker(),
        ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (3, BASE_LR)],
                                  interp='linear'),
        ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1),
                                                    (60, BASE_LR * 1e-2),
                                                    (80, BASE_LR * 1e-3)]),
        PeriodicTrigger(DataParallelInferenceRunner(dataset_val, infs,
                                                    list(range(nr_tower))),
                        every_k_epochs=1),
    ]

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)
    return TrainConfig(
        model=model,
        data=input,
        callbacks=callbacks,
        steps_per_epoch=1281167 // TOTAL_BATCH_SIZE,
        max_epoch=100,
    )
예제 #16
0
    def run_once(self, opt, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(opt["train_batch_size"], mode="train")
        valid_datagen = self.get_datagen(opt["infer_batch_size"], mode="valid")

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        ######
        model_flags = opt["model_flags"]
        model = self.get_model()(**model_flags)
        ######
        callbacks = [
            # ModelSaver(max_to_keep=20), # TODO dynamic this
            ModelSaver(max_to_keep=opt["nr_epochs"]),
            # InjectShell(file='/tools/hover_net/src/config.yml', shell='ipython'),
        ]

        for param_name, param_info in opt["manual_parameters"].items():
            model.add_manual_variable(param_name, param_info[0])
            callbacks.append(ScheduledHyperParamSetter(param_name, param_info[1]))
        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))
        )
        callbacks.append(MaxSaver("valid_dice"))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        config = TrainConfig(
            model=model,
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=opt["nr_epochs"],
        )
        config.session_init = sess_init

        launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph()  # remove the entire graph in case of multiple runs
        # TODO: save
        return
예제 #17
0
def get_config(model, fake=False):
    start_ = 0
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if fake:
        dataset_train = FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        if start_ < 31:
            lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 61:
            lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 91:
            lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        else:
            print('not found learning rate setting!!!!!!!!!!!!!')

        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', lr_setting),
            # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0'])
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=120,
    )
예제 #18
0
 def pred_config(self, args, df, callbacks) -> TrainConfig:
     return TrainConfig(
         model=self.train_model(args),
         data=StagingInput(QueueInput(df)),
         callbacks=callbacks,
         max_epoch=args.epochs,
         steps_per_epoch=args.steps,
         session_init=SaverRestore(args.load) if args.load else None,
     )
def get_config(model):
    nr_tower = max(get_num_gpu(), 1)
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))

    callbacks = [ThroughputTracker(args.batch)]
    if args.fake:
        data = QueueInput(FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8'))
    else:
        data = QueueInput(
            get_imagenet_dataflow(args.data, 'train', batch),
            # use a larger queue
            queue=tf.FIFOQueue(200, [tf.uint8, tf.int32], [[batch, 224, 224, 3], [batch]])
        )

        BASE_LR = 30
        SCALED_LR = BASE_LR * (args.batch / 256.0)
        callbacks.extend([
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [
                    (0, SCALED_LR),
                    (60, SCALED_LR * 1e-1),
                    (70, SCALED_LR * 1e-2),
                    (80, SCALED_LR * 1e-3),
                    (90, SCALED_LR * 1e-4),
                ]),
        ])

        dataset_val = get_imagenet_dataflow(args.data, 'val', 64)
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    if args.load.endswith(".npz"):
        # a released model in npz format
        init = SmartInit(args.load)
    else:
        # a pre-trained checkpoint
        init = SaverRestore(args.load, ignore=("learning_rate", "global_step"))
    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        session_init=init,
        max_epoch=100,
    )
예제 #20
0
파일: train.py 프로젝트: haelkemary/xy_net
    def run_once(self, nr_gpus, freeze, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(mode='train')
        valid_datagen = self.get_datagen(mode='valid')

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        callbacks = [
            ModelSaver(max_to_keep=200),
            ScheduledHyperParamSetter('learning_rate', self.lr_sched),
        ]
        ######

        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs,
                                        list(range(nr_gpus))))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        MODEL_MAKER = Model_NP_XY if self.model_mode == 'np+xy' else Model_NP_DIST

        config = TrainConfig(
            model=MODEL_MAKER(freeze),
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=self.nr_epochs,
        )
        config.session_init = sess_init

        launch_train_with_config(config,
                                 SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph(
        )  # remove the entire graph in case of multiple runs
        return
예제 #21
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (85, BASE_LR * 1e-3),
                                       (95, BASE_LR * 1e-4),
                                       (105, BASE_LR * 1e-5)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, 0.1),
                                                            (3, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=110,
    )
예제 #22
0
def get_config():
    #anchors = np.mgrid[0:4,0:4][:,1:,1:].transpose(1,2,0).reshape((-1,2)) / 4.0
    # prepare dataset
    d1 = dataset.SVHNDigit('train')
    d2 = dataset.SVHNDigit('extra')
    train = RandomMixData([d1, d2])
    test = dataset.SVHNDigit('test')

    augmentors = [
        imgaug.Resize((40, 40)),
        imgaug.BrightnessAdd(30),
        imgaug.Contrast((0.5, 1.5)),
        imgaug.GaussianDeform(  # this is slow
            [(0.2, 0.2), (0.2, 0.8), (0.8, 0.8), (0.8, 0.2)], (40, 40), 0.2,
            3),
    ]
    train = AugmentImageComponent(train, augmentors)
    train = BatchData(train, 128)
    nr_proc = 5
    train = PrefetchData(train, 5, nr_proc)
    step_per_epoch = train.size()

    augmentors = [
        imgaug.Resize((40, 40)),
    ]
    test = AugmentImageComponent(test, augmentors)
    test = BatchData(test, 128, remainder=True)

    sess_config = get_default_sess_config(0.8)

    lr = tf.train.exponential_decay(learning_rate=1e-3,
                                    global_step=get_global_step_var(),
                                    decay_steps=train.size() * 60,
                                    decay_rate=0.2,
                                    staircase=True,
                                    name='learning_rate')
    tf.scalar_summary('learning_rate', lr)

    return TrainConfig(
        dataset=train,
        optimizer=tf.train.AdamOptimizer(lr),
        callbacks=Callbacks([
            StatPrinter(),
            ModelSaver(),
            InferenceRunner(dataset_test,
                            [ScalarStats('cost'),
                             ClassificationError()])
        ]),
        session_config=sess_config,
        model=Model(),
        step_per_epoch=step_per_epoch,
        max_epoch=350,
    )
예제 #23
0
파일: resnext.py 프로젝트: mfkiwl/camuy
def get_config(model):
    batch = 1

    logger.info("For benchmark, batch size is fixed to 1 per tower.")
    data = QueueInput(
        FakeData([[1, 224, 224, 3], [1]], 1, random=False, dtype='uint8'))

    return TrainConfig(model=model,
                       data=data,
                       callbacks=[],
                       steps_per_epoch=1,
                       max_epoch=1)
예제 #24
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4),
                                       (60, 5e-5), (80, 2.5e-5),
                                       (100, 1.25e-5), (120, 5e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        # Finetune COCO
        #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]
        #JT COCO
        #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)]
        #Fintune to VOC
        #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)]

        #infs = [ClassificationError('wrong-top1', 'val-error-top1'),
        #        ClassificationError('wrong-top5', 'val-error-top5')]
        infs = [
            ClassificationError('loss-wrong-top1', 'loss-val-error-top1'),
            ClassificationError('loss-wrong-top5', 'loss-val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=1522,
                       max_epoch=140,
                       nr_tower=nr_tower)
예제 #25
0
def get_config(model):
    input_sig = model.get_input_signature()
    nr_tower = max(hvd.size(), 1)
    batch = args.batch // nr_tower
    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))

    callbacks = [ThroughputTracker(args.batch), UpdateMomentumEncoder()]

    if args.fake:
        data = QueueInput(
            FakeData([x.shape for x in input_sig],
                     1000,
                     random=False,
                     dtype='uint8'))
    else:
        zmq_addr = 'ipc://@imagenet-train-b{}'.format(batch)
        data = ZMQInput(zmq_addr, 25, bind=False)

        dataset = data.to_dataset(input_sig).repeat().prefetch(15)
        dataset = dataset.apply(
            tf.data.experimental.prefetch_to_device('/gpu:0'))
        data = TFDatasetInput(dataset)

        callbacks.extend([
            ModelSaver(),
            EstimatedTimeLeft(),
        ])

        if not args.v2:
            # step-wise LR in v1
            SCALED_LR = BASE_LR * (args.batch / 256.0)
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate',
                                          [(0, min(BASE_LR, SCALED_LR)),
                                           (120, SCALED_LR * 1e-1),
                                           (160, SCALED_LR * 1e-2)]))
            if SCALED_LR > BASE_LR:
                callbacks.append(
                    ScheduledHyperParamSetter('learning_rate',
                                              [(0, BASE_LR), (5, SCALED_LR)],
                                              interp='linear'))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        max_epoch=200,
    )
예제 #26
0
def get_config(model, fake=False, xla=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

        config = tf.ConfigProto()
        jit_level = 0
        if xla:
            # Turns on XLA JIT compilation
            jit_level = tf.OptimizerOptions.ON_1
        config.graph_options.optimizer_options.global_jit_level = jit_level

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=10,
                       max_epoch=1,
                       nr_tower=nr_tower)
예제 #27
0
def get_config(args, model, num_gpus, num_towers):
    """
    Create the TensorPack Trainer configuration.

    :param args: The cli arguments.
    :param model: The model object to train.
    :param num_gpus: The number of gpus on which to train
    :param num_towers: The number of data parallel towers to create

    :return: A TrainConfig object.
    """
    logger.info("Running on {} towers. Batch size per tower: {}".format(
        num_towers, args.batch_size))

    df_train = avatar_synth_df(args.train_dir, args.batch_size,
                               args.num_threads)
    df_test = avatar_synth_df(args.test_dir, args.batch_size, args.num_threads)

    def update_lr(epoch, cur_lr):
        """ Approximate exponential decay of the learning rate """
        if args.resume_lr:
            return cur_lr * args.lr_decay
        else:
            return args.lr * args.lr_decay**epoch

    callbacks = [
        cb.ModelSaver(),
        cb.MinSaver('val-error-top1'),
        cb.HyperParamSetterWithFunc('tower0/Avatar_Synth/LR:0', update_lr),
        cb.MergeAllSummaries(period=args.summary_freq),
    ]
    infs = [cb.ScalarStats('Avatar_Synth/Cost')]

    if num_gpus > 0:
        callbacks.append(cb.GPUUtilizationTracker())

    if num_towers == 1:  # single-GPU inference with queue prefetch
        callbacks.append(cb.InferenceRunner(QueueInput(df_test), infs))
    else:  # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            cb.DataParallelInferenceRunner(df_test, infs,
                                           list(range(num_towers))))

    return TrainConfig(model=model,
                       dataflow=df_train,
                       callbacks=callbacks,
                       max_epoch=args.epochs,
                       nr_tower=num_towers)
def get_config(model, checkpoint_dir, target_shape, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, target_shape, target_shape, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch, target_shape)
        dataset_val = get_data('val', batch, target_shape)
        callbacks = [
            ModelSaver(checkpoint_dir=checkpoint_dir),
            ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))


# 7.5 it / sec testing
    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 300,  #5000 
        max_epoch=110,
        nr_tower=nr_tower)
예제 #29
0
def get_config(model):
    nr_tower = get_nr_gpu()

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, args.batch_size_per_gpu))
    dataset_train = get_data('train', args.batch_size_per_gpu)
    dataset_val = get_data('val', args.batch_size_per_gpu)

    BASE_LR = 1e-3 * (args.batch_size_per_gpu * nr_tower / 256.0)
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR),
                                                    (60, BASE_LR * 1e-1),
                                                    (90, BASE_LR * 1e-2)]),
        HumanHyperParamSetter('learning_rate'),
    ]
    '''
    if BASE_LR > 0.1:
        callbacks.append(
            ScheduledHyperParamSetter(
                'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear'))
    '''

    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=1280000 // (args.batch_size_per_gpu * nr_tower),
        max_epoch=110,
    )
예제 #30
0
def get_config(lmdb_path, txt_path, nr_gpu):

    df = ResNet.get_data(lmdb_path, txt_path)

    return TrainConfig(
        model=ResNet(),
        dataflow=df,
        callbacks=[ModelSaver()],
        extra_callbacks=[
            MovingAverageSummary(),
            ProgressBar(["EMA/cost", "EMA/loss/pos-dist",
                         "EMA/loss/neg-dist"]),
            MergeAllSummaries(),
            RunUpdateOps()
        ],
        nr_tower=nr_gpu,
        steps_per_epoch=1800,
        max_epoch=30,
        session_config=tf.ConfigProto(allow_soft_placement=True))