Пример #1
0
def train_vqvae(params, dataset, checkpoint_dir):
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']
    image_shape = model_params['image_shape']

    train_ds, val_ds, sample_train, sample_test = load_toy_dataset(
        dataset, trainer_params['batch_size'],
        trainer_params['num_parallel'])

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseVQVAE.from_params(model_params)

    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            Reconstruct(model, sample_train, sample_test,
                        os.path.join(checkpoint_dir, 'images')),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['loss', 'perplexity'])),
            MaxSaver(monitor_stat='validation_loss'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip'))
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, SimpleTrainer())
Пример #2
0
    def train(self, args):
        self.args = args
        # Make sure the save path exist
        if not os.path.exists(self.args.save):
            os.makedirs(self.args.save)

        with change_gpu(self.args.gpu):
            train_df = self._dataflow()
            trainer = (SimpleTrainer() if get_num_gpu() <= 1 else
                       SyncMultiGPUTrainerReplicated(get_num_gpu()))
            print("Found %d gpus. Using trainer:" % get_num_gpu(), trainer)
            # Setup callbacks
            self._default_callbacks()
            try:
                launch_train_with_config(
                    self.pred_config(self.args, train_df, self.callbacks),
                    trainer)
            except Exception as error:
                traceback.print_exc()
            else:
                # If everythin worked save a compated model
                self.export(os.path.join(self.args.save, "compact.pb"))
Пример #3
0
    M.add(KL.Conv2D(32, 3, activation='relu', padding='same'))
    M.add(KL.Conv2D(32, 3, activation='relu', padding='same'))
    M.add(KL.MaxPooling2D())
    M.add(KL.Conv2D(32, 3, padding='same', activation='relu'))
    M.add(KL.Flatten())
    M.add(
        KL.Dense(512,
                 activation='relu',
                 kernel_regularizer=regularizers.l2(1e-5)))
    M.add(KL.Dropout(0.5))
    M.add(
        KL.Dense(10, activation=None,
                 kernel_regularizer=regularizers.l2(1e-5)))
    M.add(KL.Activation('softmax'))

    trainer = SimpleTrainer()

    setup_keras_trainer(trainer,
                        model=M,
                        input=QueueInput(dataset_train),
                        optimizer=tf.train.AdamOptimizer(1e-3),
                        loss='categorical_crossentropy',
                        metrics=['accuracy'])
    trainer.train_with_defaults(
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_test,
                            [ScalarStats(['total_loss', 'accuracy'])]),
        ],
        steps_per_epoch=dataset_train.size(),
    )
Пример #4
0
            temp = temp[keys[i]]
        temp[keys[-1]] = value

    # set GPU machine
    if config['gpu'] in [None, 'None', '']:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        num_gpu = 0
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = config['gpu']
        num_gpu = max(get_num_gpu(), 1)
    config['num_gpu'] = num_gpu

    # set log directory
    if config['logdir'] in [None, 'None', '']:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir('train_log/' + config['logdir'], action='d')
    # save configuration
    with open(logger.get_logger_dir() + '/config.json', 'w') as outfile:
        json.dump(config, outfile)

    # get train config
    train_config = get_train_config(config)

    # train the model
    if num_gpu > 1:
        launch_train_with_config(train_config,
                                 SyncMultiGPUTrainerReplicated(num_gpu))
    else:
        launch_train_with_config(train_config, SimpleTrainer())
Пример #5
0
def train(args, cfg):
    out_dirs = gen_outdirs(args, "tp")
    output_dir, out_res_dir = out_dirs["output_dir"], out_dirs["out_res_dir"]
    df = PneuSegDF(args.mode, out_res_dir, args.train_dir, args.testset_dir,
                   args.min_num_workers, cfg)
    num_gpu = max(get_num_gpu(), 1)
    ds = df.prepared(num_gpu, cfg.batch_size)

    # Avoid overwritting config file
    if os.path.exists(pj(output_dir, os.path.basename(args.config))):
        input(
            "Config file will NOT be overwritten. Press Enter to continue...")
    else:
        shutil.copy(args.config, output_dir)
    logger.set_logger_dir(pj(output_dir, "log"))
    callback_list = [
        # PeriodicCallback overwritten the frequency of what's wrapped
        PeriodicCallback(ModelSaver(50, checkpoint_dir=output_dir),
                         every_k_epochs=1),
        GPUUtilizationTracker(),
        MergeAllSummaries(1 if args.train_debug else 0),
        # ProgressBar(["Loss"])
    ]
    if cfg.network["norm_layer"] == "BN_layers":
        callback_list.append(BN_layers_update())
    if cfg.lr_schedule["type"] == "epoch_wise_constant":
        schedule = [(ep, lr / num_gpu) for ep, lr in zip(
            [0] + cfg.lr_schedule["epoch_to_drop_lr"], cfg.lr_schedule["lr"])]
        callback_list.append(
            ScheduledHyperParamSetter("learning_rate", schedule))
    elif cfg.lr_schedule["type"] == "halved":
        schedule = [(0, cfg.lr_schedule["init_lr"])]
        for i in range(cfg.lr_schedule["first_epoch2drop"], cfg.max_epoch,
                       cfg.lr_schedule["period"]):
            schedule.append(
                (i, schedule[int((i - cfg.lr_schedule["first_epoch2drop"]) /
                                 cfg.lr_schedule["period"])][1] /
                 (cfg.lr_schedule["decay_rate"] * num_gpu)))
        print(schedule)
        callback_list.append(
            ScheduledHyperParamSetter("learning_rate", schedule))
    steps_per_epoch = len(ds) // num_gpu + 1
    train_cfg = TrainConfig(
        model=Tensorpack_model(cfg, steps_per_epoch),
        data=QueueInput(ds),
        steps_per_epoch=steps_per_epoch,
        callbacks=callback_list,
        monitors=[
            # ScalarPrinter(True, whitelist=["Loss", "LR"]),
            ScalarPrinter(True),
            # ScalarPrinter(),
            TFEventWriter(),
            # JSONWriter()
        ],
        max_epoch=cfg.max_epoch,
        session_init=SmartInit(args.resume),
        starting_epoch=args.resume_epoch)
    launch_train_with_config(
        train_cfg,
        SyncMultiGPUTrainerReplicated(num_gpu)
        if num_gpu > 1 else SimpleTrainer())
Пример #6
0
def critic_train(ctrl,
                 data,
                 log_dir,
                 model_dir,
                 prev_dir,
                 vs_name,
                 split_train_val=False):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    lr_schedule = []
    max_epoch = ctrl.critic_train_epoch
    lr = ctrl.critic_init_lr
    for epoch in range(0, max_epoch):
        if epoch % 1 == 0:
            lr_schedule.append((epoch + 1, lr))
            lr *= 0.9
    ds_size = len(data[0])
    idxs = list(range(ds_size))
    np.random.shuffle(idxs)

    if split_train_val:
        train_size = ds_size * 9 // 10
        if train_size == 0:
            train_size = ds_size
        val_start = train_size
    else:
        train_size = ds_size
        val_start = ds_size * 9 // 10
    if ds_size - val_start == 0:
        val_start = 0

    data_train = [[col[k] for k in idxs[:train_size]] for col in data]
    data_val = [[col[k] for k in idxs[val_start:]] for col in data]

    model = critic_factory(ctrl, is_train=True, vs_name=vs_name)
    ds_train = critic_dataflow_factory(ctrl, data_train, is_train=True)
    ds_val = critic_dataflow_factory(ctrl, data_val, is_train=False)
    session_config = None
    device = 0
    if ctrl.critic_type == CriticTypes.LSTM:
        session_config = tf.ConfigProto(device_count={'GPU': 0})
        device = -1
    extra_callbacks = DEFAULT_CALLBACKS()
    extra_callbacks = list(
        filter(lambda x: not isinstance(x, ProgressBar), extra_callbacks))
    logger.info("Extra callbacks are {}".format(
        list(map(lambda x: x.__class__, extra_callbacks))))
    # Put this into callbacks for in-training validation/inferencing
    inference_callback = InferenceRunner(
        ds_val, [ScalarStats('{}/cost'.format(vs_name))], device=device)
    config = TrainConfig(
        dataflow=ds_train,
        callbacks=[
            ModelSaver(checkpoint_dir=model_dir,
                       max_to_keep=1,
                       keep_checkpoint_every_n_hours=100),
            ScheduledHyperParamSetter('learning_rate', lr_schedule)
        ],
        extra_callbacks=extra_callbacks,
        model=model,
        monitors=[JSONWriter(), ScalarPrinter()],  #, TFEventWriter()],
        steps_per_epoch=ds_train.size(),
        max_epoch=max_epoch,
        session_config=session_config)
    ckpt = tf.train.latest_checkpoint(prev_dir if prev_dir else model_dir)
    if ckpt:
        config.session_init = SaverRestore(ckpt)
    launch_train_with_config(config, SimpleTrainer())