def train_vqvae(params, dataset, checkpoint_dir): logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] image_shape = model_params['image_shape'] train_ds, val_ds, sample_train, sample_test = load_toy_dataset( dataset, trainer_params['batch_size'], trainer_params['num_parallel']) params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseVQVAE.from_params(model_params) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ Reconstruct(model, sample_train, sample_test, os.path.join(checkpoint_dir, 'images')), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['loss', 'perplexity'])), MaxSaver(monitor_stat='validation_loss'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')) ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, SimpleTrainer())
def train(self, args): self.args = args # Make sure the save path exist if not os.path.exists(self.args.save): os.makedirs(self.args.save) with change_gpu(self.args.gpu): train_df = self._dataflow() trainer = (SimpleTrainer() if get_num_gpu() <= 1 else SyncMultiGPUTrainerReplicated(get_num_gpu())) print("Found %d gpus. Using trainer:" % get_num_gpu(), trainer) # Setup callbacks self._default_callbacks() try: launch_train_with_config( self.pred_config(self.args, train_df, self.callbacks), trainer) except Exception as error: traceback.print_exc() else: # If everythin worked save a compated model self.export(os.path.join(self.args.save, "compact.pb"))
M.add(KL.Conv2D(32, 3, activation='relu', padding='same')) M.add(KL.Conv2D(32, 3, activation='relu', padding='same')) M.add(KL.MaxPooling2D()) M.add(KL.Conv2D(32, 3, padding='same', activation='relu')) M.add(KL.Flatten()) M.add( KL.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(1e-5))) M.add(KL.Dropout(0.5)) M.add( KL.Dense(10, activation=None, kernel_regularizer=regularizers.l2(1e-5))) M.add(KL.Activation('softmax')) trainer = SimpleTrainer() setup_keras_trainer(trainer, model=M, input=QueueInput(dataset_train), optimizer=tf.train.AdamOptimizer(1e-3), loss='categorical_crossentropy', metrics=['accuracy']) trainer.train_with_defaults( callbacks=[ ModelSaver(), InferenceRunner(dataset_test, [ScalarStats(['total_loss', 'accuracy'])]), ], steps_per_epoch=dataset_train.size(), )
temp = temp[keys[i]] temp[keys[-1]] = value # set GPU machine if config['gpu'] in [None, 'None', '']: os.environ['CUDA_VISIBLE_DEVICES'] = '' num_gpu = 0 else: os.environ['CUDA_VISIBLE_DEVICES'] = config['gpu'] num_gpu = max(get_num_gpu(), 1) config['num_gpu'] = num_gpu # set log directory if config['logdir'] in [None, 'None', '']: logger.auto_set_dir() else: logger.set_logger_dir('train_log/' + config['logdir'], action='d') # save configuration with open(logger.get_logger_dir() + '/config.json', 'w') as outfile: json.dump(config, outfile) # get train config train_config = get_train_config(config) # train the model if num_gpu > 1: launch_train_with_config(train_config, SyncMultiGPUTrainerReplicated(num_gpu)) else: launch_train_with_config(train_config, SimpleTrainer())
def train(args, cfg): out_dirs = gen_outdirs(args, "tp") output_dir, out_res_dir = out_dirs["output_dir"], out_dirs["out_res_dir"] df = PneuSegDF(args.mode, out_res_dir, args.train_dir, args.testset_dir, args.min_num_workers, cfg) num_gpu = max(get_num_gpu(), 1) ds = df.prepared(num_gpu, cfg.batch_size) # Avoid overwritting config file if os.path.exists(pj(output_dir, os.path.basename(args.config))): input( "Config file will NOT be overwritten. Press Enter to continue...") else: shutil.copy(args.config, output_dir) logger.set_logger_dir(pj(output_dir, "log")) callback_list = [ # PeriodicCallback overwritten the frequency of what's wrapped PeriodicCallback(ModelSaver(50, checkpoint_dir=output_dir), every_k_epochs=1), GPUUtilizationTracker(), MergeAllSummaries(1 if args.train_debug else 0), # ProgressBar(["Loss"]) ] if cfg.network["norm_layer"] == "BN_layers": callback_list.append(BN_layers_update()) if cfg.lr_schedule["type"] == "epoch_wise_constant": schedule = [(ep, lr / num_gpu) for ep, lr in zip( [0] + cfg.lr_schedule["epoch_to_drop_lr"], cfg.lr_schedule["lr"])] callback_list.append( ScheduledHyperParamSetter("learning_rate", schedule)) elif cfg.lr_schedule["type"] == "halved": schedule = [(0, cfg.lr_schedule["init_lr"])] for i in range(cfg.lr_schedule["first_epoch2drop"], cfg.max_epoch, cfg.lr_schedule["period"]): schedule.append( (i, schedule[int((i - cfg.lr_schedule["first_epoch2drop"]) / cfg.lr_schedule["period"])][1] / (cfg.lr_schedule["decay_rate"] * num_gpu))) print(schedule) callback_list.append( ScheduledHyperParamSetter("learning_rate", schedule)) steps_per_epoch = len(ds) // num_gpu + 1 train_cfg = TrainConfig( model=Tensorpack_model(cfg, steps_per_epoch), data=QueueInput(ds), steps_per_epoch=steps_per_epoch, callbacks=callback_list, monitors=[ # ScalarPrinter(True, whitelist=["Loss", "LR"]), ScalarPrinter(True), # ScalarPrinter(), TFEventWriter(), # JSONWriter() ], max_epoch=cfg.max_epoch, session_init=SmartInit(args.resume), starting_epoch=args.resume_epoch) launch_train_with_config( train_cfg, SyncMultiGPUTrainerReplicated(num_gpu) if num_gpu > 1 else SimpleTrainer())
def critic_train(ctrl, data, log_dir, model_dir, prev_dir, vs_name, split_train_val=False): if not os.path.exists(model_dir): os.makedirs(model_dir) lr_schedule = [] max_epoch = ctrl.critic_train_epoch lr = ctrl.critic_init_lr for epoch in range(0, max_epoch): if epoch % 1 == 0: lr_schedule.append((epoch + 1, lr)) lr *= 0.9 ds_size = len(data[0]) idxs = list(range(ds_size)) np.random.shuffle(idxs) if split_train_val: train_size = ds_size * 9 // 10 if train_size == 0: train_size = ds_size val_start = train_size else: train_size = ds_size val_start = ds_size * 9 // 10 if ds_size - val_start == 0: val_start = 0 data_train = [[col[k] for k in idxs[:train_size]] for col in data] data_val = [[col[k] for k in idxs[val_start:]] for col in data] model = critic_factory(ctrl, is_train=True, vs_name=vs_name) ds_train = critic_dataflow_factory(ctrl, data_train, is_train=True) ds_val = critic_dataflow_factory(ctrl, data_val, is_train=False) session_config = None device = 0 if ctrl.critic_type == CriticTypes.LSTM: session_config = tf.ConfigProto(device_count={'GPU': 0}) device = -1 extra_callbacks = DEFAULT_CALLBACKS() extra_callbacks = list( filter(lambda x: not isinstance(x, ProgressBar), extra_callbacks)) logger.info("Extra callbacks are {}".format( list(map(lambda x: x.__class__, extra_callbacks)))) # Put this into callbacks for in-training validation/inferencing inference_callback = InferenceRunner( ds_val, [ScalarStats('{}/cost'.format(vs_name))], device=device) config = TrainConfig( dataflow=ds_train, callbacks=[ ModelSaver(checkpoint_dir=model_dir, max_to_keep=1, keep_checkpoint_every_n_hours=100), ScheduledHyperParamSetter('learning_rate', lr_schedule) ], extra_callbacks=extra_callbacks, model=model, monitors=[JSONWriter(), ScalarPrinter()], #, TFEventWriter()], steps_per_epoch=ds_train.size(), max_epoch=max_epoch, session_config=session_config) ckpt = tf.train.latest_checkpoint(prev_dir if prev_dir else model_dir) if ckpt: config.session_init = SaverRestore(ckpt) launch_train_with_config(config, SimpleTrainer())