def train(args, logdir): # model # ;model = Net1() # dataflow # ;df = Net1DataFlow(hp.train1.data_path, hp.train1.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True, ), ) train_conf = TrainConfig( # ;model=model, # ;data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], # ;max_epoch=hp.train1.num_epochs, # ;steps_per_epoch=hp.train1.steps_per_epoch, # session_config=session_conf ) ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(','))
def get_config(): M = Model() dataflow = data_io from tensorpack.callbacks.base import Callback class CBSyncWeight(Callback): def _before_run(self, ctx): if self.local_step % 10 == 0: return [M._sync_op_pred] import functools from tensorpack.train.config import TrainConfig from tensorpack.callbacks.saver import ModelSaver from tensorpack.callbacks.graph import RunOp from tensorpack.callbacks.param import ScheduledHyperParamSetter, HumanHyperParamSetter, HyperParamSetterWithFunc from tensorpack.tfutils import sesscreate from tensorpack.tfutils.common import get_default_sess_config import tensorpack.tfutils.symbolic_functions as symbf sigma_beta_steering = symbf.get_scalar_var('actor/sigma_beta_steering', 0.3, summary=True, trainable=False) sigma_beta_accel = symbf.get_scalar_var('actor/sigma_beta_accel', 0.3, summary=True, trainable=False) return TrainConfig( model=M, data=dataflow, callbacks=[ ModelSaver(), HyperParamSetterWithFunc( 'learning_rate/actor', functools.partial(M._calc_learning_rate, 'actor')), HyperParamSetterWithFunc( 'learning_rate/critic', functools.partial(M._calc_learning_rate, 'critic')), # ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]), ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]), # HumanHyperParamSetter('learning_rate'), # HumanHyperParamSetter('entropy_beta'), ScheduledHyperParamSetter('actor/sigma_beta_accel', [(1, 0.2), (2, 0.01)]), ScheduledHyperParamSetter('actor/sigma_beta_steering', [(1, 0.1), (2, 0.01)]), CBSyncWeight(), data_io, # PeriodicTrigger(Evaluator( # EVAL_EPISODE, ['state'], ['policy'], get_player), # every_k_epochs=3), ] + evaluators, session_creator=sesscreate.NewSessionCreator( config=get_default_sess_config(0.5)), steps_per_epoch=STEPS_PER_EPOCH, max_epoch=1000, )
def main(cfg): print(cfg) tf.reset_default_graph() logger.set_logger_dir('tflogs', action='d') copyfile(hydra.utils.to_absolute_path('model.py'), 'model.py') copyfile(hydra.utils.to_absolute_path('dataflow.py'), 'dataflow.py') if cfg.cat_name == 'smpl': train_df = SMPLDataFlow(cfg, True, 1000) val_df = VisSMPLDataFlow(cfg, True, 1000, port=1080) else: train_df = ShapeNetDataFlow(cfg, cfg.data.train_txt, True) val_df = VisDataFlow(cfg, cfg.data.val_txt, False, port=1080) config = TrainConfig( model=Model(cfg), dataflow=BatchData(PrefetchData(train_df, cpu_count() // 2, cpu_count() // 2), cfg.batch_size), callbacks=[ ModelSaver(), SimpleMovingAverage(['recon_loss', 'GAN/loss_d', 'GAN/loss_g', 'GAN/gp_loss', 'symmetry_loss'], 100), PeriodicTrigger(val_df, every_k_steps=30) ], monitors=tensorpack.train.DEFAULT_MONITORS() + [ScalarPrinter(enable_step=True, enable_epoch=False)], max_epoch=10 ) launch_train_with_config(config, SimpleTrainer())
def train(args, logdir1, logdir2): # model model = Net2() preprocessing(data_path, logdir2) # dataflow df = Net2DataFlow(data_path, hp.train2.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir2) # session_conf = tf.ConfigProto( # gpu_options=tf.GPUOptions( # allow_growth=True, # per_process_gpu_memory_fraction=0.6, # ), # ) dataset_size = len(glob.glob(data_path + '/wav/*.wav')) print("\t\data_path : ", data_path) print("\t\tDataset Size : ", dataset_size) print("\t\tBatch Size : ", hp.train2.batch_size) print("\t\tSteps per epoch : ", (dataset_size // hp.train2.batch_size)) from time import sleep sleep(10) session_inits = [] ckpt2 = '{}/{}'.format( logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) if ckpt2: session_inits.append(SaverRestore(ckpt2)) ckpt1 = tf.train.latest_checkpoint(logdir1) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=8)), callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), # ConvertCallback(logdir2, hp.train2.test_per_epoch), ], max_epoch=hp.train2.num_epochs, steps_per_epoch=dataset_size // hp.train2.batch_size, session_init=ChainInit(session_inits)) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) gpu_list = args.gpu.split(',') gpu_list = list(map(int, gpu_list)) #trainer = SimpleTrainer() trainer = SyncMultiGPUTrainerReplicated(gpu_list) #trainer = AsyncMultiGPUTrainer(gpu_list, False) launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir): # model model = Net1() preprocessing(data_path) preprocessing(test_path) # dataflow df = Net1DataFlow(data_path, hp.train1.batch_size) df_test = Net1DataFlow(test_path, hp.train1.batch_size) #datas = df.get_data() #print(datas[1]) # set logger for event and model saver logger.set_logger_dir(logdir) #session_conf = tf.ConfigProto( # gpu_options=tf.GPUOptions( # allow_growth=True, # ),) # cv test code # https://github.com/tensorpack/tensorpack/blob/master/examples/boilerplate.py train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=hp.train1.batch_size * 10, n_thread=1)), callbacks=[ ModelSaver(checkpoint_dir=logdir), InferenceRunner( df_test(n_prefetch=1), ScalarStats(['net1/eval/loss', 'net1/eval/acc'], prefix='')), ], max_epoch=hp.train1.num_epochs, steps_per_epoch=hp.train1.steps_per_epoch, #session_config=session_conf ) ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) num_gpu = hp.train1.num_gpu if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) num_gpu = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(num_gpu) else: trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir1, logdir2): # model model = Net2() # dataflow df = Net2DataFlow(hp.train2.data_path, hp.train2.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir2) session_conf = tf.ConfigProto( # log_device_placement=True, allow_soft_placement=True, gpu_options=tf.GPUOptions( # allow_growth=True, per_process_gpu_memory_fraction=0.6, ), ) session_inits = [] ckpt2 = '{}/{}'.format(logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) if ckpt2: session_inits.append(SaverRestore(ckpt2)) ckpt1 = tf.train.latest_checkpoint(logdir1) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), # ConvertCallback(logdir2, hp.train2.test_per_epoch), ], max_epoch=hp.train2.num_epochs, steps_per_epoch=hp.train2.steps_per_epoch, session_init=ChainInit(session_inits), session_config=session_conf ) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) #trainer = SyncMultiGPUTrainerParameterServer(hp.train2.num_gpu) trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir): # model print("####model") model = Net1() # dataflow print("####dataflow") df = Net1DataFlow(hp.Train1.data_path, hp.Train1.batch_size) # set logger for event and model saver print("####logger") logger.set_logger_dir(logdir) print("####session_conf") session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True, ), allow_soft_placement=True) print("####train_conf") train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=5)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], max_epoch=hp.Train1.num_epochs, steps_per_epoch=hp.Train1.steps_per_epoch, session_config=session_conf) print("####ckpt") ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) print("####trainer") trainer = SyncMultiGPUTrainerReplicated(hp.Train1.num_gpu) print("####launch_train_with_config") launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir): # model model = Net1() # dataflow TIMIT_TRAIN_WAV = 'TIMIT/TRAIN/*/*/*.npz' TIMIT_TEST_WAV = 'TIMIT/TEST/*/*/*.npz' print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV)) print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV)) df = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV), hp.train1.batch_size) df_test = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV), hp.train1.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=8)), callbacks=[ ModelSaver(checkpoint_dir=logdir), InferenceRunner(df_test(n_prefetch=1), ScalarStats(['net1/eval/loss', 'net1/eval/acc'],prefix='')), ], max_epoch=hp.train1.num_epochs, steps_per_epoch=hp.train1.steps_per_epoch, #session_config=session_conf ) ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if hp.default.use_gpu == True: os.environ['CUDA_VISIBLE_DEVICES'] = hp.default.gpu_list train_conf.nr_tower = len(hp.default.gpu_list.split(',')) num_gpu = len(hp.default.gpu_list.split(',')) trainer = SyncMultiGPUTrainerReplicated(num_gpu) else: os.environ['CUDA_VISIBLE_DEVICES'] = '' trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir): # model model = Net() # dataflow df = NetDataFlow(hp.train.data_path, hp.train.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) session_conf = tf.ConfigProto( gpu_options=tf.GPUOptions( allow_growth=True, ),) session_conf.gpu_options.per_process_gpu_memory_fraction = 0.45 # 占用GPU90%的显存 train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], max_epoch=hp.train.num_epochs, steps_per_epoch=hp.train.steps_per_epoch, # session_config=session_conf ) ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(hp.train.num_gpu) launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir2): # model model = Net2() # dataflow df = Net2DataFlow(hp.train2.mel_path, hp.train2.ppgs_path, hp.train2.batch_size) session_inits = [] ckpt2 = '{}/{}'.format( logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) if ckpt2: session_inits.append(SaverRestore(ckpt2)) ''' ckpt1 = tf.train.latest_checkpoint(logdir1) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) ''' train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), # ConvertCallback(logdir2, hp.train2.test_per_epoch), ], max_epoch=hp.train2.num_epochs, steps_per_epoch=hp.train2.steps_per_epoch, session_init=ChainInit(session_inits)) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(hp.train2.num_gpu) print("strated trainer") launch_train_with_config(train_conf, trainer=trainer)
model = IAFVocoder(batch_size=hp.train.batch_size, length=hp.signal.length) # dataset dataset = Dataset(hp.data_path, hp.train.batch_size, length=hp.signal.length) print('dataset size is {}'.format(len(dataset.wav_files))) # set logger for event and model saver logger.set_logger_dir(hp.logdir) train_conf = TrainConfig( model=model, data=TFDatasetInput(dataset()), callbacks=[ ModelSaver(checkpoint_dir=hp.logdir), RunUpdateOps() # for batch norm, exponential moving average # TODO GenerateCallback() ], max_epoch=hp.train.num_epochs, steps_per_epoch=hp.train.steps_per_epoch, ) ckpt = '{}/{}'.format( hp.logdir, ckpt) if ckpt else tf.train.latest_checkpoint(hp.logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, gpu)) train_conf.nr_tower = len(gpu)
def get_config(): M = Model() name_base = str(uuid.uuid1())[:6] PIPE_DIR = os.environ.get('TENSORPACK_PIPEDIR', '/tmp/.ipcpipe').rstrip('/') if not os.path.exists(PIPE_DIR): os.makedirs(PIPE_DIR) else: os.system('rm -f {}/sim-*'.format(PIPE_DIR)) namec2s = 'ipc://{}/sim-c2s-{}'.format(PIPE_DIR, name_base) names2c = 'ipc://{}/sim-s2c-{}'.format(PIPE_DIR, name_base) # AgentTorcs * SIMULATOR_PROC, AgentReplay * SIMULATOR_PROC procs = [ MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC * 2) ] ensure_proc_terminate(procs) start_proc_mask_signal(procs) master = MySimulatorMaster(namec2s, names2c, M) dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE) class CBSyncWeight(Callback): def _after_run(self, ctx, _): if self.local_step > 1 and self.local_step % SIMULATOR_PROC == 0: # print("before step ",self.local_step) return [M._td_sync_op] def _before_run(self, ctx): if self.local_step % 10 == 0: return [M._sync_op, M._td_sync_op] if self.local_step % SIMULATOR_PROC == 0 and 0: return [M._td_sync_op] import functools return TrainConfig( model=M, dataflow=dataflow, callbacks=[ ModelSaver(), HyperParamSetterWithFunc( 'learning_rate/actor', functools.partial(M._calc_learning_rate, 'actor')), HyperParamSetterWithFunc( 'learning_rate/critic', functools.partial(M._calc_learning_rate, 'critic')), # ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]), ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]), # HumanHyperParamSetter('learning_rate'), # HumanHyperParamSetter('entropy_beta'), # ScheduledHyperParamSetter('actor/sigma_beta_accel', [(1, 0.2), (2, 0.01), (3, 1e-3), (4, 1e-4)]), # ScheduledHyperParamSetter('actor/sigma_beta_steering', [(1, 0.1), (2, 0.01), (3, 1e-3), (4, 1e-4)]), master, StartProcOrThread(master), CBSyncWeight(), # CBTDSyncWeight() # PeriodicTrigger(Evaluator( # EVAL_EPISODE, ['state'], ['policy'], get_player), # every_k_epochs=3), ], session_creator=sesscreate.NewSessionCreator( config=get_default_sess_config(0.5)), steps_per_epoch=STEPS_PER_EPOCH, max_epoch=1000, )
audio_meta = AudioMeta(hp.train.data_path) if args.remote: df = get_remote_dataflow(args.port, hp.train.batch_size) else: df = DataLoader(audio_meta, hp.train.batch_size).dataflow( nr_prefetch=5000, nr_thread=int(multiprocessing.cpu_count() // 1.5)) # set logger for event and model saver logger.set_logger_dir(hp.logdir) if True: train_conf = TrainConfig( model=ClassificationModel(num_classes=audio_meta.num_speaker, **hp.model), data=FlexibleQueueInput(df, capacity=500), callbacks=[ModelSaver(checkpoint_dir=hp.logdir), EvalCallback()], steps_per_epoch=hp.train.steps_per_epoch, # session_config=session_config ) ckpt = args.ckpt if args.ckpt else tf.train.latest_checkpoint( hp.logdir) if ckpt and not args.r: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(hp.train.num_gpu)