def train(name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01, seed=None, logdir='data/logs', max_epoch=100, patience=2, train_sampling=1.0, eval_sampling=1.0, eval_memsize=5, gpu=0, gpu_allow_growth=False, save_best_model=False, forward_split=False, write_summaries=False, verbose=False, asgd_decay=None, tqdm=True, side_split=True, max_steps=None, save_from_step=None, do_eval=True, predict_window=63): eval_k = int(round(26214 * eval_memsize / n_models)) eval_batch_size = int( eval_k / (hparams.rnn_depth * hparams.encoder_rnn_layers)) # 128 -> 1024, 256->512, 512->256 eval_pct = 0.1 batch_size = hparams.batch_size train_window = hparams.train_window tf.reset_default_graph() if seed: tf.set_random_seed(seed) with tf.device("/cpu:0"): inp = VarFeeder.read_vars("data/vars") if side_split: splitter = Splitter(page_features(inp), inp.page_map, 3, train_sampling=train_sampling, test_sampling=eval_sampling, seed=seed) else: splitter = FakeSplitter(page_features(inp), 3, seed=seed, test_sampling=eval_sampling) real_train_pages = splitter.splits[0].train_size real_eval_pages = splitter.splits[0].test_size items_per_eval = real_eval_pages * eval_pct eval_batches = int(np.ceil(items_per_eval / eval_batch_size)) steps_per_epoch = real_train_pages // batch_size eval_every_step = int(round(steps_per_epoch * eval_pct)) # eval_every_step = int(round(items_per_eval * train_sampling / batch_size)) global_step = tf.train.get_or_create_global_step() inc_step = tf.assign_add(global_step, 1) all_models: List[ModelTrainerV2] = [] def create_model(scope, index, prefix, seed): with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): split = splitter.splits[index] pipe = InputPipe(inp, features=split.train_set, n_pages=split.train_size, mode=ModelMode.TRAIN, batch_size=batch_size, n_epoch=None, verbose=verbose, train_completeness_threshold=train_completeness_threshold, predict_completeness_threshold=train_completeness_threshold, train_window=train_window, predict_window=predict_window, rand_seed=seed, train_skip_first=hparams.train_skip_first, back_offset=predict_window if forward_split else 0) inp_scope.reuse_variables() if side_split: side_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window * (2 if forward_split else 1)) else: side_eval_pipe = None if forward_split: forward_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window) else: forward_eval_pipe = None avg_sgd = asgd_decay is not None #asgd_decay = 0.99 if avg_sgd else None train_model = Model(pipe, hparams, is_train=True, graph_prefix=prefix, asgd_decay=asgd_decay, seed=seed) scope.reuse_variables() eval_stages = [] if side_split: side_eval_model = Model(side_eval_pipe, hparams, is_train=False, #loss_mask=np.concatenate([np.zeros(50, dtype=np.float32), np.ones(10, dtype=np.float32)]), seed=seed) eval_stages.append((Stage.EVAL_SIDE, side_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_SIDE_EMA, side_eval_model)) if forward_split: forward_eval_model = Model(forward_eval_pipe, hparams, is_train=False, seed=seed) eval_stages.append((Stage.EVAL_FRWD, forward_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_FRWD_EMA, forward_eval_model)) if write_summaries: summ_path = f"{logdir}/{name}_{index}" if os.path.exists(summ_path): shutil.rmtree(summ_path) summ_writer = tf.summary.FileWriter(summ_path) # , graph=tf.get_default_graph() else: summ_writer = None if do_eval and forward_split: stop_metric = lambda metrics: metrics[Stage.EVAL_FRWD]['SMAPE'].avg_epoch else: stop_metric = None return ModelTrainerV2(train_model, eval_stages, index, patience=patience, stop_metric=stop_metric, summary_writer=summ_writer) if n_models == 1: with tf.device(f"/gpu:{gpu}"): scope = tf.get_variable_scope() all_models = [create_model(scope, 0, None, seed=seed)] else: for i in range(n_models): device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}" with tf.device(device): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: all_models.append(create_model(scope, i, prefix=prefix, seed=seed + i)) trainer = MultiModelTrainer(all_models, inc_step) if save_best_model or save_from_step: saver_path = f'data/cpt/{name}' if os.path.exists(saver_path): shutil.rmtree(saver_path) os.makedirs(saver_path) saver = tf.train.Saver(max_to_keep=10, name='train_saver') else: saver = None avg_sgd = asgd_decay is not None if avg_sgd: from itertools import chain def ema_vars(model): ema = model.train_model.ema return {ema.average_name(v):v for v in model.train_model.ema._averages} ema_names = dict(chain(*[ema_vars(model).items() for model in all_models])) #ema_names = all_models[0].train_model.ema.variables_to_restore() ema_loader = tf.train.Saver(var_list=ema_names, max_to_keep=1, name='ema_loader') ema_saver = tf.train.Saver(max_to_keep=1, name='ema_saver') else: ema_loader = None init = tf.global_variables_initializer() if forward_split and do_eval: eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE') eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE') else: eval_smape = DummyMetric() eval_mae = DummyMetric() if side_split and do_eval: eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE') eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE') else: eval_mae_side = DummyMetric() eval_smape_side = DummyMetric() train_smape = trainer.metric(Stage.TRAIN, 'SMAPE') train_mae = trainer.metric(Stage.TRAIN, 'MAE') grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm') eval_stages = [] ema_eval_stages = [] if forward_split and do_eval: eval_stages.append(Stage.EVAL_FRWD) ema_eval_stages.append(Stage.EVAL_FRWD_EMA) if side_split and do_eval: eval_stages.append(Stage.EVAL_SIDE) ema_eval_stages.append(Stage.EVAL_SIDE_EMA) # gpu_options=tf.GPUOptions(allow_growth=False), with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=gpu_allow_growth))) as sess: sess.run(init) # pipe.load_vars(sess) inp.restore(sess) for model in all_models: model.init(sess) # if beholder: # visualizer = Beholder(session=sess, logdir=summ_path) step = 0 prev_top = np.inf best_smape = np.inf # Contains best value (first item) and subsequent values best_epoch_smape = [] for epoch in range(max_epoch): # n_steps = pusher.n_pages // batch_size if tqdm: tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False) else: tqr = range(steps_per_epoch) for _ in tqr: try: step = trainer.train_step(sess, epoch) except tf.errors.OutOfRangeError: break # if beholder: # if step % 5 == 0: # noinspection PyUnboundLocalVariable # visualizer.update() if step % eval_every_step == 0: if eval_stages: trainer.eval_step(sess, epoch, step, eval_batches, stages=eval_stages) if save_best_model and epoch > 0 and eval_smape.last < best_smape: best_smape = eval_smape.last saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if save_from_step and step >= save_from_step: saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if avg_sgd and ema_eval_stages: ema_saver.save(sess, 'data/cpt_tmp/ema', write_meta_graph=False) # restore ema-backed vars ema_loader.restore(sess, 'data/cpt_tmp/ema') trainer.eval_step(sess, epoch, step, eval_batches, stages=ema_eval_stages) # restore normal vars ema_saver.restore(sess, 'data/cpt_tmp/ema') MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last, train_mae.last) improvement = '↑' if eval_smape.improved else ' ' SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last, eval_smape_side.last, train_smape.last) if tqdm: tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE) if not trainer.has_active() or (max_steps and step > max_steps): break if tqdm: tqr.close() trainer.end_epoch() if not best_epoch_smape or eval_smape.avg_epoch < best_epoch_smape[0]: best_epoch_smape = [eval_smape.avg_epoch] else: best_epoch_smape.append(eval_smape.avg_epoch) current_top = eval_smape.top if prev_top > current_top: prev_top = current_top has_best_indicator = '↑' else: has_best_indicator = ' ' status = "%2d: Best top SMAPE=%.3f%s (%s)" % ( epoch + 1, current_top, has_best_indicator, ",".join(["%.3f" % m.top for m in eval_smape.metrics])) if trainer.has_active(): status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d am" % \ (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch, eval_mae.avg_epoch, eval_mae_side.avg_epoch, eval_smape.avg_epoch, eval_smape_side.avg_epoch, trainer.has_active()) print(status, file=sys.stderr) else: print(status, file=sys.stderr) print("Early stopping!", file=sys.stderr) break if max_steps and step > max_steps: print("Max steps calculated", file=sys.stderr) break sys.stderr.flush() # noinspection PyUnboundLocalVariable return np.mean(best_epoch_smape, dtype=np.float64)
def train(name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01, seed=None, logdir='data/logs', max_epoch=100, patience=2, train_sampling=1.0, eval_sampling=1.0, eval_memsize=5, gpu=0, gpu_allow_growth=False, save_best_model=False, forward_split=False, write_summaries=False, verbose=False, asgd_decay=None, tqdm=True, side_split=True, max_steps=None, save_from_step=None, do_eval=True, predict_window=63): eval_k = int(round(26214 * eval_memsize / n_models)) eval_batch_size = int( eval_k / (hparams.rnn_depth * hparams.encoder_rnn_layers)) # 128 -> 1024, 256->512, 512->256 eval_pct = 0.1 batch_size = hparams.batch_size train_window = hparams.train_window # todo eval_k = 43690,eval_batch_size = 163,eval_pct = 0,batch_size = 128,train_window = 283 # print("eval_k = %d,eval_batch_size = %d,eval_pct = %d,batch_size = %d,train_window = %d" %(eval_k,eval_batch_size,eval_pct,batch_size,train_window)) tf.reset_default_graph() if seed: tf.set_random_seed(seed) with tf.device("/cpu:0"): inp = VarFeeder.read_vars("data/vars") # print("side_split = %d,train_sampling= %d,eval_sampling= %d,seed= %d" % ( # side_split,train_sampling,eval_sampling,seed),f"inp={inp}, side_split={side_split}; type(inp)={type(inp)}") # todo side_split = 0,train_sampling= 1,eval_sampling= 1,seed= 5 # inp={'hits': <tf.Variable 'hits:0' shape=(145036, 805) dtype=float32_ref>, # 'lagged_ix': <tf.Variable 'lagged_ix:0' shape=(867, 4) dtype=int16_ref>, # 'page_map': <tf.Variable 'page_map:0' shape=(52752, 4) dtype=int32_ref>, # 'page_ix': <tf.Variable 'page_ix:0' shape=(145036,) dtype=string_ref>, # 'pf_agent': <tf.Variable 'pf_agent:0' shape=(145036, 4) dtype=float32_ref>, # 'pf_country': <tf.Variable 'pf_country:0' shape=(145036, 7) dtype=float32_ref>, # 'pf_site': <tf.Variable 'pf_site:0' shape=(145036, 3) dtype=float32_ref>, # 'page_popularity': <tf.Variable 'page_popularity:0' shape=(145036,) dtype=float32_ref>, # 'year_autocorr': <tf.Variable 'year_autocorr:0' shape=(145036,) dtype=float32_ref>, # 'quarter_autocorr': <tf.Variable 'quarter_autocorr:0' shape=(145036,) dtype=float32_ref>, # 'dow': <tf.Variable 'dow:0' shape=(867, 2) dtype=float32_ref>,'features_days': 867, # 'data_days': 805, 'n_pages': 145036, 'data_start': '2015-07-01', # 'data_end': Timestamp('2017-09-11 00:00:00'), 'features_end': Timestamp('2017-11-13 00:00:00')} # side_split=False; # type(inp)=<class 'feeder.FeederVars'>; # if True: if side_split: splitter = Splitter(page_features(inp), inp.page_map, 3, train_sampling=train_sampling, test_sampling=eval_sampling, seed=seed) else: splitter = FakeSplitter(page_features(inp), 3, seed=seed, test_sampling=eval_sampling) real_train_pages = splitter.splits[0].train_size real_eval_pages = splitter.splits[0].test_size items_per_eval = real_eval_pages * eval_pct eval_batches = int(np.ceil(items_per_eval / eval_batch_size)) steps_per_epoch = real_train_pages // batch_size eval_every_step = int(round(steps_per_epoch * eval_pct)) # todo real_train_pages = 145036,real_eval_pages= 145036,items_per_eval= 14503,eval_batches= 89, # steps_per_epoch= 1133,eval_every_step= 113 -- 每个epoch有1133个step,每113个step打印一下当前模型的效果 # print("real_train_pages = %d,real_eval_pages= %d,items_per_eval= %d,eval_batches= %d,steps_per_epoch= %d,eval_every_step= %d; eval_pct" % ( # real_train_pages, real_eval_pages, items_per_eval, eval_batches, steps_per_epoch, eval_every_step,eval_pct # )) # return # eval_every_step = int(round(items_per_eval * train_sampling / batch_size)) # todo get_or_create_global_step 这个函数主要用于返回或者创建(如果有必要的话)一个全局步数的tensor变量。 global_step = tf.train.get_or_create_global_step() # todo tf.assign_add(ref,value,use_locking=None,name=None);通过增加value,更新ref的值,即:ref = ref + value; # inc increase_step inc_step = tf.assign_add(global_step, 1) all_models: List[ModelTrainerV2] = [] def create_model(scope, index, prefix, seed): # todo 主要是创建了模型,以及返回一些None的东西。 # 数据在构建模型的时候使用了,模型中只使用了数据的预测窗口的长度--不对,应该是创建模型的时候直接喂入数据了。 with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): split = splitter.splits[index] pipe = InputPipe( inp, features=split.train_set, n_pages=split.train_size, mode=ModelMode.TRAIN, batch_size=batch_size, n_epoch=None, verbose=verbose, train_completeness_threshold=train_completeness_threshold, predict_completeness_threshold=train_completeness_threshold, train_window=train_window, predict_window=predict_window, rand_seed=seed, train_skip_first=hparams.train_skip_first, back_offset=predict_window if forward_split else 0) inp_scope.reuse_variables() # todo side_split: False; forward_split:False; eval_stages: []; if side_split: side_eval_pipe = InputPipe( inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window * (2 if forward_split else 1)) else: side_eval_pipe = None if forward_split: forward_eval_pipe = InputPipe( inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window) else: forward_eval_pipe = None avg_sgd = asgd_decay is not None #asgd_decay = 0.99 if avg_sgd else None train_model = Model(pipe, hparams, is_train=True, graph_prefix=prefix, asgd_decay=asgd_decay, seed=seed) scope.reuse_variables() eval_stages = [] if side_split: # print('2 side_split side_eval_model') side_eval_model = Model( side_eval_pipe, hparams, is_train=False, #loss_mask=np.concatenate([np.zeros(50, dtype=np.float32), np.ones(10, dtype=np.float32)]), seed=seed) # print("2 side_eval_model -- 2") # todo TRAIN = 0; EVAL_SIDE = 1; EVAL_FRWD = 2; EVAL_SIDE_EMA = 3; EVAL_FRWD_EMA = 4 eval_stages.append((Stage.EVAL_SIDE, side_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_SIDE_EMA, side_eval_model)) if forward_split: # print("3 forward_split forward_eval_model") # tf.reset_default_graph() forward_eval_model = Model(forward_eval_pipe, hparams, is_train=False, seed=seed) # print("3 forward_split forward_eval_model -- 2") eval_stages.append((Stage.EVAL_FRWD, forward_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_FRWD_EMA, forward_eval_model)) if write_summaries: summ_path = f"{logdir}/{name}_{index}" # print("write_summaries summ_path",summ_path) if os.path.exists(summ_path): shutil.rmtree(summ_path) summ_writer = tf.summary.FileWriter( summ_path) # , graph=tf.get_default_graph() else: summ_writer = None if do_eval and forward_split: stop_metric = lambda metrics: metrics[Stage.EVAL_FRWD]['SMAPE' ].avg_epoch else: stop_metric = None # todo side_split: False; forward_split:False; # summ_writer=<tensorflow.python.summary.writer.writer.FileWriter object at 0x7ff5dc176710>; # eval_stages: []; stop_metric=None; patience=2; index=0 # print(f"side_split: {side_split}; forward_split:{forward_split}; summ_writer={summ_writer};" # f"eval_stages: {eval_stages}; stop_metric={stop_metric}; patience={patience}; index={index}") return ModelTrainerV2(train_model, eval_stages, index, patience=patience, stop_metric=stop_metric, summary_writer=summ_writer) # todo n_models == 3 if n_models == 1: with tf.device(f"/gpu:{gpu}"): scope = tf.get_variable_scope() all_models = [create_model(scope, 0, None, seed=seed)] else: for i in range(n_models): device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}" with tf.device(device): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: all_models.append( create_model(scope, i, prefix=prefix, seed=seed + i)) # todo inc_step = tf.assign_add(global_step, 1) trainer = MultiModelTrainer(all_models, inc_step) # return # todo save_best_model or save_from_step: False 10500 # print("save_best_model or save_from_step: ", save_best_model, save_from_step) if save_best_model or save_from_step: saver_path = f'data/cpt/{name}' # todo saver_path: data/cpt/s32 # print("saver_path: ",saver_path) if os.path.exists(saver_path): shutil.rmtree(saver_path) os.makedirs(saver_path) # todo max_to_keep 参数,这个是用来设置保存模型的个数,默认为5,即 max_to_keep=5,保存最近的5个模型 saver = tf.train.Saver(max_to_keep=10, name='train_saver') else: saver = None # todo EMA decay for averaged SGD. Not use ASGD if not set avg_sgd = asgd_decay is not None # todo asgd_decay=0.99; avg_sgd=True # print(f"asgd_decay={asgd_decay}; avg_sgd={avg_sgd}") if avg_sgd: from itertools import chain def ema_vars(model): ema = model.train_model.ema # todo: average_name() methods give access to the shadow variables and their names return { ema.average_name(v): v for v in model.train_model.ema._averages } ema_names = dict( chain(*[ema_vars(model).items() for model in all_models])) # todo ema_names= # {'m_0/m_0/cudnn_gru/opaque_kernel/ExponentialMovingAverage': <tf.Variable 'm_0/cudnn_gru/opaque_kernel:0' shape=<unknown> dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d/kernel:0' shape=(7, 5, 16) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d/bias:0' shape=(16,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_1/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_1/kernel:0' shape=(3, 16, 16) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_1/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_1/bias:0' shape=(16,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_2/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_2/kernel:0' shape=(3, 16, 32) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_2/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_2/bias:0' shape=(32,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_3/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_3/kernel:0' shape=(3, 32, 32) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_3/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_3/bias:0' shape=(32,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_4/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_4/kernel:0' shape=(3, 32, 64) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_4/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_4/bias:0' shape=(64,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_5/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_5/kernel:0' shape=(3, 64, 64) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_5/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_5/bias:0' shape=(64,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/fc_convnet/fc_encoder/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/fc_encoder/kernel:0' shape=(2304, 512) dtype=float32_ref>, # 'm_0/m_0/fingerpint/fc_convnet/fc_encoder/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/fc_encoder/bias:0' shape=(512,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/fc_convnet/out_encoder/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/out_encoder/kernel:0' shape=(512, 16) dtype=float32_ref>, # 'm_0/m_0/fingerpint/fc_convnet/out_encoder/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/out_encoder/bias:0' shape=(16,) dtype=float32_ref>, # 'm_0/m_0/attn_focus/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/attn_focus/kernel:0' shape=(16, 221) dtype=float32_ref>, # 'm_0/m_0/attn_focus/bias/ExponentialMovingAverage': <tf.Variable 'm_0/attn_focus/bias:0' shape=(221,) dtype=float32_ref>, # 'm_0/m_0/gru_cell/w_ru/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/w_ru:0' shape=(291, 534) dtype=float32_ref>, # 'm_0/m_0/gru_cell/b_ru/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/b_ru:0' shape=(534,) dtype=float32_ref>, # 'm_0/m_0/gru_cell/w_c/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/w_c:0' shape=(291, 267) dtype=float32_ref>, # 'm_0/m_0/gru_cell/b_c/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/b_c:0' shape=(267,) dtype=float32_ref>, # 'm_0/m_0/decoder_output_proj/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/decoder_output_proj/kernel:0' shape=(267, 1) dtype=float32_ref>, # 'm_0/m_0/decoder_output_proj/bias/ExponentialMovingAverage': <tf.Variable 'm_0/decoder_output_proj/bias:0' shape=(1,) dtype=float32_ref>, # print(f"ema_names={ema_names}") # todo chain=<itertools.chain object at 0x7fe6587cbf98>, # [] = [dict_items([ # ('m_0/m_0/cudnn_gru/opaque_kernel/ExponentialMovingAverage', <tf.Variable 'm_0/cudnn_gru/opaque_kernel:0' shape=<unknown> dtype=float32_ref>), # ... # ('m_2/m_2/decoder_output_proj/bias/ExponentialMovingAverage', <tf.Variable 'm_2/decoder_output_proj/bias:0' shape=(1,) dtype=float32_ref>) # ])] # print(f"chain={chain(*[ema_vars(model).items() for model in all_models])},\n[] = {[ema_vars(model).items() for model in all_models]}") #ema_names = all_models[0].train_model.ema.variables_to_restore() ema_loader = tf.train.Saver(var_list=ema_names, max_to_keep=1, name='ema_loader') ema_saver = tf.train.Saver(max_to_keep=1, name='ema_saver') else: ema_loader = None init = tf.global_variables_initializer() # print(f"forward_split={forward_split}; do_eval={do_eval}; side_split={side_split}") if forward_split and do_eval: eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE') eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE') else: eval_smape = DummyMetric() eval_mae = DummyMetric() if side_split and do_eval: eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE') eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE') else: eval_mae_side = DummyMetric() eval_smape_side = DummyMetric() train_smape = trainer.metric(Stage.TRAIN, 'SMAPE') train_mae = trainer.metric(Stage.TRAIN, 'MAE') grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm') eval_stages = [] ema_eval_stages = [] if forward_split and do_eval: eval_stages.append(Stage.EVAL_FRWD) ema_eval_stages.append(Stage.EVAL_FRWD_EMA) if side_split and do_eval: eval_stages.append(Stage.EVAL_SIDE) ema_eval_stages.append(Stage.EVAL_SIDE_EMA) # todo eval_stages=[]; ema_eval_stages=[] # print(f"eval_stages={eval_stages}; ema_eval_stages={ema_eval_stages}") # gpu_options=tf.GPUOptions(allow_growth=False), with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions( allow_growth=gpu_allow_growth))) as sess: sess.run(init) # pipe.load_vars(sess) # todo 之前inp是加载了这个数据对象,restore是把数据tensor加载到sess中吧? # 这里加载了数据在哪里用到了呢? inp.restore(sess) for model in all_models: # todo 这里是什么意思呢?这样的到什么呢?运行了一下init_iterator? # 上面建好模型结构之后,在哪里喂入数据呢? model.init(sess) # if beholder: # visualizer = Beholder(session=sess, logdir=summ_path) step = 0 prev_top = np.inf best_smape = np.inf # Contains best value (first item) and subsequent values best_epoch_smape = [] for epoch in range(max_epoch): # n_steps = pusher.n_pages // batch_size if tqdm: # todo Tqdm 是一个快速,可扩展的Python进度条,可以在 Python 长循环中添加一个进度提示信息, # 用户只需要封装任意的迭代器 tqdm(iterator)。trange(i) 是 tqdm(range(i)) 的简单写法 # desc=进度条前面的描述;leave:保留进度条存在的痕迹,简单来说就是会把进度条的最终形态保留下来,默认为True tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False) else: tqr = range(steps_per_epoch) for _ in tqr: try: # print("PRINT step = trainer.train_step") # todo 训练模型只有这一行对吧 step = trainer.train_step(sess, epoch) # if epoch == 0: # print(f"step={step}, _={_}, epoch = {epoch}") except tf.errors.OutOfRangeError: break # if beholder: # if step % 5 == 0: # noinspection PyUnboundLocalVariable # visualizer.update() # todo 应该是每训练一个epoch,会对其中的100(eval_pct)个batch的结果做一个评估;eval_every_step= 113 if step % eval_every_step == 0: # todo eval_stages=[];save_best_model=False; save_from_step=10500; avg_sgd=True; ema_eval_stages=[] # print(f"eval_stages={eval_stages};save_best_model={save_best_model}; save_from_step={save_from_step}; avg_sgd={avg_sgd}; ema_eval_stages={ema_eval_stages}") if eval_stages: trainer.eval_step(sess, epoch, step, eval_batches, stages=eval_stages) if save_best_model and epoch > 0 and eval_smape.last < best_smape: best_smape = eval_smape.last saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if save_from_step and step >= save_from_step: saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if avg_sgd and ema_eval_stages: ema_saver.save(sess, 'data/cpt_tmp/ema', write_meta_graph=False) # restore ema-backed vars ema_loader.restore(sess, 'data/cpt_tmp/ema') trainer.eval_step(sess, epoch, step, eval_batches, stages=ema_eval_stages) # restore normal vars ema_saver.restore(sess, 'data/cpt_tmp/ema') MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last, train_mae.last) improvement = '↑' if eval_smape.improved else ' ' SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last, eval_smape_side.last, train_smape.last) if tqdm: # todo .set_description("GEN %i"%i) #设置进度条左边显示的信息 # .set_postfix(loss=random(),gen=randint(1,999),str="h",lst=[1,2]) #设置进度条右边显示的信息 tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE) if not trainer.has_active() or (max_steps and step > max_steps): break if tqdm: tqr.close() trainer.end_epoch() if not best_epoch_smape or eval_smape.avg_epoch < best_epoch_smape[ 0]: best_epoch_smape = [eval_smape.avg_epoch] else: best_epoch_smape.append(eval_smape.avg_epoch) current_top = eval_smape.top if prev_top > current_top: prev_top = current_top has_best_indicator = '↑' else: has_best_indicator = ' ' status = "%2d: Best top SMAPE=%.3f%s (%s)" % ( epoch + 1, current_top, has_best_indicator, ",".join( ["%.3f" % m.top for m in eval_smape.metrics])) if trainer.has_active(): status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d am" % \ (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch, eval_mae.avg_epoch, eval_mae_side.avg_epoch, eval_smape.avg_epoch, eval_smape_side.avg_epoch, trainer.has_active()) else: print("Early stopping!", file=sys.stderr) break if max_steps and step > max_steps: print("Max steps calculated", file=sys.stderr) break sys.stderr.flush() # todo best_epoch_smape=[nan]; eval_smape.avg_epoch=nan; trainer.has_active()=3; prev_top=inf; current_top=nan # print(f"best_epoch_smape={best_epoch_smape}; eval_smape.avg_epoch={eval_smape.avg_epoch}; " # f"trainer.has_active()={trainer.has_active()}; prev_top={prev_top}; current_top={current_top}") # noinspection PyUnboundLocalVariable return np.mean(best_epoch_smape, dtype=np.float64)