def create_or_load_hparams(default_hparams, hparams_path): """Create hparams or load hparams from output_dir.""" hparams = utils.maybe_parse_standard_hparams(default_hparams, hparams_path) hparams = extend_hparams(hparams) # Print HParams utils.print_hparams(hparams) return hparams
def create_or_load_hparams(out_dir, default_hparams, hparams_path, save_hparams=True): """Create hparams or load hparams from out_dir.""" print('[new hparams]\n') hparams = default_hparams hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path) hparams = extend_hparams(hparams) ''' hparams = utils.load_hparams(out_dir) if not hparams: print('[new hparams]\n') hparams = default_hparams hparams = utils.maybe_parse_standard_hparams( hparams, hparams_path) hparams = extend_hparams(hparams) else: print('[load hparams]\n') hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path) ''' # Save HParams if save_hparams: utils.save_hparams(out_dir, hparams) for metric in hparams.metrics: utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"), hparams) # Print HParams utils.print_hparams(hparams) return hparams
def create_or_load_hparams(load_dir, default_hparams, hparams_path, save_hparams): """Create hparams or load hparams from out_dir.""" hparams = utils.load_hparams(load_dir) if not hparams: hparams = default_hparams # Override hparams values with existing standard hparams config hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path) hparams = process_input_path(hparams) hparams = extend_hparams(hparams) else: hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path) hparams = process_input_path(hparams) # Save HParams if save_hparams: utils.save_hparams(default_hparams.out_dir, hparams) for metric in hparams.metrics: utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"), hparams) # Print HParams utils.print_hparams(hparams) return hparams
def create_or_load_hparams(out_dir, default_hparams, flags): """Create hparams or load hparams from out_dir.""" hparams = utils.load_hparams(out_dir, verbose=not flags.chat) if not hparams: # Parse the ones from the command line hparams = default_hparams hparams = utils.maybe_parse_standard_hparams(hparams, flags.hparams_path, verbose=not flags.chat) hparams = extend_hparams(hparams) else: hparams = ensure_compatible_hparams(hparams, default_hparams, flags) # Save HParams utils.save_hparams(out_dir, hparams, verbose=not flags.chat) for metric in hparams.metrics: utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"), hparams, verbose=not flags.chat) # Print HParams if not flags.chat: utils.print_hparams(hparams) return hparams
def create_or_load_hparams(out_dir, default_hparams, hparams_path): """Create hparams or load hparams from out_dir.""" hparams = utils.load_hparams(out_dir) # print(hparams); assert False #debug if not hparams: hparams = default_hparams hparams = utils.maybe_parse_standard_hparams( hparams, hparams_path) hparams = extend_hparams(hparams) else: hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path) if FLAGS.inference_input_file: hparams.src_vocab_file = os.path.join(out_dir, "../data/vocab.cor") hparams.tgt_vocab_file = os.path.join(out_dir, "../data/vocab.man") hparams.out_dir = out_dir hparams.best_bleu_dir = os.path.join(out_dir, "best_bleu") hparams.train_prefix = os.path.join(out_dir, "../data/train") hparams.dev_prefix = os.path.join(out_dir, "../data/dev_test") hparams.vocab_prefix = os.path.join(out_dir, "../data/vocab") hparams.rc_vocab_file = os.path.join(out_dir, "../data/vocab.cor") hparams.test_prefix = os.path.join(out_dir, "../data/test") # Save HParams utils.save_hparams(out_dir, hparams) for metric in hparams.metrics: utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"), hparams) # Print HParams utils.print_hparams(hparams) return hparams
def create_or_load_hparams(out_dir, default_hparams, save_hparams=True): hparams = utils.load_hparams(out_dir) if not hparams: hparams = default_hparams hparams = extend_hparams(hparams) # Save HParams if save_hparams: utils.save_hparams(out_dir, hparams) # Print HParams utils.print_hparams(hparams) return hparams
def create_or_load_hparams(out_dir, default_hparams, hparams_path): """Create hparams or load hparams from out_dir.""" hparams = utils.load_hparams(out_dir) if not hparams: hparams = default_hparams hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path) hparams = extend_hparams(hparams) else: hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path) # Save HParams utils.save_hparams(out_dir, hparams) for metric in hparams.metrics: utils.save_hparams(getattr(hparams, "best_bleu_dir"), hparams) # Print HParams utils.print_hparams(hparams) return hparams
def create_or_load_hparams(out_dir, default_hparams, hparams_path, save_hparams=True): hparams = utils.load_hparams(out_dir) if not hparams: hparams = default_hparams hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path) hparams = extend_hparams(hparams) else: hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path) if save_hparams: utils.save_hparams(out_dir, hparams) for metric in hparams.metrics: utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"), hparams) utils.print_hparams(hparams) return hparams
def main(unused_argv): experiment_start = time.time() tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.use_fp16 and FLAGS.use_dist_strategy: raise ValueError("use_fp16 and use_dist_strategy aren't compatible") if FLAGS.use_fp16 + FLAGS.use_amp + FLAGS.use_fastmath > 1: raise ValueError("Only one of use_fp16, use_amp, use_fastmath can be set") if FLAGS.use_amp: utils.print_out('Enabling TF-AMP') os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if FLAGS.use_fastmath: utils.print_out('Enabling FastMath') os.environ["TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32"] = '1' os.environ["TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32"] = '1' os.environ["TF_ENABLE_CUDNN_RNN_TENSOR_OP_MATH_FP32"] = '1' # Set up hacky envvars. # Hack that affects Defun in attention_wrapper.py active_xla_option_nums = np.sum([FLAGS.use_xla, FLAGS.use_autojit_xla, FLAGS.xla_compile]) if active_xla_option_nums > 1: raise ValueError( "Only one of use_xla, xla_compile, use_autojit_xla can be set") os.environ["use_xla"] = str(FLAGS.use_xla).lower() if FLAGS.use_xla: os.environ["use_defun"] = str(True).lower() else: os.environ["use_defun"] = str(FLAGS.use_defun).lower() utils.print_out("use_defun is %s for attention" % os.environ["use_defun"]) # TODO(jamesqin): retire this config after Cuda9.1 os.environ["use_fp32_batch_matmul"] = ("true" if FLAGS.use_fp32_batch_matmul else "false") os.environ["xla_compile"] = "true" if FLAGS.xla_compile else "false" os.environ["force_inputs_padding"] = ( "true" if FLAGS.force_inputs_padding else "false") if FLAGS.mode == "train": utils.print_out("Running training mode.") default_hparams = create_hparams(FLAGS) run_main(FLAGS, default_hparams, estimator.train_fn) elif FLAGS.mode == "infer" or FLAGS.mode == "translate": if FLAGS.mode == "infer": utils.print_out("Running inference mode.") translate_mode = False else: utils.print_out("Running translate mode on file {}.".format(FLAGS.translate_file)) translate_mode = True # Random random_seed = FLAGS.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed) np.random.seed(random_seed) tf.set_random_seed(random_seed) # Model output directory output_dir = FLAGS.output_dir if output_dir and not tf.gfile.Exists(output_dir): utils.print_out("# Creating output directory %s ..." % output_dir) tf.gfile.MakeDirs(output_dir) # Load hparams. default_hparams = create_hparams(FLAGS) default_hparams.num_buckets = 1 # The estimator model_fn is written in a way allowing train hparams to be # passed in infer mode. hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path) utils.print_out("infer_hparams:") utils.print_hparams(hparams) if translate_mode: tokenize(hparams, hparams.translate_file, hparams.translate_file + ".tok") eval_sentences, eval_src_tokens, _ = iterator_utils.get_effective_epoch_size(hparams, train=False) # Run evaluation when there's a new checkpoint tf.logging.info("Starting to evaluate...") eval_start = time.time() _, (eval_speed, eval_latencies), eval_output_tokens = estimator.eval_fn(hparams, hparams.ckpt, only_translate=translate_mode) eval_end = time.time() eval_delta = eval_end - eval_start utils.print_out("eval time for ckpt: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" % (eval_delta / 60., eval_speed, eval_speed * (eval_src_tokens + eval_output_tokens) / eval_sentences), f=sys.stderr) for lat in sorted(eval_latencies): utils.print_out("eval latency_%s for ckpt: %.2f ms" % (lat, eval_latencies[lat] * 1000)) if translate_mode: detokenize(hparams, hparams.translate_file + ".trans.tok", hparams.translate_file + ".trans") else: assert FLAGS.mode == "train_and_eval" utils.print_out("Running train and eval mode.") # Random random_seed = FLAGS.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed) np.random.seed(random_seed) tf.set_random_seed(random_seed) # Model output directory output_dir = FLAGS.output_dir if output_dir and not tf.gfile.Exists(output_dir): utils.print_out("# Creating output directory %s ..." % output_dir) tf.gfile.MakeDirs(output_dir) # Load hparams. default_hparams = create_hparams(FLAGS) hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path) utils.print_out("training hparams:") utils.print_hparams(hparams) with tf.gfile.GFile(os.path.join(output_dir, "train_hparams.txt"), "w") as f: f.write(utils.serialize_hparams(hparams) + "\n") # The estimator model_fn is written in a way allowing train hparams to be # passed in infer mode. infer_hparams = tf.contrib.training.HParams(**hparams.values()) infer_hparams.num_buckets = 1 utils.print_out("infer_hparams:") utils.print_hparams(infer_hparams) with tf.gfile.GFile(os.path.join(output_dir, "infer_hparams.txt"), "w") as f: f.write(utils.serialize_hparams(infer_hparams) + "\n") epochs = 0 should_stop = epochs >= FLAGS.max_train_epochs train_sentences, train_src_tokens, train_tgt_tokens = iterator_utils.get_effective_epoch_size(hparams) eval_sentences, eval_src_tokens, _ = iterator_utils.get_effective_epoch_size(hparams, train=False) while not should_stop: utils.print_out("Starting epoch %d" % epochs) try: train_start = time.time() train_speed, _ = estimator.train_fn(hparams) except tf.errors.OutOfRangeError: utils.print_out("training hits OutOfRangeError", f=sys.stderr) train_end = time.time() train_delta = train_end - train_start utils.print_out("training time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" % (epochs + 1, train_delta / 60., train_speed, train_speed * (train_src_tokens + train_tgt_tokens) / train_sentences), f=sys.stderr) # This is probably sub-optimal, doing eval per-epoch eval_start = time.time() bleu_score, (eval_speed, eval_latencies), eval_output_tokens = estimator.eval_fn(infer_hparams) eval_end = time.time() eval_delta = eval_end - eval_start utils.print_out("eval time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" % (epochs + 1, eval_delta / 60., eval_speed, eval_speed * (eval_src_tokens + eval_output_tokens) / eval_sentences), f=sys.stderr) for lat in sorted(eval_latencies): utils.print_out("eval latency_%s for epoch %d: %.2f ms" % (lat, epochs + 1, eval_latencies[lat] * 1000)) if FLAGS.debug or (FLAGS.target_bleu is not None and bleu_score > FLAGS.target_bleu): should_stop = True utils.print_out( "Stop job since target bleu is reached at epoch %d ." % epochs, f=sys.stderr) epochs += 1 if epochs >= FLAGS.max_train_epochs: should_stop = True utils.print_out("Stop job since max_train_epochs is reached.", f=sys.stderr) experiment_end = time.time() utils.print_out('Experiment took {} min'.format((experiment_end - experiment_start) / 60))
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.use_fp16 and FLAGS.use_dist_strategy: raise ValueError("use_fp16 and use_dist_strategy aren't compatible") # Set up hacky envvars. # Hack that affects Defun in attention_wrapper.py active_xla_option_nums = np.sum([FLAGS.use_xla, FLAGS.use_autojit_xla, FLAGS.xla_compile]) if active_xla_option_nums > 1: raise ValueError( "Only one of use_xla, xla_compile, use_autojit_xla can be set") os.environ["use_xla"] = str(FLAGS.use_xla).lower() if FLAGS.use_xla: os.environ["use_defun"] = str(True).lower() else: os.environ["use_defun"] = str(FLAGS.use_defun).lower() utils.print_out("use_defun is %s for attention" % os.environ["use_defun"]) # TODO(jamesqin): retire this config after Cuda9.1 os.environ["use_fp32_batch_matmul"] = ("true" if FLAGS.use_fp32_batch_matmul else "false") os.environ["xla_compile"] = "true" if FLAGS.xla_compile else "false" os.environ["force_inputs_padding"] = ( "true" if FLAGS.force_inputs_padding else "false") if FLAGS.mode == "train": utils.print_out("Running training mode.") FLAGS.num_buckets = 5 default_hparams = create_hparams(FLAGS) run_main(FLAGS, default_hparams, estimator.train_fn) elif FLAGS.mode == "infer": utils.print_out("Running inference mode.") # Random random_seed = FLAGS.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed) np.random.seed(random_seed) tf.set_random_seed(random_seed) # Model output directory output_dir = FLAGS.output_dir if output_dir and not tf.gfile.Exists(output_dir): utils.print_out("# Creating output directory %s ..." % output_dir) tf.gfile.MakeDirs(output_dir) # Load hparams. default_hparams = create_hparams(FLAGS) default_hparams.num_buckets = 1 # The estimator model_fn is written in a way allowing train hparams to be # passed in infer mode. hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path) utils.print_out("infer_hparams:") utils.print_hparams(hparams) # Run evaluation when there's a new checkpoint for i, ckpt in enumerate( evaluation_utils.get_all_checkpoints(FLAGS.output_dir)): tf.logging.info("Starting to evaluate...") eval_start = time.time() bleu_score = estimator.eval_fn(hparams, ckpt) eval_end = time.time() utils.print_out("eval time for %d th ckpt: %.2f mins" % (i, (eval_end - eval_start) / 60.), f=sys.stderr) else: assert FLAGS.mode == "train_and_eval" utils.print_out("Running train and eval mode.") # Random random_seed = FLAGS.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed) np.random.seed(random_seed) tf.set_random_seed(random_seed) # Model output directory output_dir = FLAGS.output_dir if output_dir and not tf.gfile.Exists(output_dir): utils.print_out("# Creating output directory %s ..." % output_dir) tf.gfile.MakeDirs(output_dir) # Load hparams. default_hparams = create_hparams(FLAGS) default_hparams.num_buckets = 5 hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path) utils.print_out("training hparams:") utils.print_hparams(hparams) with tf.gfile.GFile(os.path.join(output_dir, "train_hparams.txt"), "w") as f: f.write(utils.serialize_hparams(hparams) + "\n") # The estimator model_fn is written in a way allowing train hparams to be # passed in infer mode. infer_hparams = tf.contrib.training.HParams(**hparams.values()) infer_hparams.num_buckets = 1 utils.print_out("infer_hparams:") utils.print_hparams(infer_hparams) with tf.gfile.GFile(os.path.join(output_dir, "infer_hparams.txt"), "w") as f: f.write(utils.serialize_hparams(infer_hparams) + "\n") epochs = 0 should_stop = epochs >= FLAGS.max_train_epochs mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.gnmt_print(key=mlperf_log.EVAL_TARGET, value=hparams.target_bleu) while not should_stop: utils.print_out("Starting epoch %d" % epochs) mlperf_log.gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epochs) mlperf_log.gnmt_print( key=mlperf_log.INPUT_SIZE, value=iterator_utils.get_effective_train_epoch_size(hparams)) mlperf_log.gnmt_print( key=mlperf_log.TRAIN_CHECKPOINT, value=("Under " + hparams.output_dir)) try: train_start = time.time() estimator.train_fn(hparams) except tf.errors.OutOfRangeError: utils.print_out("training hits OutOfRangeError", f=sys.stderr) train_end = time.time() utils.print_out("training time for epoch %d: %.2f mins" % (epochs, (train_end - train_start) / 60.), f=sys.stderr) # This is probably sub-optimal, doing eval per-epoch mlperf_log.gnmt_print(key=mlperf_log.EVAL_START) eval_start = time.time() bleu_score = estimator.eval_fn(infer_hparams) eval_end = time.time() utils.print_out("eval time for epoch %d: %.2f mins" % (epochs, (eval_end - eval_start) / 60.), f=sys.stderr) mlperf_log.gnmt_print(key=mlperf_log.EVAL_ACCURACY, value={"epoch": epochs, "value": bleu_score}) mlperf_log.gnmt_print(key=mlperf_log.EVAL_STOP, value=epochs) if FLAGS.debug or bleu_score > FLAGS.target_bleu: should_stop = True utils.print_out( "Stop job since target bleu is reached at epoch %d ." % epochs, f=sys.stderr) mlperf_log.gnmt_print(mlperf_log.RUN_STOP, {"success": True}) if epochs >= FLAGS.max_train_epochs: should_stop = True utils.print_out("Stop job since max_train_epochs is reached.", f=sys.stderr) mlperf_log.gnmt_print(mlperf_log.RUN_STOP, {"success": False}) epochs += 1 mlperf_log.gnmt_print(key=mlperf_log.RUN_FINAL)
def main(_): #################################################################################### feats = Features() # hyper params hparam = tf.contrib.training.HParams( model=cfg.model, norm=True, # use batch norm seed=cfg.seed, batch_norm_decay=0.9, hidden_size=[1024, 512], cross_layer_sizes=[128, 128], k=16, # multi_features embedding dim single_k=16, # single_features embedding dim max_length=100, # hash length cross_hash_num=int(5e6), single_hash_num=int(5e6), multi_hash_num=int(1e6), batch_size=1024, infer_batch_size=2**14, optimizer="adam", dropout=0, kv_batch_num=20, learning_rate=0.00005, num_display_steps=100, # every number of steps to display results num_save_steps=1000, # every number of steps to save model num_eval_steps=2000, # every number of steps to evaluate model epoch=10, # train epoch metric='softmax_loss', activation=['relu', 'relu', 'relu'], init_method='tnormal', cross_activation='relu', init_value=0.001, single_features=None, cross_features=None, multi_features=feats.multi_features, dense_features=feats.dense_features, kv_features=None, label=feats.label_features, label_dim=4, # output label dim (gender - 1, age - 4, age_all - 10) label_name='age', model_name=cfg.model, checkpoint_dir=os.path.join(cfg.data_path, FLAGS.log_dir)) utils.print_hparams(hparam) #################################################################################### if FLAGS.mode == 'train': # read data train_log = read_all_feature_data(feats, label_name=hparam.label_name) # build model model = model_utils.build_model(hparam) # train model model.train(train_log, None) #################################################################################### elif FLAGS.mode == 'test': # read data test_log = read_all_feature_data(feats, mode='test') # build model model = model_utils.build_model(hparam) # infer model preds = model.infer(test_log) # shape: [length, 20] if hparam.label_name == 'age': _ = output_labels_v2(test_log, preds, pred_path=os.path.join( cfg.data_path, FLAGS.log_dir, 'preds.csv')) elif hparam.label_name == 'gender': _ = output_labels_v3(test_log, preds, pred_path=os.path.join( cfg.data_path, FLAGS.log_dir, 'preds.csv')) # K_fold = [] # for i in range(5): # if i == 4: # tmp = index # else: # tmp = random.sample(index, int(1.0 / 5 * train.shape[0])) # index = index - set(tmp) # print("Number:", len(tmp)) # K_fold.append(tmp) # # train_preds = np.zeros(len(train)) # test_preds = np.zeros(len(test)) # scores = [] # train['gold'] = True # for i in range(5): # print("Fold", i) # dev_index = K_fold[i] # train_index = [] # for j in range(5): # if j != i: # train_index += K_fold[j] # for k in range(2): # model = model_utils.build_model(hparam) # score = model.train(train.loc[train_index], train.loc[dev_index]) # scores.append(score) # train_preds[list(dev_index)] += model.infer(train.loc[list(dev_index)]) / 2 # test_preds += model.infer(test) / 10 # print(np.mean((np.exp(test_preds * 10 / (i * 2 + k + 1)) - 1))) # try: # del model # gc.collect() # except: # pass # train_preds = np.exp(train_preds) - 1 # test_preds = np.exp(test_preds) - 1 #################################################################################### elif FLAGS.mode == 'val': # read data train_log, val_log = read_all_feature_data(feats, mode='val') # build model model = model_utils.build_model(hparam) # train model model.train(train_log, None, is_val=True) # infer model preds = model.infer(val_log) # shape: [length, 20] if hparam.label_name == 'age': val_log = output_labels_v2(val_log, preds, pred_path=os.path.join( cfg.data_path, FLAGS.log_dir, 'val_preds.csv'), is_train=True) elif hparam.label_name == 'gender': val_log = output_labels_v3(val_log, preds, pred_path=os.path.join( cfg.data_path, FLAGS.log_dir, 'val_preds.csv'), is_train=True) # print results age_acc = sum((val_log.age == val_log.predicted_age).astype( np.int)) / len(val_log) gender_acc = sum((val_log.gender == val_log.predicted_gender).astype( np.int)) / len(val_log) print("Final Age Accuracy: %.4f" % age_acc) print("Final Gender Accuracy: %.4f" % gender_acc)
def build_graph(self, features, labels, mode, params): """docstring.""" del labels, params misc_utils.print_out("Running fast mode_fn") hparams = self.hparams # Create global_step tf.train.get_or_create_global_step() if mode == tf.contrib.learn.ModeKeys.INFER: # Doing inference only on one GPU inf_hparams = tf.contrib.training.HParams(**hparams.values()) inf_hparams.set_hparam("num_gpus", 1) # Inference is done in fp32 and in the same way as that of dist_strategy. inf_hparams.set_hparam("use_fp16", False) misc_utils.print_out("inference hparmas:") misc_utils.print_hparams(inf_hparams) # Create variable_mgr var_mgr = self._get_variable_mgr(inf_hparams) with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope( "tower_0"), var_mgr.create_outer_variable_scope(0): model = gnmt_model.GNMTModel(inf_hparams, mode=mode, features=features) sample_ids = model.sample_id reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file( inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK) sample_words = reverse_target_vocab_table.lookup( tf.to_int64(sample_ids)) # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if inf_hparams.time_major: sample_words = tf.transpose(sample_words) elif sample_words.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. sample_words = tf.transpose(sample_words, [2, 0, 1]) predictions = {"predictions": sample_words} # return loss, vars, grads, predictions, train_op, scaffold return None, None, None, predictions, None, None elif mode == tf.contrib.learn.ModeKeys.TRAIN: num_towers = hparams.num_gpus # Shard inputs tower_features = self._shard_inputs(features, num_towers) # Create loss scale vars if necessary loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars( ) # Create variable_mgr var_mgr = self._get_variable_mgr(hparams) # Build per-tower fprop and bprop devices = var_mgr.get_devices() tower_gradvars = [] tower_scopes = [] var_scopes = [] train_losses = [] learning_rates = [] batch_sizes = [] opts = [] def fprop_and_bprop(tid): """docstring.""" model = gnmt_model.GNMTModel(hparams, mode=mode, features=tower_features[tid]) # sync training. assert model.learning_rate is not None # The following handles shouldn't be built in when doing manual assert model.grad_norm is None assert model.update is None tower_loss = model.train_loss # Only check loss numerics if in fp16 if hparams.use_fp16 and hparams.check_tower_loss_numerics: tower_loss = tf.check_numerics( tower_loss, "tower_%d has Inf/NaN loss" % tid) # Cast to fp32, otherwise would easily overflow. tower_loss = tf.to_float(tower_loss) var_params, grads, opt = self._compute_tower_grads( tower_loss, var_mgr.trainable_variables_on_device(tid, tid), model.learning_rate, use_fp16=hparams.use_fp16, loss_scale=loss_scale, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) self._print_varinfo(var_params, tid) res = [model.train_loss, model.learning_rate, model.batch_size] res.extend(grads) opts.append(opt) return res def unpack_fprop_and_bprop_output(output): train_loss = output[0] learning_rate = output[1] batch_size = output[2] grads = output[3:] return train_loss, learning_rate, batch_size, grads with mixed_precision_scope(): for tid in range(num_towers): with tf.device(devices[tid % len(devices)]), tf.name_scope( "tower_%s" % tid) as scope: tower_scopes.append(scope) with var_mgr.create_outer_variable_scope( tid) as var_scope: var_scopes.append(var_scope) outputs = maybe_xla_compile( hparams, fprop_and_bprop, tid) (train_loss, learning_rate, batch_size, grads) = unpack_fprop_and_bprop_output(outputs) train_losses.append(train_loss) learning_rates.append(learning_rate) batch_sizes.append(batch_size) var_params = var_mgr.trainable_variables_on_device( tid, tid) tower_gradvars.append(list(zip(grads, var_params))) # Add summaries if hparams.show_metrics: tf.summary.scalar("learning_rate", learning_rates[0]) if loss_scale: tf.summary.scalar("loss_scale", loss_scale) if hparams.enable_auto_loss_scale: tf.summary.scalar("loss_scale_normal_steps", loss_scale_normal_steps) misc_utils.print_out("Finish building fprop and per-tower bprop.") # Aggregate gradients # The following compute the aggregated grads for each tower, stored in # opaque grad_states structure. apply_grads_devices, grad_states = var_mgr.preprocess_device_grads( tower_gradvars) master_grads = None master_params = None update_ops = [] for i, device in enumerate(apply_grads_devices): with tf.device(device), tf.name_scope(tower_scopes[i]): # Get per-tower grads. with tf.name_scope("get_gradients_to_apply"): avg_gradvars = var_mgr.get_gradients_to_apply( i, grad_states) avg_grads = [gv[0] for gv in avg_gradvars] # gradients post-processing with tf.name_scope("clip_gradients"): if hparams.clip_grads: clipped_grads, grad_norm = model_helper.gradient_clip( avg_grads, max_gradient_norm=hparams.max_gradient_norm) # summary the grad on the 1st tower if i == 0 and hparams.show_metrics: tf.summary.scalar("grad_norm", grad_norm) tf.summary.scalar( "clipped_grad_norm", tf.global_norm(clipped_grads)) else: clipped_grads = avg_grads if i == 0: master_grads = clipped_grads # Build apply-gradients ops clipped_gradvars = list( zip(clipped_grads, [gv[1] for gv in avg_gradvars])) if i == 0: master_params = [gv[1] for gv in avg_gradvars] with tf.name_scope("append_gradient_ops"): loss_scale_params = variable_mgr_util.AutoLossScaleParams( enable_auto_loss_scale=hparams. enable_auto_loss_scale, loss_scale=loss_scale, loss_scale_normal_steps=loss_scale_normal_steps, inc_loss_scale_every_n=hparams. fp16_inc_loss_scale_every_n, is_chief=True) opt = opts[i] var_mgr.append_apply_gradients_ops( grad_states, opt, clipped_gradvars, update_ops, loss_scale_params) misc_utils.print_out("Finish building grad aggregation.") assert len(update_ops) == num_towers train_op = tf.group(update_ops) with tf.control_dependencies([train_op]): global_step = tf.train.get_global_step() train_op = global_step.assign_add(1) # Compute loss on the first gpu # TODO(jamesqin): optimize it? with tf.device("gpu:0"): loss = misc_utils.weighted_avg(train_losses, batch_sizes) # Create local init_ops # TODO(jamesqin): handle resource variables! # At present if not using mirror strategy, not using resource vars. local_init_ops = [] local_init_op = tf.local_variables_initializer() with tf.control_dependencies([local_init_op]): local_init_ops.append(var_mgr.get_post_init_ops()) local_init_ops.extend([local_init_op, tf.tables_initializer()]) saveable_vars = var_mgr.savable_variables() # Add saveables for cudnn vars in master tower. saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) saveable_objects = [x for x in saveable_objects if "v0" in x.name] misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars)) for mv in saveable_vars: misc_utils.print_out(mv.name) misc_utils.print_out("All global trainable vars(%d): " % len(tf.trainable_variables())) for tv in tf.trainable_variables(): misc_utils.print_out(tv.name) misc_utils.print_out("All global vars(%d): " % len(tf.global_variables())) for gv in tf.global_variables(): misc_utils.print_out(gv.name) misc_utils.print_out("master backproped params(%d): " % len(master_params)) for mp in master_params: misc_utils.print_out(mp.name) # Note the cudnn vars are skipped the init check. :( scaffold = tf.train.Scaffold( ready_op=tf.report_uninitialized_variables(saveable_vars), ready_for_local_init_op=tf.report_uninitialized_variables( saveable_vars), local_init_op=tf.group(*local_init_ops), saver=tf.train.Saver(saveable_vars + saveable_objects, save_relative_paths=True)) misc_utils.print_out("Finish building model_fn") # return loss, vars, grads, predictions, train_op, scaffold return loss, master_params, master_grads, None, train_op, scaffold
def main(_): #################################################################################### feats = Features() # hyper params hparam = tf.contrib.training.HParams( num_classes=1999, # number of label model=cfg.model, norm=True, # use batch norm seed=cfg.seed, batch_norm_decay=0.9, hidden_size=[1024, 512], cross_layer_sizes=[128, 128], k=16, # multi_features embedding dim single_k=16, # single_features embedding dim sequence_length=100, # max sentence length embed_size=100, # embedding size cross_hash_num=int(5e6), single_hash_num=int(5e6), multi_hash_num=int(1e6), batch_size=1024, infer_batch_size=2**14, optimizer="adam", dropout=0, kv_batch_num=20, learning_rate=0.01, decay_steps=12000, # how many steps before decay learning rate decay_rate=0.9, # Rate of decay for learning rate num_display_steps=1000, # every number of steps to display results num_save_steps=1000, # every number of steps to save model num_eval_steps=1000, # every number of steps to evaluate model epoch=20, # train epoch metric='softmax_loss', activation=['relu', 'relu', 'relu'], init_method='tnormal', cross_activation='relu', init_value=0.001, l2_lambda=0.0001, single_features=None, cross_features=None, multi_features=feats.multi_features, dense_features=feats.dense_features, kv_features=None, label=feats.label_features, label_dim=1, # output label dim (gender - 1, age - 4, age_all - 10) label_name='gender', model_name=cfg.model, checkpoint_dir=os.path.join(cfg.data_path, FLAGS.log_dir)) utils.print_hparams(hparam) #################################################################################### if FLAGS.mode == 'train': # read train data train_log = read_all_feature_data(feats, label_name=hparam.label_name) # build model model = model_utils.build_model(hparam) # train model model.train(train_log, None) # read test data test_log = read_all_feature_data(feats, mode='test', label_name=hparam.label_name) # infer model preds = model.infer(test_log) # shape: [length, 20] if hparam.label_name == 'age': _ = output_labels_v2(test_log, preds, pred_path=os.path.join( cfg.data_path, FLAGS.log_dir, 'preds.csv')) elif hparam.label_name == 'gender': _ = output_labels_v3(test_log, preds, pred_path=os.path.join( cfg.data_path, FLAGS.log_dir, 'preds.csv')) #################################################################################### elif FLAGS.mode == 'val': # read data train_log, val_log = read_all_feature_data( feats, mode='val', label_name=hparam.label_name) # build model model = model_utils.build_model(hparam) # train model model.train(train_log, None, is_val=True) # infer model preds = model.infer(val_log) # shape: [length, 20] if hparam.label_name == 'age': val_log = output_labels_v2(val_log, preds, pred_path=os.path.join( cfg.data_path, FLAGS.log_dir, 'val_preds.csv'), is_train=True) elif hparam.label_name == 'gender': val_log = output_labels_v3(val_log, preds, pred_path=os.path.join( cfg.data_path, FLAGS.log_dir, 'val_preds.csv'), is_train=True) # print results age_acc = sum((val_log.age == val_log.predicted_age).astype( np.int)) / len(val_log) gender_acc = sum((val_log.gender == val_log.predicted_gender).astype( np.int)) / len(val_log) print("Final Age Accuracy: %.4f" % age_acc) print("Final Gender Accuracy: %.4f" % gender_acc) # #################################################################################### # 1.load data(X:list of lint,y:int). # if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) # else: if 1 == 1: # 1. get vocabulary of X and label. trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary( simple='simple', word2vec_model_path=FLAGS.word2vec_model_path, name_scope="biLstmTextRelation") vocab_size = len(vocabulary_word2index) print("rnn_model.vocab_size:", vocab_size) # vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="biLstmTextRelation") vocabulary_word2index_label = {'1': 1, '0': 0} vocabulary_index2word_label = {0: '0', 1: '1'} train, test, _ = load_data(vocabulary_word2index, vocabulary_word2index_label, valid_portion=0.005, training_data_path=FLAGS.traning_data_path) # train, test, _ = load_data_multilabel_new_twoCNN(vocabulary_word2index, vocabulary_word2index_label,multi_label_flag=False,traning_data_path=FLAGS.traning_data_path) #,traning_data_path=FLAGS.traning_data_path # train, test, _ = load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label,multi_label_flag=False,traning_data_path=FLAGS.traning_data_path) #,traning_data_path=FLAGS.traning_data_path trainX, trainY = train testX, testY = test # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length ############################################################################################### # with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) ############################################################################################### print("trainX[0]:", trainX[0]) # ;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") # 2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Instantiate Model biLstmTR = BiLstm(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size, FLAGS.is_training) # Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint for rnn model.") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: # load pre-trained word embedding assign_pretrained_word_embedding( sess, vocabulary_index2word, vocab_size, biLstmTR, word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch = sess.run(biLstmTR.epoch_step) # 3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size for epoch in range(curr_epoch, hparam.epoch): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end] ) # ;print("trainY[start:end]:",trainY[start:end]) curr_loss, curr_acc, _ = sess.run( [biLstmTR.loss_val, biLstmTR.accuracy, biLstmTR.train_op], feed_dict={ biLstmTR.input_x: trainX[start:end], biLstmTR.input_y: trainY[start:end], biLstmTR.dropout_keep_prob: 1.0 } ) # curr_acc--->TextCNN.accuracy -->,textRNN.dropout_keep_prob:1 loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 500 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter)) ) # tTrain Accuracy:%.3f---》acc/float(counter) # epoch increment print("going to increment epoch counter....") sess.run(biLstmTR.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, biLstmTR, testX, testY, batch_size, vocabulary_index2word_label) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" if not os.path.exists(FLAGS.ckpt_dir): os.mkdir(FLAGS.ckpt_dir) saver.save(sess, save_path, global_step=epoch)