def main(_): tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info("Tensorflow Version: " + TF_VERSION) # load config file tf.logging.info("***** loding config *****") tf.logging.info(FLAGS.config) with open(FLAGS.config, 'r') as f: config = json.load(f) if FLAGS.config.split('/')[1] == 'biz': config[C.BIZ_NAME] = FLAGS.config.split('/')[2] if FLAGS.task_type == "train": sess_config = tf.ConfigProto(allow_soft_placement=True) run_config = tf.estimator.RunConfig( model_dir=FLAGS.checkpoint_path, save_checkpoints_steps=config.get('save_checkpoints_steps'), session_config=sess_config, log_step_count_steps=10) model_fn = model_fn_builder(config, FLAGS.init_checkpoint_path) estimator = tf.estimator.Estimator( model_fn=model_fn, params={"batch_size": FLAGS.batch_size}, config=run_config) if FLAGS.worker_count > 1: FLAGS.worker_count -= 1 if FLAGS.task_index > 0: FLAGS.task_index -= 1 train_input_fn = input_fn_builder(table=FLAGS.train_table, config=config) tf.logging.info("***** Running training *****") tf.logging.info("Batch size = %d", FLAGS.batch_size) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) # do eval eval_input_fn = input_eval_fn_builder(table=FLAGS.eval_table, config=config) tf.logging.info("***** Running evaluation *****") eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, start_delay_secs=30, throttle_secs=30) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) elif FLAGS.task_type == "export": export_saved_model(config) elif FLAGS.task_type == 'predict': predict(config, FLAGS.worker_count, FLAGS.task_index)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 tpu_config = tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=INIT_CHECKPOINT, learning_rate=FLAGS.learning_rate, num_train_steps=1, num_warmup_steps=0, config=config, use_tpu=FLAGS.use_tpu, create_model_fn=create_model, fine_tune=FLAGS.fine_tune) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, log_step_count_steps=1, save_summary_steps=2, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=2, tpu_config=tpu_config) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, predict_batch_size=FLAGS.predict_batch_size) suffix = '' if FLAGS.fine_tune: suffix = '_fine_tune' eval_examples = None with tf.gfile.GFile( '%s/dev_examples%s.pickle' % (FLAGS.features_dir, suffix), 'rb') as out_file: eval_examples = pickle.load(out_file) eval_features = None with tf.gfile.GFile( '%s/dev_features%s.pickle' % (FLAGS.features_dir, suffix), 'rb') as out_file: eval_features = pickle.load(out_file) tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) all_results = [] predict_input_fn = input_fn_builder(input_file=DEV_FILENAME, seq_length=FLAGS.max_seq_length, bert_config=bert_config, is_training=False, drop_remainder=False, fine_tune=FLAGS.fine_tune) # If running eval on the TPU, you will need to specify the number of # steps. all_results = [] for result in estimator.predict(predict_input_fn, yield_single_examples=False): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) if hasattr(result["unique_ids"], 'shape'): for i, unique_id_s in enumerate(result['unique_ids']): unique_id = int(unique_id_s) start_logits = [ float(x) for x in result["start_logits"][i].flat ] end_logits = [float(x) for x in result["end_logits"][i].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) else: unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(FLAGS.output_dir, FLAGS.predictions_output_directory, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, FLAGS.predictions_output_directory, "nbest_predictions.json") output_null_log_odds_file = os.path.join( FLAGS.output_dir, FLAGS.predictions_output_directory, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file)
flags.DEFINE_integer( "max_seq_length", 384, "The maximum total input sequence length after WordPiece tokenization. " "Sequences longer than this will be truncated, and sequences shorter " "than this will be padded.") flags.DEFINE_integer("batch_size", 100, ".") flags.DEFINE_string("data_bert_directory", 'data/uncased_L-12_H-768_A-12', 'directory containing BERT config and checkpoints') bert_config = modeling.BertConfig.from_json_file("%s/bert_config.json" % FLAGS.data_bert_directory) input_fn = input_fn_builder('out/features/eval.tf_record', FLAGS.max_seq_length, False, False, bert_config) dataset: tf.data.TFRecordDataset = input_fn({'batch_size': FLAGS.batch_size}) def test_embedding_dimensions(): assert dataset.output_shapes['input_ids'].dims[0].value is None assert dataset.output_shapes['input_ids'].dims[ 1].value == FLAGS.max_seq_length assert len(dataset.output_shapes['input_ids'].dims) == 2 assert dataset.output_shapes['input_mask'].dims[0].value is None assert dataset.output_shapes['input_mask'].dims[ 1].value == FLAGS.max_seq_length assert len(dataset.output_shapes['input_mask'].dims) == 2 assert dataset.output_shapes['unique_ids'].dims[0].value is None
def train(config, worker_count, task_index, cluster, is_chief, target): worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, 0) print("worker_deivce = %s" % worker_device) # assign io related variables and ops to local worker device with tf.device(worker_device): train_input_fn = input_fn_builder( table=FLAGS.train_table, config=config, slice_id=FLAGS.task_index, slice_count=worker_count ) d = train_input_fn() iterator = d.make_one_shot_iterator() features = iterator.get_next() # assign global variables to ps nodes available_worker_device = "/job:worker/task:%d" % (task_index) with tf.device(tf.train.replica_device_setter(worker_device=available_worker_device, cluster=cluster)): global_step = tf.Variable(0, name="global_step", trainable=False) # construct the model structure # loss, optimizer = model_fn(features, labels, global_step) policy_network_module = utils.load_policy_network_module(config) simulator_network_module = utils.load_simulator_network_module(config) trainer_module = utils.load_trainer_module(config) trainReinforce = trainer_module.TrainReinforce(config, features , policy_network_module.PGNetwork , simulator_network_module.PGNetwork , global_step , FLAGS.simulator_checkpoint_path ) if FLAGS.init_checkpoint_path is not None: t_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='reinforce') (assignment_map, initialized_variable_names ) = get_assignment_map_from_checkpoint(t_vars, FLAGS.init_checkpoint_path) tf.train.init_from_checkpoint(FLAGS.init_checkpoint_path, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in t_vars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) # hooks = [tf.train.StopAtStepHook(last_step=FLAGS.num_train_steps)] hooks = [] step = 0 with tf.train.MonitoredTrainingSession(master=target , checkpoint_dir=FLAGS.checkpoint_path , save_checkpoint_secs=120 , is_chief=is_chief, hooks=hooks) as mon_sess: while True: # _, c, g = mon_sess.run([optimizer, loss, global_step]) trainReinforce.train(mon_sess) _global_step = mon_sess.run(global_step) if task_index == 0: print 'step:{}'.format(_global_step) if _global_step >= FLAGS.num_train_steps: break print("%d steps finished." % step)
def trian_and_eval_on_single_worker(config): train_input_fn = input_fn_builder( table=FLAGS.train_table, config=config ) d = train_input_fn() iterator = d.make_one_shot_iterator() features = iterator.get_next() global_step = tf.train.get_or_create_global_step() # construct the model structure # loss, optimizer = model_fn(features, labels, global_step) policy_network_module = utils.load_policy_network_module(config) simulator_network_module = utils.load_simulator_network_module(config) trainer_module = utils.load_trainer_module(config) trainer = trainer_module.Trainer(config, features , policy_network_module.PGNetwork , simulator_network_module.PGNetwork , global_step , FLAGS.simulator_checkpoint_path ) eval_graph = tf.Graph() with eval_graph.as_default() as g: eval_input_fn = input_eval_fn_builder( table=FLAGS.eval_table, config=config ) eval_d = eval_input_fn() eval_iterator = eval_d.make_one_shot_iterator() eval_features = eval_iterator.get_next() eval_module = utils.load_evaluator_module(config) evalutor = eval_module.Evaluator(config, eval_features , policy_network_module.PGNetwork , simulator_network_module.PGNetwork , FLAGS.simulator_checkpoint_path) eval_saver = tf.train.Saver(max_to_keep=10) if FLAGS.init_checkpoint_path is not None: t_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='reinforce') (assignment_map, initialized_variable_names ) = get_assignment_map_from_checkpoint(t_vars, FLAGS.init_checkpoint_path) tf.train.init_from_checkpoint(FLAGS.init_checkpoint_path, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in t_vars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) hooks = [] step = 0 previous_ckpt_path = '' with tf.train.MonitoredTrainingSession(master='' , checkpoint_dir=FLAGS.checkpoint_path , save_checkpoint_secs=60 , is_chief=True, hooks=hooks) as mon_sess: while True: # _, c, g = mon_sess.run([optimizer, loss, global_step]) trainer.train(mon_sess) _global_step = mon_sess.run(global_step) print 'step:{}'.format(_global_step) # eval if _global_step > 0 and _global_step % 100 == 0: latest_ckpt_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) if latest_ckpt_path is None or latest_ckpt_path == previous_ckpt_path: continue print 'latest_ckpt_path', latest_ckpt_path with tf.Session(graph=eval_graph) as eval_sess: eval_sess.run(tf.global_variables_initializer()) eval_sess.run(tf.local_variables_initializer()) eval_saver.restore(eval_sess, latest_ckpt_path) evalutor.eval(eval_sess) previous_ckpt_path = latest_ckpt_path if _global_step >= FLAGS.num_train_steps: break print("%d steps finished." % step)
def main(test_file='test.json'): tf.logging.set_verbosity(tf.logging.INFO) #1.设置数据处理器 processors = {'joint': Joint_Processor} task_name = config['task_name'].lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name]() #1.1获取标签 id2domain, domain2id, id2intent, intent2id, id2slot, slot2id, domain_w, intent_w = \ processor.get_labels(config["data_dir"],\ "train" if config['do_train'] else "test") #print(domain2id) #print(intent2id) #print(slot2id) #获取分词器 tokenizer = tokenization.FullTokenizer(\ vocab_file=config['vocab_file'], do_lower_case=config['do_lower_case']) train_examples = None num_train_steps = None num_warmup_steps = None save_checkpoints_steps = config['save_checkpoints_steps'] #1.2读取训练数据,并转成example格式 if config['do_train']: tf.logging.info("***** Loading training examples *****") train_examples = processor.get_train_examples(config['data_dir']) num_train_steps = int( len(train_examples) / config['train_batch_size'] * config['num_train_epochs']) num_warmup_steps = int(num_train_steps * config['warmup_proportion']) save_checkpoints_steps = int( len(train_examples) / config['train_batch_size']) + 1 if config['do_train']: train_file = os.path.join(config['data_dir'], 'train.tf_record') #将example写入tf方便读取的文件 file_based_convert_examples_to_features(train_examples, domain2id, intent2id, slot2id,\ config['max_seq_length'], tokenizer, train_file) #文件读取模块 train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=config['max_seq_length'], is_training=True, drop_remainder=False) #2.创建模型 #2.1设置模型运行参数 bert_config = modeling.BertConfig.from_json_file( config['bert_config_file']) tf_cfg = tf.ConfigProto() tf_cfg.gpu_options.per_process_gpu_memory_fraction = 0.8 run_config = tf.estimator.RunConfig( model_dir=config['output_dir'], save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=1, session_config=tf_cfg, log_step_count_steps=100, ) #2.1构建模型 model_fn = model_fn_builder(bert_config=bert_config, num_domain=len(domain2id), num_intent=len(intent2id), num_slot=len(slot2id), init_checkpoint=config['init_checkpoint'], learning_rate=config['learning_rate'], num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=config['use_tpu'], use_one_hot_embeddings=config['use_tpu'], do_serve=config['do_serve'], domain_w=domain_w, intent_w=intent_w) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, ) #3训练 if config['do_train']: tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", config['train_batch_size']) tf.logging.info(" Num steps = %d", num_train_steps) if config['do_eval']: #没有eval环节 train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn,\ max_steps = num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn,\ steps = eval_steps, start_delay_secs=60, throttle_secs=0) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) else: estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) return None #4预测 #4.1加载预测数据 if config['do_predict']: tf.logging.info("***** Loading training examples *****") test_examples = processor.get_test_examples(test_file) num_actual_predict_examples = len(test_examples) tf.logging.info("the number of test_examples is %d" % len(test_examples)) test_features = convert_examples_to_features(test_examples, domain2id,\ intent2id, slot2id, config['max_seq_length'], tokenizer) tf.logging.info("the number of test_features is %d" % len(test_features)) if config['do_predict']: predict_input_fn = input_fn_builder( features=test_features, seq_length=config['max_seq_length'], is_training=False, drop_remainder=False, ) result = estimator.predict(input_fn=predict_input_fn) print(result) pred_results = [] for pred_line, prediction in zip(test_examples, result): data = {} #print(pred_line.text) data['text'] = pred_line.text domain_pred = prediction["domain_pred"] intent_pred = prediction["intent_pred"] slot_pred = prediction["slot_pred"] data['domain'] = id2domain[domain_pred] data['intent'] = id2intent[ intent_pred] if id2intent[intent_pred] != 'NaN' else np.nan idx = 0 len_seq = len(pred_line.text) slot_labels = [] for sid in slot_pred: if idx >= len_seq: break if sid == 0: continue cur_slot = id2slot[sid] if cur_slot in ['[CLS]', '[SEP]']: continue slot_labels.append(cur_slot) idx += 1 data['slots'] = get_slot_name(pred_line.text, slot_labels) for p in code_pattern: result = re.match(p, data['text']) if result: #print(result.group(1)) #print(result.group(0), result.group(1)) data['slots']['code'] = result.group(1) break pred_results.append(data) #print(domain_pred, intent_pred, slot_pred) json.dump(pred_results, open(sys.argv[2], 'w', encoding='utf8'), ensure_ascii=False)
def main(_): tf.gfile.MakeDirs(OUTPUT_DIR) tf.logging.set_verbosity(tf.logging.INFO) (config, create_model) = load_and_save_config(FLAGS.config) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 tpu_config = tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, log_step_count_steps=1, save_summary_steps=2, model_dir=OUTPUT_DIR, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=2, tpu_config=tpu_config) num_train_steps = None num_warmup_steps = None if FLAGS.do_train: num_train_examples = N_TRAIN_EXAMPLES if num_train_examples is None: num_train_examples = math.ceil(N_TOTAL_SQUAD_EXAMPLES * (1. - FLAGS.eval_percent)) num_train_steps = int(num_train_examples / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) print("Total training steps = %d" % num_train_steps) time.sleep(2) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=INIT_CHECKPOINT, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, config=config, use_tpu=FLAGS.use_tpu, create_model_fn=create_model, fine_tune=FLAGS.fine_tune) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. train_input_fn = input_fn_builder(input_file=TRAIN_FILE_NAME, seq_length=FLAGS.max_seq_length, is_training=True, bert_config=bert_config, drop_remainder=True, fine_tune=FLAGS.fine_tune) eval_input_fn = input_fn_builder( input_file=EVAL_FILE_NAME, seq_length=FLAGS.max_seq_length, # No need to shuffle eval set is_training=False, bert_config=bert_config, drop_remainder=True, fine_tune=FLAGS.fine_tune) # This should be .train_and_evaluate # https://www.tensorflow.org/api_docs/python/tf/estimator/train_and_evaluate # and https://towardsdatascience.com/how-to-configure-the-train-and-evaluate-loop-of-the-tensorflow-estimator-api-45c470f6f8d train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, # start_delay_secs=FLAGS.eval_start_delay_secs, # start evaluating after N seconds throttle_secs=FLAGS.eval_throttle_secs, steps=FLAGS.eval_steps, ) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)