def train_slot_model(bert_config, bert_init_checkpoint, run_config, num_slot_labels, num_train_steps, num_warmup_steps, train_data_file): model_fn = slot_model_fn_builder( bert_config=bert_config, num_slot_labels=num_slot_labels, init_checkpoint=bert_init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) # 读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_data_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "ner": NerProcessor } # if not FLAGS.do_train and not FLAGS.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # 加载BERT模型参数 if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 在train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if FLAGS.clean and FLAGS.do_train: if os.path.exists(FLAGS.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(FLAGS.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(FLAGS.data_config_path): try: os.remove(FLAGS.data_config_path) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() # print(label_list) # exit() if not os.path.exists(os.path.join(FLAGS.output_dir, 'label_list.pkl')): with open(os.path.join(FLAGS.output_dir, 'label_list.pkl'), 'wb') as fd: pickle.dump(label_list, fd) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # TPU参数配置 tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path) as fd: data_config = json.load(fd) else: data_config = {} # print(data_config) #空的 # exit() if FLAGS.do_train: # 加载训练数据 if len(data_config) == 0: train_examples = processor.get_train_examples(FLAGS.data_dir) # 训练步数 num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) # 预热step步 num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) data_config['num_train_steps'] = num_train_steps # 训练步数 data_config['num_warmup_steps'] = num_warmup_steps # 热身步数 data_config['num_train_size'] = len(train_examples) # 训练样本大小 example对象 else: num_train_steps = int(data_config['num_train_steps']) num_warmup_steps = int(data_config['num_warmup_steps']) # print(data_config) #{'num_train_steps': 4890, 'num_warmup_steps': 489, 'num_train_size': 20864} # exit() # print(bert_config) # exit() # 目前为止,数据处理完毕 # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # 如果用哑编码则tpu比较快,其他GPU比较快 estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # 1. 将数据转化为tf_record 数据 if data_config.get('train.tf_record_path', '') == '': train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) # 将数据写到tf里 else: train_file = data_config.get('train.tf_record_path') num_train_size = num_train_size = int(data_config['num_train_size']) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", num_train_size) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: if data_config.get('eval.tf_record_path', '') == '': eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) data_config['eval.tf_record_path'] = eval_file data_config['num_eval_size'] = len(eval_examples) else: eval_file = data_config['eval.tf_record_path'] # 打印验证集数据信息 num_eval_size = data_config.get('num_eval_size', 0) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_size) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(num_eval_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # 保存数据的配置文件,避免在以后的训练过程中多次读取训练以及测试数据集,消耗时间 if not os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd: json.dump(data_config, fd) if FLAGS.do_predict: token_path = os.path.join(FLAGS.output_dir, "test_token.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) # predicted_result = estimator.evaluate(input_fn=predict_input_fn) # output_eval_file = os.path.join(FLAGS.output_dir, "predicted_results.txt") # with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: # tf.logging.info("***** Predict results *****") # for key in sorted(predicted_result.keys()): # tf.logging.info(" %s = %s", key, str(predicted_result[key])) # writer.write("%s = %s\n" % (key, str(predicted_result[key]))) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_label.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) for id in prediction: if idx > len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: if curr_labels == '[SEP]': break continue # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception! try: line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') # 将模型预测的结果和原始标签写入到文件中,以空格分开,使用conevel.py脚本来预测entity level 的结果并且输出 with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from conlleval import return_report eval_result = return_report(output_predict_file) print(''.join(eval_result)) with codecs.open(os.path.join(FLAGS.output_dir, 'entity_level_predicted_result.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result))
def main(): tf.logging.set_verbosity(tf.logging.INFO) processors = {"ner": NerProcessor} # if not FLAGS.do_train and not FLAGS.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(FLAGS.output_dir): import time FLAGS.output_dir = FLAGS.output_dir[:-1] + "_" + str(int(time.time())) os.mkdir(FLAGS.output_dir) FLAGS.data_config_path = os.path.join(FLAGS.output_dir, 'data.conf') # # 在train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean # if FLAGS.clean and FLAGS.do_train: # if os.path.exists(FLAGS.output_dir): # def del_file(path): # ls = os.listdir(path) # for i in ls: # c_path = os.path.join(path, i) # if os.path.isdir(c_path): # del_file(c_path) # else: # os.remove(c_path) # try: # del_file(FLAGS.output_dir) # except Exception as e: # print(e) # print('pleace remove the files of output dir and data.conf') # exit(-1) # if os.path.exists(FLAGS.data_config_path): # try: # os.remove(FLAGS.data_config_path) # except Exception as e: # print(e) # print('pleace remove the files of output dir and data.conf') # exit(-1) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path) as fd: data_config = json.load(fd) else: data_config = {} if FLAGS.do_train: # 加载训练数据 if len(data_config) == 0: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) data_config['num_train_steps'] = num_train_steps data_config['num_warmup_steps'] = num_warmup_steps data_config['num_train_size'] = len(train_examples) else: num_train_steps = int(data_config['num_train_steps']) num_warmup_steps = int(data_config['num_warmup_steps']) # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # 1. 将数据转化为tf_record 数据 if data_config.get('train.tf_record_path', '') == '': train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) else: train_file = data_config.get('train.tf_record_path') num_train_size = num_train_size = int(data_config['num_train_size']) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", num_train_size) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: if data_config.get('eval.tf_record_path', '') == '': eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) data_config['eval.tf_record_path'] = eval_file data_config['num_eval_size'] = len(eval_examples) else: eval_file = data_config['eval.tf_record_path'] # 打印验证集数据信息 num_eval_size = data_config.get('num_eval_size', 0) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_size) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(num_eval_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # 保存数据的配置文件,避免在以后的训练过程中多次读取训练以及测试数据集,消耗时间 if not os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd: json.dump(data_config, fd) if FLAGS.do_predict: token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) predicted_result = estimator.evaluate(input_fn=predict_input_fn) output_eval_file = os.path.join(FLAGS.output_dir, "predicted_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Predict results *****") for key in sorted(predicted_result.keys()): tf.logging.info(" %s = %s", key, str(predicted_result[key])) writer.write("%s = %s\n" % (key, str(predicted_result[key]))) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") # 在这里进行了改动 with open(output_predict_file, 'w', encoding='utf-8') as writer: for prediction in result: output_line = "\n".join(id2label[id] for id in prediction if id != 0) + "\n" writer.write(output_line)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = {"ner": NerProcessor} # if not FLAGS.do_train and not FLAGS.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 在train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if FLAGS.clean and FLAGS.do_train: if os.path.exists(FLAGS.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(FLAGS.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(FLAGS.data_config_path): try: os.remove(FLAGS.data_config_path) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() if os.path.exists(os.path.join(FLAGS.data_dir, "label_list.txt")): label_list = [] with codecs.open(os.path.join(FLAGS.data_dir, 'label_list.txt'), 'r', encoding='utf-8') as f: for line in f.readlines(): label_list.append(line.strip()) f.close() else: label_list = processor.get_labels(FLAGS.data_dir) with codecs.open(os.path.join(FLAGS.data_dir, 'label_list.txt'), 'w', encoding='utf-8') as f: for label in label_list: f.write(label + '\n') f.close() print("the label list is:", label_list) predict_label_list = processor.get_predict_labels(FLAGS.data_dir) print("predict_label_list", predict_label_list) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path) as fd: data_config = json.load(fd) else: data_config = {} if FLAGS.do_train: # 加载训练数据 if len(data_config) == 0: train_examples, train_examples_num = processor.get_train_examples( FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) data_config['num_train_steps'] = num_train_steps data_config['num_warmup_steps'] = num_warmup_steps data_config['num_train_size'] = len(train_examples) else: num_train_steps = int(data_config['num_train_steps']) num_warmup_steps = int(data_config['num_warmup_steps']) # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # 1. 将数据转化为tf_record 数据 if data_config.get('train.tf_record_path', '') == '': train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, predict_label_list=predict_label_list) else: train_file = data_config.get('train.tf_record_path') num_train_size = int(data_config['num_train_size']) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", num_train_size) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: if data_config.get('eval.tf_record_path', '') == '': eval_examples, eval_examples_number = processor.get_dev_examples( FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, predict_label_list) data_config['eval.tf_record_path'] = eval_file data_config['num_eval_size'] = len(eval_examples) else: eval_file = data_config['eval.tf_record_path'] # 打印验证集数据信息 num_eval_size = data_config.get('num_eval_size', 0) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_size) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(num_eval_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # 保存数据的配置文件,避免在以后的训练过程中多次读取训练以及测试数据集,消耗时间 # if not os.path.exists(FLAGS.data_config_path): # with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd: # json.dump(data_config, fd) if FLAGS.do_predict: token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} print("label2id is", label2id) print("id2label is", id2label) batch_size = 0 i = 0 #用来标注第几轮 f = open(os.path.join(FLAGS.data_dir, "pku98-gold.txt")) CharNum = sum( [len(word) for line in f for word in line.strip().split()]) f.close() speed_writer = open(os.path.join(FLAGS.output_dir, "speed_evaluate.txt"), 'w', encoding='utf-8') while batch_size < CharNum: batch_size += 100000 i += 1 print("Start do it") predict_examples, predict_examples_number = processor.get_test_examples( FLAGS.data_dir, batch_size) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record_1") filed_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test", predict_label_list=predict_label_list) file_size = os.path.getsize( os.path.join(FLAGS.data_dir, 'pku98-gold.txt')) / 1000 tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) tf.logging.info(" Character Number = %d", predict_examples_number) tf.logging.info(" The size of file = %.2f kB" % file_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) if FLAGS.do_predict_eval and FLAGS.is_label: predicted_result = estimator.evaluate( input_fn=predict_input_fn) predict_eval_file = "predicted_results_%03d.txt" % i output_eval_file = os.path.join(FLAGS.output_dir, predict_eval_file) with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Predict results *****") for key in sorted(predicted_result.keys()): tf.logging.info(" %s = %s", key, str(predicted_result[key])) writer.write("%s = %s\n" % (key, str(predicted_result[key]))) #开始计时 start_time = time.time() result = estimator.predict(input_fn=predict_input_fn) #停止计时 stop_time = time.time() time_cost = (stop_time - start_time) * 1000 char_speed = batch_size / time_cost byte_speed = file_size / time_cost predict_file = "label_test_%03d.txt" % i output_predict_file = os.path.join(FLAGS.output_dir, predict_file) def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) for id in prediction: if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception! try: if FLAGS.is_label: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' else: line += line_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) print("字符数:%d 耗费时间:%dms 速度:%.3f字符/ms" % (batch_size, time_cost, char_speed)) speed_writer.write("字符数:%d 耗费时间:%dms 速度:%.3f字符/ms \n" % (batch_size, time_cost, char_speed)) if batch_size > CharNum: print("字节数:%.3fk 耗费时间:%dms 速度:%.3fk/ms" % (file_size, time_cost, byte_speed)) speed_writer.write("字节数:%.3fk 耗费时间:%dms 速度:%.3fk/ms \n" % (file_size, time_cost, byte_speed)) if FLAGS.do_predict_eval and FLAGS.is_label: from conlleval import return_report eval_result = return_report(output_predict_file) tf.logging.info("***** Colleval Predict results *****") print(eval_result) eval_result_str = ' '.join(eval_result) predict_conlleval_file = "predicted_conlleval_results_%03d.txt" % i output_predict_eval_file = os.path.join( FLAGS.output_dir, predict_conlleval_file) with codecs.open(output_predict_eval_file, "w", encoding='utf-8') as writer: writer.write(eval_result_str) writer.close() speed_writer.close()
def main(): tf.logging.set_verbosity(tf.logging.INFO) processors = {"ner": NerProcessor} if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") #加载下载的bert配置文件 bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) #数据的max_seq_length必需要小于bert的max_position_embedding=512,否则报错 if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 在train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if FLAGS.clean and FLAGS.do_train: #在训练之前,把所有的output_dir里的文件全部删除 if os.path.exists(FLAGS.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(FLAGS.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) #删除data.config文件 if os.path.exists(FLAGS.data_config_path): try: os.remove(FLAGS.data_config_path) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) #初始化NERprocessor类 processor = processors[task_name]() #获取数据集的标签 label_list = processor.get_labels() #token 处理器,主要作用就是分字,将字转换成ID。vocab_file字典文件路径,do_lower_case 是否将所有数据小写 tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) #使用TPU加速计算 tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path) as fd: data_config = json.load(fd) else: data_config = {} if FLAGS.do_train: # 加载训练数据 if len(data_config) == 0: train_examples = processor.get_train_examples(FLAGS.data_dir) #计算训练执行的总批次数 num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) data_config['num_train_steps'] = num_train_steps data_config['num_warmup_steps'] = num_warmup_steps data_config['num_train_size'] = len(train_examples) else: num_train_steps = int(data_config['num_train_steps']) num_warmup_steps = int(data_config['num_warmup_steps']) # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, #初始预训练检查点,这里来自预训练的bert模型 learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, #model_dir = os.path.join(root_path,'model'), train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # 1. 将数据转化为tf_record 数据 if data_config.get('train.tf_record_path', '') == '': train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) else: train_file = data_config.get('train.tf_record_path') num_train_size = num_train_size = int(data_config['num_train_size']) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", num_train_size) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) stop_hook = tf.contrib.estimator.stop_if_no_increase_hook( estimator, 'f1', 500, min_steps=1500, run_every_secs=120) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[stop_hook]) if FLAGS.do_eval: if data_config.get('eval.tf_record_path', '') == '': eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) data_config['eval.tf_record_path'] = eval_file data_config['num_eval_size'] = len(eval_examples) else: eval_file = data_config['eval.tf_record_path'] # 打印验证集数据信息 num_eval_size = data_config.get('num_eval_size', 0) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_size) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(num_eval_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # 保存数据的配置文件,避免在以后的训练过程中多次读取训练以及测试数据集,消耗时间 if not os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd: json.dump(data_config, fd) if FLAGS.do_predict: #删除之前生成的token_test.txt文件 token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) #构建标签-索引对字典 with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} #加载测试集数据 predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) predicted_result = estimator.evaluate(input_fn=predict_input_fn) output_eval_file = os.path.join(FLAGS.output_dir, "predicted_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Predict results *****") for key in sorted(predicted_result.keys()): tf.logging.info(" %s = %s", key, str(predicted_result[key])) writer.write("%s = %s\n" % (key, str(predicted_result[key]))) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) for id in prediction: if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception! try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from conlleval import return_report eval_result = return_report(output_predict_file) print(eval_result)
def main(): tf.logging.set_verbosity(tf.logging.INFO) processors = { "ner": NerProcessor } # if not FLAGS.do_train and not FLAGS.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 在train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if FLAGS.clean and FLAGS.do_train: if os.path.exists(FLAGS.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(FLAGS.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(FLAGS.data_config_path): try: os.remove(FLAGS.data_config_path) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path) as fd: data_config = json.load(fd) else: data_config = {} if FLAGS.do_train: # 加载训练数据 if len(data_config) == 0: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) data_config['num_train_steps'] = num_train_steps data_config['num_warmup_steps'] = num_warmup_steps data_config['num_train_size'] = len(train_examples) else: num_train_steps = int(data_config['num_train_steps']) num_warmup_steps = int(data_config['num_warmup_steps']) # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # 1. 将数据转化为tf_record 数据 if data_config.get('train.tf_record_path', '') == '': train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) else: train_file = data_config.get('train.tf_record_path') num_train_size = num_train_size = int(data_config['num_train_size']) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", num_train_size) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: if data_config.get('eval.tf_record_path', '') == '': eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) data_config['eval.tf_record_path'] = eval_file data_config['num_eval_size'] = len(eval_examples) else: eval_file = data_config['eval.tf_record_path'] # 打印验证集数据信息 num_eval_size = data_config.get('num_eval_size', 0) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_size) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(num_eval_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # 保存数据的配置文件,避免在以后的训练过程中多次读取训练以及测试数据集,消耗时间 if not os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd: json.dump(data_config, fd) if FLAGS.do_predict: token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) predicted_result = estimator.evaluate(input_fn=predict_input_fn) output_eval_file = os.path.join(FLAGS.output_dir, "predicted_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Predict results *****") for key in sorted(predicted_result.keys()): tf.logging.info(" %s = %s", key, str(predicted_result[key])) writer.write("%s = %s\n" % (key, str(predicted_result[key]))) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) for id in prediction: if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception! try: line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from conlleval import return_report eval_result = return_report(output_predict_file) print(eval_result)