def __init__(self, bert_config_file, vocab_file, init_checkpoint, batch_size=4): self.bert_config = bert_config_file self.max_seq_length = self.bert_config.max_position_embeddings self.batch_size = batch_size self.bert_config = modeling.BertConfig.from_json_file(bert_config_file) self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.model_fn = model_fn_builder(bert_config=self.bert_config, init_checkpoint=init_checkpoint, use_one_hot_embeddings=False)
def process_function(data_dir, vocab_file_path, do_train, do_eval, do_test, max_seq_length, max_sent_length, batch_size): train_input = None eval_input = None test_input = None processor = MyProcessor() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True) if do_train: # train_examples is a list, 每个元素为InputExample(guid=guid, text_a=text_a, text_b=None, label=label) train_examples = processor.get_train_examples(data_dir) # InputExample ---> features train_input = file_based_convert_examples_to_features( train_examples, max_seq_length, max_sent_length, tokenizer, data_mode='train') print('***start to training**') print(' Number training examples %d', len(train_examples)) print(' Batch size %d', batch_size) if do_eval: eval_examples = processor.get_dev_examples(data_dir) eval_input = file_based_convert_examples_to_features(eval_examples, max_seq_length, max_sent_length, tokenizer, data_mode='dev') print('***start to validation**') print(' Number validate examples %d', len(eval_examples)) print(' Batch size %d', batch_size) if do_test: test_examples = processor.get_test_examples(data_dir) test_input = file_based_convert_examples_to_features(test_examples, max_seq_length, max_sent_length, tokenizer, data_mode='test') print('***start to testing**') print(' Number test examples %d', len(test_examples)) print(' Batch size %d', batch_size) # 返回的 train_input,eval_input,predict_input 都是list return train_input, eval_input, test_input
def process_function(data_dir, vocab_file_path, do_train, do_eval, do_predict, output_dir, max_seq_length, batch_size): train_input = None eval_input = None predict_input = None processor = MyProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True) if do_train: # train_examples is a list, 每个元素为InputExample(guid=guid, text_a=text_a, text_b=None, label=label) train_examples = processor.get_train_examples(data_dir) train_file = os.path.join(output_dir, "train.tf_record") # InputExample ---> features train_input = file_based_convert_examples_to_features( train_examples, label_list, max_seq_length, tokenizer, train_file) print('***start to training**') print(' Number training examples %d', len(train_examples)) print(' Batch size %d', batch_size) if do_eval: eval_examples = processor.get_dev_examples(data_dir) eval_file = os.path.join(output_dir, "eval.tf_record") eval_input = file_based_convert_examples_to_features( eval_examples, label_list, max_seq_length, tokenizer, eval_file) print('***start to validation**') print(' Number validate examples %d', len(eval_examples)) print(' Batch size %d', batch_size) if do_predict: predict_examples = processor.get_test_examples(data_dir) predict_file = os.path.join(output_dir, "predict.tf_record") predict_input = file_based_convert_examples_to_features( predict_examples, label_list, max_seq_length, tokenizer, predict_file) print('***start to predict**') print(' Number predict examples %d', len(predict_examples)) print(' Batch size %d', batch_size) # 返回的 train_input,eval_input,predict_input 都是list return train_input, eval_input, predict_input
def load_model(): is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=master, model_dir=output_dir, save_checkpoints_steps=save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=num_tpu_cores, per_host_input_for_training=is_per_host)) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=use_tpu, use_one_hot_embeddings=use_tpu) # estimator = tf.estimator.Estimator(model_fn=model_fn, # params=params, # model_dir="./weibo_and_t_train_20/") estimator = tf.contrib.tpu.TPUEstimator( use_tpu=use_tpu, model_fn=model_fn, config=run_config, train_batch_size=8, eval_batch_size=8, predict_batch_size=8) return estimator,tokenizer
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "fake_news": FakeNewsProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(input_data, task_name): tf.logging.set_verbosity(tf.logging.INFO) processors = { "sim": SimProcessor, "sent": SentProcessor } classif = [] if task_name == 'sim': tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.sim_init_checkpoint) elif task_name == 'sent': tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.sent_init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) #task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() print(label_list) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if task_name == 'sim': model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.sim_init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) elif task_name == 'sent': model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.sent_init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_predict: predict_examples = processor.get_test_examples(input_data) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) for prediction in result: probabilities = prediction["probabilities"] classif.append(probabilities) return classif, label_list
def main(): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cluener": NerProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) # if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: # raise ValueError( # "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() tag_list = processor.get_tags() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # num_labels=2 * len(tag_list) + 1 BI两种外加一个O model_fn = model_fn_builder( bert_config=bert_config, num_labels=2*len(tag_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.data_dir, "train.tf_record") file_based_convert_examples_to_features( train_examples, tag_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.data_dir, "eval.tf_record") file_based_convert_examples_to_features( eval_examples, tag_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: # label dict的设置 tag_ids = {0: 'O', 1: 'B-address', 2: 'I-address', 3: 'B-book', 4: 'I-book', 5: 'B-company', 6: 'I-company', 7: 'B-game', 8: 'I-game', 9: 'B-government', 10: 'I-government', 11: 'B-movie', 12: 'I-movie', 13: 'B-name', 14: 'I-name', 15: 'B-organization', 16: 'I-organization', 17: 'B-position', 18: 'I-position', 19: 'B-scene', 20: 'I-scene'} predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) test_file = os.path.join(FLAGS.data_dir, "test.tf_record") file_based_convert_examples_to_features(predict_examples, tag_list, FLAGS.max_seq_length, tokenizer, test_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=test_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) results = estimator.predict(input_fn=predict_input_fn) output_file = os.path.join(FLAGS.data_dir, 'clue_predict.json') with open(output_file, 'w', encoding='utf-8') as fr: for example, result in zip(predict_examples, results): pre_id = result['predictions'] # print(f'text is {example.text_a}') # print(f'preid is {pre_id}') text = example.text_a # 只获取text中的长度的tag输出 tags = [tag_ids[tag] for tag in pre_id][1:len(text) + 1] res_words, res_pos = get_result(text, tags) rs = {} for w, t in zip(res_words, res_pos): rs[t] = rs.get(t, []) + [w] pres = {} for t, ws in rs.items(): temp = {} for w in ws: word = text[w[0]: w[1] + 1] temp[word] = temp.get(word, []) + [w] pres[t] = temp output_line = json.dumps({'id': example.guid, 'label': pres}, ensure_ascii=False) + '\n' fr.write(output_line)
def main(): tf.logging.set_verbosity(tf.logging.INFO) processors = { "tnews": TnewsProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.data_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.data_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: # label dict的设置 label_dict = { 0: 100, 1: 101, 2: 102, 3: 103, 4: 104, 5: 106, 6: 107, 7: 108, 8: 109, 9: 110, 10: 112, 11: 113, 12: 114, 13: 115, 14: 116 } label_desc = { 100: "news_story", 101: "news_culture", 102: "news_entertainment", 103: "news_sports", 104: "news_finance", 106: "news_house", 107: "news_car", 108: "news_edu", 109: "news_tech", 110: "news_military", 112: "news_travel", 113: "news_world", 114: "news_stock", 115: "news_agriculture", 116: "news_game" } predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) test_file = os.path.join(FLAGS.data_dir, "test.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, test_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=test_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) results = estimator.predict(input_fn=predict_input_fn) output_file = os.path.join(FLAGS.output_dir, 'news_predict.json') with open(output_file, 'w', encoding='utf-8') as fr: print(results) for index, result in enumerate(results): pre_id = result['predictions'] print(f'the index is {index} preid is {pre_id}') label = label_dict.get(pre_id) label_d = label_desc.get(label) json_str = json.dumps({ "id": index, "label": str(label), "label_desc": label_d }) fr.write(json_str) fr.write('\n')
def main(): """ 训练主入口 """ tf.logging.info('start to train') # 部分参数设置 process = AllProcessor() label_list = process.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) train_examples = process.get_train_examples(FLAGS.data_dir) train_cnt = file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.data_dir, 'train' ) dev_examples = process.get_dev_examples(FLAGS.data_dir) dev_cnt = file_based_convert_examples_to_features( dev_examples, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.data_dir, 'dev' ) # 输入输出定义 input_ids = tf.placeholder(tf.int64, shape=[None, FLAGS.max_seq_length], name='input_ids') input_mask = tf.placeholder(tf.int64, shape=[None, FLAGS.max_seq_length], name='input_mask') segment_ids = tf.placeholder(tf.int64, shape=[None, FLAGS.max_seq_length], name='segment_ids') labels = tf.placeholder(tf.int64, shape=[None], name='labels') task = tf.placeholder(tf.int64, name='task') # bert相关参数设置 bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) loss, logits, acc, pre_id = create_model( bert_config, True, input_ids, input_mask, segment_ids, labels, False, task ) num_train_steps = int(len(train_examples) / FLAGS.train_batch_size) num_warmup_steps = math.ceil( num_train_steps * FLAGS.train_batch_size * FLAGS.warmup_proportion) train_op = optimization.create_optimizer( loss, FLAGS.learning_rate, num_train_steps * FLAGS.num_train_epochs, num_warmup_steps, False ) # 初始化参数 init_global = tf.global_variables_initializer() saver = tf.train.Saver( [v for v in tf.global_variables() if 'adam_v' not in v.name and 'adam_m' not in v.name]) with tf.Session() as sess: sess.run(init_global) print('start to load bert params') if FLAGS.init_checkpoint: # tvars = tf.global_variables() tvars = tf.trainable_variables() print("global_variables", len(tvars)) assignment_map, initialized_variable_names = \ modeling.get_assignment_map_from_checkpoint(tvars, FLAGS.init_checkpoint) print("initialized_variable_names:", len(initialized_variable_names)) saver_ = tf.train.Saver([v for v in tvars if v.name in initialized_variable_names]) saver_.restore(sess, FLAGS.init_checkpoint) tvars = tf.global_variables() # initialized_vars = [v for v in tvars if v.name in initialized_variable_names] not_initialized_vars = [v for v in tvars if v.name not in initialized_variable_names] print('all size %s; not initialized size %s' % (len(tvars), len(not_initialized_vars))) if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) # for v in initialized_vars: # print('initialized: %s, shape = %s' % (v.name, v.shape)) # for v in not_initialized_vars: # print('not initialized: %s, shape = %s' % (v.name, v.shape)) else: print('the bert init checkpoint is None!!!') sess.run(tf.global_variables_initializer()) # 训练的step def train_step(ids, mask, seg, true_y, task_id): feed = {input_ids: ids, input_mask: mask, segment_ids: seg, labels: true_y, task: task_id} _, logits_out, loss_out = sess.run([train_op, logits, loss], feed_dict=feed) return logits_out, loss_out # 验证的step def dev_step(ids, mask, seg, true_y, task_id): feed = {input_ids: ids, input_mask: mask, segment_ids: seg, labels: true_y, task: task_id} pre_out, acc_out = sess.run([pre_id, acc], feed_dict=feed) return pre_out, acc_out # 开始训练 for epoch in range(FLAGS.num_train_epochs): tf.logging.info(f'start to train and the epoch:{epoch}') epoch_loss = do_train(sess, train_cnt, train_step, epoch) tf.logging.info(f'the epoch{epoch} loss is {epoch_loss}') saver.save(sess, FLAGS.output_dir + 'bert.ckpt', global_step=epoch) # 每一个epoch开始验证模型 do_eval(sess, dev_cnt, dev_step) # 进行预测并保存结果 do_predict(label_list, process, tokenizer, dev_step) tf.logging.info('the training is over!!!!')