def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "sst-2": run_classifier.SST2Processor, "mnli": run_classifier.MnliProcessor } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint1) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint2) if not tf.train.checkpoint_exists(FLAGS.init_checkpoint1): raise TFCheckpointNotFoundError("checkpoint1 does not exist!") if not tf.train.checkpoint_exists(FLAGS.init_checkpoint2) and \ not FLAGS.use_random: raise TFCheckpointNotFoundError("checkpoint2 does not exist!") bert_config1 = modeling.BertConfig.from_json_file(FLAGS.bert_config_file1) bert_config2 = modeling.BertConfig.from_json_file(FLAGS.bert_config_file2) if FLAGS.max_seq_length > bert_config1.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config1.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) all_results = [] predict_examples = processor.get_test_examples(FLAGS.diff_input_file) num_actual_predict_examples = len(predict_examples) # For single sentence tasks (like SST2) eg.text_b is None original_data = [(eg.text_a, eg.text_b) for eg in predict_examples] if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(run_classifier.PaddingInputExample()) predict_file = os.path.join(FLAGS.init_checkpoint1, FLAGS.exp_name + ".predict.tf_record") run_classifier.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) for bert_config_type, output_dir in [ (bert_config1, FLAGS.init_checkpoint1), (bert_config2, FLAGS.init_checkpoint2) ]: tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = run_classifier.model_fn_builder( bert_config=bert_config_type, num_labels=len(label_list), # This init checkpoint is eventually overriden by the estimator init_checkpoint=FLAGS.output_dir, learning_rate=FLAGS.learning_rate, num_train_steps=None, num_warmup_steps=None, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = run_classifier.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = [x for x in estimator.predict(input_fn=predict_input_fn)] all_results.append(result) all_results[0] = all_results[0][:num_actual_predict_examples] all_results[1] = all_results[1][:num_actual_predict_examples] assert len(all_results[0]) == len(all_results[1]) # Assuming model1's predictions are gold labels, calculate model2's accuracy score = 0 for prob1, prob2 in zip(all_results[0], all_results[1]): if np.argmax(prob1["probabilities"]) == np.argmax( prob2["probabilities"]): score += 1 tf.logging.info("Agreement score = %.6f", float(score) / num_actual_predict_examples) # Calculate the average value of |v1 - v2|, the distance on the simplex # Unlike KL divergence, this is a bounded metric # However, these results are not comparable across tasks # with different number classes distances = [] for prob1, prob2 in zip(all_results[0], all_results[1]): distances.append( np.linalg.norm(prob1["probabilities"] - prob2["probabilities"])) tf.logging.info("Average length |p1 - p2| = %.8f", np.mean(distances)) tf.logging.info("Max length |p1 - p2| = %.8f", np.max(distances)) tf.logging.info("Min length |p1 - p2| = %.8f", np.min(distances)) tf.logging.info("Std length |p1 - p2| = %.8f", np.std(distances)) if FLAGS.diff_type == "kld1": all_kld = [] for prob1, prob2 in zip(all_results[0], all_results[1]): all_kld.append( stats.entropy(prob1["probabilities"], prob2["probabilities"])) tf.logging.info("Average kl-divergence (p1, p2) = %.8f", np.mean(all_kld)) tf.logging.info("Max kl-divergence (p1, p2) = %.8f", np.max(all_kld)) tf.logging.info("Min kl-divergence (p1, p2) = %.8f", np.min(all_kld)) tf.logging.info("Std kl-divergence (p1, p2) = %.8f", np.std(all_kld)) elif FLAGS.diff_type == "kld2": all_kld = [] for prob1, prob2 in zip(all_results[0], all_results[1]): all_kld.append( stats.entropy(prob2["probabilities"], prob1["probabilities"])) tf.logging.info("Average kl-divergence (p2, p1) = %.8f", np.mean(all_kld)) tf.logging.info("Max kl-divergence (p2, p1) = %.8f", np.max(all_kld)) tf.logging.info("Min kl-divergence (p2, p1) = %.8f", np.min(all_kld)) tf.logging.info("Std kl-divergence (p2, p1) = %.8f", np.std(all_kld)) if FLAGS.diff_output_file: output = "" # Removing padded examples all_results[0] = all_results[0][:len(original_data)] all_results[1] = all_results[1][:len(original_data)] with tf.gfile.GFile(FLAGS.diff_output_file, "w") as f: for i, (eg, prob1, prob2) in enumerate( zip(original_data, all_results[0], all_results[1])): if i % 1000 == 0: tf.logging.info("Writing instance %d", i + 1) p1_items = [p1.item() for p1 in prob1["probabilities"]] p2_items = [p2.item() for p2 in prob2["probabilities"]] prob1_str = "%.6f\t%.6f\t%.6f" % (p1_items[0], p1_items[1], p1_items[2]) prob2_str = "%.6f\t%.6f\t%.6f" % (p2_items[0], p2_items[1], p2_items[2]) output = "%s\t%s\t%s\t%s\n" % (eg[0], eg[1], prob1_str, prob2_str) f.write(output) return
def get_tokenizer(): return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=False)
def prepare_bert(bert_vocab_file, bert_config_file, init_checkpoint, sen_len, select_layers, batch_size, graph_file, model_dir): tokenizer = tokenization.FullTokenizer(bert_vocab_file) estimator = get_estimator(bert_config_file, init_checkpoint, sen_len, select_layers, batch_size, graph_file, model_dir) return tokenizer, estimator
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train`, `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) num_train_steps = None num_warmup_steps = None if FLAGS.do_train: num_train_steps = int( FLAGS.train_data_size / FLAGS.train_batch_size) * FLAGS.epochs num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) if not tf.gfile.Exists(FLAGS.train_file): tf.logging.info( "DANITER:File doesn't exist, creating tfrecord data") examples = model_builder.load_hellaswag(FLAGS.train_raw_data) tf.logging.info("DANITER:Read raw data as json") model_builder.file_based_convert_examples_for_bilinear( examples, 512, tokenizer, FLAGS.train_file, do_copa=True) train_input_fn = file_based_input_fn_builder( input_file=FLAGS.train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, steps=num_train_steps) if FLAGS.do_eval: # This tells the estimator to run through the entire set. if FLAGS.eval_data_size < 0: eval_steps = None else: eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False if not tf.gfile.Exists(FLAGS.eval_file): examples = model_builder.load_hellaswag(FLAGS.eval_raw_data) model_builder.file_based_convert_examples_for_bilinear( examples, 512, tokenizer, FLAGS.eval_file, do_copa=True) eval_input_fn = file_based_input_fn_builder( input_file=FLAGS.eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates tf.logging.info("Evaling all models in output dir") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "eval_accuracy" tf.logging.info("Checkpoint path " + checkpoint_path) if tf.gfile.Exists(checkpoint_path + ".index"): tf.logging.info("Found a best model... not good") result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) best_perf = result[key_name] global_step = result["global_step"] else: tf.logging.info("Setting global step to -1") global_step = -1 best_perf = -1 checkpoint_path = None tf.logging.info("Openning writer " + output_eval_file) writer = tf.gfile.GFile(output_eval_file, "w") steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) tf.logging.info("Models found " + "\n".join(filenames)) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) # steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format(global_step)) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) # Why should we remove checkpoints? # tf.gfile.Remove(src_ckpt) tf.logging.info("Skipping candidate for some reason") continue result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if len(_find_valid_cands(global_step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) # tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") writer.close()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = {"answer_sent_labeling": AnswerSentenceLabelingProcessor} if not FLAGS.do_train_and_eval and not FLAGS.do_predict and not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train_and_eval`, or `do_predict', or `do_train`, or `do_eval' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.work_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) config = tf.compat.v1.ConfigProto() run_config = tf.estimator.RunConfig( model_dir=FLAGS.work_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps, ) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train_and_eval or FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) # If GPU is not available, this will fall back to normal Estimator on CPU. estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={ "train_batch_size": FLAGS.train_batch_size, "predict_batch_size": FLAGS.predict_batch_size }) # Early_stop early_stopping_hook = tf.contrib.estimator.stop_if_no_increase_hook( estimator=estimator, metric_name="f1_score", max_steps_without_increase=FLAGS.max_steps_without_increase, min_steps=1000, run_every_secs=None, run_every_steps=FLAGS.save_checkpoints_steps, ) if FLAGS.do_train_and_eval: train_file = os.path.join(FLAGS.work_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, FLAGS.max_answer_num, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, max_answer_num=FLAGS.max_answer_num, is_training=True, drop_remainder=True) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.work_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, FLAGS.max_answer_num, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, max_answer_num=FLAGS.max_answer_num, is_training=False, drop_remainder=eval_drop_remainder) # steps=None tells the estimator to run through the entire set. # throttle_secs set minimum seconds needed to evaluate model again. eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=None, throttle_secs=10) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if FLAGS.do_train: train_file = os.path.join(FLAGS.work_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, FLAGS.max_answer_num, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, max_answer_num=FLAGS.max_answer_num, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.work_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, FLAGS.max_answer_num, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) # This tells the estimator to run through the entire set. eval_steps = None eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, max_answer_num=FLAGS.max_answer_num, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.work_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info("\n %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.work_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, FLAGS.max_answer_num, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d ", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, max_answer_num=FLAGS.max_answer_num, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.work_dir, "test_results.tsv") output_pos_prob_np_file = os.path.join(FLAGS.work_dir, "test_results.npy") pos_prob_list = [] with open(output_predict_file, "w") as writer: num_written_lines = 0 for (query_id, prediction) in enumerate(result): if query_id % 200 == 0: tf.logging.info("***** query_id: {}*****".format(query_id)) predict = prediction["predict"] answer_num = prediction["answer_num"] positive_probabilities = prediction[ "positive_probabilities"][:answer_num] pos_prob_list.extend(positive_probabilities) for answer_id, class_id in enumerate(predict): if answer_id < answer_num: output_line = "\t".join( str(x) for x in [query_id, answer_id, class_id]) + "\n" writer.write(output_line) num_written_lines += 1 np.save(output_pos_prob_np_file, np.array(pos_prob_list)) assert num_written_lines == num_predict_examples
def main(argv): tf.logging.set_verbosity(tf.logging.INFO) tf.config.set_soft_device_placement(True) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) train_examples = None num_train_steps = None num_warmup_steps = None train_input_fn = None ft_known_train_file = None train_file = None if FLAGS.do_train: current_seed = 0 num_known_classes = FLAGS.num_domains * FLAGS.num_labels_per_domain data_output_dir = FLAGS.data_output_dir if not tf.gfile.Exists(data_output_dir): tf.gfile.MakeDirs(data_output_dir) known_ft_path = os.path.join(data_output_dir, "known_ft_train.tf_record") unknown_ft_path = os.path.join(data_output_dir, "unknown_ft_train.tf_record") if not tf.gfile.Glob(known_ft_path): preprocess_few_shot_training_data(tokenizer, known_ft_path, unknown_ft_path, current_seed) if FLAGS.continual_learning is None: assert False, "Not Implemented" elif FLAGS.continual_learning == "pretrain": train_file = os.path.join(FLAGS.data_output_dir, "known_ft_train.tf_record") num_classes = num_known_classes num_train_examples = num_known_classes * FLAGS.known_num_shots num_shots_per_class = FLAGS.known_num_shots elif FLAGS.continual_learning == "few_shot": train_file = os.path.join(FLAGS.data_output_dir, "unknown_ft_train.tf_record") ft_known_train_file = os.path.join(FLAGS.data_output_dir, "known_ft_train.tf_record") num_unknown_classes = NUM_CLASSES - num_known_classes num_classes = num_unknown_classes num_train_examples = num_unknown_classes * FLAGS.few_shot num_shots_per_class = FLAGS.few_shot tpu_split = FLAGS.tpu_split if FLAGS.use_tpu else 1 if num_shots_per_class < tpu_split: steps_per_epoch = 1 else: steps_per_epoch = num_shots_per_class // tpu_split num_train_steps = int(steps_per_epoch * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) FLAGS.num_train_steps = num_train_steps FLAGS.save_checkpoints_steps = int(steps_per_epoch * FLAGS.save_every_epoch) tf.logging.info("***** Running training *****") tf.logging.info(" train_file: %s" % train_file) tf.logging.info(" use_tpu: %s" % FLAGS.use_tpu) tf.logging.info(" Num examples = %d", num_train_examples) tf.logging.info(" Batch size = %d", FLAGS.batch_size) tf.logging.info(" Save checkpoints steps = %d", FLAGS.save_checkpoints_steps) tf.logging.info(" warmup steps = %d", num_warmup_steps) tf.logging.info(" Num epochs = %d", FLAGS.num_train_epochs) tf.logging.info(" Num steps = %d", num_train_steps) tf.logging.info(" Reduce method = %s", FLAGS.reduce_method) tf.logging.info(" Max Seq Length = %d", FLAGS.max_seq_length) tf.logging.info(" learning_rate = %.7f", FLAGS.learning_rate) tf.logging.info(" dropout rate = %.4f", DROPOUT_PROB) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, ft_known_train_file=ft_known_train_file, use_tpu=FLAGS.use_tpu) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) FLAGS.do_eval = False eval_input_fn = None params = _get_hparams() params.update(num_train_steps=num_train_steps) if not FLAGS.do_train: train_input_fn = eval_input_fn experiment_utils.run_experiment(model_fn=model_fn, train_input_fn=train_input_fn, eval_input_fn=train_input_fn, params=params)
def main(_): tf.gfile.MakeDirs(os.path.dirname(FLAGS.output_tfrecord)) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_path, do_lower_case=True) annotations_zipfn = os.path.join(FLAGS.data_dir, "vcr1annots.zip") images_zipfn = os.path.join(FLAGS.data_dir, "vcr1images.zip") # Generate data for all splits: for split in ["train", "val", "test"]: jsonl_file = split + ".jsonl" output_tfrecord = "-".join([ FLAGS.output_tfrecord, split, "%05d" % FLAGS.shard, "of", "%05d" % FLAGS.num_shards ]) with tf.python_io.TFRecordWriter(output_tfrecord) as writer: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) with zipfile.ZipFile( tf.gfile.Open(annotations_zipfn)) as annotations_zip: with zipfile.ZipFile( tf.gfile.Open(images_zipfn)) as images_zip: with annotations_zip.open(jsonl_file) as jsonl: for idx, line in enumerate(jsonl): if idx % FLAGS.num_shards != FLAGS.shard: continue example = json.loads(line) meta_filename = "vcr1images/" + example[ "metadata_fn"] meta = json.loads( images_zip.open(meta_filename).read()) del meta["segms"] try: image_filename = "vcr1images/" + example[ "img_fn"] tf.logging.info("Reading %s", image_filename) with images_zip.open( image_filename) as image: image_string = image.read() except zipfile.BadZipfile as e: tf.logging.error("Bad Zip file: " + str(e)) image_string = BLANK_JPEG for box in meta["boxes"]: box[0] = 0.0 box[1] = 0.0 box[2] = 1.0 box[3] = 1.0 is_test = (split == "test") for tf_example in create_tf_examples( tokenizer, example, image_string, meta, is_test=is_test): writer.write( tf_example.SerializeToString())
def main(_): tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) validate_flags_or_throw(bert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 save_checkpoints_steps = int(FLAGS.train_num_precomputed / FLAGS.train_batch_size) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, keep_checkpoint_max=100, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) num_train_steps = None num_warmup_steps = None if FLAGS.do_train: num_train_features = FLAGS.train_num_precomputed num_train_steps = int(int(num_train_features / FLAGS.train_batch_size) * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size, eval_batch_size=FLAGS.eval_batch_size, model_dir=FLAGS.output_dir) if FLAGS.do_train: tf.logging.info("***** Running training on precomputed features *****") tf.logging.info(" Num split examples = %d", num_train_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_filename = FLAGS.train_precomputed_file train_input_fn = input_fn_builder( input_file=train_filename, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = {"race": race_utils.RaceProcessor} tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name, )) processor = processors[task_name]( use_spm=True if FLAGS.spm_model_file else False, do_lower_case=FLAGS.do_lower_case, high_only=FLAGS.high_only, middle_only=FLAGS.middle_only) label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) #, # spm_model_file=FLAGS.spm_model_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int( min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=int(FLAGS.save_checkpoints_steps), keep_checkpoint_max=0, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) model_fn = race_utils.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=int(FLAGS.train_step), num_warmup_steps=int(FLAGS.warmup_step), use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, max_seq_length=FLAGS.max_seq_length, dropout_prob=FLAGS.dropout_prob) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. # if FLAGS.use_tpu: estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) # else: # estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: if not tf.gfile.Exists(FLAGS.train_file): race_utils.file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.train_file, FLAGS.max_qa_length) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", FLAGS.train_step) train_input_fn = classifier_utils.file_based_input_fn_builder( input_file=FLAGS.train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size, multiple=len(label_list)) estimator.train(input_fn=train_input_fn, max_steps=int(FLAGS.train_step)) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(classifier_utils.PaddingInputExample()) if not tf.gfile.Exists(FLAGS.eval_file): race_utils.file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.eval_file, FLAGS.max_qa_length) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = classifier_utils.file_based_input_fn_builder( input_file=FLAGS.eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.eval_batch_size, multiple=len(label_list)) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "eval_accuracy" if tf.gfile.Exists(checkpoint_path + ".index"): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) best_perf = result[key_name] global_step = result["global_step"] else: global_step = -1 best_perf = -1 checkpoint_path = None writer = tf.gfile.GFile(output_eval_file, "w") while global_step < FLAGS.train_step: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) # steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(1) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in [ "meta", "data-00000-of-00001", "index" ]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt)) if len(_find_valid_cands(global_step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") writer.close() if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(classifier_utils.PaddingInputExample()) assert len(predict_examples) % FLAGS.predict_batch_size == 0 predict_steps = int( len(predict_examples) // FLAGS.predict_batch_size) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") race_utils.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, FLAGS.max_qa_length) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = classifier_utils.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size, multiple=len(label_list)) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.evaluate(input_fn=predict_input_fn, steps=predict_steps, checkpoint_path=checkpoint_path) output_predict_file = os.path.join(FLAGS.output_dir, "predict_results.txt") with tf.gfile.GFile(output_predict_file, "w") as pred_writer: # num_written_lines = 0 tf.logging.info("***** Predict results *****") pred_writer.write("***** Predict results *****\n") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) pred_writer.write("%s = %s\n" % (key, str(result[key]))) pred_writer.write("best = {}\n".format(best_perf))
def main(): MAX_SEQ_LENGTH, LABELS_LIST, VOCAB_FILE_PATH = get_config("cn") credentials = grpc.ssl_channel_credentials( root_certificates=ROOT_CERT.encode()) channel = grpc.secure_channel( '{}:{}'.format(MODEL_SERVER_HOST, MODEL_SERVER_PORT), credentials) stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) # get the sentence of input # sentence = str(globals.request.headers.getlist('Text')[0]) # sentence = globals.request.form.to_dict() sentence = "配置很不错,有很多的贴心配置,让人感到很温暖" # convert single sentence to feature tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE_PATH, do_lower_case=True) example = run_classifier.InputExample( guid="test-0", text_a=tokenization.convert_to_unicode(sentence), text_b=None, label=LABELS_LIST[0]) feature = run_classifier.convert_single_example(0, example, LABELS_LIST, MAX_SEQ_LENGTH, tokenizer) # get the input of model input_ids = np.reshape([feature.input_ids], (1, MAX_SEQ_LENGTH)) input_mask = np.reshape([feature.input_mask], (1, MAX_SEQ_LENGTH)) segment_ids = np.reshape([feature.segment_ids], (MAX_SEQ_LENGTH)) label_ids = [feature.label_id] # Construct the request to tensorflow serving request = predict_pb2.PredictRequest() request.model_spec.name = MODEL_NAME request.model_spec.signature_name = 'serving_default' # package the input into request, Note the format of the input(follow the model) request.inputs['input_ids'].CopyFrom( tf.contrib.util.make_tensor_proto(input_ids, shape=[1, MAX_SEQ_LENGTH], dtype=tf.int32)) request.inputs['input_mask'].CopyFrom( tf.contrib.util.make_tensor_proto(input_mask, shape=[1, MAX_SEQ_LENGTH], dtype=tf.int32)) request.inputs['label_ids'].CopyFrom( tf.contrib.util.make_tensor_proto(label_ids, shape=[1], dtype=tf.int32)) request.inputs['segment_ids'].CopyFrom( tf.contrib.util.make_tensor_proto(segment_ids, shape=[1, MAX_SEQ_LENGTH], dtype=tf.int32)) # do predict result = stub.Predict(request, 100, metadata=metadata_transformer()) # 120 secs timeout # parse the result probabilities_tensor_proto = result.outputs["probabilities"] probabilities = list(probabilities_tensor_proto.float_val) probabilities_np = np.array(probabilities) top3_index_np = probabilities_np.argsort()[-3:][::-1] probabilities_top3 = probabilities_np[top3_index_np] label_top3 = np.array(LABELS_LIST)[top3_index_np] # shape = tf.TensorShape(probabilities_tensor_proto.tensor_shape) # probabilities = np.array(probabilities_tensor_proto.float_val).reshape( # shape.as_list()) result_list = [] for index in range(3): result_list.append({ "label": label_top3[index], "score": str(probabilities_top3[index]) }) output_json = {"predictions": [{"results": result_list}]} return Response(json.dumps(output_json), mimetype='application/json')
def __init__(self, is_training): self.is_training = is_training self.tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
maxlen = 512 custom_objects = get_custom_objects() custom_objects["tf"] = tf model = load_model("../bert_best_finetuned.h5", custom_objects=custom_objects) with open("../old_complete_output.json_class_labels.txt") as f: label_mapping = np.array(json.load(f)) with open("../mesh_mapping.json") as f: mesh_mapping = json.load(f) graph = tf.get_default_graph() tokenizer = tokenization.FullTokenizer("../biobert_pubmed/vocab.txt", do_lower_case=False) def make_multilabel_prediction(abstract): abstract = ["[CLS]" ] + tokenizer.tokenize(abstract)[0:maxlen - 2] + ["[SEP]"] vocab = tokenizer.vocab print(abstract) token_vectors = np.asarray([vocab[token] for token in abstract] + [0] * (maxlen - len(abstract))) # Model expects a list of samples, we only have one. token_vectors = np.asarray([token_vectors])
encoding='utf-8') as detail: for idx in range(len(logits_list)): item = {} item['trans'] = trans_list[idx] item['lengths'] = lengths_list[idx] item['logit'] = logits_list[idx] item['pred'] = y_pred_list[idx] item['ldct_list'] = ldct_list[idx] detail.write( json.dumps(item, ensure_ascii=False, cls=NpEncoder) + '\n') if __name__ == '__main__': config = Config() vocab_file = config.vocab_file do_lower_case = False tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) print('Predicting test.txt..........') dev_iter = DataIterator(config.batch_size, data_file=result_data_dir + 'test.txt', use_bert=config.use_bert, seq_length=config.sequence_length, is_test=True, tokenizer=tokenizer) # print('Predicting dev.txt..........') # dev_iter = DataIterator(config.batch_size, data_file=result_data_dir + 'dev.txt', use_bert=config.use_bert, # seq_length=config.sequence_length, is_test=True, tokenizer=tokenizer) set_test(dev_iter, config.checkpoint_path)
def main(_): logging.set_verbosity(logging.INFO) processors = {"ner": NerProcessor} if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) print("# training examples", len(train_examples)) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") _, _ = filed_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) logging.info("***** Running training *****") logging.info(" Num examples = %d", len(train_examples)) logging.info(" Batch size = %d", FLAGS.train_batch_size) logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") batch_tokens, batch_labels = filed_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) logging.info("***** Running evaluation *****") logging.info(" Num examples = %d", len(eval_examples)) logging.info(" Batch size = %d", FLAGS.eval_batch_size) # if FLAGS.use_tpu: # eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) # eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with open(output_eval_file, "w", encoding="utf-8") as wf: logging.info("***** Eval results *****") confusion_matrix = result["confusion_matrix"] p, r, f = metrics.calculate(confusion_matrix, len(label_list) - 1) logging.info("***********************************************") logging.info("********************P = %s*********************", str(p)) logging.info("********************R = %s*********************", str(r)) logging.info("********************F = %s*********************", str(f)) logging.info("***********************************************") if FLAGS.do_predict: with open(FLAGS.middle_output + '/label2id.pkl', 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") batch_tokens, batch_labels = filed_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) logging.info("***** Running prediction*****") logging.info(" Num examples = %d", len(predict_examples)) logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") #here if the tag is "X" means it belong to its before token, here for convenient evaluate use # conlleval.pl we discarding it directly Writer(output_predict_file, result, batch_tokens, batch_labels, id2label)
def __init__(self, config): self.config = config self.max_segment_len = config['max_segment_len'] self.max_span_width = config["max_span_width"] self.genres = {g: i for i, g in enumerate(config["genres"])} self.subtoken_maps = {} self.gold = {} self.eval_data = None # Load eval data lazily. self.eval_test_data = None self.bert_config = modeling.BertConfig.from_json_file( config["bert_config_file"]) self.tokenizer = tokenization.FullTokenizer( vocab_file=config['vocab_file'], do_lower_case=False) input_props = [] input_props.append((tf.int32, [None, None])) # input_ids. input_props.append((tf.int32, [None, None])) # input_mask input_props.append((tf.int32, [None])) # Text lengths. input_props.append((tf.int32, [None, None])) # Speaker IDs. input_props.append((tf.int32, [])) # Genre. input_props.append((tf.bool, [])) # Is training. input_props.append((tf.int32, [None])) # Gold starts. input_props.append((tf.int32, [None])) # Gold ends. input_props.append((tf.int32, [None])) # Cluster ids. input_props.append((tf.int32, [None])) # Sentence Map self.queue_input_tensors = [ tf.placeholder(dtype, shape) for dtype, shape in input_props ] dtypes, shapes = zip(*input_props) queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes) self.enqueue_op = queue.enqueue(self.queue_input_tensors) self.input_tensors = queue.dequeue() self.predictions, self.loss = self.get_predictions_and_loss( *self.input_tensors) # bert stuff tvars = tf.trainable_variables() # If you're using TF weights only, tf_checkpoint and init_checkpoint can be the same # Get the assignment map from the tensorflow checkpoint. Depending on the extension, use TF/Pytorch to load weights. assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint( tvars, config['tf_checkpoint']) init_from_checkpoint = tf.train.init_from_checkpoint if config[ 'init_checkpoint'].endswith( 'ckpt') else load_from_pytorch_checkpoint init_from_checkpoint(config['init_checkpoint'], assignment_map) print("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" # tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, # init_string) print(" name = %s, shape = %s%s" % (var.name, var.shape, init_string)) num_train_steps = int(self.config['num_docs'] * self.config['num_epochs']) num_warmup_steps = int(num_train_steps * 0.1) self.global_step = tf.train.get_or_create_global_step() self.train_op, self.bert_lr, self.task_lr = optimization.create_custom_optimizer( tvars, self.loss, self.config['bert_learning_rate'], self.config['task_learning_rate'], num_train_steps, num_warmup_steps, False, self.global_step, freeze=-1, task_opt=self.config['task_optimizer'], eps=config['adam_eps'])
def main(_): tf.logging.set_verbosity(tf.logging.INFO) layer_indexes = [int(x) for x in FLAGS.layers.split(",")] bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( master=FLAGS.master, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) examples = read_examples(FLAGS.input_file) features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, layer_indexes=layer_indexes, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, predict_batch_size=FLAGS.batch_size) input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length) with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, "w")) as writer: for result in estimator.predict(input_fn, yield_single_examples=True): unique_id = int(result["unique_id"]) feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = result["layer_output_%d" % j] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(float(x), 6) for x in layer_output[i:(i + 1)].flat ] all_layers.append(layers) features = collections.OrderedDict() features["token"] = token features["layers"] = all_layers all_features.append(features) output_json["features"] = all_features writer.write(json.dumps(output_json) + "\n")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Answering ablities parser.add_argument("--span_extraction", default=False, action='store_true', help="Whether to use span extraction.") parser.add_argument("--addition_subtraction", default=False, action='store_true', help="Whether to use addition subtraction.") parser.add_argument("--counting", default=False, action='store_true', help="Whether to use counting.") parser.add_argument("--negation", default=False, action='store_true', help="Whether to use negation.") parser.add_argument("--include_more_numbers", default=True, action='store_true', help="Whether to include more numbers.") parser.add_argument("--beam_size", default=3, type=int, help="The size of beam search.") parser.add_argument("--max_count", default=4, type=int, help="The maximal number of add_sub expressions.") parser.add_argument("--max_answer_number", default=8, type=int, help="The maximal number of answers.") ## Other parameters parser.add_argument("--do_debug", default=False, action='store_true', help="Whether to run in debug mode.") parser.add_argument( "--train_file", default=None, type=str, help="DROP json for training. E.g., drop_dataset_train.json") parser.add_argument("--predict_file", default=None, type=str, help="DROP json for predictions.") parser.add_argument( "--prediction_dir", default=None, type=str, help="Nitish added: directory for predictions and metrircs") parser.add_argument("--predictions_json", default=None, type=str, help="Nitish added: file name to write predictions") parser.add_argument("--metrics_json", default=None, type=str, help="Nitish added: filename for metrics") parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=32, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.05, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--length_heuristic", default=0.05, type=float, help="Weight on length heuristic.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--data_parallel", default=False, action='store_true', help="Whether not to use data parallel") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() if not args.span_extraction and not args.addition_subtraction and not args.counting and not args.negation: raise ValueError( "At least one of `span_extraction` or `addition_subtraction` or `counting` or `negation` must be True." ) args.answering_abilities = [] if args.span_extraction: args.answering_abilities.append("span_extraction") if args.addition_subtraction: args.answering_abilities.append("addition_subtraction") if args.counting: args.answering_abilities.append("counting") if args.negation: args.answering_abilities.append("negation") logger.info("Answering abilities: {}".format(args.answering_abilities)) assert "span_extraction" in args.answering_abilities and "addition_subtraction" in args.answering_abilities if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info('output_dir: {}'.format(args.output_dir)) save_path = os.path.join(args.output_dir, 'checkpoint.pth.tar') log_path = os.path.join(args.output_dir, 'performance.txt') network_path = os.path.join(args.output_dir, 'network.txt') parameter_path = os.path.join(args.output_dir, 'parameter.txt') f = open(parameter_path, "w") for arg in sorted(vars(args)): print("{}: {}".format(arg, getattr(args, arg)), file=f) f.close() if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train and not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict and not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "torch_version: {} device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(torch.__version__, device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # --- Prepare model --- logger.info("***** Preparing model *****") model = MTMSN(bert_config, args.answering_abilities, args.max_answer_number) if args.init_checkpoint is not None and not os.path.isfile(save_path): logger.info("Loading model from pretrained checkpoint: {}".format( args.init_checkpoint)) model = bert_load_state_dict( model, torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1 or args.data_parallel: model = torch.nn.DataParallel(model) if os.path.isfile(save_path): checkpoint = torch.load(save_path) model.load_state_dict(checkpoint['model']) logger.info( "Loading model from finetuned checkpoint: '{}' (step {}, epoch {})" .format(save_path, checkpoint['step'], checkpoint['epoch'])) f = open(network_path, "w") for n, param in model.named_parameters(): print("name: {}, size: {}, dtype: {}, requires_grad: {}".format( n, param.size(), param.dtype, param.requires_grad), file=f) total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) print("Total trainable parameters: {}".format(total_trainable_params), file=f) print("Total parameters: {}".format(total_params), file=f) f.close() # --- Prepare data --- tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples, train_features, num_train_steps = None, None, None eval_examples, eval_features = None, None if args.do_train: logger.info("***** Preparing training *****") train_examples, train_features, num_train_steps = read_train_data( args, tokenizer, logger) logger.info("***** Preparing evaluation *****") eval_examples, eval_features = read_eval_data(args, tokenizer, logger) if args.do_predict and eval_features is None: logger.info("***** Preparing prediction *****") eval_examples, eval_features = read_eval_data(args, tokenizer, logger) # --- Prepare optimizer --- logger.info("***** Preparing optimizer *****") if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step, global_epoch = 0, 1 if os.path.isfile(save_path) and args.do_train: checkpoint = torch.load(save_path) optimizer.load_state_dict(checkpoint['optimizer']) logger.info( "Load optimizer from finetuned checkpoint: '{}' (step {}, epoch {})" .format(save_path, checkpoint['step'], checkpoint['epoch'])) global_step = checkpoint['step'] global_epoch = checkpoint['epoch'] + 1 # --- Run training --- if args.do_train and global_epoch < int(args.num_train_epochs) + 1: logger.info("***** Running training *****") best_f1 = 0 for epoch in range(global_epoch, int(args.num_train_epochs) + 1): logger.info("***** Epoch: {} *****".format(epoch)) global_step, model, best_f1 = run_train_epoch( args, global_step, n_gpu, device, model, param_optimizer, optimizer, train_examples, train_features, eval_examples, eval_features, logger, log_path, save_path, best_f1, epoch) # --- Run prediction --- if args.do_predict: logger.info("***** Running prediction *****") # restore from best checkpoint if save_path and os.path.isfile(save_path): checkpoint = torch.load(save_path) model.load_state_dict(checkpoint['model']) logger.info( "Loading model from finetuned checkpoint: '{}' (step {}, epoch {})" .format(save_path, checkpoint['step'], checkpoint['epoch'])) global_step = checkpoint['step'] torch.save( { 'model': model.state_dict(), 'step': checkpoint['step'], 'epoch': checkpoint['epoch'] }, save_path) model.eval() metrics = evaluate(args, model, device, eval_examples, eval_features, logger, write_pred=True) metrics_path = os.path.join(args.prediction_dir, args.metrics_json) f = open(metrics_path, "w") metrics_dict = {'f1': metrics['f1'], 'em': metrics['em']} json.dump(metrics_dict, f) f.close() logger.info("Predition written to : {}".format(metrics_path))
# evaluate tf.logging.info("Precision, Recall and F1-Score...") tf.logging.info( metrics.classification_report(y_test_cls, y_pred_cls, target_names=label_list)) tf.logging.info("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) tf.logging.info(cm) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_TCM.py [train / test]""") tf.logging.set_verbosity(tf.logging.INFO) config = TextConfig() label_list = Processor().get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file, do_lower_case=False) model = TextCNN(config) if sys.argv[1] == 'train': train() elif sys.argv[1] == 'test': test() else: exit()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) # to make sure the output_dir exists tf.gfile.MakeDirs(FLAGS.output_dir) processors = {"ner": NerProcessor} if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, # why? label id starts from 1 init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, is_eval=False, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). # 8 is the TPU cores while len(eval_examples) % (FLAGS.eval_batch_size * FLAGS.num_tpu_cores) != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) # The total batch size should be a multiple of 64 (8 per TPU core), and feature dimensions should be a multiple of 128 # https://cloud.google.com/tpu/docs/troubleshooting # eval_steps = eval_steps // 8 * 8 # solved by padding eval_drop_remainder = True if FLAGS.use_tpu else False #eval_drop_remainder = False if eval_steps is None: tf.logging.info(" eval_steps: None") else: tf.logging.info(" eval_steps = %d", eval_steps) tf.logging.info(" eval_drop_remainder = %d", int(eval_drop_remainder)) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, is_eval=True, drop_remainder=eval_drop_remainder) try: result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) except tf.errors.OutOfRangeError: tf.logging.info("Out Of Range error cached") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: #token_path = os.path.join(FLAGS.output_dir, "token_test.txt") with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} #if os.path.exists(token_path): # os.remove(token_path) predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, is_eval=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(example_col_sep) label_token = str(predict_line.label).split(example_col_sep) if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) for id in prediction: if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception! try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with tf.gfile.GFile(output_predict_file, 'w') as writer: result_to_pair(writer) with tf.gfile.GFile(output_predict_file, 'r') as reader: eval_result = return_report(reader) print(eval_result)
config.read(os.path.join(directory, 'defaults.cfg')) gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True set_session(tf.Session(config=gpu_config)) # concepts concept, smpl_dev_data, dictionary, corpus_dev_sampled = load_concepts( config['terminology']['dict_file']) # mentions corpus_train = load_mentions(config['corpus']['training_file'], 'training corpus') corpus_dev = load_mentions(config['corpus']['development_file'], 'dev corpus') tokenizer = tokenization.FullTokenizer(config['bert']['vocab_file'], do_lower_case=False) # FIXME: only using one concept name per mention positives_training, positives_dev, positives_dev_sampled = load_data( 'data/gitig_positive_indices.pickle') del positives_dev positives_training = [(_, span.lower()) for _, span in positives_training] positives_dev_sampled = [(_, span.lower()) for _, span in positives_dev_sampled] # generators for training and validation instances train_examples = examples(concept, positives_training, tokenizer, config.getint('training', 'neg_count'), config.getint('training', 'mmaxlen'), config.getint('training', 'cmaxlen')) dev_examples = examples(concept, positives_dev_sampled, tokenizer,
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "bc5cdr": BC5CDRProcessor, "clefe": CLEFEProcessor, } # if not FLAGS.do_train and not FLAGS.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) tf.gfile.MakeDirs(FLAGS.output_dir) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.Open(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if tf.gfile.Exists(token_path): tf.gfile.Remove(token_path) predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) prf = estimator.evaluate(input_fn=predict_input_fn, steps=None) output_test_file = os.path.join(FLAGS.output_dir, "test_results.txt") with tf.gfile.Open(output_test_file, "w") as writer: tf.logging.info("***** TEST results *****") for key in sorted(prf.keys()): tf.logging.info(" %s = %s", key, str(prf[key])) writer.write("%s = %s\n" % (key, str(prf[key]))) result = estimator.predict(input_fn=predict_input_fn) result = list(result) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") with tf.gfile.Open(output_predict_file, 'w') as writer: print(id2label) for prediction in result: output_line = "\n".join(id2label[id] for id in prediction if id != 0) + "\n" writer.write(output_line) output_predict_file = os.path.join(FLAGS.output_dir, "test_labels.txt") output_err_file = os.path.join(FLAGS.output_dir, "test_labels_errs.txt") result_to_pair(predict_examples, result, id2label, output_predict_file, output_err_file) tf.logging.info('Reading: %s', output_predict_file) with tf.gfile.Open(output_predict_file, "r") as f: counts = evaluate(f) eval_result = report_notprint(counts) print(''.join(eval_result)) with tf.gfile.Open( os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'), 'w') as fd: fd.write(''.join(eval_result))
] test_documents = [ os.path.join(documents_dir, name + '.json') for name in test_domains ] train_mentions = mentions_dir + '/train.json' val_mentions = mentions_dir + '/val.json' test_mentions = mentions_dir + '/test.json' train_cands = cands_dir + '/train.json' val_cands = cands_dir + '/val.json' test_cands = cands_dir + '/test.json' tf.logging.set_verbosity(tf.logging.INFO) tokenizer = tokenization.FullTokenizer(vocab_file=dict_path, do_lower_case='uncased' in BERT_BASE_DIR) from collections import namedtuple TrainingInstance = namedtuple( 'TrainingInstance', 'mention_context_id mention_context_title ' 'mention_context_tokens cand_tokens mention_start' ' mention_end mention_guid cand_guids label_id') def pad_sequence(tokens, max_len): assert len(tokens) <= max_len return tokens + [0] * (max_len - len(tokens))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.data_type == "onehop": dataset_class = input_fns.OneHopDataset eval_fn = evaluate.multihop_eval_fn elif FLAGS.data_type == "twohop": dataset_class = input_fns.TwoHopDataset eval_fn = evaluate.multihop_eval_fn elif FLAGS.data_type == "threehop": dataset_class = input_fns.ThreeHopDataset eval_fn = evaluate.multihop_eval_fn elif (FLAGS.data_type == "wikimovie" or FLAGS.data_type == "wikimovie-2hop" or FLAGS.data_type == "wikimovie-3hop"): dataset_class = input_fns.WikiMovieDataset eval_fn = evaluate.wikimovie_eval_fn elif FLAGS.data_type == "hotpotqa": dataset_class = input_fns.HotpotQADataset eval_fn = evaluate.hotpot_eval_fn if FLAGS.model_type == "onehop": create_model_fn = model_fns.create_onehop_model elif FLAGS.model_type == "twohop": create_model_fn = model_fns.create_twohop_model elif FLAGS.model_type == "twohop-cascaded": create_model_fn = model_fns.create_twohopcascade_model elif FLAGS.model_type == "threehop": create_model_fn = functools.partial(model_fns.create_twohop_model, num_hops=3) elif FLAGS.model_type == "threehop-cascaded": create_model_fn = functools.partial( model_fns.create_twohopcascade_model, num_hops=3) elif FLAGS.model_type == "wikimovie": create_model_fn = model_fns.create_wikimovie_model elif FLAGS.model_type == "wikimovie-2hop": create_model_fn = functools.partial(model_fns.create_wikimovie_model, num_hops=2) elif FLAGS.model_type == "wikimovie-3hop": create_model_fn = functools.partial(model_fns.create_wikimovie_model, num_hops=3) elif FLAGS.model_type == "hotpotqa": create_model_fn = functools.partial(model_fns.create_hotpotqa_model, num_hops=FLAGS.num_hops) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # Load mention and entity files. mention2text = json.load( tf.gfile.Open(os.path.join(FLAGS.train_data_dir, "mention2text.json"))) tf.logging.info("Loading metadata about entities and mentions...") entity2id, entity2name = json.load( tf.gfile.Open(os.path.join(FLAGS.train_data_dir, "entities.json"))) entityid2name = {str(i): entity2name[e] for e, i in entity2id.items()} # all_paragraphs = json.load(tf.gfile.Open(os.path.join( # FLAGS.train_data_dir, "subparas.json"))) # all_mentions = np.load(tf.gfile.Open(os.path.join( # FLAGS.train_data_dir, "mentions.npy"))) all_paragraphs = None all_mentions = None qa_config = QAConfig( qry_layers_to_use=FLAGS.qry_layers_to_use, qry_aggregation_fn=FLAGS.qry_aggregation_fn, dropout=FLAGS.question_dropout, qry_num_layers=FLAGS.question_num_layers, projection_dim=FLAGS.projection_dim, load_only_bert=FLAGS.load_only_bert, num_entities=len(entity2id), max_entity_len=FLAGS.max_entity_len, ensure_answer_sparse=FLAGS.ensure_answer_sparse, ensure_answer_dense=FLAGS.ensure_answer_dense, train_with_sparse=FLAGS.train_with_sparse, predict_with_sparse=FLAGS.predict_with_sparse, fix_sparse_to_one=FLAGS.fix_sparse_to_one, supervision=FLAGS.supervision, l2_normalize_db=FLAGS.l2_normalize_db, entity_score_aggregation_fn=FLAGS.entity_score_aggregation_fn, entity_score_threshold=FLAGS.entity_score_threshold, softmax_temperature=FLAGS.softmax_temperature, sparse_reduce_fn=FLAGS.sparse_reduce_fn, intermediate_loss=FLAGS.intermediate_loss, light=FLAGS.light, sparse_strategy=FLAGS.sparse_strategy, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) mips_config = MIPSConfig(ckpt_path=os.path.join(FLAGS.train_data_dir, "mention_feats"), ckpt_var_name="db_emb", num_mentions=len(mention2text), emb_size=FLAGS.projection_dim * 2, num_neighbors=FLAGS.num_mips_neighbors) validate_flags_or_throw() tf.gfile.MakeDirs(FLAGS.output_dir) if FLAGS.do_train: json.dump( tf.app.flags.FLAGS.flag_values_dict(), tf.gfile.Open(os.path.join(FLAGS.output_dir, "flags.json"), "w")) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=8, tpu_config=tf.estimator.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), session_config=tf.ConfigProto(log_device_placement=False)) num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_dataset = dataset_class( in_file=FLAGS.train_file, tokenizer=tokenizer, subject_mention_probability=FLAGS.subject_mention_probability, max_qry_length=FLAGS.max_query_length, is_training=True, entity2id=entity2id, tfrecord_filename=os.path.join(FLAGS.output_dir, "train.tf_record")) num_train_steps = int(train_dataset.num_examples / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) summary_obj = None model_fn = model_fn_builder( bert_config=bert_config, qa_config=qa_config, mips_config=mips_config, init_checkpoint=FLAGS.init_checkpoint, e2m_checkpoint=os.path.join(FLAGS.train_data_dir, "ent2ment.npz"), m2e_checkpoint=os.path.join(FLAGS.train_data_dir, "coref.npz"), entity_id_checkpoint=os.path.join(FLAGS.train_data_dir, "entity_ids"), entity_mask_checkpoint=os.path.join(FLAGS.train_data_dir, "entity_mask"), learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, create_model_fn=create_model_fn, summary_obj=summary_obj) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.estimator.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", train_dataset.num_examples) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train(train_dataset, estimator, num_train_steps) if FLAGS.do_predict: eval_dataset = dataset_class(in_file=FLAGS.predict_file, tokenizer=tokenizer, subject_mention_probability=0.0, max_qry_length=FLAGS.max_query_length, is_training=False, entity2id=entity2id, tfrecord_filename=os.path.join( FLAGS.output_dir, "eval.tf_record")) continuous_eval(eval_dataset, estimator, mention2text, entityid2name, qa_config.supervision, eval_fn, paragraphs=all_paragraphs, mentions=all_mentions) if FLAGS.do_test: # Load mention and entity files. mention2text = json.load( tf.gfile.Open( os.path.join(FLAGS.test_data_dir, "mention2text.json"))) entity2id, entity2name = json.load( tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "entities.json"))) entityid2name = {str(i): entity2name[e] for e, i in entity2id.items()} all_paragraphs = json.load( tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "subparas.json"))) all_mentions = np.load( tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "mentions.npy"))) qa_config.num_entities = len(entity2id) mips_config = MIPSConfig(ckpt_path=os.path.join( FLAGS.test_data_dir, "mention_feats"), ckpt_var_name="db_emb", num_mentions=len(mention2text), emb_size=FLAGS.projection_dim * 2, num_neighbors=FLAGS.num_mips_neighbors) model_fn = model_fn_builder( bert_config=bert_config, qa_config=qa_config, mips_config=mips_config, init_checkpoint=FLAGS.init_checkpoint, e2m_checkpoint=os.path.join(FLAGS.test_data_dir, "ent2ment.npz"), m2e_checkpoint=os.path.join(FLAGS.test_data_dir, "coref.npz"), entity_id_checkpoint=os.path.join(FLAGS.test_data_dir, "entity_ids"), entity_mask_checkpoint=os.path.join(FLAGS.test_data_dir, "entity_mask"), learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, create_model_fn=create_model_fn, summary_obj=summary_obj) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) eval_dataset = dataset_class(in_file=FLAGS.test_file, tokenizer=tokenizer, subject_mention_probability=0.0, max_qry_length=FLAGS.max_query_length, is_training=False, entity2id=entity2id, tfrecord_filename=os.path.join( FLAGS.output_dir, "test.tf_record")) if tf.gfile.Exists(os.path.join(FLAGS.output_dir, "best_model.meta")): ckpt_path = os.path.join(FLAGS.output_dir, "best_model") else: ckpt_path = None output_prediction_file = os.path.join(FLAGS.output_dir, "test_predictions.json") metrics = single_eval(eval_dataset, estimator, ckpt_path, mention2text, entityid2name, qa_config.supervision, output_prediction_file, eval_fn, paragraphs=all_paragraphs, mentions=all_mentions) with tf.gfile.Open(os.path.join(FLAGS.output_dir, "test_metrics.txt"), "w") as fo: for metric, value in metrics.items(): tf.logging.info("%s: %.4f", metric, value) fo.write("%s %.4f\n" % (metric, value))
def main(_): if not tf.gfile.Exists(FLAGS.multihop_output_dir): tf.gfile.MakeDirs(FLAGS.multihop_output_dir) # Initialize tokenizer. tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # Read entities. if FLAGS.do_preprocess: tf.logging.info("Reading entities.") entity2id, entity2name = {}, {} with tf.gfile.Open(FLAGS.entity_file) as f: entities = json.load(f) tf.logging.info("Read %d entities", len(entities)) for e, (_, n) in entities.items(): if e.lower() in entity2id: continue # tf.logging.warn("%s entity repeated", e) entity2id[e.lower()] = len(entity2id) entity2name[e.lower()] = n tf.logging.info("Kept %d entities", len(entity2id)) # Read paragraphs, mentions and entities. if FLAGS.do_preprocess: mentions = [] ent_rows, ent_cols, ent_vals = [], [], [] mention2text = {} total_sub_paras = [0] all_sub_paras = [] num_skipped_mentions = 0. tf.logging.info("Reading paragraphs from %s", FLAGS.wiki_file) with tf.gfile.Open(FLAGS.wiki_file) as f: for ii, line in tqdm(enumerate(f)): if ii == FLAGS.max_total_paragraphs: tf.logging.info("Processed maximum number of paragraphs, breaking.") break if ii > 0 and ii % 100000 == 0: tf.logging.info("Skipped / Kept mentions = %.3f", num_skipped_mentions / len(mentions)) orig_para = json.loads(line.strip()) if orig_para["kb_id"].lower() not in entity2id: tf.logging.warn("%s not in entities. Skipping %s para", orig_para["kb_id"], orig_para["title"]) continue sub_para_objs = _get_sub_paras(orig_para, tokenizer, FLAGS.max_seq_length, FLAGS.doc_stride, total_sub_paras) for para_obj in sub_para_objs: # Add mentions from this paragraph. local2global = {} title_entity_mention = None for im, mention in enumerate( para_obj["mentions"][:FLAGS.max_mentions_per_entity]): if mention["kb_id"].lower() not in entity2id: # tf.logging.warn("%s not in entities. Skipping mention %s", # mention["kb_id"], mention["text"]) num_skipped_mentions += 1 continue mention2text[len(mentions)] = mention["text"] local2global[im] = len(mentions) if mention["kb_id"] == orig_para["kb_id"]: title_entity_mention = len(mentions) mentions.append( (entity2id[mention["kb_id"].lower()], para_obj["id"], mention["start_token"], mention["end_token"])) for im, gm in local2global.items(): # entity to mention matrix. ent_rows.append(entity2id[orig_para["kb_id"].lower()]) ent_cols.append(gm) ent_vals.append(1.) if title_entity_mention is not None: ent_rows.append(mentions[gm][0]) ent_cols.append(title_entity_mention) ent_vals.append(1.) all_sub_paras.append(para_obj["tokens"]) assert len(all_sub_paras) == total_sub_paras[0], (len(all_sub_paras), total_sub_paras) tf.logging.info("Num paragraphs = %d, Num mentions = %d", total_sub_paras[0], len(mentions)) tf.logging.info("Saving coreference map.") search_utils.write_to_checkpoint( "coref", np.array([m[0] for m in mentions], dtype=np.int32), tf.int32, os.path.join(FLAGS.multihop_output_dir, "coref.npz")) tf.logging.info("Creating entity to mentions matrix.") sp_entity2mention = sp.csr_matrix((ent_vals, (ent_rows, ent_cols)), shape=[len(entity2id), len(mentions)]) tf.logging.info("Num nonzero = %d", sp_entity2mention.getnnz()) tf.logging.info("Saving as ragged tensor %s.", str(sp_entity2mention.shape)) search_utils.write_ragged_to_checkpoint( "ent2ment", sp_entity2mention, os.path.join(FLAGS.multihop_output_dir, "ent2ment.npz")) tf.logging.info("Saving mentions metadata.") np.save( tf.gfile.Open( os.path.join(FLAGS.multihop_output_dir, "mentions.npy"), "w"), np.array(mentions, dtype=np.int64)) json.dump( mention2text, tf.gfile.Open( os.path.join(FLAGS.multihop_output_dir, "mention2text.json"), "w")) tf.logging.info("Saving entities metadata.") json.dump([entity2id, entity2name], tf.gfile.Open( os.path.join(FLAGS.multihop_output_dir, "entities.json"), "w")) tf.logging.info("Saving split paragraphs.") json.dump( all_sub_paras, tf.gfile.Open( os.path.join(FLAGS.multihop_output_dir, "subparas.json"), "w")) # Store entity tokens. if FLAGS.do_preprocess: tf.logging.info("Processing entities.") entity_ids = np.zeros((len(entity2id), FLAGS.max_entity_length), dtype=np.int32) entity_mask = np.zeros((len(entity2id), FLAGS.max_entity_length), dtype=np.float32) num_exceed_len = 0. for entity in tqdm(entity2id): ei = entity2id[entity] entity_tokens = tokenizer.tokenize(entity2name[entity]) entity_token_ids = tokenizer.convert_tokens_to_ids(entity_tokens) if len(entity_token_ids) > FLAGS.max_entity_length: num_exceed_len += 1 entity_token_ids = entity_token_ids[:FLAGS.max_entity_length] entity_ids[ei, :len(entity_token_ids)] = entity_token_ids entity_mask[ei, :len(entity_token_ids)] = 1. tf.logging.info("Saving %d entity ids. %d exceed max-length of %d.", len(entity2id), num_exceed_len, FLAGS.max_entity_length) search_utils.write_to_checkpoint( "entity_ids", entity_ids, tf.int32, os.path.join(FLAGS.multihop_output_dir, "entity_ids")) search_utils.write_to_checkpoint( "entity_mask", entity_mask, tf.float32, os.path.join(FLAGS.multihop_output_dir, "entity_mask")) # Copy BERT checkpoint for future use. if FLAGS.do_preprocess: tf.logging.info("Copying BERT checkpoint.") if tf.gfile.Exists(os.path.join(FLAGS.pretrain_dir, "best_model.index")): bert_ckpt = os.path.join(FLAGS.pretrain_dir, "best_model") else: bert_ckpt = tf.train.latest_checkpoint(FLAGS.pretrain_dir) tf.logging.info("%s.data-00000-of-00001", bert_ckpt) tf.gfile.Copy( bert_ckpt + ".data-00000-of-00001", os.path.join(FLAGS.multihop_output_dir, "bert_init.data-00000-of-00001"), overwrite=True) tf.logging.info("%s.index", bert_ckpt) tf.gfile.Copy( bert_ckpt + ".index", os.path.join(FLAGS.multihop_output_dir, "bert_init.index"), overwrite=True) tf.logging.info("%s.meta", bert_ckpt) tf.gfile.Copy( bert_ckpt + ".meta", os.path.join(FLAGS.multihop_output_dir, "bert_init.meta"), overwrite=True) if FLAGS.do_embed: # Get mention embeddings from BERT. bert_ckpt = os.path.join(FLAGS.multihop_output_dir, "bert_init") if not FLAGS.do_preprocess: with tf.gfile.Open( os.path.join(FLAGS.multihop_output_dir, "mentions.npy")) as f: mentions = np.load(f) with tf.gfile.Open( os.path.join(FLAGS.multihop_output_dir, "subparas.json")) as f: all_sub_paras = json.load(f) tf.logging.info("Computing embeddings for %d mentions over %d paras.", len(mentions), len(all_sub_paras)) shard_size = len(mentions) // FLAGS.num_shards bert_predictor = bert_utils_v2.BERTPredictor(tokenizer, bert_ckpt) if FLAGS.my_shard is None: shard_range = range(FLAGS.num_shards + 1) else: shard_range = [FLAGS.my_shard] for ns in shard_range: min_ = ns * shard_size max_ = (ns + 1) * shard_size if min_ >= len(mentions): break if max_ > len(mentions): max_ = len(mentions) min_subp = mentions[min_][1] max_subp = mentions[max_ - 1][1] tf.logging.info("Processing shard %d of %d mentions and %d paras.", ns, max_ - min_, max_subp - min_subp + 1) para_emb = bert_predictor.get_doc_embeddings( all_sub_paras[min_subp:max_subp + 1]) assert para_emb.shape[2] == 2 * FLAGS.projection_dim mention_emb = np.empty((max_ - min_, 2 * bert_predictor.emb_dim), dtype=np.float32) for im, mention in enumerate(mentions[min_:max_]): mention_emb[im, :] = np.concatenate([ para_emb[mention[1] - min_subp, mention[2], :FLAGS.projection_dim], para_emb[mention[1] - min_subp, mention[3], FLAGS.projection_dim:2 * FLAGS.projection_dim] ]) del para_emb tf.logging.info("Saving %d mention features to tensorflow checkpoint.", mention_emb.shape[0]) with tf.device("/cpu:0"): search_utils.write_to_checkpoint( "db_emb_%d" % ns, mention_emb, tf.float32, os.path.join(FLAGS.multihop_output_dir, "mention_feats_%d" % ns)) if FLAGS.do_combine: # Combine sharded DB into one. if FLAGS.shards_to_combine is None: shard_range = range(FLAGS.num_shards + 1) else: shard_range = range(FLAGS.shards_to_combine) with tf.device("/cpu:0"): all_db = [] for i in shard_range: ckpt_path = os.path.join(FLAGS.multihop_output_dir, "mention_feats_%d" % i) reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path) var_to_shape_map = reader.get_variable_to_shape_map() tf.logging.info("Reading %s from %s with shape %s", "db_emb_%d" % i, ckpt_path, str(var_to_shape_map["db_emb_%d" % i])) tf_db = search_utils.load_database("db_emb_%d" % i, var_to_shape_map["db_emb_%d" % i], ckpt_path) all_db.append(tf_db) tf.logging.info("Reading all variables.") session = tf.Session() session.run(tf.global_variables_initializer()) session.run(tf.local_variables_initializer()) np_db = session.run(all_db) tf.logging.info("Concatenating and storing.") np_db = np.concatenate(np_db, axis=0) search_utils.write_to_checkpoint( "db_emb", np_db, tf.float32, os.path.join(FLAGS.multihop_output_dir, "mention_feats"))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "ske_2019": SKE_2019_Multi_Label_Classification_Processor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) # if not FLAGS.do_predict: # raise ValueError( # "`do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() # label_length = len(label_list) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( # cluster=tpu_cluster_resolver, # master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, # tpu_config=tf.contrib.tpu.TPUConfig( # iterations_per_loop=FLAGS.iterations_per_loop, # num_shards=FLAGS.num_tpu_cores, # per_host_input_for_training=is_per_host)) ) # num_train_steps = None # num_warmup_steps = None model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, # learning_rate=FLAGS.learning_rate, # num_train_steps=num_train_steps, # num_warmup_steps=num_warmup_steps, # use_tpu=FLAGS.use_tpu, # use_one_hot_embeddings=FLAGS.use_tpu ) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) # if FLAGS.do_predict: # predict_examples = processor.get_test_examples(FLAGS.data_dir) # num_actual_predict_examples = len(predict_examples) # predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") # file_based_convert_examples_to_features(predict_examples, label_list, # FLAGS.max_seq_length, tokenizer, # predict_file) # tf.logging.info("***** Running prediction*****") # tf.logging.info(" Num examples = %d (%d actual, %d padding)", # len(predict_examples), num_actual_predict_examples, # len(predict_examples) - num_actual_predict_examples) # tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_test_data = [ "《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡", "你是最爱词:许常德李素珍/曲:刘天健你的故事写到你离去后为止", "《苏州商会档案丛编第二辑》是2012年华中师范大学出版社出版的图书,作者是马敏、祖苏、肖芃" ] num_actual_predict_examples = len(predict_test_data) # dataset = string_tokenizer( # examples=predict_test_data, # label_list=label_list, # max_seq_length=FLAGS.max_seq_length, # tokenizer=tokenizer) predict_input_fn = string_based_input_fn_builder( data=predict_test_data, seq_length=FLAGS.max_seq_length, label_list=label_list, tokenizer=tokenizer) result = estimator.predict(input_fn=predict_input_fn) num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): print("\n\n prediction:\n{}".format(prediction)) # continue # probabilities = prediction["probabilities"] # if i >= num_actual_predict_examples: # break # output_line_score_value = " ".join( # str(class_probability) # for class_probability in probabilities) + "\n" # predicate_predict = [] # for idx, class_probability in enumerate(probabilities): # if class_probability > 0.5: # predicate_predict.append(label_list[idx]) # output_line_predicate_predict = " ".join( # predicate_predict) + "\n" # predicate_predict_writer.write( # output_line_predicate_predict) # score_value_writer.write(output_line_score_value) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval_dev and not FLAGS.do_eval_test: raise ValueError("At least one of `do_train`, `do_eval_dev` or " "`do_eval_test' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) label_list = ["Yes", "No"] if FLAGS.from_three_class_model: label_list.append("Neutral") tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = get_train() num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features( examples=train_examples, label_list=label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer, output_file=train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) eval_on = [] if FLAGS.do_eval_dev: eval_on.append((get_dev(), "dev")) if FLAGS.do_eval_test: eval_on.append((get_test(), "test")) for eval_examples, name in eval_on: eval_file = os.path.join(FLAGS.output_dir, "%s.tf_record" % name) file_based_convert_examples_to_features( examples=eval_examples, label_list=label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer, output_file=eval_file) tf.logging.info("***** Running %s *****" % name) tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "%s_eval_results.txt" % name) with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** %s eval results *****" % name) for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch. eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(args): if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) run_config = tf.estimator.RunConfig( model_dir=save_model_dir, save_summary_steps=SAVE_SUMMARY_STEPS, keep_checkpoint_max=1, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(LABEL_COLUMNS), init_checkpoint=BERT_INIT_CHKPNT, learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, use_one_hot_embeddings=False) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={"batch_size": BATCH_SIZE}) train_input_fn = file_based_input_fn_builder( input_file=train_tf_record_path, seq_length=MAX_SEQ_LENGTH, is_training=True, drop_remainder=True) eval_input_fn = file_based_input_fn_builder(input_file=eval_tf_record_path, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) if FLAGS.do_train and FLAGS.do_eval: print(f'Beginning Training and evaluating!') train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=None) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) elif FLAGS.do_train: print(f'Beginning Training!') current_time = datetime.now() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) print("Training took time ", datetime.now() - current_time) elif FLAGS.do_eval: print(f'Beginning evaluating!') current_time = datetime.now() result = estimator.evaluate(input_fn=eval_input_fn, steps=None) # None 表示跑完整个数据集 print("Training took time ", datetime.now() - current_time) output_eval_file = os.path.join("data", "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_sample = "If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message." input_sample = InputExample(guid=0, text_a=predict_sample, labels=[0, 0, 0, 0, 0, 0]) tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT) tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True) feature = convert_single_example(input_sample, MAX_SEQ_LENGTH, tokenizer) predict_input_fn = input_fn_builder([feature], MAX_SEQ_LENGTH, False, False) predictions = estimator.predict(predict_input_fn) probabilities = [] for (i, prediction) in enumerate(predictions): preds = prediction["probabilities"] probabilities.append(preds) print(probabilities[0])
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "ske": SkeProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict and not FLAGS.do_export: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") if not os.path.exists(train_file): file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") if not os.path.exists(eval_file): file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") if not os.path.exists(predict_file): file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 tf.logging.info("num_written_lines %d,num_actual_predict_examples %d", num_written_lines, num_actual_predict_examples) assert num_written_lines == num_actual_predict_examples if FLAGS.do_export: tf.logging.info("doing saved model export !!") estimator._export_to_tpu = False estimator.export_saved_model(FLAGS.export_dir, serving_input_fn)
def __init__(self, args, is_training=True, emb_class='glove', use_crf=True): self.emb_path = args.emb_path self.embvec = pkl.load(open( self.emb_path, 'rb')) # resources(glove, vocab, path, etc) self.wrd_dim = args.wrd_dim # size of word embedding(glove) self.chr_dim = 50 # size of character embedding self.pos_dim = 7 # size of part of speech embedding self.class_size = len(self.embvec.tag_vocab) # number of class(tags) self.word_length = args.word_length # maximum character size of word for convolution self.restore = args.restore # checkpoint path if available self.use_crf = use_crf # use crf decoder or not self.emb_class = emb_class # class of embedding(glove, elmo, bert) self.starter_learning_rate = 0.001 # 0.001(default), 0.0003(transformer) self.decay_steps = 12000 self.decay_rate = 0.9 self.clip_norm = 10 self.keep_prob = 0.7 # keep probability for dropout self.chr_conv_type = 'conv1d' # conv1d | conv2d self.filter_sizes = [3] # filter sizes self.num_filters = 25 # number of filters self.rnn_used = True # use rnn layer or not self.rnn_num_layers = 2 # number of RNN layers self.rnn_type = 'fused' # normal | fused self.rnn_size = 200 # size of RNN hidden unit self.tf_used = False # use transformer encoder layer or not self.tf_num_layers = 4 # number of layers for transformer encoder self.tf_keep_prob = 0.8 # keep probability for transformer encoder self.tf_mh_num_heads = 4 # number of head for multi head attention self.tf_mh_num_units = 64 # Q,K,V dimension for multi head attention self.tf_mh_keep_prob = 0.8 # keep probability for multi head attention self.tf_ffn_kernel_size = 3 # conv1d kernel size for feed forward net self.tf_ffn_keep_prob = 0.8 # keep probability for feed forward net self.is_training = is_training if self.is_training: self.epoch = args.epoch self.batch_size = args.batch_size self.dev_batch_size = 2 * self.batch_size self.checkpoint_dir = args.checkpoint_dir self.summary_dir = args.summary_dir if self.emb_class == 'elmo': from bilm import Batcher, BidirectionalLanguageModel self.word_length = 50 # replace to fixed word length for the pre-trained elmo : 'max_characters_per_token' self.elmo_batcher = Batcher( self.embvec.elmo_vocab_path, self.word_length) # map text to character ids self.elmo_bilm = BidirectionalLanguageModel( self.embvec.elmo_options_path, self.embvec.elmo_weight_path) # biLM graph self.elmo_keep_prob = 0.8 if self.emb_class == 'bert': from bert import modeling from bert import tokenization self.bert_config = modeling.BertConfig.from_json_file( self.embvec.bert_config_path) self.bert_tokenizer = tokenization.FullTokenizer( vocab_file=self.embvec.bert_vocab_path, do_lower_case=self.embvec.bert_do_lower_case) self.bert_init_checkpoint = self.embvec.bert_init_checkpoint self.bert_max_seq_length = self.embvec.bert_max_seq_length self.bert_keep_prob = 0.8 # modified for bert self.rnn_size = 256 self.starter_learning_rate = 2e-5 self.decay_steps = 5000 self.decay_rate = 0.9 self.clip_norm = 1.5 if self.is_training: self.dev_batch_size = self.batch_size # set batch_size == dev_batch_size