def process_inputs(input_data): bert_config = modeling.BertConfig.from_json_file(rs.FLAGS.bert_config_file) eval_examples = read_squad_data(input_data, is_training=False) eval_features = [] eval_writer = rs.FeatureWriter(filename=os.path.join( "./colab_output", "train.tf_record"), is_training=False) def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) # 토크나이저에 사전과 do_lower_case 설정 tokenizer = tokenization.FullTokenizer( vocab_file=rs.FLAGS.vocab_file, do_lower_case=rs.FLAGS.do_lower_case) rs.convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=rs.FLAGS.max_seq_length, doc_stride=rs.FLAGS.doc_stride, max_query_length=rs.FLAGS.max_query_length, is_training=False, output_fn=append_feature) eval_writer.close() return eval_examples, eval_features
def mrc(): data_from_post = getData() data = preprocess_data(data_from_post) eval_writer = mainfile.FeatureWriter(filename=os.path.join( mrc_inference_config["output_dir"], "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) mainfile.convert_examples_to_features( examples=data, tokenizer=tokenizer, max_seq_length=mrc_inference_config["max_seq_length"], doc_stride=mrc_inference_config["doc_stride"], max_query_length=mrc_inference_config["max_query_length"], is_training=False, output_fn=append_feature) eval_writer.close() all_results = [] predict_input_fn = mainfile.input_fn_builder( input_file=eval_writer.filename, seq_length=mrc_inference_config["max_seq_length"], is_training=False, drop_remainder=False) all_results = [] for result in estimator.predict(predict_input_fn, yield_single_examples=True): unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( mainfile.RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) answer = mainfile.write_predictions( data, eval_features, all_results, 20, mrc_inference_config["max_answer_length"], True, None, None, None) return sendResponse({"Answer": answer.get(data_from_post.get("qas_id"))})
def do_predict(self, json_data): eval_examples = self.read_squad_examples(input_data=json_data, is_training=False) eval_writer = run_squad.FeatureWriter(filename=os.path.join( run_squad.FLAGS.output_dir, "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) run_squad.convert_examples_to_features( examples=eval_examples, tokenizer=self.tokenizer, max_seq_length=run_squad.FLAGS.max_seq_length, doc_stride=run_squad.FLAGS.doc_stride, max_query_length=run_squad.FLAGS.max_query_length, is_training=False, output_fn=append_feature) eval_writer.close() tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", run_squad.FLAGS.predict_batch_size) all_results = [] predict_input_fn = run_squad.input_fn_builder( input_file=eval_writer.filename, seq_length=run_squad.FLAGS.max_seq_length, is_training=False, drop_remainder=False) # If running eval on the TPU, you will need to specify the number of # steps. all_results = [] for result in self.estimator.predict(predict_input_fn, yield_single_examples=True): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( run_squad.RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(run_squad.FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(run_squad.FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(run_squad.FLAGS.output_dir, "null_odds.json") return self.write_predictions( eval_examples, eval_features, all_results, run_squad.FLAGS.n_best_size, run_squad.FLAGS.max_answer_length, run_squad.FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file)
num_train_steps = int( len(train_examples) / global_batch_size * num_train_epochs) num_warmup_steps = int(num_train_steps * warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) start_index = 0 end_index = len(train_examples) tmp_filenames = os.path.join(output_dir, "train.tf_record") # We write to a temporary file to avoid storing very large constant tensors # in memory. train_writer = run_squad.FeatureWriter(filename=tmp_filenames, is_training=True) # run_squad.convert_examples_to_features( # examples=train_examples[start_index:end_index], # tokenizer=tokenizer, # max_seq_length=max_seq_length, # doc_stride=doc_stride, # max_query_length=max_query_length, # is_training=True, # output_fn=train_writer.process_feature) # train_writer.close() #%% tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num orig examples = %d", end_index - start_index)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) bert_config = rs.modeling.BertConfig.from_json_file(FLAGS.bert_config_file) rs.validate_flags_or_throw(bert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = rs.tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = rs.read_squad_examples( input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) model_fn = rs.model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. train_writer = rs.FeatureWriter( filename=os.path.join(FLAGS.output_dir, "train.tf_record"), is_training=True) rs.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) del train_examples train_input_fn = rs.input_fn_builder( input_file=train_writer.filename, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_predict: eval_examples = rs.read_squad_examples( input_file=FLAGS.predict_file, is_training=False) act_seq_len = get_act_seq_len(eval_examples, tokenizer, FLAGS.max_seq_length, FLAGS.doc_stride, FLAGS.max_query_length) eval_writer = rs.FeatureWriter( filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) rs.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) eval_writer.close() tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) all_results = [] predict_input_fn = rs.input_fn_builder( input_file=eval_writer.filename, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) # If running eval on the TPU, you will need to specify the number of # steps. all_results = [] for idx, result in enumerate(estimator.predict( predict_input_fn, yield_single_examples=True)): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( rs.RawResult( unique_id=unique_id, start_logits=start_logits[:act_seq_len[idx]], end_logits=end_logits[:act_seq_len[idx]])) output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") rs.write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file)