def load_from_bert(vocab_file, input_file_a, input_file_b, do_lower_case=True, max_seq_length=128, vocab_file1=None, align_file=None, n_max_sent=None, align_punc=False, policy='1to1'): tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) tokenizer1 = tokenization.FullTokenizer( vocab_file=vocab_file1, do_lower_case=do_lower_case) examples = load_bert(input_file_a, input_file_b, n_max_sent=n_max_sent) aligns = None if align_file: aligns = load_aligns(align_file, n_max_sent=n_max_sent, examples=examples, align_punc=align_punc, policy=policy) try: assert len(examples) == len(aligns) except: raise ValueError("Number of examples({}) and alignments({}) mismatch!".format(len(examples),len(aligns))) features = convert_bert_examples_to_features( examples=examples, seq_length=max_seq_length, tokenizer=tokenizer, tokenizer1=tokenizer1, aligns=aligns) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature #all_input_ids_a = torch.tensor([f.input_ids_a for f in features], dtype=torch.long) #all_input_ids_b = torch.tensor([f.input_ids_b for f in features], dtype=torch.long) all_input_embs_a = torch.tensor([f.input_embs_a for f in features], dtype=torch.float) all_input_embs_b = torch.tensor([f.input_embs_b for f in features], dtype=torch.float) all_input_mask_a = torch.tensor([f.input_mask_a for f in features], dtype=torch.long) all_input_mask_b = torch.tensor([f.input_mask_b for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_mask_a.size(0), dtype=torch.long) if align_file: all_align_ids_a = torch.tensor([f.align_ids_a for f in features], dtype=torch.long) all_align_ids_b = torch.tensor([f.align_ids_b for f in features], dtype=torch.long) all_align_mask = torch.tensor([f.align_mask for f in features], dtype=torch.long) dataset = TensorDataset(all_input_embs_a, all_input_mask_a, all_input_embs_b, all_input_mask_b, all_align_ids_a, all_align_ids_b, all_align_mask, all_example_index) else: dataset = TensorDataset(all_input_embs_a, all_input_mask_a, all_input_embs_b, all_input_mask_b, all_example_index) #if local_rank == -1: # sampler = SequentialSampler(dataset) #sampler = RandomSampler(dataset) #else: # sampler = DistributedSampler(dataset) #dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size) return dataset, unique_id_to_feature, features
def convert(vocab_file, sents, batch_size=32, do_lower_case=True, max_seq_length=128, local_rank=-1): tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) features = convert_sents_to_features(sents=sents, seq_length=max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_example_index) return dataset, unique_id_to_feature, features
def main(_): tf.logging.set_verbosity(tf.logging.INFO) # bpe_tokenizer = BertTokenizer.from_pretrained( # FLAGS.bert_tokenizer_name, # tokenize_chinese_chars=False # ) # bpe_tokenizer.tokenize_chinese_chars = False # print("bpe_tokenizer.tokenize_chinese_chars: ", bpe_tokenizer.tokenize_chinese_chars) bpe_tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) create_training_instances( input_files, FLAGS.output_file, bpe_tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng, )
def submit(model=None, path="", vocab_file="", use_crf="", label_file="", tag_to_index=None): """ submit task """ tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file) data = [] for line in open(path): if not line.strip(): continue oneline = json.loads(line.strip()) res = process(model=model, text=oneline["text"], tokenizer_=tokenizer_, use_crf=use_crf, tag_to_index=tag_to_index, vocab=vocab_file) data.append(json.dumps({"label": res}, ensure_ascii=False)) open("ner_predict.json", "w").write("\n".join(data)) labels = [] with open(label_file) as f: for label in f: labels.append(label.strip()) get_result(labels, "ner_predict.json", path)
def do_eval(dataset=None, vocab_file="", eval_json="", load_checkpoint_path="", seq_length=384): """ do eval """ if load_checkpoint_path == "": raise ValueError( "Finetune model missed, evaluation task must load finetune model!") tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) eval_examples = read_squad_examples(eval_json, False) eval_features = convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=seq_length, doc_stride=128, max_query_length=64, is_training=False, output_fn=None, verbose_logging=False) net = BertSquad(bert_net_cfg, False, 2) net.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(net, param_dict) model = Model(net) output = [] RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) columns_list = ["input_ids", "input_mask", "segment_ids", "unique_ids"] for data in dataset.create_dict_iterator(): input_data = [] for i in columns_list: input_data.append(Tensor(data[i])) input_ids, input_mask, segment_ids, unique_ids = input_data start_positions = Tensor([1], mstype.float32) end_positions = Tensor([1], mstype.float32) is_impossible = Tensor([1], mstype.float32) logits = model.predict(input_ids, input_mask, segment_ids, start_positions, end_positions, unique_ids, is_impossible) ids = logits[0].asnumpy() start = logits[1].asnumpy() end = logits[2].asnumpy() for i in range(bert_net_cfg.batch_size): unique_id = int(ids[i]) start_logits = [float(x) for x in start[i].flat] end_logits = [float(x) for x in end[i].flat] output.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) write_predictions(eval_examples, eval_features, output, 20, 30, True, "./predictions.json", None, None)
def test_eval(): """Evaluation function for SQuAD task""" tokenizer = tokenization.FullTokenizer(vocab_file="./vocab.txt", do_lower_case=True) input_file = "dataset/v1.1/dev-v1.1.json" eval_examples = read_squad_examples(input_file, False) eval_features = convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, output_fn=None, verbose_logging=False) device_id = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target='Ascend', device_id=device_id) dataset = get_squad_dataset(bert_net_cfg.batch_size, 1) net = BertSquad(bert_net_cfg, False, 2) net.set_train(False) param_dict = load_checkpoint(cfg.finetune_ckpt) load_param_into_net(net, param_dict) model = Model(net) output = [] RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) columns_list = ["input_ids", "input_mask", "segment_ids", "unique_ids"] for data in dataset.create_dict_iterator(): input_data = [] for i in columns_list: input_data.append(Tensor(data[i])) input_ids, input_mask, segment_ids, unique_ids = input_data start_positions = Tensor([1], mstype.float32) end_positions = Tensor([1], mstype.float32) is_impossible = Tensor([1], mstype.float32) logits = model.predict(input_ids, input_mask, segment_ids, start_positions, end_positions, unique_ids, is_impossible) ids = logits[0].asnumpy() start = logits[1].asnumpy() end = logits[2].asnumpy() for i in range(bert_net_cfg.batch_size): unique_id = int(ids[i]) start_logits = [float(x) for x in start[i].flat] end_logits = [float(x) for x in end[i].flat] output.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) write_predictions(eval_examples, eval_features, output, 20, 30, True, "./predictions.json", None, None, False, False)
def submit(model=None, path="", vocab_file="", use_crf="", label_file="", tag_to_index=None): """ submit task """ tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file) data = [] if cfg.schema_file is not None: f1 = open(cfg.schema_file, 'r') numRows = json.load(f1) up_num = numRows["numRows"] else: up_num = 600000000000 num = 0 for line in open(path): num = num + 1 if num > up_num: break if not line.strip(): continue oneline = json.loads(line.strip()) if cfg.task == 'Classification': res = process(model=model, text=oneline["sentence"], tokenizer_=tokenizer_, use_crf=use_crf, tag_to_index=tag_to_index, vocab=vocab_file) print("text", oneline["sentence"]) elif cfg.task == 'NER': res = process(model=model, text=oneline["text"], tokenizer_=tokenizer_, use_crf=use_crf, tag_to_index=tag_to_index, vocab=vocab_file) print("text", oneline["text"]) else: raise Exception("Task error") print("res:", res) f.write("result: " + str(res) + '\n') data.append(json.dumps({"label": res}, ensure_ascii=False)) f.close()
def submit(model=None, path="", vocab_file="", use_crf="", label2id_file=""): """ submit task """ tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file) data = [] for line in open(path): if not line.strip(): continue oneline = json.loads(line.strip()) res = process(model=model, text=oneline["text"], tokenizer_=tokenizer_, use_crf=use_crf, label2id_file=label2id_file) print("text", oneline["text"]) print("res:", res) data.append(json.dumps({"label": res}, ensure_ascii=False)) open("ner_predict.json", "w").write("\n".join(data))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "chn": data_processors.ChnSentiCorpDataProcessor, "lcqmc": data_processors.LCQMCProcessor, "xnli": data_processors.XnliProcessor, "book_review": data_processors.BookReviewProcessor, "shopping": data_processors.ShoppingProcessor, "weibo": data_processors.WeiboProcessor, "law_qa": data_processors.LawQAProcessor, "nlpcc_dbqa": data_processors.NlpccDbqaProcessor, } # tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, # FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) if not FLAGS.albert_config_file and not FLAGS.albert_hub_module_handle: raise ValueError("At least one of `--albert_config_file` and " "`--albert_hub_module_handle` must be set") if FLAGS.albert_config_file: albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) else: albert_config = None # Get the config from TF-Hub. tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name](FLAGS) label_list = processor.get_labels() # bpe_tokenizer = BertTokenizer.from_pretrained( # FLAGS.bert_tokenizer_name, # tokenize_chinese_chars=False # ) # bpe_tokenizer.tokenize_chinese_chars = False # print("bpe_tokenizer.tokenize_chinese_chars: ", bpe_tokenizer.tokenize_chinese_chars) bpe_tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int( min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=int(FLAGS.save_checkpoints_steps), keep_checkpoint_max=0, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) model_fn = classifier_utils.model_fn_builder( albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.train_step, num_warmup_steps=FLAGS.warmup_step, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, task_name=task_name, hub_module=FLAGS.albert_hub_module_handle, optimizer=FLAGS.optimizer) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir train_file = os.path.join(cached_dir, task_name + "_train.tf_record") # if not tf.gfile.Exists(train_file): classifier_utils.file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, bpe_tokenizer, train_file, task_name, ) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", FLAGS.train_step) train_input_fn = classifier_utils.file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(data_processors.PaddingInputExample()) cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record") # if not tf.gfile.Exists(eval_file): classifier_utils.file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, bpe_tokenizer, eval_file, task_name, ) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = classifier_utils.file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.eval_batch_size) best_trial_info_file = os.path.join(FLAGS.output_dir, "best_trial.txt") def _best_trial_info(): """Returns information about which checkpoints have been evaled so far.""" if tf.gfile.Exists(best_trial_info_file): with tf.gfile.GFile(best_trial_info_file, "r") as best_info: global_step, best_metric_global_step, metric_value = ( best_info.read().split(":")) global_step = int(global_step) best_metric_global_step = int(best_metric_global_step) metric_value = float(metric_value) else: metric_value = -1 best_metric_global_step = -1 global_step = -1 tf.logging.info( "Best trial info: Step: %s, Best Value Step: %s, " "Best Value: %s", global_step, best_metric_global_step, metric_value) return global_step, best_metric_global_step, metric_value def _remove_checkpoint(checkpoint_path): for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") if task_name == "sts-b": key_name = "pearson" elif task_name == "cola": key_name = "matthew_corr" elif task_name == "nlpcc_dbqa": key_name = "f1_score" else: key_name = "eval_accuracy" global_step, best_perf_global_step, best_perf = _best_trial_info() writer = tf.gfile.GFile(output_eval_file, "w") while global_step < FLAGS.train_step: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(60) else: for checkpoint in sorted(steps_and_files.items()): step, checkpoint_path = checkpoint if global_step >= step: if (best_perf_global_step != step and len(_find_valid_cands(step)) > 1): _remove_checkpoint(checkpoint_path) continue result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if result[key_name] > best_perf: best_perf = result[key_name] best_perf_global_step = global_step elif len(_find_valid_cands(global_step)) > 1: _remove_checkpoint(checkpoint_path) writer.write("=" * 50 + "\n") writer.flush() with tf.gfile.GFile(best_trial_info_file, "w") as best_info: best_info.write("{}:{}:{}".format( global_step, best_perf_global_step, best_perf)) writer.close() # output_eval_file_local = os.path.join("./tmp/", "_".join(output_eval_file.split("/")[4:])) # tf.gfile.Copy(output_eval_file, output_eval_file_local, overwrite=True) for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = "model.ckpt-{}.{}".format(best_perf_global_step, ext) tgt_ckpt = "model.ckpt-best.{}".format(ext) tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) tf.io.gfile.rename(os.path.join(FLAGS.output_dir, src_ckpt), os.path.join(FLAGS.output_dir, tgt_ckpt), overwrite=True) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(data_processors.PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") classifier_utils.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, bpe_tokenizer, predict_file, task_name, ) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = classifier_utils.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.predict(input_fn=predict_input_fn, checkpoint_path=checkpoint_path) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as pred_writer, \ tf.gfile.GFile(output_submit_file, "w") as sub_writer: sub_writer.write("index" + "\t" + "prediction\n") num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, (example, prediction)) in \ enumerate(zip(predict_examples, result)): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" pred_writer.write(output_line) if task_name != "sts-b": actual_label = label_list[int(prediction["predictions"])] else: actual_label = str(prediction["predictions"]) sub_writer.write(example.guid + "\t" + actual_label + "\n") num_written_lines += 1 assert num_written_lines == num_actual_predict_examples truth_json_dir = os.path.join(FLAGS.data_dir, "test.json") pred_tsv_dir = output_submit_file accuracy, results = cal_metrics(pred_tsv_dir, truth_json_dir) tf.logging.info("***** Predict metrics *****") tf.logging.info("***** Predict metrics *****") tf.logging.info("accuracy: %f" % accuracy) tf.logging.info("results: ") tf.logging.info(json.dumps(results, ensure_ascii=False, indent=2)) tf.logging.info("***** Predict metrics *****") tf.logging.info("***** Predict metrics *****")
def run_squad(): """run squad task""" parser = argparse.ArgumentParser(description="run squad") parser.add_argument("--device_target", type=str, default="Ascend", choices=["Ascend", "GPU"], help="Device type, default is Ascend") parser.add_argument("--do_train", type=str, default="false", choices=["true", "false"], help="Eable train, default is false") parser.add_argument("--do_eval", type=str, default="false", choices=["true", "false"], help="Eable eval, default is false") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--epoch_num", type=int, default=3, help="Epoch number, default is 1.") parser.add_argument("--num_class", type=int, default=2, help="The number of class, default is 2.") parser.add_argument("--train_data_shuffle", type=str, default="true", choices=["true", "false"], help="Enable train data shuffle, default is true") parser.add_argument("--eval_data_shuffle", type=str, default="false", choices=["true", "false"], help="Enable eval data shuffle, default is false") parser.add_argument("--train_batch_size", type=int, default=32, help="Train batch size, default is 32") parser.add_argument("--eval_batch_size", type=int, default=1, help="Eval batch size, default is 1") parser.add_argument("--vocab_file_path", type=str, default="", help="Vocab file path") parser.add_argument("--eval_json_path", type=str, default="", help="Evaluation json file path, can be eval.json") parser.add_argument("--save_finetune_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_pretrain_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--load_finetune_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--train_data_file_path", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_file_path", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() epoch_num = args_opt.epoch_num load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower( ) == "false": raise ValueError( "At least one of 'do_train' or 'do_eval' must be true") if args_opt.do_train.lower( ) == "true" and args_opt.train_data_file_path == "": raise ValueError( "'train_data_file_path' must be set when do finetune task") if args_opt.do_eval.lower() == "true": if args_opt.vocab_file_path == "": raise ValueError( "'vocab_file_path' must be set when do evaluation task") if args_opt.eval_json_path == "": raise ValueError( "'tokenization_file_path' must be set when do evaluation task") target = args_opt.device_target if target == "Ascend": context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) elif target == "GPU": context.set_context(mode=context.GRAPH_MODE, device_target="GPU") if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 else: raise Exception("Target error, GPU or Ascend is supported.") netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1) if args_opt.do_train.lower() == "true": ds = create_squad_dataset( batch_size=args_opt.train_batch_size, repeat_count=1, data_file_path=args_opt.train_data_file_path, schema_file_path=args_opt.schema_file_path, do_shuffle=(args_opt.train_data_shuffle.lower() == "true")) do_train(ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num) if args_opt.do_eval.lower() == "true": if save_finetune_checkpoint_path == "": load_finetune_checkpoint_dir = _cur_dir else: load_finetune_checkpoint_dir = make_directory( save_finetune_checkpoint_path) load_finetune_checkpoint_path = LoadNewestCkpt( load_finetune_checkpoint_dir, ds.get_dataset_size(), epoch_num, "squad") if args_opt.do_eval.lower() == "true": from src import tokenization from src.create_squad_data import read_squad_examples, convert_examples_to_features from src.squad_get_predictions import write_predictions from src.squad_postprocess import SQuad_postprocess tokenizer = tokenization.FullTokenizer( vocab_file=args_opt.vocab_file_path, do_lower_case=True) eval_examples = read_squad_examples(args_opt.eval_json_path, False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=bert_net_cfg.seq_length, doc_stride=128, max_query_length=64, is_training=False, output_fn=None, vocab_file=args_opt.vocab_file_path) ds = create_squad_dataset( batch_size=args_opt.eval_batch_size, repeat_count=1, data_file_path=eval_features, schema_file_path=args_opt.schema_file_path, is_training=False, do_shuffle=(args_opt.eval_data_shuffle.lower() == "true")) outputs = do_eval(ds, load_finetune_checkpoint_path, args_opt.eval_batch_size) all_predictions = write_predictions(eval_examples, eval_features, outputs, 20, 30, True) SQuad_postprocess(args_opt.eval_json_path, all_predictions, output_metrics="output.json")
else: sent_new_ += char_ # drop redundent blank sent_new_ = drop_extra_blank(sent_new_) return sent_new_ def tokenize_single_sent(sent, tokenizer=None): # sent = proc_single_sent(sent) line_seg = tokenizer.tokenize(sent) return line_seg if __name__ == "__main__": bpe_tokenizer = tokenization.FullTokenizer( vocab_file= "data_proc/tokenizers/sentencepiece/char_no_space-21128-clean.vocab", do_lower_case=True, spm_model_file= "data_proc/tokenizers/sentencepiece/char_no_space-21128-clean.model") sent = "我喜欢篮球" line_seg = tokenize_single_sent(sent, tokenizer=bpe_tokenizer) line_seg = ["[CLS]"] + line_seg + ["[SEP]"] print(line_seg) print(" ".join(line_seg)) # [CLS] ▁我 喜欢 篮球 [SEP]