Пример #1
0
def report_results(strings,
                   predicts,
                   goldens,
                   id_to_char,
                   id_to_tag,
                   output_path,
                   verbose=False):
    results = []
    for i in range(len(strings)):
        result = []
        string = [x for x in strings[i]]
        pred = iobes_iob([id_to_tag[int(x)] for x in predicts[i]])
        gold = iobes_iob([id_to_tag[int(x)] for x in goldens[i]])
        for char, gold, pred in zip(string, gold, pred):
            result.append(" ".join([char, gold, pred]))
        results.append(result)

    with codecs.open(output_path, 'w', 'utf-8') as f:
        for sentence in results:
            for line in sentence:
                f.write(line + '\n')
            f.write('\n')

    eval_lines = return_report(output_path)

    if verbose:
        for line in eval_lines[1:]:
            print line.strip()

    f1 = float(eval_lines[1].strip().split()[-1])
    return f1
Пример #2
0
def result_write_evaluate(results, path, name, size_train_data):
    """
    将对验证集的预测识别结果写入到原数据中并进行输出,然后计算识别的性能;将对测试集的预测识别结果写入到原数据中并进行输出
    :param results:
    :param path:
    :param name:
    :return:
    """
    if name == "dev":
        # output_file = os.path.join(path, "3000_predict_dev.utf8")
        # output_file = os.path.join(path, "5000_predict_dev.utf8")
        # output_file = os.path.join(path, "7000_predict_dev.utf8")
        # output_file = os.path.join(path, "10000_predict_dev.utf8")
        output_file = os.path.join(path,
                                   str(size_train_data) + "_predict_dev.utf8")
        with open(output_file, "w", encoding="utf8") as f:
            to_write = []
            for block in results:
                for line in block:
                    to_write.append(line + "\n")
                to_write.append("\n")
            f.writelines(to_write)
        eval_lines = return_report(output_file)
        return eval_lines
    elif name == "test":
        output_file = os.path.join(path, "ner_predict_test.utf8")
        with open(output_file, "w", encoding="utf8") as f:
            to_write = []
            for block in results:
                for line in block:
                    to_write.append(line + "\n")
                to_write.append("\n")
            f.writelines(to_write)
Пример #3
0
def test_ner(results, path, filename):
    output_file = os.path.join(path, filename)
    with open(output_file, "w", encoding='utf-8') as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #4
0
def evaluate_results(results, result_path):
    output_file = os.path.join(result_path, "ner_predict.utf8")
    with open(output_file, "w", encoding="utf8") as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")
        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #5
0
def test_ner(results, path):
    output_file = os.path.join(path, "ner_predict.utf8")
    with open(output_file, "w", encoding='utf8') as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")
        f.writelines(to_write)
    # 返回评估报告
    eval_lines = return_report(output_file)
    return eval_lines
Пример #6
0
def evaluate_ner(results, conf):
    with open(conf.result_file, "w", encoding='utf-8') as f:
        to_write = []
        for block in results:
            print(block)
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")
        f.writelines(to_write)
    eval_lines = return_report(conf.result_file)
    for line in eval_lines:
        print(line)
    f1 = float(eval_lines[1].strip().split()[-1])
    return f1
Пример #7
0
    def report_ner(self, results, output_file):
        """
        Run perl script to evaluate model
        """
        with open(output_file, "w", encoding='utf8') as f:
            to_write = []
            for block in results:
                for line in block:
                    to_write.append(line + "\n")
                to_write.append("\n")

            f.writelines(to_write)
        eval_lines = return_report(output_file)
        return eval_lines
Пример #8
0
def test_ner(results, path):
    """
    Run perl script to evaluate model
    """
    output_file = os.path.join(path, "ner_predict.utf8")
    with open(output_file, "w") as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line.encode('utf-8') + "\n")
            to_write.append("\n")
        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #9
0
def test_srl(results, path):
	"""
	perl script를 이용해 평가
	"""
	output_file = os.path.join(path, "srl_predict.utf8")
	with open(output_file, "w", encoding='utf-8') as f:
		to_write = []
		for block in results:
			for line in block:
				to_write.append(line + "\n")
			to_write.append("\n")

		f.writelines(to_write)
	eval_lines = return_report(output_file)
	return eval_lines
Пример #10
0
def test_skill(results, path):
    """
    Run perl script to evaluate model
    """
    output_file = os.path.join(path, "skill_predict.utf8")
    with codecs.open(output_file, "w", encoding="utf-8") as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")

        f.writelines(to_write)
    eval_lines, results_skill = return_report(output_file)
    return eval_lines, results_skill
Пример #11
0
def test_ner(results, path):
    """
    Report the performance.
    """
    output_file = os.path.join(path, 'Brands_ner_predict.utf8')
    with open(output_file, 'w', encoding='utf-8') as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + '\n')
            to_write.append('\n')

        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #12
0
def test_ner(results, path):
    """
    Run perl script to evaluate model
    """
    output_file = path + "_predict.utf8"
    with open(output_file, "w", encoding='utf8') as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")

        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #13
0
def test_ner(results, path):
    """
    Run perl script to evaluate model
    """
    output_file = os.path.join(path, "ner_predict.utf8")
    with open(output_file, "w") as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")

        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #14
0
def test_ner(results, path):
    """
    使用验证集验证模型效果
    """
    output_file = os.path.join(path, "ner_predict.utf8")
    with open(output_file, "w") as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")

        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #15
0
def test_ner(results, path, epoch, name):
    """
    Run perl script to evaluate model
    """
    output_file = os.path.join(path,
                               "result_" + str(epoch) + "_" + name + ".txt")
    with codecs.open(output_file, "w", 'utf8') as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")

        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #16
0
def test_ner(results, path):
    """
    para results dimension: eval/test样本数量* LenSentence  每个元素: 字符 正确标签 预测标签
    """
    output_file = os.path.join(path, "ner_predict.utf8")
    with open(output_file, "w") as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")

        f.writelines(
            to_write)  # to_write dimension: 样本中的总字数*1  每个节点是str:字符 正确标签 预测标签
    eval_lines = return_report(output_file)
    return eval_lines
Пример #17
0
def test_ner(results, path):
    """
    用CoNLL-2000的实体识别评估脚本来评估模型
    """
    """ 用CoNLL-2000的脚本,需要把预测结果保存为文件,再读取 """
    output_file = os.path.join(path, "ner_predict.utf8")
    with open(output_file, "w", encoding='utf8') as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")

        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
def test_ner(results, path):
    """
    Run perl script to evaluate model
    """
    output_file = os.path.join(path, "ner_predict.utf8")
    #将验证结果写入文件中
    with open(output_file, "w", encoding='utf8') as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")

        f.writelines(to_write)
    eval_lines = return_report(output_file)  #计算模型的准确率和F1系数等指标
    return eval_lines
Пример #19
0
def test_ner(results, path):
    """
    :param results:
    :param path:
    :return:
    """
    output_file = os.path.join(path, 'ner_predict.utf8')
    with codecs.open(output_file, "w", encoding="utf-8") as f_write:
        to_write = []
        for line in results:
            for iner_line in line:
                to_write.append(iner_line + "\n")
            to_write.append("\n")
        f_write.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #20
0
def test_ner(results, path):
    """
    :param results:
    :param path:
    :return:
    """
    output_file = os.path.join(path, 'ner_predict.utf8')
    with open(output_file, 'w', encoding='UTF-8') as f:
        to_write = []
        for line in results:
            for iner_line in line:
                to_write.append(iner_line + '\n')
            to_write.append('\n')
        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #21
0
def test_ner(results, path):
    """
    Run perl script to evaluate model
    """
    output_file = os.path.join(path, "ner_predict.utf8")
    with open(output_file, "w") as f:
        to_write = []
        for block in results:
            for line in block:
                f.write(line)
                f.write('\n')
            f.write('\n')
            # tmp_str1 = str(to_write).replace('u\'', '\'')
            # tmp_str1 = tmp_str1.decode("unicode-escape")
            # f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #22
0
def test_ner(results, path):
    """
    Run perl script to evaluate model
    """
    output_file = os.path.join(path, "ner_predict.utf8")
    with open(output_file, "w") as f:
        to_write = []
        for block in results:
            for line in block:
                f.write(line)
                f.write('\n')
            f.write('\n')
            # tmp_str1 = str(to_write).replace('u\'', '\'')
            # tmp_str1 = tmp_str1.decode("unicode-escape")
            # f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines
Пример #23
0
def test_ner(results, path):
    """
    ner_results: [batch, ["char true_label pred_label"]
    result_path: save path
    """
    output_file = os.path.join(path, "ner_predict.utf-8")
    with open(output_file, "w") as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + "\n")
            to_write.append("\n")

        f.writelines(to_write)
    eval_lines = return_report(output_file)

    return eval_lines
Пример #24
0
def test_ner(results, path):
    """
    :param results:
    :param path:
    :return:
    """
    output_file = os.path.join(path, 'ner_predict.utf8')
    with open(output_file, 'w', encoding='UTF-8') as f:
        to_write = []
        for line in results:
            for iner_line in line:
                to_write.append(iner_line + '\n')
            to_write.append('\n')
        f.writelines(to_write)
    eval_lines = return_report(output_file)
    # output_file = os.path.join(path, 'ner_predict.utf8')
    # with open(output_file, 'w', encoding='UTF-8') as f:
    #     to_write = []
    #     for line in results:
    #         for iner_line in line:
    #             to_write.append(iner_line + '\n')
    #         to_write.append('\n')
    #     f.writelines(to_write)
    # golden_lists = []
    # predic_lists = []
    # glod = []
    # pred = []
    # with open(output_file, 'r', encoding='UTF-8') as f:
    #     for line in f:
    #         if len(line) > 1:
    #             line = line.strip().split()
    #             glod.append(line[1])
    #             pred.append(line[2])
    #         else:
    #             golden_lists.append(glod)
    #             predic_lists.append(pred)
    #             glod = []
    #             pred = []
    # accuracy, precision, recall, f_score = get_ner_measure(golden_lists=golden_lists, predict_lists=predic_lists, label_type='BIO')
    # print('acc: {}, precision: {}, recall: {}, f_score: {}'.format(accuracy, precision, recall, f_score))
    # return accuracy, precision, recall, f_score
    return eval_lines
Пример #25
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    processors = {
        "ner": NerProcessor
    }
    #     if not FLAGS.do_train and not FLAGS.do_eval:
    #         raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)  # 加载BERT模型参数

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    # 在train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean
    if FLAGS.clean and FLAGS.do_train:
        if os.path.exists(FLAGS.output_dir):
            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(FLAGS.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)
        if os.path.exists(FLAGS.data_config_path):
            try:
                os.remove(FLAGS.data_config_path)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)
    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()
    label_list = processor.get_labels()
    # print(label_list)
    # exit()
    if not os.path.exists(os.path.join(FLAGS.output_dir, 'label_list.pkl')):
        with open(os.path.join(FLAGS.output_dir, 'label_list.pkl'), 'wb') as fd:
            pickle.dump(label_list, fd)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    # TPU参数配置
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if os.path.exists(FLAGS.data_config_path):
        with codecs.open(FLAGS.data_config_path) as fd:
            data_config = json.load(fd)
    else:
        data_config = {}
    # print(data_config) #空的
    # exit()

    if FLAGS.do_train:
        # 加载训练数据
        if len(data_config) == 0:
            train_examples = processor.get_train_examples(FLAGS.data_dir)
            # 训练步数
            num_train_steps = int(
                len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
            # 预热step步
            num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

            data_config['num_train_steps'] = num_train_steps  # 训练步数
            data_config['num_warmup_steps'] = num_warmup_steps  # 热身步数
            data_config['num_train_size'] = len(train_examples)  # 训练样本大小  example对象
        else:
            num_train_steps = int(data_config['num_train_steps'])
            num_warmup_steps = int(data_config['num_warmup_steps'])
    # print(data_config) #{'num_train_steps': 4890, 'num_warmup_steps': 489, 'num_train_size': 20864}
    # exit()
    # print(bert_config)
    # exit()

    # 目前为止,数据处理完毕
    # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程
    # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)  # 如果用哑编码则tpu比较快,其他GPU比较快

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        # 1. 将数据转化为tf_record 数据
        if data_config.get('train.tf_record_path', '') == '':
            train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
            filed_based_convert_examples_to_features(
                train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)  # 将数据写到tf里
        else:
            train_file = data_config.get('train.tf_record_path')
        num_train_size = num_train_size = int(data_config['num_train_size'])
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", num_train_size)
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        # 2.读取record 数据,组成batch
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    if FLAGS.do_eval:
        if data_config.get('eval.tf_record_path', '') == '':
            eval_examples = processor.get_dev_examples(FLAGS.data_dir)
            eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
            filed_based_convert_examples_to_features(
                eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
            data_config['eval.tf_record_path'] = eval_file
            data_config['num_eval_size'] = len(eval_examples)
        else:
            eval_file = data_config['eval.tf_record_path']
        # 打印验证集数据信息
        num_eval_size = data_config.get('num_eval_size', 0)
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", num_eval_size)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_steps = None
        if FLAGS.use_tpu:
            eval_steps = int(num_eval_size / FLAGS.eval_batch_size)
        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with codecs.open(output_eval_file, "w", encoding='utf-8') as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    # 保存数据的配置文件,避免在以后的训练过程中多次读取训练以及测试数据集,消耗时间
    if not os.path.exists(FLAGS.data_config_path):
        with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd:
            json.dump(data_config, fd)

    if FLAGS.do_predict:
        token_path = os.path.join(FLAGS.output_dir, "test_token.txt")
        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples, label_list,
                                                 FLAGS.max_seq_length, tokenizer,
                                                 predict_file, mode="test")

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
        if FLAGS.use_tpu:
            # Warning: According to tpu_estimator.py Prediction on TPU is an
            # experimental feature and hence not supported here
            raise ValueError("Prediction in TPU not supported")
        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        # predicted_result = estimator.evaluate(input_fn=predict_input_fn)
        # output_eval_file = os.path.join(FLAGS.output_dir, "predicted_results.txt")
        # with codecs.open(output_eval_file, "w", encoding='utf-8') as writer:
        #     tf.logging.info("***** Predict results *****")
        #     for key in sorted(predicted_result.keys()):
        #         tf.logging.info("  %s = %s", key, str(predicted_result[key]))
        #         writer.write("%s = %s\n" % (key, str(predicted_result[key])))

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(FLAGS.output_dir, "test_label.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                idx = 0
                line = ''
                line_token = str(predict_line.text).split(' ')
                label_token = str(predict_line.label).split(' ')
                len_seq = len(label_token)
                if len(line_token) != len(label_token):
                    tf.logging.info(predict_line.text)
                    tf.logging.info(predict_line.label)
                for id in prediction:
                    if idx > len_seq:
                        break
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ['[CLS]', '[SEP]']:
                        if curr_labels == '[SEP]':
                            break
                        continue
                    # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception!
                    try:
                        line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n'
                    except Exception as e:
                        tf.logging.info(e)
                        tf.logging.info(predict_line.text)
                        tf.logging.info(predict_line.label)
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        # 将模型预测的结果和原始标签写入到文件中,以空格分开,使用conevel.py脚本来预测entity level 的结果并且输出
        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer)
        from conlleval import return_report
        eval_result = return_report(output_predict_file)
        print(''.join(eval_result))
        with codecs.open(os.path.join(FLAGS.output_dir, 'entity_level_predicted_result.txt'), 'a',
                         encoding='utf-8') as fd:
            fd.write(''.join(eval_result))
Пример #26
0
def iob_iobes(tags):
    "I - 中间字; B - 起始字; E - 结束字; S - 单字; O - 非entity"
    new_tags = []
    for i, tag in enumerate(tags):
        if tag=='O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if (i+1) != len(tags) and tags[i+1].strip('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-')
        elif tag.split('-')[0] == 'I':
            if (i+1) != len(tags) and tags[i+1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('# >>> Invalid format !!! <<< #')
    return new_tags


def iobes_iob(tags):
    new_tags = []
    for i, tag in enumerate(tags):
        tag_prefix = tag.split('-')[0]
        if tag_prefix == 'B':
            new_tags.append(tag)
        elif tag_prefix == 'I':
            new_tags.append(tag)
        elif tag_prefix == 'S':
            new_tags.append(tag.replace('S-', 'B-'))
        elif tag_prefix == 'E':
            new_tags.append(tag.replace('E-', 'I-'))
        elif tag_prefix == 'O':
            new_tags.append(tag)
        else:
            raise Exception('Invalid format!')
    return new_tags


def load_word2vec(emb_path, id_to_word, word_dim, weighs):
    "加载预训练的词向量,注意维度匹配"
    # 1. read pretrained weights
    print('=> Loading pretrained embeddings from {}...'.format(emb_path))
    pre_trained = {}
    emb_invalid = 0
    for i, line in enumerate(codecs.open(emb_path, 'r', 'utf-8')):
        line = line.rstrip().split()
        if len(line) == word_dim + 1:
            pre_trained[line[0]] = np.array(
                [float(x) for x in line[1:]]).astype(np.float32)
        else:
            emb_invalid += 1
    if emb_invalid > 0:
        print('=》 WARNING: %i invalid lines' % emb_invalid)

    # 2. weights assignment
    c_found = 0
    n_words = len(id_to_word)
    for i in range(n_words):
        word = id_to_word[i]
        if word in pre_trained:
            weights[i] = pre_trained[word]
            c_found += 1
        elif word.lower() in pre_trained:
            weights[i] = pre_trained[word.lower()]
            c_found += 1
        elif re.sub('\d', '0', word.lower()) in pre_trained:
            weights[i] = pre_trained[re.sub('\d', '0', word.lower())]
            c_found += 1
    print('=> Loaded %i pretrained embeddings.' % len(pre_trained))
    print('=> %i / %i words have been initialized with pretrained embeddings.' % (c_found, n_words))
    return weights


def test_ner(results, path):
    """
    Run perl script to evaluate model
    """
    output_file = os.path.join(path, "ner_predict.utf8")
    with open(output_file, "w", encoding='utf8') as f:
        to_write = []
        for res in results:
            for line in res:
                to_write.append(line + "\n")
            to_write.append("\n")
        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines


def input_from_line(line, char_to_id):
    """
    Take sentence data and return an input for
    the training or the evaluation function.
    """
    line = full_to_half(line)
    line = replace_html(line)
    inputs = list()
    inputs.append([line])
    line.replace(" ", "$")
    inputs.append([[char_to_id[char] if char in char_to_id else char_to_id["<UNK>"]
                   for char in line]])
    inputs.append([get_seg_features(line)])
    inputs.append([[]])
    return inputs


def full_to_half(s):
    """
    Convert full-width character to half-width one
    """
    n = []
    for char in s:
        num = ord(char)
        if num == 0x3000:
            num = 32
        elif 0xFF01 <= num <= 0xFF5E:
            num -= 0xfee0
        char = chr(num)
        n.append(char)
    return ''.join(n)


def replace_html(s):
    s = s.replace('&quot;','"')
    s = s.replace('&amp;','&')
    s = s.replace('&lt;','<')
    s = s.replace('&gt;','>')
    s = s.replace('&nbsp;',' ')
    s = s.replace("&ldquo;", "")
    s = s.replace("&rdquo;", "")
    s = s.replace("&mdash;","")
    s = s.replace("\xa0", " ")
    return(s)


def get_seg_features(string):
    """
    Segment text with jieba
    features are represented in bies format
    s donates single word
    注意:若jieba中的词表没有永久添加目标NER词汇,需要在使用前进行手动添加。
    """
    seg_feature = []
    for word in jieba.cut(string):
        if len(word) == 1:
            seg_feature.append(0)
        else:
            tmp = [2] * len(word)
            tmp[0] = 1
            tmp[-1] = 3
            seg_feature.extend(tmp)
    return seg_feature


def jieba_dict_prepare(dict_path="./source_data/DICT_NOW.csv"):
    "根据语料资源,向jieba中添加自定义的词。在使用jieba进行分词前可使用。"
    import jieba, csv

    # 数据说明:
    #     DICT_NOW.csv:
    #           所有标记对应的语言组成的dictionary
    #           这一部分只需要将***.txt中的entity mention和entity category进行对应输出即可
    #           得到DICT_NOW.csv文件。(实际中可以在医药网站或者医学百科中爬取一些医学类entity)
    dics = csv.reader(open(dict_path, 'r', encoding='utf8'))

    # 利用jieba自定义分词,进行专有名词输入
    # 将识别对象加入jieba识别词表,标记视为词性
    for row in dics:
        if len(row) == 2:
            jieba.add_word(row[0].strip(), tag=row[1].strip())
            # 强制加入词为一个joined整体
            jieba.suggest_freq(row[0].strip())


def result_to_json(string, tags):
    item = {"string": string, "entities": []}
    entity_name = ""
    entity_start = 0
    idx = 0
    for char, tag in zip(string, tags):
        prefix = tag[0]
        if prefix == "S":
            item["entities"].append({"word": char, "start": idx, "end": idx+1, "type":tag[2:]})
        elif prefix == "B":
            entity_name += char
            entity_start = idx
        elif prefix == "I":
            entity_name += char
        elif prefix == "E":
            entity_name += char
            item["entities"].append({"word": entity_name, "start": entity_start, "end": idx + 1, "type": tag[2:]})
            entity_name = ""
        else:
            entity_name = ""
            entity_start = idx
        idx += 1
    return item
def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    processors = {
        "ner": NerProcessor
    }
#     if not FLAGS.do_train and not FLAGS.do_eval:
#         raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    # 在train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean
    if FLAGS.clean and FLAGS.do_train:
        if os.path.exists(FLAGS.output_dir):
            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                    else:
                        os.remove(c_path)
            try:
                del_file(FLAGS.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)
        if os.path.exists(FLAGS.data_config_path):
            try:
                os.remove(FLAGS.data_config_path)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)
    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if os.path.exists(FLAGS.data_config_path):
        with codecs.open(FLAGS.data_config_path) as fd:
            data_config = json.load(fd)
    else:
        data_config = {}

    if FLAGS.do_train:
        # 加载训练数据
        if len(data_config) == 0:
            train_examples = processor.get_train_examples(FLAGS.data_dir)
            num_train_steps = int(
                len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
            num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

            data_config['num_train_steps'] = num_train_steps
            data_config['num_warmup_steps'] = num_warmup_steps
            data_config['num_train_size'] = len(train_examples)
        else:
            num_train_steps = int(data_config['num_train_steps'])
            num_warmup_steps = int(data_config['num_warmup_steps'])
    # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程
    # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        # 1. 将数据转化为tf_record 数据
        if data_config.get('train.tf_record_path', '') == '':
            train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
            filed_based_convert_examples_to_features(
                train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
        else:
            train_file = data_config.get('train.tf_record_path')
        num_train_size = num_train_size = int(data_config['num_train_size'])
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", num_train_size)
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        # 2.读取record 数据,组成batch
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    if FLAGS.do_eval:
        if data_config.get('eval.tf_record_path', '') == '':
            eval_examples = processor.get_dev_examples(FLAGS.data_dir)
            eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
            filed_based_convert_examples_to_features(
                eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
            data_config['eval.tf_record_path'] = eval_file
            data_config['num_eval_size'] = len(eval_examples)
        else:
            eval_file = data_config['eval.tf_record_path']
        # 打印验证集数据信息
        num_eval_size = data_config.get('num_eval_size', 0)
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", num_eval_size)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_steps = None
        if FLAGS.use_tpu:
            eval_steps = int(num_eval_size / FLAGS.eval_batch_size)
        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with codecs.open(output_eval_file, "w", encoding='utf-8') as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    # 保存数据的配置文件,避免在以后的训练过程中多次读取训练以及测试数据集,消耗时间
    if not os.path.exists(FLAGS.data_config_path):
        with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd:
            json.dump(data_config, fd)

    if FLAGS.do_predict:
        token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples, label_list,
                                                 FLAGS.max_seq_length, tokenizer,
                                                 predict_file, mode="test")

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
        if FLAGS.use_tpu:
            # Warning: According to tpu_estimator.py Prediction on TPU is an
            # experimental feature and hence not supported here
            raise ValueError("Prediction in TPU not supported")
        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        predicted_result = estimator.evaluate(input_fn=predict_input_fn)
        output_eval_file = os.path.join(FLAGS.output_dir, "predicted_results.txt")
        with codecs.open(output_eval_file, "w", encoding='utf-8') as writer:
            tf.logging.info("***** Predict results *****")
            for key in sorted(predicted_result.keys()):
                tf.logging.info("  %s = %s", key, str(predicted_result[key]))
                writer.write("%s = %s\n" % (key, str(predicted_result[key])))

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                idx = 0
                line = ''
                line_token = str(predict_line.text).split(' ')
                label_token = str(predict_line.label).split(' ')
                if len(line_token) != len(label_token):
                    tf.logging.info(predict_line.text)
                    tf.logging.info(predict_line.label)
                for id in prediction:
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ['[CLS]', '[SEP]']:
                        continue
                    # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception!
                    try:
                        line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n'
                    except Exception as e:
                        tf.logging.info(e)
                        tf.logging.info(predict_line.text)
                        tf.logging.info(predict_line.label)
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer)
        from conlleval import return_report
        eval_result = return_report(output_predict_file)
        print(eval_result)
Пример #28
0
def train(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map

    processors = {"ner": NerProcessor}
    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean
    if args.clean and args.do_train:
        if os.path.exists(args.output_dir):

            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(args.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)

    # check output dir exists
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    processor = processors[args.ner](args.output_dir)
    logger.info(args.data_dir)

    # 加载字典
    tokenizer = FullTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)

    session_config = tf.ConfigProto(log_device_placement=False,
                                    inter_op_parallelism_threads=0,
                                    intra_op_parallelism_threads=0,
                                    allow_soft_placement=True)

    run_config = tf.estimator.RunConfig(
        model_dir=args.output_dir,
        save_summary_steps=500,  # 这里写死了 前面定义无用
        save_checkpoints_steps=500,
        session_config=session_config)

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if args.do_train and args.do_eval:
        # 加载训练数据
        train_examples = processor.get_train_examples(args.data_dir)
        logger.info(len(train_examples))
        num_train_steps = int(
            len(train_examples) * 1.0 / args.batch_size *
            args.num_train_epochs)
        if num_train_steps < 1:
            raise AttributeError('training data is so small...')
        num_warmup_steps = int(num_train_steps * args.warmup_proportion)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        # 加载测试数据
        eval_examples = processor.get_dev_examples(args.data_dir)

        # 打印验证集数据信息
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.batch_size)

    # labels = ["B_at", "I_at", "B_ot", "I_ot", "O"]
    # label_list = processor.get_labels(labels)
    label_list = processor.get_labels()
    # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程
    # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=args.init_checkpoint,
                                learning_rate=args.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                args=args)

    params = {'batch_size': args.batch_size}

    # 不同场景的dropout设置????, 如何实现
    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    if args.do_train and args.do_eval:
        # 1. 将数据转化为tf_record 数据
        train_file = os.path.join(args.output_dir, "train.tf_record")
        if not os.path.exists(train_file):
            filed_based_convert_examples_to_features(train_examples,
                                                     label_list,
                                                     args.max_seq_length,
                                                     tokenizer, train_file,
                                                     args.output_dir)
        # 2.读取record 训练数据,组成batch
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=args.max_seq_length,
            is_training=True,
            drop_remainder=True)
        # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

        # eval的record
        eval_file = os.path.join(args.output_dir, "eval.tf_record")
        if not os.path.exists(eval_file):
            filed_based_convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer, eval_file,
                                                     args.output_dir)
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=False)

        # train and eval togither
        # early stop hook
        early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook(
            estimator=estimator,
            metric_name='loss',  # loss没有提升的时候提前结束, 为啥不合适dev loss???
            max_steps_without_decrease=num_train_steps,  # 这里设置了最大值?????
            eval_dir=None,
            min_steps=0,
            run_every_secs=None,
            run_every_steps=args.save_checkpoints_steps)

        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=num_train_steps,
                                            hooks=[early_stopping_hook])

        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)

        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    if args.do_predict:
        token_path = os.path.join(args.output_dir, "token_test.txt")
        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'),
                         'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(args.data_dir)
        predict_file = os.path.join(args.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples,
                                                 label_list,
                                                 args.max_seq_length,
                                                 tokenizer,
                                                 predict_file,
                                                 args.output_dir,
                                                 mode="test")

        logger.info("***** Running prediction*****")
        logger.info("  Num examples = %d", len(predict_examples))
        logger.info("  Batch size = %d", args.batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        # 这里没有进行维特比解码 如何获取序列化标注的结果????
        result = estimator.predict(input_fn=predict_input_fn)
        logger.info(result)
        output_predict_file = os.path.join(args.output_dir, "label_test.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                idx = 0
                line = ''
                line_token = str(predict_line.text).split(' ')
                label_token = str(predict_line.label).split(' ')
                len_seq = len(label_token)
                if len(line_token) != len(label_token):
                    logger.info(predict_line.text)
                    logger.info(predict_line.label)
                    break
                for id in prediction:
                    if idx >= len_seq:
                        break
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ['[CLS]', '[SEP]', 'X']:
                        continue
                    try:
                        line += line_token[idx] + ' ' + label_token[
                            idx] + ' ' + curr_labels + '\n'
                    except Exception as e:
                        logger.info(e)
                        logger.info(predict_line.text)
                        logger.info(predict_line.label)
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer)

        eval_result = return_report(output_predict_file)
        print(''.join(eval_result))

        # 写结果到文件中
        with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'),
                         'a',
                         encoding='utf-8') as fd:
            fd.write(''.join(eval_result))

    # filter model
    if args.filter_adam_var:
        adam_filter(args.output_dir)
Пример #29
0
def main():
    ''' PrePare and check file'''

    # 检查checkpoint配置的准确性
    tokenization.validate_case_matches_checkpoint(arg_dic['do_lower_case'],
                                                  arg_dic['init_checkpoint'])

    if not arg_dic['do_train'] and not arg_dic['do_eval'] and not arg_dic[
            'do_predict']:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    # 导入Bert配置
    bert_config = modeling.BertConfig.from_json_file(
        arg_dic['bert_config_file'])

    if arg_dic['max_seq_length'] > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (arg_dic['max_seq_length'], bert_config.max_position_embeddings))
    ''' Estimator Config '''

    processors = {"ner": SelfProcessor}

    processor = processors[arg_dic["ner"]]()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=arg_dic["vocab_file"],
        do_lower_case=arg_dic["do_lower_case"])
    '''
        配置tf.Session的运算方式:
        
            log_device_placement: 打印出TensorFlow使用了那种操作
            inter_op_parallelism_threads: 设置线程一个操作内部并行运算的线程数,比如矩阵乘法,如果设置为0,则表示以最优的线程数处理
            intra_op_parallelism_threads: 设置多个操作并行运算的线程数,比如 c = a + b,d = e + f . 可以并行运算
            allow_soft_placement: 那么当运行设备不满足要求时,会自动分配GPU或者CPU
    '''
    session_config = tf.ConfigProto(log_device_placement=False,
                                    inter_op_parallelism_threads=0,
                                    intra_op_parallelism_threads=0,
                                    allow_soft_placement=True)
    '''
        Estimator Config:
            
            model_dir: 存储模型参数,graph等的路径
            save_summary_steps: 每隔这么多步骤保存摘要
            save_checkpoints_steps: 每隔多少个step就存一次checkpoint
            
    '''

    run_config = tf.estimator.RunConfig(
        model_dir=arg_dic["ckpt_dir"],
        save_summary_steps=arg_dic["save_summary_steps"],
        save_checkpoints_steps=arg_dic["save_checkpoints_steps"],
        session_config=session_config)

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None
    ''' Load Data and Model about train and eval '''
    if arg_dic["do_train"] and arg_dic["do_eval"]:

        # train
        train_examples = processor.get_train_examples(arg_dic["data_dir"])

        num_train_steps = int(
            len(train_examples) * 1.0 / arg_dic["train_batch_size"] *
            arg_dic["num_train_epochs"])

        if num_train_steps < 1:
            raise AttributeError('training data is so small...')

        num_warmup_steps = int(num_train_steps * arg_dic["warmup_proportion"])

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", arg_dic["train_batch_size"])
        logger.info("  Num steps = %d", num_train_steps)

        # eval
        eval_examples = processor.get_dev_examples(arg_dic["data_dir"])

        # 打印验证集数据信息
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", arg_dic["train_batch_size"])

    label_list = processor.get_labels(arg_dic["data_dir"] + "label.txt")
    ''' Model of Estimator'''
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=arg_dic["init_checkpoint"],
                                learning_rate=arg_dic["learning_rate"],
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps)

    params = {'batch_size': arg_dic["train_batch_size"]}

    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)
    ''' Train of Estimator'''
    if arg_dic["do_train"] and arg_dic["do_eval"]:
        '''data input_fn'''
        # 1. 将数据转化为tf_record 数据
        train_file = os.path.join(arg_dic["tfrecord_dir"], "train.tf_record")

        # 如果不存在train_record则生成
        if not os.path.exists(train_file):
            filed_based_convert_examples_to_features(train_examples,
                                                     label_list,
                                                     arg_dic["max_seq_length"],
                                                     tokenizer, train_file,
                                                     arg_dic["tfrecord_dir"])

        # 2.读取record 数据,组成batch
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=arg_dic["max_seq_length"],
            is_training=True,
            drop_remainder=True)

        # 1. eval
        eval_file = os.path.join(arg_dic["tfrecord_dir"], "eval.tf_record")

        if not os.path.exists(eval_file):
            filed_based_convert_examples_to_features(eval_examples, label_list,
                                                     arg_dic["max_seq_length"],
                                                     tokenizer, eval_file,
                                                     arg_dic["tfrecord_dir"])

        # 2. eval read
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=arg_dic["max_seq_length"],
            is_training=False,
            drop_remainder=False)
        '''estimator train'''
        '''
            max_steps_without_increase:如果没有增加的最大长是多少,如果超过了这个最大步长metric还是没有增加那么就会停止。
            eval_dir:默认是使用estimator.eval_dir目录,用于存放评估的summary file。
            run_every_secs:表示多长时间调用一次should_stop_fn
        '''

        early_stopping_hook = tf.estimator.experimental.stop_if_no_decrease_hook(
            estimator=estimator,
            metric_name='loss',
            max_steps_without_decrease=num_train_steps,
            eval_dir=None,
            min_steps=0,
            run_every_secs=None,
            run_every_steps=arg_dic["save_checkpoints_steps"])

        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=num_train_steps,
                                            hooks=[early_stopping_hook])
        '''
            throttle_secs:多少秒后又开始评估,如果没有新的 checkpoints 产生,则不评估,所以这个间隔是最小值。
        '''
        eval_spec = tf.estimator.EvalSpec(
            input_fn=eval_input_fn, throttle_secs=arg_dic["eval_model_steps"])

        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    # 进行预测
    if arg_dic["do_predict"]:

        token_path = os.path.join(arg_dic["output_dir"], "token_test.txt")

        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(arg_dic["tfrecord_dir"], 'label2id.pkl'),
                         'rb') as rf:

            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        # 数据
        predict_examples = processor.get_test_examples(arg_dic["data_dir"])

        predict_file = os.path.join(arg_dic["output_dir"], "predict.tf_record")

        filed_based_convert_examples_to_features(predict_examples,
                                                 label_list,
                                                 arg_dic["max_seq_length"],
                                                 tokenizer,
                                                 predict_file,
                                                 arg_dic["output_dir"],
                                                 mode="test")

        logger.info("***** Running prediction*****")
        logger.info("  Num examples = %d", len(predict_examples))
        logger.info("  Batch size = %d", arg_dic["train_batch_size"])

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=arg_dic["max_seq_length"],
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(
            input_fn=predict_input_fn,
            checkpoint_path=".\output\ckpt\model.ckpt-30")

        output_predict_file = os.path.join(arg_dic["output_dir"],
                                           "label_test.txt")

        def result_to_pair(writer):
            print("********")
            print(predict_examples)
            for predict_line, prediction in zip(predict_examples, result):

                idx = 0
                line = ''
                line_token = str(predict_line.text).split(' ')
                label_token = str(predict_line.label).split(' ')

                len_seq = len(label_token)

                if len(line_token) != len(label_token):
                    logger.info(predict_line.text)
                    logger.info(predict_line.label)
                    break
                for id in prediction:
                    if idx >= len_seq:
                        break
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ['[CLS]', '[SEP]']:
                        continue
                    try:
                        line += line_token[idx] + ' ' + label_token[
                            idx] + ' ' + curr_labels + '\n'
                    except Exception as e:
                        logger.info(e)
                        logger.info(predict_line.text)
                        logger.info(predict_line.label)
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        # 预测结果写入文件
        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer)

        import conlleval

        # predict的项
        eval_result = conlleval.return_report(output_predict_file)

        print(''.join(eval_result))

        # 写结果到文件中
        with codecs.open(os.path.join(arg_dic["output_dir"],
                                      'predict_score.txt'),
                         'a',
                         encoding='utf-8') as fd:
            fd.write(''.join(eval_result))
Пример #30
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    processors = {"ner": NerProcessor}
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError("Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    is_train()

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    # is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    is_per_host=3
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(iterations_per_loop=FLAGS.iterations_per_loop,num_shards=FLAGS.num_tpu_cores,per_host_input_for_training=is_per_host))
    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if os.path.exists(FLAGS.data_config_path):
        with codecs.open(FLAGS.data_config_path) as fd:
            data_config = json.load(fd)
    else:
        data_config = {}

    if FLAGS.do_train:
        # 加载训练数据
        if len(data_config) == 0:
            train_examples = processor.get_train_examples(FLAGS.data_dir)
            num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
            num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

            data_config['num_train_steps'] = num_train_steps
            data_config['num_warmup_steps'] = num_warmup_steps
            data_config['num_train_size'] = len(train_examples)
        else:
            num_train_steps = int(data_config['num_train_steps'])
            num_warmup_steps = int(data_config['num_warmup_steps'])
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        if data_config.get('train.tf_record_path', '') == '':
            train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
            filed_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
        else:
            train_file = data_config.get('train.tf_record_path')
        num_train_size = num_train_size = int(data_config['num_train_size'])
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", num_train_size)
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        train_input_fn = file_based_input_fn_builder(input_file=train_file,seq_length=FLAGS.max_seq_length,is_training=True,drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        if data_config.get('eval.tf_record_path', '') == '':
            eval_examples = processor.get_dev_examples(FLAGS.data_dir)
            eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
            filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)#将数据写入tf_record
            data_config['eval.tf_record_path'] = eval_file
            data_config['num_eval_size'] = len(eval_examples)
        else:
            eval_file = data_config['eval.tf_record_path']
        num_eval_size = data_config.get('num_eval_size', 0)
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", num_eval_size)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_steps = None
        if FLAGS.use_tpu:
            eval_steps = int(num_eval_size / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(input_file=eval_file,seq_length=FLAGS.max_seq_length,is_training=False,drop_remainder=eval_drop_remainder)
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with codecs.open(output_eval_file, "w", encoding='utf-8') as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if not os.path.exists(FLAGS.data_config_path):
        with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd:
            json.dump(data_config, fd)

    if FLAGS.do_predict:
        token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples, label_list,FLAGS.max_seq_length, tokenizer,predict_file, mode="test")

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
        if FLAGS.use_tpu:
            raise ValueError("Prediction in TPU not supported")
        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(input_file=predict_file,seq_length=FLAGS.max_seq_length,is_training=False,drop_remainder=predict_drop_remainder)


        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                print(prediction)
                idx = 0
                line = ''
                line_token = str(predict_line.text).split(' ')
                label_token = str(predict_line.label).split(' ')
                if len(line_token) != len(label_token):
                    tf.logging.info(predict_line.text)
                    tf.logging.info(predict_line.label)
                for id in prediction:
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ['[CLS]', '[SEP]']:
                        continue
                    try:
                        line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n'
                    except Exception as e:
                        tf.logging.info(e)
                        tf.logging.info(predict_line.text)
                        tf.logging.info(predict_line.label)
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer)
        from conlleval import return_report
        eval_result = return_report(output_predict_file)
        print(eval_result)
def predict(args, processor, tokenizer, bert_config, sess_config, label_list):
    """
    预测函数
    """
    # 生成3个examples
    predict_examples = processor.get_test_examples(args.data_dir)
    predict_file = os.path.join(args.output_dir, "predict.tf_record")
    filed_based_convert_examples_to_features(predict_examples,
                                             label_list,
                                             args.max_seq_length,
                                             tokenizer,
                                             predict_file,
                                             args.output_dir,
                                             mode="test")
    train_examples = processor.get_train_examples(args.data_dir)
    eval_examples = processor.get_dev_examples(args.data_dir)
    train_file = os.path.join(args.output_dir, "train.tf_record")
    eval_file = os.path.join(args.output_dir, "eval.tf_record")
    # 生成数据集
    train_data = file_based_dataset(input_file=train_file,
                                    batch_size=args.batch_size,
                                    seq_length=args.max_seq_length,
                                    is_training=False,
                                    drop_remainder=False)
    eval_data = file_based_dataset(input_file=eval_file,
                                   batch_size=args.batch_size,
                                   seq_length=args.max_seq_length,
                                   is_training=False,
                                   drop_remainder=False)
    predict_data = file_based_dataset(input_file=predict_file,
                                      batch_size=args.batch_size,
                                      seq_length=args.max_seq_length,
                                      is_training=False,
                                      drop_remainder=False)
    train_iter = train_data.make_one_shot_iterator().get_next()
    eval_iter = eval_data.make_one_shot_iterator().get_next()
    predict_iter = predict_data.make_one_shot_iterator().get_next()

    # 开启计算图
    with tf.Session(config=sess_config) as sess:
        # 从文件中读取计算图
        save_dir = os.path.join(args.output_dir, 'model')
        # saver = tf.train.import_meta_graph(
        #     tf.train.latest_checkpoint(save_dir) + ".meta")
        # sess.run(tf.global_variables_initializer())
        # 打印张量名
        # tensor_list = [
        #     n.name for n in tf.get_default_graph().as_graph_def().node if 'older' in n.name]
        # print(tensor_list)
        # 构造模型
        input_ids = tf.placeholder(shape=[None, args.max_seq_length],
                                   dtype=tf.int32,
                                   name='input_ids')
        input_mask = tf.placeholder(shape=[None, args.max_seq_length],
                                    dtype=tf.int32,
                                    name='input_mask')
        segment_ids = tf.placeholder(shape=[None, args.max_seq_length],
                                     dtype=tf.int32,
                                     name='segment_ids')
        label_ids = tf.placeholder(shape=[None, args.max_seq_length],
                                   dtype=tf.int32,
                                   name='label_ids')
        is_training = tf.get_variable("is_training",
                                      shape=[],
                                      dtype=tf.bool,
                                      trainable=False)

        total_loss, logits, trans, pred_ids = create_model(
            bert_config,
            is_training, input_ids, input_mask, segment_ids, label_ids,
            len(label_list), False, args.dropout_rate, args.lstm_size,
            args.cell, args.num_layers)

        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(sess, tf.train.latest_checkpoint(save_dir))
        # 通过张量名获取模型的占位符和参数
        # input_ids = tf.get_default_graph().get_tensor_by_name('input_ids:0')
        # input_mask = tf.get_default_graph().get_tensor_by_name('input_mask:0')
        # segment_ids = tf.get_default_graph().get_tensor_by_name('segment_ids:0')
        # label_ids = tf.get_default_graph().get_tensor_by_name('label_ids:0')
        # sess.run(tf.assign(tf.get_default_graph().get_tensor_by_name(
        #     'is_training:0'), tf.constant(False, dtype=tf.bool)))
        # # 找到crf输出, 注意其名称在crf_decode源码中, 可以在graph中查到
        # pred_ids = tf.get_default_graph().get_tensor_by_name('ReverseSequence_1:0')

        sess.run(tf.assign(is_training, tf.constant(False, dtype=tf.bool)))

        # test集预测
        predict_total = np.array([[0] * 150], dtype=np.int32)
        for _ in range(0, int(len(predict_examples) / args.batch_size) + 1):
            # predict feed
            predict_batch = sess.run(predict_iter)
            predict_res = sess.run(pred_ids,
                                   feed_dict={
                                       input_ids: predict_batch['input_ids'],
                                       input_mask: predict_batch['input_mask'],
                                       segment_ids:
                                       predict_batch['segment_ids'],
                                       label_ids: predict_batch['label_ids']
                                   })
            predict_total = np.concatenate((predict_total, predict_res),
                                           axis=0)
        # 处理评估结果,计算recall与f1
        predict_total = predict_total[1:]
        output_predict_file = os.path.join(args.output_dir, "label_test.txt")
        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(args, writer, predict_examples, predict_total)

        # train集预测
        train_total = np.array([[0] * 150], dtype=np.int32)
        for _ in range(0, int(len(train_examples) / args.batch_size) + 1):
            # predict feed
            train_batch = sess.run(train_iter)
            train_res = sess.run(pred_ids,
                                 feed_dict={
                                     input_ids: train_batch['input_ids'],
                                     input_mask: train_batch['input_mask'],
                                     segment_ids: train_batch['segment_ids'],
                                     label_ids: train_batch['label_ids']
                                 })
            train_total = np.concatenate((train_total, train_res), axis=0)
        # 处理评估结果,计算recall与f1
        train_total = train_total[1:]
        output_train_file = os.path.join(args.output_dir, "label_train.txt")
        with codecs.open(output_train_file, 'w', encoding='utf-8') as writer:
            result_to_pair(args, writer, train_examples, train_total)
        train_score, _ = conlleval.return_report(output_train_file)
        print(''.join(train_score))

        # eval集预测
        eval_total = np.array([[0] * 150], dtype=np.int32)
        for _ in range(0, int(len(eval_examples) / args.batch_size) + 1):
            # predict feed
            eval_batch = sess.run(eval_iter)
            eval_res = sess.run(pred_ids,
                                feed_dict={
                                    input_ids: eval_batch['input_ids'],
                                    input_mask: eval_batch['input_mask'],
                                    segment_ids: eval_batch['segment_ids'],
                                    label_ids: eval_batch['label_ids']
                                })
            eval_total = np.concatenate((eval_total, eval_res), axis=0)
        # 处理评估结果,计算recall与f1
        eval_total = eval_total[1:]
        output_eval_file = os.path.join(args.output_dir, "label_dev.txt")
        with codecs.open(output_eval_file, 'w', encoding='utf-8') as writer:
            result_to_pair(args, writer, eval_examples, eval_total)
        eval_score, _ = conlleval.return_report(output_eval_file)
        print(''.join(eval_score))
def train_and_eval(args, processor, tokenizer, bert_config, sess_config,
                   label_list):
    """
    训练和评估函数
    """

    # 生成tf_record文件
    train_examples = processor.get_train_examples(args.data_dir)
    eval_examples = processor.get_dev_examples(args.data_dir)
    num_train_steps = int(
        len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs)
    if num_train_steps < 1:
        raise AttributeError('training data is so small...')
    num_warmup_steps = int(num_train_steps * args.warmup_proportion)
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", len(train_examples))
    tf.logging.info("  Batch size = %d", args.batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)

    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Num examples = %d", len(eval_examples))
    tf.logging.info("  Batch size = %d", args.batch_size)

    # 写入tfrecord
    train_file = os.path.join(args.output_dir, "train.tf_record")
    if not os.path.exists(train_file):
        filed_based_convert_examples_to_features(train_examples, label_list,
                                                 args.max_seq_length,
                                                 tokenizer, train_file,
                                                 args.output_dir)
    eval_file = os.path.join(args.output_dir, "eval.tf_record")
    if not os.path.exists(eval_file):
        filed_based_convert_examples_to_features(eval_examples, label_list,
                                                 args.max_seq_length,
                                                 tokenizer, eval_file,
                                                 args.output_dir)
    """
    -------------分割线-------------
    """
    # 存储路径
    log_dir = os.path.join(args.output_dir, 'log')
    save_dir = os.path.join(args.output_dir, 'model')
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    # # 加载数据
    # train_file = os.path.join(args.output_dir, "train.tf_record")
    # eval_file = os.path.join(args.output_dir, "eval.tf_record")
    # if not os.path.exists(train_file) or not os.path.exists(eval_file):
    #     raise ValueError
    # 生成dataset
    train_data = file_based_dataset(input_file=train_file,
                                    batch_size=args.batch_size,
                                    seq_length=args.max_seq_length,
                                    is_training=True,
                                    drop_remainder=False)
    eval_data = file_based_dataset(input_file=eval_file,
                                   batch_size=args.batch_size,
                                   seq_length=args.max_seq_length,
                                   is_training=False,
                                   drop_remainder=False)
    train_iter = train_data.make_one_shot_iterator().get_next()

    # 开启计算图
    with tf.Session(config=sess_config) as sess:
        # 构造模型
        input_ids = tf.placeholder(shape=[None, args.max_seq_length],
                                   dtype=tf.int32,
                                   name='input_ids')
        input_mask = tf.placeholder(shape=[None, args.max_seq_length],
                                    dtype=tf.int32,
                                    name='input_mask')
        segment_ids = tf.placeholder(shape=[None, args.max_seq_length],
                                     dtype=tf.int32,
                                     name='segment_ids')
        label_ids = tf.placeholder(shape=[None, args.max_seq_length],
                                   dtype=tf.int32,
                                   name='label_ids')
        is_training = tf.get_variable("is_training",
                                      shape=[],
                                      dtype=tf.bool,
                                      trainable=False)

        total_loss, logits, trans, pred_ids = create_model(
            bert_config,
            is_training, input_ids, input_mask, segment_ids, label_ids,
            len(label_list), False, args.dropout_rate, args.lstm_size,
            args.cell, args.num_layers)

        # 优化器
        train_op = optimization.create_optimizer(total_loss,
                                                 args.learning_rate,
                                                 num_train_steps,
                                                 num_warmup_steps, False)
        sess.run(tf.global_variables_initializer())

        # 加载bert原始模型
        tvars = tf.trainable_variables()
        if args.init_checkpoint:
            (assignment_map, initialized_variable_names) = \
                modeling.get_assignment_map_from_checkpoint(
                    tvars, args.init_checkpoint)
            tf.train.init_from_checkpoint(args.init_checkpoint, assignment_map)

        # 打印加载模型的参数
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)

        # 初始化存储和log
        writer = tf.summary.FileWriter(log_dir, sess.graph)
        saver = tf.train.Saver()

        # 定义一些全局变量
        best_eval_loss = 1000000.0
        patience = 0

        # 开始训练
        sess.run(tf.assign(is_training, tf.constant(True, dtype=tf.bool)))
        for go in range(1, num_train_steps + 1):
            # feed
            train_batch = sess.run(train_iter)
            loss, preds, op = sess.run(
                [total_loss, pred_ids, train_op],
                feed_dict={
                    input_ids: train_batch['input_ids'],
                    input_mask: train_batch['input_mask'],
                    segment_ids: train_batch['segment_ids'],
                    label_ids: train_batch['label_ids']
                })

            if go % args.save_summary_steps == 0:
                # 训练log
                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag="loss/train_loss",
                                         simple_value=loss / args.batch_size),
                    ]), sess.run(tf.train.get_global_step()))
                writer.flush()

            if go % args.save_checkpoints_steps == 0:
                # 验证集评估
                sess.run(
                    tf.assign(is_training, tf.constant(False, dtype=tf.bool)))
                eval_loss_total = 0.0
                eval_preds_total = np.array([[0] * 150], dtype=np.int32)
                eval_truth_total = np.array([[0] * 150], dtype=np.int32)
                # 重新生成一次验证集数据
                eval_data = eval_data.repeat()
                eval_iter = eval_data.make_one_shot_iterator().get_next()
                for _ in range(0,
                               int(len(eval_examples) / args.batch_size) + 1):
                    # eval feed
                    eval_batch = sess.run(eval_iter)
                    eval_loss, eval_preds, eval_truth = sess.run(
                        [total_loss, pred_ids, label_ids],
                        feed_dict={
                            input_ids: eval_batch['input_ids'],
                            input_mask: eval_batch['input_mask'],
                            segment_ids: eval_batch['segment_ids'],
                            label_ids: eval_batch['label_ids']
                        })
                    # 统计结果
                    eval_loss_total += eval_loss
                    eval_preds_total = np.concatenate(
                        (eval_preds_total, eval_preds), axis=0)
                    eval_truth_total = np.concatenate(
                        (eval_truth_total, eval_truth), axis=0)

                # 处理评估结果,计算recall与f1
                eval_preds_total = eval_preds_total[1:]
                eval_truth_total = eval_truth_total[1:]
                eval_f1 = metrics.f1_score(eval_truth_total.reshape(-1),
                                           eval_preds_total.reshape(-1),
                                           average='macro')
                eval_recall = metrics.recall_score(
                    eval_truth_total.reshape(-1),
                    eval_preds_total.reshape(-1),
                    average='macro')
                eval_acc = metrics.accuracy_score(eval_truth_total.reshape(-1),
                                                  eval_preds_total.reshape(-1))
                eval_loss_aver = eval_loss_total / len(eval_examples)

                # 评估命名实体识别的指标
                output_eval_file = os.path.join(args.output_dir,
                                                "label_eval.txt")
                with codecs.open(output_eval_file, 'w',
                                 encoding='utf-8') as writer_1:
                    result_to_pair(args, writer_1, eval_examples,
                                   eval_preds_total)
                eval_score, over_all = conlleval.return_report(
                    output_eval_file)
                print(''.join(eval_score))

                # 评估log
                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag="loss/eval_loss",
                                         simple_value=eval_loss_aver),
                    ]), sess.run(tf.train.get_global_step()))
                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag="eval/f1", simple_value=eval_f1),
                    ]), sess.run(tf.train.get_global_step()))
                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag="eval/recall",
                                         simple_value=eval_recall),
                    ]), sess.run(tf.train.get_global_step()))
                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag="eval/acc",
                                         simple_value=eval_acc),
                    ]), sess.run(tf.train.get_global_step()))
                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag="ner/f1",
                                         simple_value=over_all.fscore),
                    ]), sess.run(tf.train.get_global_step()))
                writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag="ner/recall",
                                         simple_value=over_all.rec),
                    ]), sess.run(tf.train.get_global_step()))
                writer.flush()

                # early stopping 与 模型保存
                if eval_loss_aver >= best_eval_loss:
                    patience += 1
                    if patience >= 5:
                        print("early stoping!")
                        return

                if eval_loss_aver < best_eval_loss:
                    patience = 0
                    best_eval_loss = eval_loss_aver
                    saver.save(
                        sess,
                        os.path.join(
                            save_dir, "model_{}_loss_{:.4f}.ckpt".format(
                                sess.run(tf.train.get_global_step()),
                                best_eval_loss)))

                sess.run(
                    tf.assign(is_training, tf.constant(False, dtype=tf.bool)))
Пример #33
0
with codecs.open(output_predict_file, 'r', encoding='utf-8') as f:
    counter = 0
    line_1 = []
    line_2 = []
    line_3 = []
    lines = ''
    for line in f:
        if line.strip():
            content = line.strip()
            tokens = content.split('\t')
            if counter == 0:
                line_1 = tokens
                counter += 1
            elif counter == 1:
                line_2 = tokens
                counter += 1
            elif counter == 2:
                line_3 = tokens
        else:
            for a, b, c in zip(line_1, line_2, line_3):
                if a not in ["[PAD]", "[CLS]", "[SEP]"
                             ] and b not in ["X", "APAD"]:
                    lines += a + " " + b + " " + c + '\n'
            counter = 0

with codecs.open(output_predict_file_processed, 'w',
                 encoding='utf-8') as writer:
    writer.write(lines + '\n')

eval_result = conlleval.return_report(output_predict_file_processed)
print(''.join(eval_result))