示例#1
0
        bert_config=bert_config,
        is_training=False,
        input_ids=input_ids_p,
        input_mask=input_mask_p,
        segment_ids=None,
        labels=None,
        num_labels=num_labels,
        use_one_hot_embeddings=False,
        dropout_rate=1.0,
    )

    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint(model_dir))

tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(
    bert_dir, "vocab.txt"),
                                       do_lower_case=args.do_lower_case)

ckpt = tf.train.get_checkpoint_state(model_dir)
ckpt_path = ckpt.model_checkpoint_path


def read_model_param_and_value():
    reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path)
    param_dict = reader.get_variable_to_shape_map()

    for key, val in param_dict.items():
        try:
            # if "crf_loss" in key or "project" in key:
            # print(key)  # , reader.get_tensor(key))
            if "bert/encoder/Reshape_13" in key:
示例#2
0
def zjb_eval(output_dir, data_dir, max_seq_length, vocab_file, batch_size,
             learning_rate, init_checkpoint, bert_config_file,
             num_train_epochs):
    tf.logging.set_verbosity(tf.logging.INFO)
    from bert_base.train.train_helper import get_args_parser
    args = get_args_parser()
    token_path = os.path.join(output_dir, "token_test.txt")
    if os.path.exists(token_path):
        os.remove(token_path)

    with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'rb') as rf:
        label2id = pickle.load(rf)
        id2label = {value: key for key, value in label2id.items()}

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=args.do_lower_case)
    processors = {"ner": NerProcessor}
    processor = processors[args.ner](output_dir)
    label_list = processor.get_labels()
    predict_examples = processor.get_test_examples(data_dir)
    predict_file = os.path.join(output_dir, "predict.tf_record")
    filed_based_convert_examples_to_features(predict_examples,
                                             label_list,
                                             max_seq_length,
                                             tokenizer,
                                             predict_file,
                                             output_dir,
                                             mode="test")

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d", len(predict_examples))
    tf.logging.info("  Batch size = %d", batch_size)

    predict_drop_remainder = False
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    session_config = tf.ConfigProto(log_device_placement=False,
                                    inter_op_parallelism_threads=0,
                                    intra_op_parallelism_threads=0,
                                    allow_soft_placement=True)
    # session_config.gpu_options.per_process_gpu_memory_fraction = 0.8  # 占用80%显存

    run_config = tf.estimator.RunConfig(model_dir=output_dir,
                                        save_summary_steps=500,
                                        save_checkpoints_steps=500,
                                        session_config=session_config)

    train_examples = processor.get_train_examples(data_dir)
    num_train_steps = int(
        len(train_examples) * 1.0 / batch_size * num_train_epochs)
    if num_train_steps < 1:
        raise AttributeError('training data is so small...')
    num_warmup_steps = int(num_train_steps * args.warmup_proportion)

    bert_config = modeling.BertConfig.from_json_file(bert_config_file)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=init_checkpoint,
                                learning_rate=learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                args=args)

    params = {'batch_size': args.batch_size}

    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    result = estimator.predict(input_fn=predict_input_fn)
    output_predict_file = os.path.join(output_dir, "label_test.txt")

    def result_to_pair(writer):
        for predict_line, prediction in zip(predict_examples, result):
            idx = 0
            line = ''
            line_token = str(predict_line.text).split(' ')
            label_token = str(predict_line.label).split(' ')
            len_seq = len(label_token)
            if len(line_token) != len(label_token):
                tf.logging.info(predict_line.text)
                tf.logging.info(predict_line.label)
                break
            for id in prediction:
                if idx >= len_seq:
                    break
                if id == 0:
                    continue
                curr_labels = id2label[id]
                if curr_labels in ['[CLS]', '[SEP]']:
                    continue
                try:
                    line += line_token[idx] + ' ' + label_token[
                        idx] + ' ' + curr_labels + '\n'
                except Exception as e:
                    tf.logging.info(e)
                    tf.logging.info(predict_line.text)
                    tf.logging.info(predict_line.label)
                    line = ''
                    break
                idx += 1
            writer.write(line + '\n')

    with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
        result_to_pair(writer)
    from bert_base.train import conlleval
    eval_result = conlleval.return_report(output_predict_file)
    print(''.join(eval_result))
    # 写结果到文件中
    with codecs.open(os.path.join(output_dir, 'predict_score.txt'),
                     'a',
                     encoding='utf-8') as fd:
        fd.write(''.join(eval_result))
    # filter model
    if args.filter_adam_var:
        adam_filter(output_dir)
def train(args):
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map

    processors = {"ner": NerProcessor}
    bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean
    if args.clean and args.do_train:
        if os.path.exists(args.output_dir):

            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(args.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)

    #check output dir exists
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    processor = processors[args.ner](args.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    session_config = tf.ConfigProto(log_device_placement=False,
                                    inter_op_parallelism_threads=0,
                                    intra_op_parallelism_threads=0,
                                    allow_soft_placement=True)

    run_config = tf.estimator.RunConfig(model_dir=args.output_dir,
                                        save_summary_steps=500,
                                        save_checkpoints_steps=500,
                                        session_config=session_config)

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if args.do_train and args.do_eval:
        # 加载训练数据
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) * 1.0 / args.batch_size *
            args.num_train_epochs)
        if num_train_steps < 1:
            raise AttributeError('training data is so small...')
        num_warmup_steps = int(num_train_steps * args.warmup_proportion)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        eval_examples = processor.get_dev_examples(args.data_dir)

        # 打印验证集数据信息
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.batch_size)

    label_list = processor.get_labels()
    # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程
    # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=args.init_checkpoint,
                                learning_rate=args.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                args=args)

    params = {'batch_size': args.batch_size}

    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    if args.do_train and args.do_eval:
        # 1. 将数据转化为tf_record 数据
        train_file = os.path.join(args.output_dir, "train.tf_record")
        if not os.path.exists(train_file):
            filed_based_convert_examples_to_features(train_examples,
                                                     label_list,
                                                     args.max_seq_length,
                                                     tokenizer, train_file,
                                                     args.output_dir)

        # 2.读取record 数据,组成batch
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=args.max_seq_length,
            is_training=True,
            drop_remainder=True)
        # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

        eval_file = os.path.join(args.output_dir, "eval.tf_record")
        if not os.path.exists(eval_file):
            filed_based_convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer, eval_file,
                                                     args.output_dir)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=False)

        # train and eval togither
        # early stop hook
        early_stopping_hook = tf.estimator.experimental.stop_if_no_decrease_hook(
            estimator=estimator,
            metric_name='loss',
            max_steps_without_decrease=num_train_steps,
            eval_dir=None,
            min_steps=0,
            run_every_secs=None,
            run_every_steps=args.save_checkpoints_steps)

        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=num_train_steps,
                                            hooks=[early_stopping_hook])
        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    if args.do_predict:
        token_path = os.path.join(args.output_dir, "token_test.txt")
        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'),
                         'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(args.data_dir)
        predict_file = os.path.join(args.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples,
                                                 label_list,
                                                 args.max_seq_length,
                                                 tokenizer,
                                                 predict_file,
                                                 args.output_dir,
                                                 mode="test")

        logger.info("***** Running prediction*****")
        logger.info("  Num examples = %d", len(predict_examples))
        logger.info("  Batch size = %d", args.batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(args.output_dir, "label_test.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                idx = 0
                line = ''
                line_token = str(predict_line.text).split(' ')
                label_token = str(predict_line.label).split(' ')
                len_seq = len(label_token)
                if len(line_token) != len(label_token):
                    logger.info(predict_line.text)
                    logger.info(predict_line.label)
                    break
                for id in prediction:
                    if idx >= len_seq:
                        break
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ['[CLS]', '[SEP]']:
                        continue
                    try:
                        line += line_token[idx] + ' ' + label_token[
                            idx] + ' ' + curr_labels + '\n'
                    except Exception as e:
                        logger.info(e)
                        logger.info(predict_line.text)
                        logger.info(predict_line.label)
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer)
        from bert_base.train import conlleval
        eval_result = conlleval.return_report(output_predict_file)
        print(''.join(eval_result))
        # 写结果到文件中
        with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'),
                         'a',
                         encoding='utf-8') as fd:
            fd.write(''.join(eval_result))
    # filter model
    if args.filter_adam_var:
        adam_filter(args.output_dir)
def train(args):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "RASA": RasaProcessor,
    }

    tokenization.validate_case_matches_checkpoint(args.do_lower_case,
                                                  args.init_checkpoint)

    if not args.do_train and not args.do_eval and not args.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(args.output_dir)

    processor = processors[args.ner]()

    label_list = processor.get_labels(args.data_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    tpu_cluster_resolver = None
    if args.use_tpu and args.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            args.tpu_name, zone=args.tpu_zone, project=args.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=args.master,
        model_dir=args.output_dir,
        save_checkpoints_steps=args.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=args.iterations_per_loop,
            num_shards=args.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.batch_size * args.num_train_epochs)
        num_warmup_steps = int(num_train_steps * args.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=args.init_checkpoint,
                                learning_rate=args.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=args.use_tpu,
                                use_one_hot_embeddings=args.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(use_tpu=args.use_tpu,
                                            model_fn=model_fn,
                                            config=run_config,
                                            train_batch_size=args.batch_size,
                                            eval_batch_size=args.batch_size,
                                            predict_batch_size=args.batch_size)

    if args.do_train:
        train_file = os.path.join(args.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                args.max_seq_length, tokenizer,
                                                train_file, args.output_dir)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", args.batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=args.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if args.do_eval:
        eval_examples = processor.get_dev_examples(args.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if args.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % args.batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(args.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                args.max_seq_length, tokenizer,
                                                eval_file, args.output_dir)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", args.batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if args.use_tpu:
            assert len(eval_examples) % args.batch_size == 0
            eval_steps = int(len(eval_examples) // args.batch_size)

        eval_drop_remainder = True if args.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_predict:
        predict_examples = processor.get_test_examples(args.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if args.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % args.batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(args.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                args.max_seq_length, tokenizer,
                                                predict_file, args.output_dir)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", args.batch_size)

        predict_drop_remainder = True if args.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)
        print("result: {}".format(result))

        output_predict_file = os.path.join(args.output_dir, "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                #print("prediction: {}".format(prediction))
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
示例#5
0
def train(args):
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "ner": NerProcessor
    }
    bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean
    if args.clean and args.do_train:
        if os.path.exists(args.output_dir):
            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(args.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)

    #check output dir exists
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    if not os.path.exists(os.path.join(args.output_dir,'eval')):
        os.mkdir(os.path.join(args.output_dir,'eval'))

    processor = processors[args.ner](args.output_dir)

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    session_config = tf.ConfigProto(
        log_device_placement=False,
        inter_op_parallelism_threads=0,
        intra_op_parallelism_threads=0,
        allow_soft_placement=True)

    run_config = tf.estimator.RunConfig(
        model_dir=args.output_dir,#模型保存路径
        keep_checkpoint_max=10, #最大保存模型的数量
        save_summary_steps=args.save_summary_steps, #每个多少steps保存一次summary
        save_checkpoints_steps=args.save_checkpoints_steps,#每个多少steps保存一次模型
        session_config=session_config
    )

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if args.do_train and args.do_eval:
        # 加载训练数据
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) *1.0 / args.batch_size * args.num_train_epochs)
        if num_train_steps < 1:
            raise AttributeError('training data is so small...')
        num_warmup_steps = int(num_train_steps * args.warmup_proportion)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d" %len(train_examples))
        logger.info("  Batch size = %d" %args.batch_size)
        logger.info("  Num steps = %d" %num_train_steps)

        eval_examples = processor.get_dev_examples(args.data_dir)

        # 打印验证集数据信息
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d"%len(eval_examples))
        logger.info("  Batch size = %d"%args.batch_size)
    if not os.path.exists(os.path.join(args.output_dir, 'label_list.pkl')):
        label_list = processor.get_labels(labels=args.label_list)
    else:
        with open(os.path.join(args.output__dir, 'label_list.pkl'),'r',encoding='utf-8') as p:
            label_list=pickle.load(p)
    # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程
    # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,  #里面没有pad:0,所以加1
        init_checkpoint=args.init_checkpoint,
        learning_rate=args.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps, #热身步数,此时学习率很小,当global_steps<num_warmup_steps时,learn_rate=global_steps/num_warmup_steps*init_learn_rate
        args=args)

    params = {
        'batch_size': args.batch_size
    }

    estimator = tf.estimator.Estimator(
        model_fn, #搭建的模型分为三种情况,训练,验证,预测
        model_dir=args.output_dir,#config和这里都可以设置模型保存路径,二选一设置即可,都设置必须保持一致
        params=params,
        config=run_config)

    if args.do_train and args.do_eval:
        # ckpt_file = tf.train.latest_checkpoint(args.output_dir)
        # print('加载{}模型来train'.format(ckpt_file))
        # 1. 将数据转化为tf_record 数据

        train_file = os.path.join(args.output_dir, "train.tf_record")
        if not os.path.exists(train_file):
            filed_based_convert_examples_to_features(
                train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir)

        # 2.读取record 数据,组成batch
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=args.max_seq_length,
            is_training=True,
            drop_remainder=True)
        # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

        eval_file = os.path.join(args.output_dir, "eval.tf_record")
        if not os.path.exists(eval_file):
            filed_based_convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=False)

        # train and eval togither
        # early stop hook
        early_stopping_hook = tf.contrib.estimator.stop_if_no_increase_hook(
            estimator=estimator,
            metric_name='f1',
            max_steps_without_increase=args.max_steps_without_decrease,
            eval_dir=None,
            min_steps=0,
            run_every_secs=None,
            run_every_steps=args.save_checkpoints_steps)

        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps,
                                            hooks=[early_stopping_hook])
        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,steps=None,throttle_secs=120) # steps 评估的迭代步数,如果为None,则在整个数据集上评估。每save一次model才会评估一次,并且至少间隔120秒
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    if args.do_predict:
        token_path = os.path.join(args.output_dir, "token_test.txt")
        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}
            print('id2label:{}'.format(id2label))

        predict_examples = processor.get_test_examples(args.data_dir)
        predict_file = os.path.join(args.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples, label_list,
                                                 args.max_seq_length, tokenizer,
                                                 predict_file, args.output_dir, mode="test")

        logger.info("***** Running prediction*****")
        logger.info("  Num examples = %d" %len(predict_examples))
        logger.info("  Batch size = %d"%args.batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(args.output_dir, "label_test.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                idx = 0
                line = ''
                line_token = str(predict_line.text).split(' ')
                label_token = str(predict_line.label).split(' ')
                len_seq = len(label_token)
                if len(line_token) != len(label_token):
                    logger.info(predict_line.text)
                    logger.info(predict_line.label)
                    break
                for id in prediction:
                    if idx >= len_seq+2:  #过滤掉pad的预测结果
                        break
                    if idx == 0: #过滤掉cls的预测结果
                        idx+=1
                        continue
                    if idx==len_seq+1: #过滤掉seq的预测结果
                        idx+=1
                        continue
                    curr_labels = id2label[id]
                    #凡是预测为[CLS]和[SEP]的标签都人为的改成O
                    if curr_labels in ['[CLS]','[SEP]']:
                        # if idx==1:
                        #     if id2label[prediction[idx+1]][0] in ['B','O']:
                        curr_labels='O'
                    #         else:
                    #             curr_labels='B'+id2label[prediction[idx+1]][1:]
                    #     else:
                    #         if id2label[prediction[idx-1]]=='O':
                    #             curr_labels='O'
                    #         else:
                    #             curr_labels='I'+id2label[prediction[idx-1]][1:]
                    try:
                        line += line_token[idx-1] + ' ' + label_token[idx-1] + ' ' + iobes_iob([curr_labels])[0] + '\n'
                    except Exception as e:
                        logger.info(e)
                        logger.info(predict_line.text)
                        logger.info(predict_line.label)
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer)
        from bert_base.train import conlleval
        eval_result = conlleval.return_report(output_predict_file)
        print(''.join(eval_result))
# 如果不是针对daGuan比赛以下这段可以注释
        tmp_file=open('dg_result.txt','w',encoding='utf-8')
        with open(output_predict_file) as f:
            lines=f.readlines()
            for line in lines:
                if line=='\n':
                    tmp_file.write('\n')
                    continue
                lis_line=line.strip().split()
                tmp_file.write(lis_line[0]+'\t'+lis_line[-1]+'\n')
        tmp_file.close()
        tf_metrics.recover_reduce_sentence_length('dg_result.txt', 'dg_NERdata/test_raw.txt', 'dg_rc_result.txt')
        tf_metrics.BIO2line_file('dg_rc_result.txt','dg_NERdata/result_file.txt')
        f1score=tf_metrics.get_f1score(result_file='dg_NERdata/result_file.txt', target_file='dg_NERdata/train_v_8.txt')
        print('df_f1score: {}'.format(f1score))
# 如果不是针对daGuan比赛这段以上这段可以注释

        # 写结果到文件中
        with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd:
            fd.write(''.join(eval_result))
            fd.write('dg_f1score: {}\n'.format(f1score))## 如果不是针对daGuan比赛可以注释
示例#6
0
def train(FLAGS):
    print(FLAGS.bert_config_file)

    processors = {"ner": NerProcessor}
    bert_config = modeling.BertConfig.from_json_file(
        FLAGS.bert_config_file)  #用bert里面方法打印配置

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    #check output dir exists
    if not os.path.exists(FLAGS.output_dir):
        os.mkdir(FLAGS.output_dir)

    processor = processors['ner'](FLAGS.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    session_config = tf.ConfigProto(
        log_device_placement=False,  #是否打印日志
        inter_op_parallelism_threads=0,
        intra_op_parallelism_threads=0,
        allow_soft_placement=True)  #是否从gpu,替换成cpu

    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        save_summary_steps=FLAGS.save_summary_steps,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        session_config=session_config)

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train and FLAGS.do_dev:
        # 加载训练数据
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) * 1.0 / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        if num_train_steps < 1:
            raise AttributeError('training data is so small...')
        num_warmup_steps = int(
            num_train_steps * FLAGS.warmup_proportion
        )  #bert模型中刚开始学习率设置小一点,等过了num_warmup_steps百分比后学习率在还原

        logger.info("***** Running training *****")

        eval_examples = processor.get_dev_examples(FLAGS.data_dir)

    label_list = processor.get_labels()

    # 1. 将数据转化为tf_record 数据
    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    if os.path.exists(train_file):
        filed_based_convert_examples_to_features(train_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, train_file,
                                                 FLAGS.output_dir)

    # 2.读取record 数据,组成batch
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)

    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    if not os.path.exists(eval_file):
        filed_based_convert_examples_to_features(eval_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, eval_file,
                                                 FLAGS.output_dir)

    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)

    # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程
    # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                FLAGS=FLAGS)

    params = {'batch_size': FLAGS.train_batch_size}

    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook(
        estimator=estimator,
        metric_name='loss',  #用来监控的目标
        max_steps_without_decrease=
        num_train_steps,  #如果没有增加的最大长是多少,如果超过了这个最大步长metric还是没有增加那么就会停止。
        eval_dir=None,  #默认是使用estimator.eval_dir目录,用于存放评估的summary file
        min_steps=0,  #训练的最小步长,如果训练小于这个步长那么永远都不会停止
        run_every_secs=None,
        run_every_steps=FLAGS.save_checkpoints_steps
    )  #表示多长时间获得步长调用一次should_stop_fn

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=num_train_steps,
                                        hooks=[early_stopping_hook])
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#7
0
    def train(self):
        if ARGS.bert:
            from bert_data_utils import BertDataUtils
            tokenizer = tokenization.FullTokenizer(vocab_file=ARGS.vocab_dir, )
            self.train_data = BertDataUtils(tokenizer, batch_size=1)
            self.dev_data = BertDataUtils(tokenizer, batch_size=20)
            self.dev_batch = self.dev_data.iteration()
        else:
            from data_utils import DataBatch
            self.train_data = DataBatch(data_type='train', batch_size=1)

            data = {
                "batch_size": self.train_data.batch_size,
                "input_size": self.train_data.input_size,
                "vocab": self.train_data.vocab,
                "tag_map": self.train_data.tag_map,
            }

            f = open("data/data_map.pkl", "wb")
            cPickle.dump(data, f)
            f.close()
            self.vocab = self.train_data.vocab
            self.input_size = len(self.vocab.values()) + 1
            self.dev_data = DataBatch(data_type='dev', batch_size=300)
            self.dev_batch = self.dev_data.iteration()
        self.nums_tags = len(self.train_data.tag_map.keys())
        self.tag_map = self.train_data.tag_map
        self.train_length = len(self.train_data.data)

        # self.test_data = DataBatch(data_type='test', batch_size=100)
        # self.test_batch = self.test_data.get_batch()
        # save vocab
        print("-" * 50)
        print("train data:\t", self.train_length)
        print("nums of tags:\t", self.nums_tags)

        self.__creat_model()
        with tf.Session() as sess:
            with tf.device("/gpu:0"):
                ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
                if ckpt and tf.train.checkpoint_exists(
                        ckpt.model_checkpoint_path):
                    print("restore model")
                    self.saver.restore(sess, ckpt.model_checkpoint_path)
                else:
                    sess.run(tf.global_variables_initializer())

                tvars = tf.trainable_variables()
                (assignment_map, initialized_variable_names) = \
                    modeling.get_assignment_map_from_checkpoint(tvars,
                                                             ARGS.init_checkpoint)
                tf.train.init_from_checkpoint(ARGS.init_checkpoint,
                                              assignment_map)
                for var in tvars:
                    init_string = ""
                    if var.name in initialized_variable_names:
                        init_string = ", *INIT_FROM_CKPT*"
                    print("  name = %s, shape = %s%s", var.name, var.shape,
                          init_string)
                for i in range(self.max_epoch):
                    print("-" * 50)
                    print("epoch {}".format(i))

                    steps = 0
                    for batch in self.train_data.get_batch():
                        steps += 1
                        if ARGS.bert:
                            global_steps, loss, logits, acc, length = self.bert_step(
                                sess, batch)
                        else:
                            global_steps, loss, logits, acc, length = self.step(
                                sess, batch)
                        if steps % 1 == 0:
                            print("[->] step {}/{}\tloss {:.2f}\tacc {:.2f}".
                                  format(steps,
                                         len(self.train_data.batch_data), loss,
                                         acc))
                    if ARGS.bert:
                        self.bert_evaluate(sess, "ORG")
                        self.bert_evaluate(sess, "PER")
                    else:
                        self.evaluate(sess, "ORG")
                        self.evaluate(sess, "PER")
                    self.saver.save(sess, self.checkpoint_path)
示例#8
0
    # pred_ids = tf.identity(pred_ids, 'pred_ids')
    # probabilities = tf.identity(probabilities, 'pred_prob')
    saver = tf.train.Saver()

    # (total_loss, logits, trans, pred_ids) = create_model(
    #     bert_config=bert_config, is_training=False, input_ids=input_ids_p, input_mask=input_mask_p, segment_ids=None,
    #     labels=None, num_labels=num_labels, use_one_hot_embeddings=False, dropout_rate=1.0)

    saver = tf.train.Saver()

    print("model_dir: ", model_dir)

    saver.restore(sess, tf.train.latest_checkpoint(model_dir))

tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(
    bert_dir, 'vocab.txt'),
                                       do_lower_case=DO_LOWER_CASE)


class InputFeatures(object):
    """A single set of features of data."""
    def __init__(
        self,
        input_ids,
        input_mask,
        segment_ids,
    ):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        # self.label_ids = label_ids
示例#9
0
            tag_ids = tag_ids + (max_length - len(tag_ids)) * [0]
            inputs_ids = inputs_ids + (max_length - len(inputs_ids)) * [0]
            segment_ids = segment_ids + (max_length - len(segment_ids)) * [0]
            input_mask = input_mask + (max_length - len(input_mask)) * [0]
            assert len(tag_ids) == len(inputs_ids) == len(segment_ids) == len(
                input_mask)
            padded_data.append(
                [ntokens, tag_ids, inputs_ids, segment_ids, input_mask])
        return padded_data

    def iteration(self):
        idx = 0
        while True:
            yield self.batch_data[idx]
            idx += 1
            if idx > len(self.batch_data) - 1:
                idx = 0

    def get_batch(self):
        for data in self.batch_data:
            yield data


if __name__ == "__main__":
    from bert_base.bert import tokenization
    tokenizer = tokenization.FullTokenizer(vocab_file="data/vocab.txt", )
    bert_data_util = BertDataUtils(tokenizer)
    bert_data_util.load_data()
    bert_data_util.prepare_batch()
    import pdb
    pdb.set_trace()
示例#10
0
def main(_):
    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    if not os.path.exists(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)

    processor = ThuProcessor()
    #定义分词器
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)
    # estimator 运行参数
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        save_summary_steps=FLAGS.save_summary_steps,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=5,
        log_step_count_steps=500,
        session_config=tf.ConfigProto(log_device_placement=True)
        #session_config=tf.ConfigProto(log_device_placement=True,
        #                               device_count={'GPU': 1}))
    )

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        import ipdb
        #ipdb.set_trace()
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    # get_labels() must be called after get_train_examoles or other examples
    label_list = processor.get_labels()
    logger.info('************ label_list=', ' '.join(label_list))
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps)

    # params是一个dict 里面的key是model_fn 里面用到的参数名称,value是对应的数据
    params = {
        'batch_size': FLAGS.train_batch_size,
    }

    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params=params,
    )

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file, 'train')
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", FLAGS.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            num_label=len(label_list),
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file, 'eval')
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            num_label=len(label_list),
            is_training=False,
            drop_remainder=False)

        result = estimator.evaluate(input_fn=eval_input_fn)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file,
                                                'test')

        logger.info("***** Running prediction*****")
        logger.info("  Num examples = %d", len(predict_examples))
        logger.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            num_label=len(label_list),
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.txt")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            logger.info("***** Predict results *****")
            for prediction in result:
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in prediction) + "\n"
                writer.write(output_line)
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        master=FLAGS.master,
        tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    examples = read_examples(FLAGS.input_file)

    features = convert_examples_to_features(examples=examples,
                                            seq_length=FLAGS.max_seq_length,
                                            tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model_fn = model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        layer_indexes=layer_indexes,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=FLAGS.batch_size)

    input_fn = input_fn_builder(features=features,
                                seq_length=FLAGS.max_seq_length)

    with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
                                                 "w")) as writer:
        for result in estimator.predict(input_fn, yield_single_examples=True):
            unique_id = int(result["unique_id"])
            feature = unique_id_to_feature[unique_id]
            output_json = collections.OrderedDict()
            output_json["linex_index"] = unique_id
            all_features = []
            for (i, token) in enumerate(feature.tokens):
                all_layers = []
                for (j, layer_index) in enumerate(layer_indexes):
                    layer_output = result["layer_output_%d" % j]
                    layers = collections.OrderedDict()
                    layers["index"] = layer_index
                    layers["values"] = [
                        round(float(x), 6)
                        for x in layer_output[i:(i + 1)].flat
                    ]
                    all_layers.append(layers)
                features = collections.OrderedDict()
                features["token"] = token
                features["layers"] = all_layers
                all_features.append(features)
            output_json["features"] = all_features
            writer.write(json.dumps(output_json) + "\n")
示例#12
0
def train(FLAGS):
    print(FLAGS.bert_config_file)

    processors = {'ner': NerProcessor}
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    # max_seq_length必须小于设置的max_position_embeddings,max_position_embeddings这里是512
    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            'Cannot use sequence length %d because the BERT model'
            'was not trained up to sequence length %d' %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    # 检查output目录是否存在
    if not os.path.exists(FLAGS.output_dir):
        os.mkdir(FLAGS.output_dir)

    processor = processors['ner'](FLAGS.output_dir)
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)
    """
    tf.ConfigProto: tensorflow config protocol,tensorflow配置协议
    :param log_device_placement: 如果是True,我们可以看到我们的tensor、op是在哪台设备、哪颗CPU上运行的。如果是Flase就看不到。
    :param inter_op_parallelism_threads: 每个进程可用的为进行阻塞操作节点准备线程池中线程数量,设置为0代表让系统选择合适数值。
    :param intra_op_parallelism_threads: 线程池中线程的数量,如果设置为0代表让系统设置合适的数值。
    :param allow_soft_placement: 这个参数制定是否允许计算的“软分配”。
                                 如果这个参数设置为True,那么一个操作在下列情况下会被放在CPU上运行:
                                     1、操作没有GPU的实现
                                     2、没有已知的GPU
                                     3、需要与来自CPU的reftype输入进行协同定位
    """
    session_config = tf.ConfigProto(log_device_placement=False,
                                    inter_op_parallelism_threads=0,
                                    intra_op_parallelism_threads=0,
                                    allow_soft_placement=True)
    """
    tf.estimator.RunConfig: tensorflow运行配置文件
    :param model_dir: 模型的输出路径
    :param save_summary_steps: 多少步进行可视化更新
    :param save_checkpoints_steps: 多少步进行存储ck文件
    :param session_config: session的配置
    """
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        save_summary_steps=FLAGS.save_summary_steps,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        session_config=session_config)

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train and FLAGS.do_dev:
        # 加载训练数据
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(1.0 * len(train_examples) /
                              FLAGS.train_batch_size * FLAGS.num_train_epochs)
        if num_train_steps < 1:
            raise AttributeError('training data is so small...')
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        logger.info('***** Running training *****')

        eval_examples = processor.get_dev_examples(FLAGS.data_dir)

    label_list = processor.get_labels()

    # 1、将训练数据转化为TF_Record数据
    train_file = os.path.join(FLAGS.output_dir, 'train.tf_record')
    if not os.path.exists(train_file):
        filed_based_convert_examples_to_features(
            examples=train_examples,
            label_list=label_list,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer,
            output_file=train_file,
            output_dir=FLAGS.output_dir)

    # 2、读取TF_Record训练数据,转化为batch
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)

    # 1、将验证数据转化为TF_Record数据
    eval_file = os.path.join(FLAGS.output_dir, 'eval.tf_record')
    if not os.path.exists(eval_file):
        filed_based_convert_examples_to_features(
            examples=eval_examples,
            label_list=label_list,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer,
            output_file=eval_file,
            output_dir=FLAGS.output_dir)

    # 2、读取TF_Record验证数据,转化为batch
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)
    """
    返回的model_fn是一个函数,其定义了模型、训练、评测方法
    并且使用了钩子参数,加载了Bert模型的参数进行了自己模型的参数初始化过程
    tf新的架构方法,通过定义model_fn函数,定义模型,然后通过EstimatorAPI进行模型的其他工作
    EstimatorAPI就可以控制模型的训练、预测和评估工作等。
    """
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                FLAGS=FLAGS)

    params = {'batch_size': FLAGS.train_batch_size}

    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    # 设置early_stopping,防止过拟合
    early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook(
        estimator=estimator,
        metric_name='loss',
        max_steps_without_decrease=num_train_steps,
        eval_dir=None,
        min_steps=0,
        run_every_secs=None,
        run_every_steps=FLAGS.save_checkpoints_steps)

    # 训练
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=num_train_steps,
                                        hooks=[early_stopping_hook])
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#13
0
def main(_):
    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.device_map

    processors = {"ner": NerProcessor}
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean
    if FLAGS.clean and FLAGS.do_train:
        if os.path.exists(FLAGS.output_dir):

            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(FLAGS.output_dir)
            except Exception as e:
                print(e)
                print("pleace remove the files of output dir and data.conf")
                exit(-1)

    # check output dir exists
    if not os.path.exists(FLAGS.output_dir):
        os.mkdir(FLAGS.output_dir)
    history_max_steps = load_global_step_from_checkpoint_dir(FLAGS.output_dir)

    processor = processors[FLAGS.task_name](FLAGS.data_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    session_config = tf.compat.v1.ConfigProto(
        log_device_placement=False,
        inter_op_parallelism_threads=0,
        intra_op_parallelism_threads=0,
        allow_soft_placement=True,
    )

    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        save_summary_steps=FLAGS.save_checkpoints_steps,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        session_config=session_config,
    )

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train and FLAGS.do_eval:
        # 加载训练数据
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = history_max_steps + int(
            len(train_examples) * 1.0 / FLAGS.batch_size *
            FLAGS.num_train_epochs)
        if num_train_steps < 1:
            raise AttributeError("training data is so small...")
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        eval_examples = processor.get_dev_examples(FLAGS.data_dir)

        # 打印验证集数据信息
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.batch_size)

    label_list = processor.get_labels()
    # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程
    # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        args=FLAGS,
    )

    params = {"batch_size": FLAGS.batch_size}
    estimator = tf.estimator.Estimator(
        model_fn,
        params=params,
        config=run_config,
        # warm_start_from=run_config.model_dir,
    )

    if FLAGS.do_train and FLAGS.do_eval:
        # 1. 将数据转化为tf_record 数据
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        # 2.读取record 数据,组成batch
        train_input_fn = get_tf_record_data(train_file, train_examples,
                                            label_list, tokenizer)

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        eval_input_fn = get_tf_record_data(eval_file, eval_examples,
                                           label_list, tokenizer)

        train(estimator, num_train_steps, train_input_fn, eval_input_fn)

    if FLAGS.do_predict:
        token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(FLAGS.output_dir, "label2id.pkl"),
                         "rb") as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        predict_input_fn = get_tf_record_data(predict_file, predict_examples,
                                              label_list, tokenizer, False,
                                              False)
        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.batch_size)

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                idx = 0
                line = ""
                line_token = predict_line.words
                label_token = predict_line.labels
                len_seq = len(label_token)
                if len(line_token) != len(label_token):
                    tf.logging.info(predict_line.text)
                    tf.logging.info(predict_line.label)
                    break
                for id in prediction:
                    if idx >= len_seq:
                        break
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ["[CLS]", "[SEP]"]:
                        continue
                    try:
                        line += (line_token[idx] + " " + label_token[idx] +
                                 " " + curr_labels + "\n")
                    except Exception as e:
                        tf.logging.info(e)
                        tf.logging.info(predict_line.text)
                        tf.logging.info(predict_line.label)
                        line = ""
                        break
                    idx += 1
                writer.write(line + "\n")

        with codecs.open(output_predict_file, "w", encoding="utf-8") as writer:
            result_to_pair(writer)

        eval_result = conlleval.return_report(output_predict_file)
        print("".join(eval_result))
        # 写结果到文件中
        with codecs.open(os.path.join(FLAGS.output_dir, "predict_score.txt"),
                         "a",
                         encoding="utf-8") as fd:
            fd.write("".join(eval_result))

    adam_filter(FLAGS.output_dir)
示例#14
0
def train(args):
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map
    tf.logging.set_verbosity(tf.logging.INFO)
    processors = {"ner": NerProcessor}
    bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean
    #args.clean和do_train默认是true;
    #也就是训练一次之后,第二次再训练时会删除上次的文件
    if args.clean and args.do_train:
        if os.path.exists(args.output_dir):

            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(args.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)

    #check output dir exists
    #默认rootpaht+'output'
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    processor = processors[args.ner](args.output_dir)

    ####tokenizer的使用
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    session_config = tf.ConfigProto(
        #应该是物理cpu个数
        device_count={"CPU": 2},  # limit to num_cpu_core CPU usage

        #cpu 核心数*超线程数*物理cpu个数=逻辑cpu数
        log_device_placement=False,  #True打印日志,false不打印日志
        inter_op_parallelism_threads=6,
        intra_op_parallelism_threads=6,
        allow_soft_placement=True)

    run_config = tf.estimator.RunConfig(
        model_dir=args.output_dir,
        save_summary_steps=500,  ##每隔500步保存tensorbaord的summary
        save_checkpoints_steps=500,  ##每隔500步保存checkpoints模型
        session_config=session_config)

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None

    # default args.do_train and args.do_eval are true
    if args.do_train and args.do_eval:
        # 加载训练数据
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) * 1.0 / args.batch_size *
            args.num_train_epochs)
        if num_train_steps < 1:
            raise AttributeError('training data is so small...')
        num_warmup_steps = int(num_train_steps * args.warmup_proportion)

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", args.batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        eval_examples = processor.get_dev_examples(args.data_dir)

        # 打印验证集数据信息
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", args.batch_size)

    tf.logging.info("get labels")
    label_list = processor.get_labels()
    # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程
    # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。
    tf.logging.info('def model_fn_builer')
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=args.init_checkpoint,
                                learning_rate=args.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                args=args)

    params = {'batch_size': args.batch_size}

    tf.logging.info('def estimator')
    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    #train和eval都为真时才会进行训练和评估
    if args.do_train and args.do_eval:
        # 1. 将数据转化为tf_record 数据
        tf.logging.info('convert data into train tf_record ')
        train_file = os.path.join(args.output_dir, "train.tf_record")
        if not os.path.exists(train_file):
            filed_based_convert_examples_to_features(train_examples,
                                                     label_list,
                                                     args.max_seq_length,
                                                     tokenizer, train_file,
                                                     args.output_dir)

        # 2.读取record 数据,组成batch
        tf.logging.info('read train record and convert to batch')
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=args.max_seq_length,
            is_training=True,
            drop_remainder=True)

        tf.logging.info('convert data to eval tf_record 数据')
        eval_file = os.path.join(args.output_dir, "eval.tf_record")
        if not os.path.exists(eval_file):
            filed_based_convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer, eval_file,
                                                     args.output_dir)

        tf.logging.info('read eval record ,convert to batch')
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=False)

        # train and eval togither
        tf.logging.info('call early stopping hook')
        early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook(
            estimator=estimator,
            metric_name='loss',
            max_steps_without_decrease=num_train_steps,
            eval_dir=None,
            min_steps=0,
            run_every_secs=None,
            run_every_steps=args.save_checkpoints_steps)  ###
        '''estimator.train(input_fn=lambda :my_input_fn(TRAIN_DATA),steps=300)
            #训练完后进行验证,这里传入我们的测试数据
            test_result = estimator.evaluate(input_fn=lambda :my_input_fn(TEST_DATA))
            #输出测试验证结果'''
        t0 = time.time()
        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=[early_stopping_hook])  ##默认被注释掉

        t1 = time.time()
        tf.logging.info('train spent time:{}s'.format(t1 - t0))
        # 自己添加的
        eval_loss = estimator.evaluate(input_fn=eval_input_fn)
        t2 = time.time()
        tf.logging.info('eval_loss=\n{}'.format(eval_loss))
        tf.logging.info('eval spent time:{}s'.format(t2 - t1))

        # tf.logging.info('call train_spec')
        # train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps,
        #                                      hooks=[early_stopping_hook]
        #                                     )
        # tf.logging.info('call eval_spec')
        # eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
        # tf.logging.info('call tf.estimator.train_and_evaluate')
        #单机上和分布式都可以使用
        # tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    #default do_predict is True
    if args.do_predict:
        token_path = os.path.join(args.output_dir, "token_test.txt")
        if os.path.exists(token_path):
            os.remove(token_path)

        with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'),
                         'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(args.data_dir)

        predict_file = os.path.join(args.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples,
                                                 label_list,
                                                 args.max_seq_length,
                                                 tokenizer,
                                                 predict_file,
                                                 args.output_dir,
                                                 mode="test.txt")

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", args.batch_size)

        predict_drop_remainder = False

        tf.logging.info('call pred predict_input_fn')
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        #############prdict的时候也是以batch大小的形式
        tf.logging.info('start predict,estimator.predict')
        result = estimator.predict(input_fn=predict_input_fn)

        #####预测结果似乎是写在这里的;里面有三列第一列是内容,第二列是真实标签,第三列是预测标签
        output_predict_file = os.path.join(args.output_dir, "label_test.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                idx = 0
                line = ''
                line_token = str(predict_line.text).split(' ')
                label_token = str(predict_line.label).split(' ')
                len_seq = len(label_token)
                if len(line_token) != len(label_token):
                    tf.logging.info('predict_line.text:\n{}'.format(
                        predict_line.text))
                    tf.logging.info('predict_line.label:\n{}'.format(
                        predict_line.label))
                    break
                for id in prediction:
                    if idx >= len_seq:
                        break
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ['[CLS]', '[SEP]']:
                        continue
                    try:
                        line += line_token[idx] + ' ' + label_token[
                            idx] + ' ' + curr_labels + '\n'
                    except Exception as e:
                        tf.logging.info('e:\n{}'.format(e))

                        tf.logging.info('predict_line.text:\n{}'.format(
                            predict_line.text))
                        tf.logging.info('predict_line.label:\n{}'.format(
                            predict_line.label))
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        ###打开文件将结果写入
        tf.logging.info('save predicted result :label_test.txt')
        with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer)

        tf.logging.info('import conlleval to eval and get eval_result ')
        from bert_base.train import conlleval
        eval_result = conlleval.return_report(output_predict_file)
        tf.logging.info('eval_result:\n{}'.format(''.join(eval_result)))
        # 写结果到文件中
        tf.logging.info('save predict_score.txt')
        with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'),
                         'a',
                         encoding='utf-8') as fd:
            fd.write(''.join(eval_result))
    # filter model
    if args.filter_adam_var:
        adam_filter(args.output_dir)