示例#1
0
def main(unused_argv):
    del unused_argv  # Unused

    tf.logging.set_verbosity(tf.logging.INFO)

    assert FLAGS.seq_len > 0
    assert FLAGS.perm_size > 0

    FLAGS.n_token = data_utils.VOCAB_SIZE
    tf.logging.info("n_token {}".format(FLAGS.n_token))

    if not tf.gfile.Exists(FLAGS.model_dir):
        tf.gfile.MakeDirs(FLAGS.model_dir)

    # Get train input function
    train_input_fn, train_record_info_dict = get_input_fn("train")

    tf.logging.info("num of batches {}".format(
        train_record_info_dict["num_batch"]))

    # Get train cache function
    train_cache_fn = get_cache_fn(FLAGS.mem_len)

    ##### Get model function
    model_fn = get_model_fn()

    ##### Create TPUEstimator
    # TPU Configuration
    run_config = model_utils.configure_tpu(FLAGS)

    # TPU Estimator
    estimator = tpu_estimator.TPUEstimator(
        model_fn=model_fn,
        train_cache_fn=train_cache_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        params={"track_mean": FLAGS.track_mean},
        train_batch_size=FLAGS.train_batch_size,
        eval_on_tpu=FLAGS.use_tpu)

    #### Training
    estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)
示例#2
0
文件: run_embed.py 项目: w-h-m/coqa
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    
    tpu_config = model_utils.configure_tpu(FLAGS)
    model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(False, True, FLAGS)
    
    model_builder = XLNetModelBuilder(
        default_model_config=model_config,
        default_run_config=run_config,
        default_init_checkpoint=FLAGS.init_checkpoint,
        use_tpu=FLAGS.use_tpu)
    
    model_fn = model_builder.get_model_fn(model_config, run_config, FLAGS.init_checkpoint, FLAGS.model_type)
    
    # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=tpu_config,
        export_to_tpu=FLAGS.use_tpu,
        train_batch_size=1)
    
    tokenizer = XLNetTokenizer(
        sp_model_file=FLAGS.spiece_model_file,
        lower_case=FLAGS.lower_case)
    
    example_converter = XLNetExampleConverter(
        label_list=[],
        max_seq_length=FLAGS.max_seq_length,
        tokenizer=tokenizer)
    
    features = example_converter.convert_examples_to_features([PaddingInputExample()])
    
    input_fn = XLNetInputBuilder.get_input_builder(features, FLAGS.max_seq_length, True, False)
    estimator.train(input_fn, max_steps=1)
    
    tf.gfile.MakeDirs(FLAGS.export_dir)
    serving_input_fn = XLNetInputBuilder.get_serving_input_fn(FLAGS.max_seq_length)
    estimator.export_savedmodel(FLAGS.export_dir, serving_input_fn, as_text=False)
示例#3
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    # Validate flags
    if FLAGS.save_steps is not None:
        FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps)

    if FLAGS.do_predict:
        predict_dir = FLAGS.predict_dir
        if not tf.gfile.Exists(predict_dir):
            tf.gfile.MakeDirs(predict_dir)

    processor = ccfKeywordProcessor()

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval, `do_predict` or "
            "`do_submit` must be True.")

    if not tf.gfile.Exists(FLAGS.output_dir):
        tf.gfile.MakeDirs(FLAGS.output_dir)

    label_list = processor.get_labels() if not FLAGS.is_regression else None

    sp = spm.SentencePieceProcessor()
    sp.Load(FLAGS.spiece_model_file)

    def tokenize_fn(text):
        text = preprocess_text(text, lower=FLAGS.uncased)
        return encode_ids(sp, text)

    run_config = model_utils.configure_tpu(FLAGS)

    model_fn = get_model_fn(
        len(label_list) if label_list is not None else None)

    spm_basename = os.path.basename(FLAGS.spiece_model_file)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    if FLAGS.use_tpu:
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            predict_batch_size=FLAGS.predict_batch_size,
            eval_batch_size=FLAGS.eval_batch_size)
    else:
        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           config=run_config)

    if FLAGS.do_train:
        train_file_base = "{}.len-{}.train.tf_record".format(
            spm_basename, FLAGS.max_seq_length)
        train_file = os.path.join(FLAGS.output_dir, train_file_base)
        tf.logging.info("Use tfrecord file {}".format(train_file))

        train_examples = processor.get_train_examples(FLAGS.train_data)
        np.random.shuffle(train_examples)
        tf.logging.info("Num of train samples: {}".format(len(train_examples)))

        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenize_fn, train_file,
                                                FLAGS.num_passes)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)

        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.dev_data)
        tf.logging.info("Num of eval samples: {}".format(len(eval_examples)))

        while len(eval_examples) % FLAGS.eval_batch_size != 0:
            eval_examples.append(PaddingInputExample())

        eval_file_base = "{}.len-{}.{}.eval.tf_record".format(
            spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
        eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenize_fn, eval_file)

        assert len(eval_examples) % FLAGS.eval_batch_size == 0
        eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=True)

        if FLAGS.eval_only_one:
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps,
                                        checkpoint_path=FLAGS.eval_ckpt)
            pre, rec, f1 = get_metrics(result["conf_mat"], 3)
            tf.logging.info("eval ckpt" + FLAGS.eval_ckpt)
            tf.logging.info("eval_precision: {}".format(pre))
            tf.logging.info("eval_recall: {}".format(rec))
            tf.logging.info("eval_f1: {}".format(f1))
            tf.logging.info("eval_accuracy: {}".format(
                result["eval_accuracy"]))
            tf.logging.info("eval_loss: {}".format(result["eval_loss"]))
            tf.logging.info("-------------------------\n\n")

        else:
            # Filter out all checkpoints in the directory
            steps_and_files = []
            filenames = tf.gfile.ListDirectory(FLAGS.model_dir)

            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = join(FLAGS.model_dir, ckpt_name)
                    global_step = int(cur_filename.split("-")[-1])
                    tf.logging.info(
                        "Add {} to eval list.".format(cur_filename))
                    steps_and_files.append([global_step, cur_filename])
            steps_and_files = sorted(steps_and_files, key=lambda x: x[0])

            # Decide whether to evaluate all ckpts
            if not FLAGS.eval_all_ckpt:
                steps_and_files = steps_and_files[-1:]

            result_list = list()
            for global_step, filename in sorted(steps_and_files,
                                                key=lambda x: x[0]):
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=filename)
                result_list.append([global_step, result])

            for step, result in result_list:
                tf.logging.info("\n\n------ step -------" + str(step))
                pre, rec, f1 = get_metrics(result["conf_mat"], 3)
                tf.logging.info("eval_precision: {}".format(pre))
                tf.logging.info("eval_recall: {}".format(rec))
                tf.logging.info("eval_f1: {}".format(f1))
                tf.logging.info("eval_accuracy: {}".format(
                    result["eval_accuracy"]))
                tf.logging.info("eval_loss: {}".format(result["eval_loss"]))
                tf.logging.info("-------------------------\n\n")

    if FLAGS.do_predict:
        """
        首先对使用的ckpt进行eval,防止加载错模型
        """
        eval_examples = processor.get_dev_examples(FLAGS.dev_data)
        tf.logging.info("Num of eval samples: {}".format(len(eval_examples)))

        while len(eval_examples) % FLAGS.eval_batch_size != 0:
            eval_examples.append(PaddingInputExample())

        eval_file_base = "{}.len-{}.{}.eval.tf_record".format(
            spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
        eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenize_fn, eval_file)

        assert len(eval_examples) % FLAGS.eval_batch_size == 0
        eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=True)

        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=eval_steps,
                                    checkpoint_path=FLAGS.predict_ckpt)
        pre, rec, f1 = get_metrics(result["conf_mat"], FLAGS.label_num)

        tf.logging.info("\n\n\n加载的模型的效果为:")
        tf.logging.info("eval_precision: {}".format(pre))
        tf.logging.info("eval_recall: {}".format(rec))
        tf.logging.info("eval_f1: {}".format(f1))
        tf.logging.info("eval_accuracy: {}".format(result["eval_accuracy"]))
        tf.logging.info("eval_loss: {}".format(result["eval_loss"]))
        tf.logging.info("-------------------------\n\n\n")
        """
        做完eval后,进行模型的predict
        """
        eval_examples = processor.get_test_examples(FLAGS.test_data)

        num_actual_predict_examples = len(eval_examples)

        eval_file_base = "{}.len-{}.{}.predict.tf_record".format(
            spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
        eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenize_fn, eval_file)

        pred_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        # Decide whether to evaluate all ckpts
        result = estimator.predict(input_fn=pred_input_fn,
                                   checkpoint_path=FLAGS.predict_ckpt)
        with tf.gfile.GFile(FLAGS.predict_file_write_path, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                p = prediction["p"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability) for class_probability in p) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

        tf.logging.info("模型预测完成。。。。")
示例#4
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

    bert_config = modeling.XLNetConfig(json_path=FLAGS.bert_config_file)

    tokenizer = tokenization.FullTokenizer(
        spm_model_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config =configure_tpu(FLAGS) 
    
    # tf.contrib.tpu.RunConfig(
    #     master=FLAGS.master,
    #     tpu_config=tf.contrib.tpu.TPUConfig(
    #         num_shards=FLAGS.num_tpu_cores,
    #         per_host_input_for_training=is_per_host))

    # examples = read_examples(FLAGS.input_file)
    json_examples = []
    for x in ['test', 'train', 'dev']:
        with open(os.path.join(FLAGS.input_file, x + '.english.jsonlines')) as f:
            json_examples.extend((json.loads(jsonline) for jsonline in f.readlines()))

    orig_examples = []
    bert_examples = []
    for i, json_e in enumerate(json_examples):
        e = process_example(json_e, i, should_filter_embedded_mentions=True)
        orig_examples.append(e)
        bert_examples.append(e.bertify(tokenizer))

    model_fn = model_fn_builder(
        bert_config=bert_config,
        run_config=run_config,
        init_checkpoint=FLAGS.init_checkpoint,
        layer_indexes=layer_indexes,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=FLAGS.batch_size)

    input_fn = input_fn_builder(
        examples=bert_examples, window_size=FLAGS.window_size, stride=FLAGS.stride, tokenizer=tokenizer)

    writer = h5py.File(FLAGS.output_file, 'w')
    with tqdm(total=sum(len(e.tokens) for e in orig_examples)) as t:
        for result in estimator.predict(input_fn, yield_single_examples=True):
            document_index = int(result["unique_ids"])
            bert_example = bert_examples[document_index]
            orig_example = orig_examples[document_index]
            file_key = bert_example.doc_key.replace('/', ':')

            t.update(n=(result['extract_indices'] >= 0).sum())

            for output_index, bert_token_index in enumerate(result['extract_indices']):
                if bert_token_index < 0:
                    continue

                token_index = bert_example.bert_to_orig_map[bert_token_index]
                sentence_index, token_index = orig_example.unravel_token_index(token_index)

                dataset_key ="{}/{}".format(file_key, sentence_index)
                if dataset_key not in writer:
                    writer.create_dataset(dataset_key,
                                          (len(orig_example.sentence_tokens[sentence_index]), bert_config.hidden_size, len(layer_indexes)),
                                          dtype=np.float32)

                dset = writer[dataset_key]
                for j, layer_index in enumerate(layer_indexes):
                    layer_output = result["layer_output_%d" % j]
                    dset[token_index, :, j] = layer_output[output_index]
    writer.close()
示例#5
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    #### Validate flags
    if FLAGS.save_steps is not None:
        FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps)

    if FLAGS.do_predict:
        predict_dir = FLAGS.predict_dir
        if not tf.gfile.Exists(predict_dir):
            tf.gfile.MakeDirs(predict_dir)

    processors = {
        "mnli_matched": MnliMatchedProcessor,
        "mnli_mismatched": MnliMismatchedProcessor,
        'sts-b': StsbProcessor,
        'imdb': ImdbProcessor,
        "yelp5": Yelp5Processor
    }

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval, `do_predict` or "
            "`do_submit` must be True.")

    if not tf.gfile.Exists(FLAGS.output_dir):
        tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels() if not FLAGS.is_regression else None

    sp = spm.SentencePieceProcessor()
    sp.Load(FLAGS.spiece_model_file)

    def tokenize_fn(text):
        text = preprocess_text(text, lower=FLAGS.uncased)
        return encode_ids(sp, text)

    run_config = model_utils.configure_tpu(FLAGS)

    model_fn = get_model_fn(
        len(label_list) if label_list is not None else None)

    spm_basename = os.path.basename(FLAGS.spiece_model_file)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    if FLAGS.use_tpu:
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            predict_batch_size=FLAGS.predict_batch_size,
            eval_batch_size=FLAGS.eval_batch_size)
    else:
        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           config=run_config)

    if FLAGS.do_train:
        train_file_base = "{}.len-{}.train.tf_record".format(
            spm_basename, FLAGS.max_seq_length)
        train_file = os.path.join(FLAGS.output_dir, train_file_base)
        tf.logging.info("Use tfrecord file {}".format(train_file))

        train_examples = processor.get_train_examples(FLAGS.data_dir)
        tf.logging.info("Num of train samples: {}".format(len(train_examples)))

        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenize_fn, train_file,
                                                FLAGS.num_passes)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)

        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

    if FLAGS.do_eval or FLAGS.do_predict:
        if FLAGS.eval_split == "dev":
            eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        else:
            eval_examples = processor.get_test_examples(FLAGS.data_dir)

        tf.logging.info("Num of eval samples: {}".format(len(eval_examples)))

    if FLAGS.do_eval:
        # TPU requires a fixed batch size for all batches, therefore the number
        # of examples must be a multiple of the batch size, or else examples
        # will get dropped. So we pad with fake examples which are ignored
        # later on. These do NOT count towards the metric (all tf.metrics
        # support a per-instance weight, and these get a weight of 0.0).
        #
        # Modified in XL: We also adopt the same mechanism for GPUs.
        while len(eval_examples) % FLAGS.eval_batch_size != 0:
            eval_examples.append(PaddingInputExample())

        eval_file_base = "{}.len-{}.{}.eval.tf_record".format(
            spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
        eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenize_fn, eval_file)

        assert len(eval_examples) % FLAGS.eval_batch_size == 0
        eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=True)

        # Filter out all checkpoints in the directory
        steps_and_files = []
        filenames = tf.gfile.ListDirectory(FLAGS.model_dir)

        for filename in filenames:
            if filename.endswith(".index"):
                ckpt_name = filename[:-6]
                cur_filename = join(FLAGS.model_dir, ckpt_name)
                global_step = int(cur_filename.split("-")[-1])
                tf.logging.info("Add {} to eval list.".format(cur_filename))
                steps_and_files.append([global_step, cur_filename])
        steps_and_files = sorted(steps_and_files, key=lambda x: x[0])

        # Decide whether to evaluate all ckpts
        if not FLAGS.eval_all_ckpt:
            steps_and_files = steps_and_files[-1:]

        eval_results = []
        for global_step, filename in sorted(steps_and_files,
                                            key=lambda x: x[0]):
            ret = estimator.evaluate(input_fn=eval_input_fn,
                                     steps=eval_steps,
                                     checkpoint_path=filename)

            ret["step"] = global_step
            ret["path"] = filename

            eval_results.append(ret)

            tf.logging.info("=" * 80)
            log_str = "Eval result | "
            for key, val in sorted(ret.items(), key=lambda x: x[0]):
                log_str += "{} {} | ".format(key, val)
            tf.logging.info(log_str)

        key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy"
        eval_results.sort(key=lambda x: x[key_name], reverse=True)

        tf.logging.info("=" * 80)
        log_str = "Best result | "
        for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]):
            log_str += "{} {} | ".format(key, val)
        tf.logging.info(log_str)

    if FLAGS.do_predict:
        eval_file_base = "{}.len-{}.{}.predict.tf_record".format(
            spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
        eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenize_fn, eval_file)

        pred_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        predict_results = []
        with tf.gfile.Open(
                os.path.join(predict_dir, "{}.tsv".format(task_name)),
                "w") as fout:
            fout.write("index\tprediction\n")

            for pred_cnt, result in enumerate(
                    estimator.predict(input_fn=pred_input_fn,
                                      yield_single_examples=True,
                                      checkpoint_path=FLAGS.predict_ckpt)):
                if pred_cnt % 1000 == 0:
                    tf.logging.info(
                        "Predicting submission for example: {}".format(
                            pred_cnt))

                logits = [float(x) for x in result["logits"].flat]
                predict_results.append(logits)

                if len(logits) == 1:
                    label_out = logits[0]
                elif len(logits) == 2:
                    if logits[1] - logits[0] > FLAGS.predict_threshold:
                        label_out = label_list[1]
                    else:
                        label_out = label_list[0]
                elif len(logits) > 2:
                    max_index = np.argmax(np.array(logits, dtype=np.float32))
                    label_out = label_list[max_index]
                else:
                    raise NotImplementedError

                fout.write("{}\t{}\n".format(pred_cnt, label_out))

        predict_json_path = os.path.join(predict_dir,
                                         "{}.logits.json".format(task_name))

        with tf.gfile.Open(predict_json_path, "w") as fp:
            json.dump(predict_results, fp, indent=4)
示例#6
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    
    np.random.seed(cf.random_seed)
    
    processor = NerProcessor(
        data_dir=cf.train_data,
        task_name=cf.task_name.lower())
    
    # label_list = processor.get_labels()
    label_list = processor.labels
    
    model_config = xlnet.XLNetConfig(json_path=cf.model_config_path)
    
    model_builder = XLNetModelBuilder(
        model_config=model_config,
        use_tpu=cf.use_tpu)
    
    model_fn = model_builder.get_model_fn(label_list)
    
    # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
    tpu_config = model_utils.configure_tpu(cf)
    
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=cf.use_tpu,
        model_fn=model_fn,
        config=tpu_config,
        export_to_tpu=cf.use_tpu,
        train_batch_size=cf.train_batch_size,
        eval_batch_size=cf.eval_batch_size,
        predict_batch_size=cf.predict_batch_size)
    
    tokenizer = XLNetTokenizer(
        sp_model_file=cf.spiece_model_file,
        lower_case=cf.lower_case)
    
    example_converter = XLNetExampleConverter(
        label_list=label_list,
        max_seq_length=cf.max_seq_length,
        tokenizer=tokenizer)
    
    if cf.do_train and cf.do_eval:   # 开始训练

        train_file = os.path.join(cf.output_dir, "train.tf_record")
        tf.logging.info("Use tfrecord samples: {}".format(len(train_file)))

        train_examples = processor.get_train_examples()    # train data
        np.random.shuffle(train_examples)

        example_converter.file_based_convert_examples_to_features(train_examples, train_file)
        train_steps = int(len(train_examples) * cf.num_train_epochs / cf.train_batch_size)
        cf.warmup_steps = int(0.1 * train_steps)

        
        tf.logging.info("***** Run training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", cf.train_batch_size)
        tf.logging.info("  Num steps = %d", cf.train_steps)

        
        # train_features = example_converter.convert_examples_to_features(train_examples)

        # if not os.path.exists(train_file):
        #     train_features = example_converter.file_based_convert_examples_to_features(train_examples, train_file)


        # 读取TF_record数据
        # train_input_fn = XLNetInputBuilder.get_input_builder(train_features, cf.max_seq_length, True, True)
        train_input_fn = XLNetInputBuilder.get_file_based_input_fn(
                                    input_file= train_file,
                                    seq_length=cf.max_seq_length,
                                    is_training=True,
                                    drop_remainder=True
        )
        
        estimator.train(input_fn=train_input_fn, max_steps=train_steps)

        eval_examples = processor.get_dev_examples()
        
        tf.logging.info("***** Run evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", cf.eval_batch_size)

        # early stop hook
        # early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook(
        #                             estimator=estimator,
        #                             metric_name='loss',
        #                             max_steps_without_decrease=cf.num_train_steps,
        #                             eval_dir=None,
        #                             min_steps=0,
        #                             run_every_secs=None,
        #                             run_every_steps=cf.save_checkpoints_steps
        # )

        eval_features = example_converter.convert_examples_to_features(eval_examples)
        eval_input_fn = XLNetInputBuilder.get_input_builder(eval_features, cf.max_seq_length, False, False)

        # train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=cf.num_train_steps,
        #                                     hooks=[early_stopping_hook])
        # eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
        # tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

        result = estimator.evaluate(input_fn=eval_input_fn)
        
        precision = result["precision"]
        recall = result["recall"]
        f1_score = 2.0 * precision * recall / (precision + recall)
        
        tf.logging.info("***** Evaluation result *****")
        tf.logging.info("  Precision (token-level) = %s", str(precision))
        tf.logging.info("  Recall (token-level) = %s", str(recall))
        tf.logging.info("  F1 score (token-level) = %s", str(f1_score))
    
    if cf.do_predict:
        predict_examples = processor.get_test_examples()
        
        tf.logging.info("***** Run prediction *****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", cf.predict_batch_size)
        
        predict_features = example_converter.convert_examples_to_features(predict_examples)
        predict_input_fn = XLNetInputBuilder.get_input_builder(predict_features, cf.max_seq_length, False, False)
        
        result = estimator.predict(input_fn=predict_input_fn)
        
        predict_recorder = XLNetPredictRecorder(
            output_dir=cf.output_dir,
            label_list=label_list,
            max_seq_length=cf.max_seq_length,
            tokenizer=tokenizer,
            predict_tag=cf.predict_tag)
        
        predicts = [{
            "input_ids": feature.input_ids,
            "input_masks": feature.input_masks,
            "label_ids": feature.label_ids,
            "predict_ids": predict["predict"].tolist()
        } for feature, predict in zip(predict_features, result)]
        
        predict_recorder.record(predicts)
    
    if cf.do_export:
        tf.logging.info("***** Running exporting *****")
        tf.io.gfile.makedirs(cf.export_dir)
        serving_input_fn = XLNetInputBuilder.get_serving_input_fn(cf.max_seq_length)
        estimator.export_saved_model(cf.export_dir, serving_input_fn, as_text=False)
示例#7
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    #### Validate flags
    if FLAGS.save_steps is not None:
        FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not tf.gfile.Exists(FLAGS.output_dir):
        tf.gfile.MakeDirs(FLAGS.output_dir)

    sp = spm.SentencePieceProcessor()
    sp.Load(FLAGS.spiece_model_file)

    def tokenize_fn(text):
        text = preprocess_text(text, lower=FLAGS.uncased)
        return encode_ids(sp, text)

    # TPU Configuration
    run_config = model_utils.configure_tpu(FLAGS)

    model_fn = get_model_fn()

    spm_basename = os.path.basename(FLAGS.spiece_model_file)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    if FLAGS.use_tpu:
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size)
    else:
        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           config=run_config)

    if FLAGS.do_train:
        train_file_base = "{}.len-{}.train.tf_record".format(
            spm_basename, FLAGS.max_seq_length)
        train_file = os.path.join(FLAGS.output_dir, train_file_base)

        if not tf.gfile.Exists(train_file) or FLAGS.overwrite_data:
            train_examples = get_examples(FLAGS.data_dir, "train")
            random.shuffle(train_examples)
            file_based_convert_examples_to_features(train_examples,
                                                    tokenize_fn, train_file)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

    if FLAGS.do_eval:
        eval_examples = get_examples(FLAGS.data_dir, FLAGS.eval_split)
        tf.logging.info("Num of eval samples: {}".format(len(eval_examples)))

        # TPU requires a fixed batch size for all batches, therefore the number
        # of examples must be a multiple of the batch size, or else examples
        # will get dropped. So we pad with fake examples which are ignored
        # later on. These do NOT count towards the metric (all tf.metrics
        # support a per-instance weight, and these get a weight of 0.0).
        #
        # Modified in XL: We also adopt the same mechanism for GPUs.

        while len(eval_examples) % FLAGS.eval_batch_size != 0:
            eval_examples.append(PaddingInputExample())

        eval_file_base = "{}.len-{}.{}.tf_record".format(
            spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)

        if FLAGS.high_only:
            eval_file_base = "high." + eval_file_base
        elif FLAGS.middle_only:
            eval_file_base = "middle." + eval_file_base

        eval_file = os.path.join(FLAGS.output_dir, eval_file_base)
        file_based_convert_examples_to_features(eval_examples, tokenize_fn,
                                                eval_file)

        assert len(eval_examples) % FLAGS.eval_batch_size == 0
        eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=True)

        ret = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        # Log current result
        tf.logging.info("=" * 80)
        log_str = "Eval | "
        for key, val in ret.items():
            log_str += "{} {} | ".format(key, val)
        tf.logging.info(log_str)
        tf.logging.info("=" * 80)
示例#8
0
文件: run_nlu.py 项目: w-h-m/coqa
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    np.random.seed(FLAGS.random_seed)

    processor = NluProcessor(data_dir=FLAGS.data_dir,
                             task_name=FLAGS.task_name.lower())

    token_label_list = processor.get_token_labels()
    sent_label_list = processor.get_sent_labels()

    model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)

    model_builder = XLNetModelBuilder(model_config=model_config,
                                      use_tpu=FLAGS.use_tpu)

    model_fn = model_builder.get_model_fn(token_label_list, sent_label_list)

    # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
    tpu_config = model_utils.configure_tpu(FLAGS)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=tpu_config,
        export_to_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    tokenizer = XLNetTokenizer(sp_model_file=FLAGS.spiece_model_file,
                               lower_case=FLAGS.lower_case)

    example_converter = XLNetExampleConverter(
        token_label_list=token_label_list,
        sent_label_list=sent_label_list,
        max_seq_length=FLAGS.max_seq_length,
        tokenizer=tokenizer)

    if FLAGS.do_train:
        train_examples = processor.get_train_examples()

        tf.logging.info("***** Run training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", FLAGS.train_steps)

        train_features = example_converter.convert_examples_to_features(
            train_examples)
        train_input_fn = XLNetInputBuilder.get_input_builder(
            train_features, FLAGS.max_seq_length, True, True)

        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples()

        tf.logging.info("***** Run evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_features = example_converter.convert_examples_to_features(
            eval_examples)
        eval_input_fn = XLNetInputBuilder.get_input_builder(
            eval_features, FLAGS.max_seq_length, False, False)

        result = estimator.evaluate(input_fn=eval_input_fn)

        token_precision = result["token_precision"]
        token_recall = result["token_recall"]
        token_f1_score = 2.0 * token_precision * token_recall / (
            token_precision + token_recall)

        sent_accuracy = result["sent_accuracy"]

        tf.logging.info("***** Evaluation result *****")
        tf.logging.info("  Precision (token-level) = %s", str(token_precision))
        tf.logging.info("  Recall (token-level) = %s", str(token_recall))
        tf.logging.info("  F1 score (token-level) = %s", str(token_f1_score))
        tf.logging.info("  Accuracy (sent-level) = %s", str(sent_accuracy))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples()

        tf.logging.info("***** Run prediction *****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_features = example_converter.convert_examples_to_features(
            predict_examples)
        predict_input_fn = XLNetInputBuilder.get_input_builder(
            predict_features, FLAGS.max_seq_length, False, False)

        result = estimator.predict(input_fn=predict_input_fn)

        predict_recorder = XLNetPredictRecorder(
            output_dir=FLAGS.output_dir,
            token_label_list=token_label_list,
            sent_label_list=sent_label_list,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer,
            predict_tag=FLAGS.predict_tag)

        predicts = [{
            "input_ids": feature.input_ids,
            "input_masks": feature.input_masks,
            "token_label_ids": feature.token_label_ids,
            "sent_label_id": feature.sent_label_id,
            "token_predict_ids": predict["token_predict"].tolist(),
            "sent_predict_id": predict["sent_predict"].tolist()
        } for feature, predict in zip(predict_features, result)]

        predict_recorder.record(predicts)

    if FLAGS.do_export:
        tf.logging.info("***** Running exporting *****")
        tf.gfile.MakeDirs(FLAGS.export_dir)
        serving_input_fn = XLNetInputBuilder.get_serving_input_fn(
            FLAGS.max_seq_length)
        estimator.export_savedmodel(FLAGS.export_dir,
                                    serving_input_fn,
                                    as_text=False)
示例#9
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if not tf.gfile.Exists(FLAGS.output_dir):
        tf.gfile.MakeDirs(FLAGS.output_dir)

    if FLAGS.do_prepro:
        preprocess()
        return

    #### Validate flags
    if FLAGS.save_steps is not None:
        FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps)

    if not FLAGS.do_train and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train` and `do_predict` must be True.")

    if FLAGS.do_predict and not tf.gfile.Exists(FLAGS.predict_dir):
        tf.gfile.MakeDirs(FLAGS.predict_dir)

    sp_model = spm.SentencePieceProcessor()
    sp_model.Load(FLAGS.spiece_model_file)

    ### TPU Configuration
    run_config = model_utils.configure_tpu(FLAGS)

    model_fn = get_model_fn()
    spm_basename = _get_spm_basename()

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    if FLAGS.use_tpu:
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            predict_batch_size=FLAGS.predict_batch_size)
    else:
        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           config=run_config)

    if FLAGS.do_train:
        train_rec_glob = os.path.join(
            FLAGS.output_dir, "{}.*.slen-{}.qlen-{}.train.tf_record".format(
                spm_basename, FLAGS.max_seq_length, FLAGS.max_query_length))

        train_input_fn = input_fn_builder(input_glob=train_rec_glob,
                                          seq_length=FLAGS.max_seq_length,
                                          is_training=True,
                                          drop_remainder=True,
                                          num_hosts=FLAGS.num_hosts)

        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

    if FLAGS.do_predict:
        eval_examples = read_squad_examples(FLAGS.predict_file,
                                            is_training=False)

        with tf.gfile.Open(FLAGS.predict_file) as f:
            orig_data = json.load(f)["data"]

        eval_rec_file = os.path.join(
            FLAGS.output_dir,
            "{}.slen-{}.qlen-{}.eval.tf_record".format(spm_basename,
                                                       FLAGS.max_seq_length,
                                                       FLAGS.max_query_length))
        eval_feature_file = os.path.join(
            FLAGS.output_dir, "{}.slen-{}.qlen-{}.eval.features.pkl".format(
                spm_basename, FLAGS.max_seq_length, FLAGS.max_query_length))

        if tf.gfile.Exists(eval_rec_file) and tf.gfile.Exists(
                eval_feature_file) and not FLAGS.overwrite_data:
            tf.logging.info(
                "Loading eval features from {}".format(eval_feature_file))
            with tf.gfile.Open(eval_feature_file, 'rb') as fin:
                eval_features = pickle.load(fin)
        else:
            eval_writer = FeatureWriter(filename=eval_rec_file,
                                        is_training=False)
            eval_features = []

            def append_feature(feature):
                eval_features.append(feature)
                eval_writer.process_feature(feature)

            convert_examples_to_features(
                examples=eval_examples,
                sp_model=sp_model,
                max_seq_length=FLAGS.max_seq_length,
                doc_stride=FLAGS.doc_stride,
                max_query_length=FLAGS.max_query_length,
                is_training=False,
                output_fn=append_feature)
            eval_writer.close()

            with tf.gfile.Open(eval_feature_file, 'wb') as fout:
                pickle.dump(eval_features, fout)

        eval_input_fn = input_fn_builder(input_glob=eval_rec_file,
                                         seq_length=FLAGS.max_seq_length,
                                         is_training=False,
                                         drop_remainder=False,
                                         num_hosts=1)

        cur_results = []
        for result in estimator.predict(input_fn=eval_input_fn,
                                        yield_single_examples=True):

            if len(cur_results) % 1000 == 0:
                tf.logging.info("Processing example: %d" % (len(cur_results)))

            unique_id = int(result["unique_ids"])
            start_top_log_probs = ([
                float(x) for x in result["start_top_log_probs"].flat
            ])
            start_top_index = [int(x) for x in result["start_top_index"].flat]
            end_top_log_probs = ([
                float(x) for x in result["end_top_log_probs"].flat
            ])
            end_top_index = [int(x) for x in result["end_top_index"].flat]

            cls_logits = float(result["cls_logits"].flat[0])

            cur_results.append(
                RawResult(unique_id=unique_id,
                          start_top_log_probs=start_top_log_probs,
                          start_top_index=start_top_index,
                          end_top_log_probs=end_top_log_probs,
                          end_top_index=end_top_index,
                          cls_logits=cls_logits))

        output_prediction_file = os.path.join(FLAGS.predict_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(FLAGS.predict_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(FLAGS.predict_dir,
                                                 "null_odds.json")

        ret = write_predictions(eval_examples, eval_features, cur_results,
                                FLAGS.n_best_size, FLAGS.max_answer_length,
                                output_prediction_file, output_nbest_file,
                                output_null_log_odds_file, orig_data)

        # Log current result
        tf.logging.info("=" * 80)
        log_str = "Result | "
        for key, val in ret.items():
            log_str += "{} {} | ".format(key, val)
        tf.logging.info(log_str)
        tf.logging.info("=" * 80)