def write_eval_records(filepath: Path):
    eval_writer = bert_utils.FeatureWriter(filename=str(filepath),
                                           is_training=False)
    tokenizer = tokenization.FullTokenizer(vocab_file=str(DATASET_PATH /
                                                          'vocab-nq.txt'),
                                           do_lower_case=True)
    features = []
    convert = bert_utils.ConvertExamples2Features(
        tokenizer=tokenizer,
        is_training=False,
        output_fn=eval_writer.process_feature,
        collect_stat=False)
    n_examples = 0
    for examples in bert_utils.nq_examples_iter(input_file=TEST_FILE,
                                                is_training=False,
                                                tqdm=tqdm.tqdm):
        for example in examples:
            n_examples += convert(example)
    eval_writer.close()
    print('number of test examples: %d, written to file: %d' %
          (n_examples, eval_writer.num_features))
Exemplo n.º 2
0
    eval_writer = bert_utils.FeatureWriter(filename=os.path.join(eval_records),
                                           is_training=False)

    tokenizer = tokenization.FullTokenizer(vocab_file='vocab-nq.txt',
                                           do_lower_case=True)

    features = []
    convert = bert_utils.ConvertExamples2Features(
        tokenizer=tokenizer,
        is_training=False,
        output_fn=eval_writer.process_feature,
        collect_stat=False)

    n_examples = 0
    for examples in bert_utils.nq_examples_iter(input_file=nq_test_file,
                                                is_training=False,
                                                tqdm=None):
        for example in examples:
            n_examples += convert(example)

    eval_writer.close()
#     print('number of test examples: %d, written to file: %d' % (n_examples,eval_writer.num_features))

seq_length = bert_utils.FLAGS.max_seq_length  #config['max_position_embeddings']
name_to_features = {
    "unique_id": tf.io.FixedLenFeature([], tf.int64),
    "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
    "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
    "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
}
Exemplo n.º 3
0
    tokenizer = tokenization.FullTokenizer(
        vocab_file='../input/bert-joint-baseline/vocab-nq.txt',
        do_lower_case=True)

    features = []
    convert = bert_utils.ConvertExamples2Features(
        tokenizer=tokenizer,
        is_training=False,
        output_fn=eval_writer.process_feature,
        collect_stat=False)

    n_examples = 0
    tqdm_notebook = tqdm.tqdm_notebook if not on_kaggle_server else None
    for examples in bert_utils.nq_examples_iter(input_file=nq_test_file,
                                                is_training=False,
                                                tqdm=tqdm_notebook):
        for example in examples:
            n_examples += convert(example)

    eval_writer.close()
    print('number of test examples: %d, written to file: %d' %
          (n_examples, eval_writer.num_features))
seq_length = FLAGS.max_seq_length  #config['max_position_embeddings']
name_to_features = {
    "unique_id": tf.io.FixedLenFeature([], tf.int64),
    "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
    "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
    "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
}
Exemplo n.º 4
0
if not url_exists(NQ_TEST_TFRECORD_PATH):
    # tf2baseline.F.max_seq_length = 512
    eval_writer = bert_utils.FeatureWriter(filename=NQ_TEST_TFRECORD_PATH,
                                           is_training=False)
    tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_PATH,
                                           do_lower_case=True)
    features = []
    convert = bert_utils.ConvertExamples2Features(tokenizer=tokenizer,
                                                  is_training=False,
                                                  output_fn=eval_writer.process_feature,
                                                  collect_stat=False)
    n_examples = 0
    # tqdm_notebook = tqdm.tqdm_notebook  # if not on_kaggle_server else None
    for examples in bert_utils.nq_examples_iter(input_file=NQ_TEST_JSONL_PATH,
                                                is_training=False,
                                                tqdm=tqdm):
        for example in examples:
            n_examples += convert(example)
    eval_writer.close()
    print('number of test examples: %d, written to file: %d' % (n_examples, eval_writer.num_features))

#%%

raw_ds = tf.data.TFRecordDataset(NQ_TEST_TFRECORD_PATH)
decoded_ds = raw_ds.map(_decode_record)
batched_ds = decoded_ds.batch(batch_size=BATCH_SIZE, drop_remainder=(TPU is not None))

result = model.predict(batched_ds, verbose=1)

#%%