Пример #1
0
    def _generate_tf_record_from_squad_file(cls,
                                            input_file_path,
                                            tokenizer,
                                            output_path,
                                            is_training,
                                            predict_batch_size=8,
                                            max_seq_length=384,
                                            max_query_length=64,
                                            doc_stride=128,
                                            version_2_with_negative=False):
        """Generates and saves training/validation data into a tf record file."""
        examples = squad_lib.read_squad_examples(
            input_file=input_file_path,
            is_training=is_training,
            version_2_with_negative=version_2_with_negative)
        writer = squad_lib.FeatureWriter(filename=output_path,
                                         is_training=is_training)

        features = []

        def _append_feature(feature, is_padding):
            if not is_padding:
                features.append(feature)
            writer.process_feature(feature)

        if is_training:
            batch_size = None
        else:
            batch_size = predict_batch_size

        number_of_examples = squad_lib.convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=is_training,
            output_fn=writer.process_feature
            if is_training else _append_feature,
            batch_size=batch_size)
        writer.close()

        meta_data = {
            'size': number_of_examples,
            'version_2_with_negative': version_2_with_negative
        }

        if is_training:
            examples = []
        return meta_data, examples, features
Пример #2
0
        
        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f}".format(
                epoch+1, batch, train_loss.result()))
        
        if batch % 500 == 0:
            ckpt_save_path = ckpt_manager.save()
            print("Saving checkpoint for epoch {} at {}".format(epoch+1,
                                                                ckpt_save_path))
    print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))


# Evaluation

eval_examples = read_squad_examples(
    "/content/drive/My Drive/BERT/data/squad/dev-v1.1.json",
    is_training=False,
    version_2_with_negative=False)

eval_writer = FeatureWriter(
    filename=os.path.join("/content/drive/My Drive/BERT/data/squad/",
                          "eval.tf_record"),
    is_training=False)

my_bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
    trainable=False)
vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

def _append_feature(feature, is_padding):
Пример #3
0
        seq_output = self.apply_bert(inputs)

        start_logits, end_logits = self.squad_layer(seq_output)
        
        return start_logits, end_logits

# ********** Stage 5: Evaluation **********
'''
* This code is specific to SQuAD. for evaluation inputs 
* Googel wrote hundreds of line in order to have somethiong that is optimize to get the correct file for the evaluation script (we use google function) 
'''

## -- Prepare evaluation

# Get the dev set in the session
eval_examples = read_squad_examples("dev-v1.1.json", is_training=False, version_2_with_negative=False)

# Define the function that will write the tf_record file for the dev set
eval_writer = FeatureWriter(filename=os.path.join("./data/squad/", "eval.tf_record"),  is_training=False)

# Create a tokenizer for future information needs
# for full version traing change to "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4"
my_bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)
vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

# Define the function that add the features (feature is a protocol in tensorflow) to our eval_features list
def _append_feature(feature, is_padding):
    if not is_padding:
        eval_features.append(feature)
Пример #4
0
import time
import json
import collections
import os

from joblib import dump, load

import app.config as cf

from app.model.BERTSquad import BERTSquad
from app.model.squad_loss_fn import squad_loss_fn

if __name__ == "__main__":

    eval_examples = read_squad_examples(cf.INPUTS_FILE_DEV,
                                        is_training=False,
                                        version_2_with_negative=False)

    my_bert_layer = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
        trainable=False)

    vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()

    do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()

    tokenizer = FullTokenizer(vocab_file, do_lower_case)

    dataset_size = convert_examples_to_features(examples=eval_examples,
                                                tokenizer=tokenizer,
                                                max_seq_length=384,