Exemplo n.º 1
0
 def convert_examples_to_features(self, examples, is_training, output_fn,
                                  batch_size):
     """Converts examples to features and write them into TFRecord file."""
     return squad_lib.convert_examples_to_features(
         examples=examples,
         tokenizer=self.tokenizer,
         max_seq_length=self.seq_len,
         doc_stride=self.doc_stride,
         max_query_length=self.query_len,
         is_training=is_training,
         output_fn=output_fn,
         batch_size=batch_size)
Exemplo n.º 2
0
    def _generate_tf_record_from_squad_file(cls,
                                            input_file_path,
                                            tokenizer,
                                            output_path,
                                            is_training,
                                            predict_batch_size=8,
                                            max_seq_length=384,
                                            max_query_length=64,
                                            doc_stride=128,
                                            version_2_with_negative=False):
        """Generates and saves training/validation data into a tf record file."""
        examples = squad_lib.read_squad_examples(
            input_file=input_file_path,
            is_training=is_training,
            version_2_with_negative=version_2_with_negative)
        writer = squad_lib.FeatureWriter(filename=output_path,
                                         is_training=is_training)

        features = []

        def _append_feature(feature, is_padding):
            if not is_padding:
                features.append(feature)
            writer.process_feature(feature)

        if is_training:
            batch_size = None
        else:
            batch_size = predict_batch_size

        number_of_examples = squad_lib.convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=is_training,
            output_fn=writer.process_feature
            if is_training else _append_feature,
            batch_size=batch_size)
        writer.close()

        meta_data = {
            'size': number_of_examples,
            'version_2_with_negative': version_2_with_negative
        }

        if is_training:
            examples = []
        return meta_data, examples, features
Exemplo n.º 3
0
vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

def _append_feature(feature, is_padding):
    if not is_padding:
        eval_features.append(feature)
    eval_writer.process_feature(feature)


eval_features = []
dataset_size = convert_examples_to_features(
    examples=eval_examples,
    tokenizer=tokenizer,
    max_seq_length=384,
    doc_stride=128,
    max_query_length=64,
    is_training=False,
    output_fn=_append_feature,
    batch_size=4)

eval_writer.close()

BATCH_SIZE = 4

eval_dataset = create_squad_dataset(
    "/content/drive/My Drive/BERT/data/squad/eval.tf_record",
    384,#input_meta_data['max_seq_length'],
    BATCH_SIZE,
    is_training=False)