def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
     if set_type == 'test':
       text_a = tokenization.convert_to_unicode(line[-2])
       text_b = tokenization.convert_to_unicode(line[-1])
       label = 0.0
     else:
       text_a = tokenization.convert_to_unicode(line[-3])
       text_b = tokenization.convert_to_unicode(line[-2])
       label = float(line[-1])
     examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     # Only the test set has a header
     if set_type == "test" and i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     if set_type == "test":
       text_a = tokenization.convert_to_unicode(line[1])
       label = "0"
     else:
       text_a = tokenization.convert_to_unicode(line[3])
       label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
   return examples
Exemplo n.º 3
0
    def _create_examples(self, lines, set_type):
        examples = []
        for line in lines:
            qid = line['id']
            question = tokenization.convert_to_unicode(
                line['question']['stem'])
            answers = np.array([
                tokenization.convert_to_unicode(choice['text'])
                for choice in sorted(line['question']['choices'],
                                     key=lambda c: c['label'])
            ])
            # the test set has no answer key so use 'A' as a dummy label
            label = self.LABELS.index(line.get('answerKey', 'A'))

            examples.append(
                InputExample(qid=qid,
                             question=question,
                             answers=answers,
                             label=label))

        return examples
Exemplo n.º 4
0
    def _create_examples_variant_D(self, lines, set_type):
        examples = []
        for line in lines:
            qid = line['idx']

            premise = tokenization.convert_to_unicode(line['premise'])

            answers = np.array([
                tokenization.convert_to_unicode(line["choice1"]),
                tokenization.convert_to_unicode(line["choice2"])
            ])

            # the test set has no answer key so use '0' as a dummy label
            label = line.get('label', 0)

            examples.append(
                InputExample(qid=qid,
                             question=premise,
                             answers=answers,
                             label=label))

        return examples
def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, rng):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.

    print('converting to unicode...')
    for input_file in input_files:
        with tf.gfile.GFile(input_file, "r") as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break
                line = line.strip()

                # Empty lines are used as document delimiters
                if not line:
                    all_documents.append([])
                tokens = tokenizer.tokenize(line)
                if tokens:
                    all_documents[-1].append(tokens)

    print('done')
    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []

    print('processing documents...')
    for _ in range(dupe_factor):
        print('dupe_factor', _)
        for document_index in tqdm(range(len(all_documents))):
            instances.extend(
                create_instances_from_document(all_documents, document_index,
                                               max_seq_length, short_seq_prob,
                                               masked_lm_prob,
                                               max_predictions_per_seq,
                                               vocab_words, rng))

    rng.shuffle(instances)
    return instances