def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text, label = self._get_text_and_label(line)
         text = tokenization.convert_to_unicode(text)
         label = tokenization.convert_to_unicode(label)
         examples.append(InputExample(guid=guid, text_a=text, label=label))
     return examples
示例#2
0
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     text_a = tokenization.convert_to_unicode(line[3])
     text_b = tokenization.convert_to_unicode(line[4])
     if set_type == "test":
       label = "0"
     else:
       label = tokenization.convert_to_unicode(line[0])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
示例#3
0
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     # Only the test set has a header
     if set_type == "test" and i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     if set_type == "test":
       text_a = tokenization.convert_to_unicode(line[1])
       label = "0"
     else:
       text_a = tokenization.convert_to_unicode(line[3])
       label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
   return examples
示例#4
0
 def get_dev_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "dev-%d" % (i)
     language = tokenization.convert_to_unicode(line[0])
     if language != tokenization.convert_to_unicode(self.language):
       continue
     text_a = tokenization.convert_to_unicode(line[6])
     text_b = tokenization.convert_to_unicode(line[7])
     label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
示例#5
0
 def get_train_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(
       os.path.join(data_dir, "multinli",
                    "multinli.train.%s.tsv" % self.language))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "train-%d" % (i)
     text_a = tokenization.convert_to_unicode(line[0])
     text_b = tokenization.convert_to_unicode(line[1])
     label = tokenization.convert_to_unicode(line[2])
     if label == tokenization.convert_to_unicode("contradictory"):
       label = tokenization.convert_to_unicode("contradiction")
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
    def convert_single_example_to_feature(self, guid, text, label=None):
        text = tokenization.convert_to_unicode(text)
        if label:
            label = tokenization.convert_to_unicode(label)
            label_id = self.label_map.get(label)
        else:
            label_id = None

        tokens = self.tokenizer.tokenize(text)
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)
        segment_id = [0] * len(input_id)

        if int(guid.split("-")[1]) < 5:
            self.log.info("*** Example ***")
            self.log.info("guid: %s" % (guid))
            self.log.info(
                "tokens: %s" %
                " ".join([tokenization.printable_text(x) for x in tokens]))
            self.log.info("input_ids: %s" %
                          " ".join([str(x) for x in input_id]))
            self.log.info("input_mask: %s" %
                          " ".join([str(x) for x in input_mask]))
            self.log.info("segment_ids: %s" %
                          " ".join([str(x) for x in segment_id]))
            if label:
                self.log.info("label: %s (id = %d)" % (label, label_id))

        pad_input_id, pad_input_mask, pad_segment_id = self.zero_padding(
            input_id, input_mask, segment_id)

        feature = InputFeatures(input_ids=pad_input_id,
                                input_mask=pad_input_mask,
                                segment_ids=pad_segment_id,
                                label_id=label_id,
                                is_real_example=True)

        return feature
示例#7
0
def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, rng):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    for input_file in input_files:
        with tf.gfile.GFile(input_file, "r") as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break
                line = line.strip()

                # Empty lines are used as document delimiters
                if not line:
                    all_documents.append([])
                tokens = tokenizer.tokenize(line)
                if tokens:
                    all_documents[-1].append(tokens)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                create_instances_from_document(all_documents, document_index,
                                               max_seq_length, short_seq_prob,
                                               masked_lm_prob,
                                               max_predictions_per_seq,
                                               vocab_words, rng))

    rng.shuffle(instances)
    return instances
示例#8
0
def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    with tf.gfile.GFile(input_file, "r") as reader:
        while True:
            line = tokenization.convert_to_unicode(reader.readline())
            if not line:
                break
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a,
                             text_b=text_b))
            unique_id += 1
    return examples