Exemplo n.º 1
0
    def get_test_iter(self, data_dir):
        """See base class."""
        self.language = "zh"
        lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "test-%d" % (i)
            language = tokenization.convert_to_unicode(line[0])
            if language != tokenization.convert_to_unicode(self.language):
                continue
            text_a = tokenization.convert_to_unicode(line[6])
            text_b = tokenization.convert_to_unicode(line[7])
            label = tokenization.convert_to_unicode(line[1])
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))

        self.test_examples = examples

        def wrapper():
            for (example_idx, example) in enumerate(examples):
                yield 0, example_idx, example

        return wrapper
Exemplo n.º 2
0
 def mnli_line_processor(line_id, line):
     if line_id == "0":
         return None
     uid = tokenization.convert_to_unicode(line[0])
     text_a = tokenization.convert_to_unicode(line[8])
     text_b = tokenization.convert_to_unicode(line[9])
     label = tokenization.convert_to_unicode(line[-1])
     if label not in ["contradiction", "entailment", "neutral"]:
         label = "contradiction"
     return BertInputExample(
         uid=uid, text_a=text_a, text_b=text_b, label=label)
Exemplo n.º 3
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[3])
         text_b = tokenization.convert_to_unicode(line[4])
         if set_type == "test":
             label = "0"
         else:
             label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemplo n.º 4
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # Only the test set has a header
         if set_type == "test" and i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         if set_type == "test":
             text_a = tokenization.convert_to_unicode(line[1])
             label = "0"
         else:
             text_a = tokenization.convert_to_unicode(line[3])
             label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
Exemplo n.º 5
0
    def get_train_iter(self,
                       data_dir,
                       epoch_num=1,
                       shuffle=True,
                       shuffle_seed=None):
        """See base class."""
        self.language = "zh"
        lines = self._read_tsv(
            os.path.join(data_dir, "multinli",
                         "multinli.train.%s.tsv" % self.language))
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "train-%d" % (i)
            text_a = tokenization.convert_to_unicode(line[0])
            text_b = tokenization.convert_to_unicode(line[1])
            label = tokenization.convert_to_unicode(line[2])
            if label == tokenization.convert_to_unicode("contradictory"):
                label = tokenization.convert_to_unicode("contradiction")
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))

        self.train_examples = examples

        def wrapper():
            if shuffle:
                if shuffle_seed is not None:
                    np.random.seed(shuffle_seed)
            for epoch_idx in range(epoch_num):
                if shuffle:
                    np.random.shuffle(examples)
                for (example_idx, example) in enumerate(examples):
                    yield epoch_idx, example_idx, example

        return wrapper
Exemplo n.º 6
0
def convert_single_example_to_unicode(guid, single_example):
    text_a = tokenization.convert_to_unicode(single_example[0])
    text_b = tokenization.convert_to_unicode(single_example[1])
    label = tokenization.convert_to_unicode(single_example[2])
    return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)