示例#1
0
    def create_examples(self, x_s, y_s=None):
        examples = []
        valid_classes = np.zeros((y_s.shape[0], len(self.list_classes)))
        accumul = 0
        for (i, x) in enumerate(x_s):
            y = y_s[i]
            guid = i
            text_a = tokenization.convert_to_unicode(x)
            #the_class = self._rewrite_classes(y, i)
            ind, = np.where(y == 1)
            the_class = self.list_classes[ind[0]]
            if the_class is None:
                #print(text_a)
                continue
            if the_class not in self.list_classes:
                #the_class = 'other'
                continue
            label = tokenization.convert_to_unicode(the_class)
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=None,
                             label=label))
            valid_classes[accumul] = y
            accumul += 1

        return examples, valid_classes
示例#2
0
 def create_inputs(self, x_s, dummy_label='dummy'):
     examples = []
     # dummy label to avoid breaking the bert base code
     label = tokenization.convert_to_unicode(dummy_label)
     for (i, x) in enumerate(x_s):
         guid = i
         text_a = tokenization.convert_to_unicode(x)
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
示例#3
0
 def _get_example(self, x, y):
     """
     Gets a collection of `InputExample` already labelled (for training and eval)
     """
     examples = []
     for i in range(len(x)):
         guid = i
         tokens = []
         labels = []
         for j in range(len(x[i])):
             tokens.append(tokenization.convert_to_unicode(x[i][j]))
             labels.append(tokenization.convert_to_unicode(y[i][j]))
         example = InputExample(guid=guid, tokens=tokens, labels=labels)
         examples.append(example)
     return examples
示例#4
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[3])
         text_b = tokenization.convert_to_unicode(line[4])
         if set_type == "test":
             label = "0"
         else:
             label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
示例#5
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # Only the test set has a header
         if set_type == "test" and i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         if set_type == "test":
             text_a = tokenization.convert_to_unicode(line[1])
             label = "0"
         else:
             text_a = tokenization.convert_to_unicode(line[3])
             label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
示例#6
0
 def get_dev_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "dev-%d" % (i)
         language = tokenization.convert_to_unicode(line[0])
         if language != tokenization.convert_to_unicode(self.language):
             continue
         text_a = tokenization.convert_to_unicode(line[6])
         text_b = tokenization.convert_to_unicode(line[7])
         label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
示例#7
0
 def get_train_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(
         os.path.join(data_dir, "multinli",
                      "multinli.train.%s.tsv" % self.language))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "train-%d" % (i)
         text_a = tokenization.convert_to_unicode(line[0])
         text_b = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[2])
         if label == tokenization.convert_to_unicode("contradictory"):
             label = tokenization.convert_to_unicode("contradiction")
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
示例#8
0
 def create_inputs(self, x_s, dummy_label='O'):
     """
     Gets a collection of `InputExample` for input to be labelled (for prediction)
     """
     examples = []
     # dummy label to avoid breaking the BERT base code
     for (i, x) in enumerate(x_s):
         guid = i
         tokens = []
         labels = []
         # if x is not already segmented:
         if isinstance(x, list):
             simple_tokens = x
         else:
             simple_tokens = tokenizeAndFilterSimple(x)
         for j in range(len(simple_tokens)):
             tokens.append(tokenization.convert_to_unicode(
                 simple_tokens[j]))
             labels.append(tokenization.convert_to_unicode(dummy_label))
         examples.append(
             InputExample(guid=guid, tokens=tokens, labels=labels))
     return examples