def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     #if i == 0:
      # continue
     guid = "%s-%s" % (set_type, i)
     if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[1])
        label ="0"
     else:
       text_a = tokenization.convert_to_unicode(line[1])
       label = tokenization.convert_to_unicode(line[0])
     examples.append(
         InputExample(guid=guid, text_a=text_a,  label=label))
   return examples
 def _create_examples(self, lines, set_type):#得到example list
   """Creates examples for the training and dev sets."""
   examples = []
   print("length of lines:",len(lines))
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     try:
         label = tokenization.convert_to_unicode(line[2])
         text_a = tokenization.convert_to_unicode(line[0])
         text_b = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     except Exception:
         print('###error.i:', i, line)
   return examples
 def get_dev_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "dev-%d" % (i)
     language = tokenization.convert_to_unicode(line[0])
     if language != tokenization.convert_to_unicode(self.language):
       continue
     text_a = tokenization.convert_to_unicode(line[6])
     text_b = tokenization.convert_to_unicode(line[7])
     label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
 def get_train_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(
       os.path.join(data_dir, "multinli",
                    "multinli.train.%s.tsv" % self.language))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "train-%d" % (i)
     text_a = tokenization.convert_to_unicode(line[0])
     text_b = tokenization.convert_to_unicode(line[1])
     label = tokenization.convert_to_unicode(line[2])
     if label == tokenization.convert_to_unicode("contradictory"):
       label = tokenization.convert_to_unicode("contradiction")
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
示例#5
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # Only the test set has a header
         guid = "%s-%s" % (set_type, i)
         if set_type == "test":
             text_a = tokenization.convert_to_unicode(line[1])
             label = "__label__unknown"
         else:
             text_a = tokenization.convert_to_unicode(line[1])
             label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
示例#6
0
    def process(self, text):
        #预处理
        text = tokenization.convert_to_unicode(text)
        text = self.tokenizer.tokenize(text)
        if len(text) > self.max_seq_length:
            text = text[0:self.max_seq_length - 2]

        #输入id化
        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in text:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.max_seq_length
        assert len(input_mask) == self.max_seq_length
        assert len(segment_ids) == self.max_seq_length

        label_id = 0.0  #模拟label

        output = self.model({
            'a_input_ids': [input_ids],
            'a_input_mask': [input_mask],
            'a_segment_ids': [segment_ids],
            'b_input_ids': [input_ids],
            'b_input_mask': [input_mask],
            'b_segment_ids': [segment_ids],
            'label_ids': [label_id]
        })['a_output_layer']

        return output
        pass