def get_sentence_examples(self, questions):
     for index, data in enumerate(questions):
         guid = 'test-%d' % index
         text_a = tokenization.convert_to_unicode(str(data[0]))
         text_b = tokenization.convert_to_unicode(str(data[1]))
         label = str(0)
         yield InputExample(guid=guid,
                            text_a=text_a,
                            text_b=text_b,
                            label=label)
 def _to_example(sentences):
     import re
     """
     sentences to InputExample
     :param sentences: list of strings
     :return: list of InputExample
     """
     unique_id = 0
     for ss in sentences:
         line = tokenization.convert_to_unicode(ss)
         if not line:
             continue
         line = line.strip()
         text_a = None
         text_b = None
         m = re.match(r"^(.*) \|\|\| (.*)$", line)
         if m is None:
             text_a = line
         else:
             text_a = m.group(1)
             text_b = m.group(2)
         yield InputExample(unique_id=unique_id,
                            text_a=text_a,
                            text_b=text_b)
         unique_id += 1
 def get_dev_examples(self, data_dir):
     file_path = os.path.join(data_dir, 'dev.txt')
     dev_df = pd.read_csv(file_path,
                          encoding='utf-8',
                          sep='\t',
                          header=None)
     dev_data = []
     for index, dev in enumerate(dev_df.values):
         guid = 'test-%d' % index
         text_a = tokenization.convert_to_unicode(str(dev[1]))
         text_b = tokenization.convert_to_unicode(str(dev[2]))
         label = str(dev[3])
         dev_data.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return dev_data
Пример #4
0
    def get_dev_examples(self, data_dir):
        file_path = os.path.join(data_dir, 'dev.txt')
        dev_data = []
        with open(file_path, 'r', encoding='utf-8') as dev_f:
            dev_dt = dev_f.readlines()

            for i in range(len(dev_dt)):
                row_data = dev_dt[i].strip().split('\t')
                guid = 'dev-%d' % i
                text_a = tokenization.convert_to_unicode(row_data[0])
                text_b = tokenization.convert_to_unicode(row_data[1])
                label = str(row_data[2])
                dev_data.append(
                    InputExample(guid=guid,
                                 text_a=text_a,
                                 text_b=text_b,
                                 label=label))
            return dev_data
Пример #5
0
    def get_train_examples(self, data_dir):
        file_path = os.path.join(data_dir, 'train.txt')
        # train_df = pd.read_csv(file_path, encoding='utf-8', sep='\t', header=None)
        train_data = []
        with open(file_path, 'r', encoding='utf-8') as train_f:
            train_dt = train_f.readlines()

            for i in range(len(train_dt)):
                row_data = train_dt[i].strip().split('\t')
                guid = 'train-%d' % i
                text_a = tokenization.convert_to_unicode(row_data[0])
                text_b = tokenization.convert_to_unicode(row_data[1])
                label = str(row_data[2])
                train_data.append(
                    InputExample(guid=guid,
                                 text_a=text_a,
                                 text_b=text_b,
                                 label=label))
            return train_data
Пример #6
0
    def get_test_examples(self, data_dir):
        file_path = os.path.join(data_dir, 'sentiment_test.txt')
        with open(file_path, 'r', encoding="utf-8") as f:
            reader = f.readlines()
        # random.shuffle(reader)  # 测试集不打乱数据,便于比较

        examples = []
        for index, line in enumerate(reader):
            guid = 'test-%d' % index
            split_line = line.strip().split("\t")
            text_a = tokenization.convert_to_unicode(split_line[1])
            text_b = None
            label = split_line[0]
            examples.append(InputExample(guid=guid, text_a=text_a,
                                         text_b=text_b, label=label))
        return examples
Пример #7
0
    def get_train_examples(self, data_dir):
        file_path = os.path.join(data_dir, 'sentiment_train.txt')
        with open(file_path, 'r', encoding="utf-8") as f:
            reader = f.readlines()
        random.seed(0)
        random.shuffle(reader)  # 注意要shuffle

        examples, self.labels = [], []
        for index, line in enumerate(reader):
            guid = 'train-%d' % index
            split_line = line.strip().split("\t")
            if len(split_line) == 2:
                text_a = tokenization.convert_to_unicode(split_line[1])
            text_b = None
            label = split_line[0]
            examples.append(InputExample(guid=guid, text_a=text_a,
                                         text_b=text_b, label=label))
            self.labels.append(label)
        return examples