def get_train_examples(self, data_dir): file_path = os.path.join(data_dir, 'train.csv') reader = pd.read_csv(file_path, encoding='utf-8', error_bad_lines=False) # 如果数据不是乱序的,注意要shuffle # 这里的数据量比较大,取部分跑一下 reader = reader.head(50000) print(type(reader)) import random random.shuffle(reader.values.tolist()) print("train length:", len(reader)) examples = [] for _, row in reader.iterrows(): line = row[0] # print(line) split_line = line.strip().split("\t") if len(split_line) != 4: continue guid = split_line[0] text_a = tokenization.convert_to_unicode(split_line[1]) text_b = tokenization.convert_to_unicode(split_line[2]) label = split_line[3] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_train_examples(self, data_dir): file_path = os.path.join(data_dir, 'data_1017.txt') f = open(file_path, 'r', encoding='utf8') data = f.readlines() import random import json random.shuffle(data) print("train length:", len(data)) examples = [] for i, row in enumerate(data): line = json.loads(row) guid = i text_a = tokenization.convert_to_unicode(line['question']) text_b = tokenization.convert_to_unicode(line['similar']) label = line['label'] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_test_examples(self, data_dir): file_path = os.path.join(data_dir, 'test.csv') reader = pd.read_csv(file_path, encoding='utf-8', error_bad_lines=False) # 这里的数据量比较大,取部分跑一下,跟验证集的数据区分开 reader = reader.head(10000) examples = [] for _, row in reader.iterrows(): line = row[0] # print(line) split_line = line.strip().split("\t") if len(split_line) != 4: continue guid = split_line[0] text_a = tokenization.convert_to_unicode(split_line[1]) text_b = tokenization.convert_to_unicode(split_line[2]) label = split_line[3] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples