def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" has_warned = False with io.open(input_file, "r", encoding="UTF-8") as file: reader = csv.reader(file, delimiter="\t", quotechar=None) examples = [] for (i, line) in enumerate(reader): if i == 0: ncol = len(line) if self.if_file_with_header[phase]: continue if (len(line) != ncol): print(line) if phase != "predict": if ncol == 1: raise Exception( "the %s file: %s only has one column but it is not a predict file" % (phase, input_file)) elif ncol == 2: example = InputExample(guid=i, text_a=line[0], label=line[1]) elif ncol == 3: example = InputExample(guid=i, text_a=line[0], text_b=line[1], label=line[2]) else: raise Exception( "the %s file: %s has too many columns (should <=3_实体识别)" % (phase, input_file)) else: if ncol == 1: example = InputExample(guid=i, text_a=line[0]) elif ncol == 2: if not has_warned: logger.warning( "the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b" % (input_file)) has_warned = True example = InputExample(guid=i, text_a=line[0], text_b=line[1]) else: raise Exception( "the predict file: %s has too many columns (should <=2)" % (input_file)) examples.append(example) # print(example) return examples
def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" has_warned = False with open(input_file, 'r', encoding='utf-8') as f: examples = [] for i, line in enumerate(f.readlines()[1:]): # line=line[:-1] # print(line) line = line.split('\t') text1 = line[0] text = eval(text1) if len(line) < 2: example = InputExample(guid=i, text_a=text) else: label1 = line[1] label = eval(label1) example = InputExample(guid=i, text_a=text, label=label) examples.append(example) return examples
def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" with io.open(input_file, "r", encoding="UTF-8") as file: examples = [] for line in file: data = line.strip().split("_!_") example = InputExample(guid=data[0], label=data[1], text_a=data[3]) examples.append(example) return examples
def _read_csv(self, input_file, quotechar=None): """Reads a tab separated value file.""" data = pd.read_csv(input_file, encoding="UTF-8") examples = [] for index, row in data.iterrows(): guid = row["id"] text = row["comment_text"] labels = [int(value) for value in row[2:]] example = InputExample(guid=guid, label=labels, text_a=text) examples.append(example) return examples
def _read_file(self, input_file, is_training): """Reads a tab separated value file.""" with io.open(input_file, "r", encoding="UTF-8") as file: examples = [] for (i, line) in enumerate(file): if i == 0 and is_training: continue data = line.strip().split("_!_") example = InputExample( guid=i, label=data[0], text_a=data[2], text_b=data[3]) examples.append(example) return examples
def _read_tsv(input_file): """Reads a tab separated value file.""" examples = [] seq_id = 0 for line in input_file: #这一步非常的中药就是我们拼接 text_a 使用未分割的字符串(框架内部有实现分词) #label这个参数拼接分类类别 example = InputExample( guid=seq_id, label=line[1], text_a=line[0]) seq_id += 1 examples.append(example) return examples
def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" with io.open(input_file, "r", encoding="UTF-8") as file: examples = [] for (i, line) in enumerate(file): data = line.strip().split("_!_") try: example = InputExample( guid=i, label=str(data[0]), text_a=data[1], text_b=None) examples.append(example) except: pass return examples
def _read_csv(self, input_file, quotechar=None): """Reads a tab separated value file.""" with codecs.open(input_file, "r", encoding="UTF-8") as f: reader = csv.reader(f, delimiter=",", quotechar=quotechar) examples = [] seq_id = 0 header = next(reader) # skip header for line in reader: example = InputExample( guid=seq_id, label=line[0], text_a=line[1]) seq_id += 1 examples.append(example) return examples
def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" data = pd.read_csv(input_file, sep='\t', header=None) examples = [] i = 0 if self.model == 'mcls': if (phase != 'predict'): for sent, entity, label in data.values: # print(type(label)) examples.append( InputExample(guid=i, text_a=sent, text_b=str(entity), label=eval(label))) i += 1 else: for sent, entity, label in data.values: examples.append( InputExample(guid=i, text_a=sent, text_b=str(entity), label=None)) i += 1 else: #if self.model=='mcls_onlysentence': if (phase != 'predict'): for sent, label in data.values: # print(type(label)) examples.append( InputExample(guid=i, text_a=sent, label=eval(label))) # print(examples[-1]) i += 1 else: for sent, label in data.values: examples.append( InputExample(guid=i, text_a=sent, label=None)) i += 1 return examples
def _read_file(self, input_file, phase=False): """ 读入json格式数据集 """ examples = [] drop = 0 with open(input_file, "r") as reader: input_data = json.load(reader)["data"] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] guid = [] labels = [0] * len(self.label_list) for qa in paragraph["qas"]: guid.append(qa["id"]) labels[self.label_list.index(qa["question"].replace( '的主体是什么?', ''))] = 1 guid = str(list(set(guid))) example = InputExample(guid=guid, label=labels, text_a=paragraph_text) examples.append(example) logger.warning("%i bad examples has been dropped" % drop) return examples
def _read_tsv(self, input_file, quotechar=None, wo_label=False): """Reads a tab separated value file.""" with io.open(input_file, "r", encoding="UTF-8") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) examples = [] seq_id = 0 if self.sub_dataset != 'CoLA' or wo_label: header = next(reader) # skip header if self.sub_dataset in [ 'MRPC', ]: if wo_label: label_index, text_a_index, text_b_index = [None, -2, -1] else: label_index, text_a_index, text_b_index = [0, -2, -1] elif self.sub_dataset in [ 'QNLI', ]: if wo_label: label_index, text_a_index, text_b_index = [None, 1, 2] else: label_index, text_a_index, text_b_index = [3, 1, 2] elif self.sub_dataset in [ 'QQP', ]: if wo_label: label_index, text_a_index, text_b_index = [None, 1, 2] else: label_index, text_a_index, text_b_index = [5, 3, 4] elif self.sub_dataset in [ 'RTE', ]: if wo_label: label_index, text_a_index, text_b_index = [None, 1, 2] else: label_index, text_a_index, text_b_index = [3, 1, 2] elif self.sub_dataset in [ 'SST-2', ]: if wo_label: label_index, text_a_index, text_b_index = [None, 1, None] else: label_index, text_a_index, text_b_index = [1, 0, None] elif self.sub_dataset in [ 'MNLI', ]: if wo_label: label_index, text_a_index, text_b_index = [None, 8, 9] else: label_index, text_a_index, text_b_index = [-1, 8, 9] elif self.sub_dataset in ['CoLA']: if wo_label: label_index, text_a_index, text_b_index = [None, 1, None] else: label_index, text_a_index, text_b_index = [1, 3, None] elif self.sub_dataset in ['STS-B']: if wo_label: label_index, text_a_index, text_b_index = [None, -2, -1] else: label_index, text_a_index, text_b_index = [-1, -3, -2] for line in reader: try: example = InputExample(guid=seq_id, text_a=line[text_a_index], text_b=line[text_b_index] if text_b_index is not None else None, label=line[label_index] if label_index is not None else None) seq_id += 1 examples.append(example) except: logger.info("[Discard Incorrect Data] " + "\t".join(line)) return examples