def getTsvTestData(self, filepath, sep, sequence_length, y_value=False): """ load the data(label, mention, entity) from labeled files :param filepath: :return: three lists(label_list, mention_list, entity_list) """ print("Loading testing/labelled data from " + filepath) x1, x2 = [], [] y = [] line_num = 0 for line in codecs.open(filepath, "r", "utf-8"): line_num += 1 l = line.strip().split(sep) if len(l) < 3: continue l[1] = preprocess_unit(l[1]) l[2] = preprocess_unit(l[2]) if len(l[1]) == 0 or len(l[2]) == 0: continue # truncate when length is bigger than the max_length if len(l[1]) > sequence_length or len(l[2]) > sequence_length: l[1] = l[1][:sequence_length] l[2] = l[2][:sequence_length] x1.append(l[1]) x2.append(l[2]) y = self.add_y_helper(y_value, y, int(l[0]) == 1) if line_num != len(y): print("ei") print(line_num) return np.asarray(x1), np.asarray(x2), np.asarray(y)
def getTsvTestData_Mul_Labels_Dyna(self, filepath, sep, sequence_length, y_value=False): """ load the data(label, mention, entity) from labeled mutlti-task files :param filepath: :return: three lists(label_list, mention_list, entity_list) """ print("Loading testing/labelled data from " + filepath) x1, x2, x3, x4 = [], [], [], [] y = [] y2 = [] indicate = [] for line in codecs.open(filepath, "r", "utf-8"): l = line.strip().split(sep) l[1] = preprocess_unit(l[1]) l[2] = preprocess_unit(l[2]) if len(l[1]) > sequence_length or len(l[2]) > sequence_length: l[1] = l[1][:sequence_length] l[2] = l[2][:sequence_length] x1.append(l[1]) x2.append(l[2]) y = self.add_y_helper(y_value, y, int(l[0]) == 1) if len(l) == 3: # dynamic single task1 x3.append("") x4.append("") y2 = self.add_y_helper(y_value, y2, False) indicate.append(1) else: l[4] = preprocess_unit(l[4]) l[5] = preprocess_unit(l[5]) # truncate when length is bigger than the max_length if len(l[4]) > sequence_length or len(l[5]) > sequence_length: l[5] = l[5][:sequence_length] l[4] = l[4][:sequence_length] x3.append(l[4]) x4.append(l[5]) indicate.append(0) y2 = self.add_y_helper(y_value, y2, int(l[3]) == 1) return indicate, np.asarray(x1), np.asarray(x2), np.asarray( x3), np.asarray(x4), np.asarray(y), np.asarray(y2)
def getTsvData(self, filepath, sep, max_record_entity, sequence_length, y_value=False): """ load the data (label1, mention1, entity1)... (label22, mention22, entity22) from labeled files :param filepath: :return: three lists(label_lists, mention_lists, entity_lists """ print("Loading labelled data from " + filepath) label_lists = [0] * max_record_entity mention_lists = [0] * max_record_entity entity_lists = [0] * max_record_entity mask_lists = [0] * max_record_entity line_num = 0 for line in codecs.open(filepath, "r", "utf-8"): line = line.strip().split(sep) if len(line) < max_record_entity * 3: continue #只取要的部分 items = line[:(max_record_entity / 2) * 3] items.extend(line[11 * 3:11 * 3 + (max_record_entity / 2) * 3]) # truncate when length is bigger than the max_length for index, item in enumerate(items): if index % 3 == 0: content1_fixed = preprocess_unit(item) content2_fixed = preprocess_unit(items[index + 1]) flag_empty = 0.0 if (content1_fixed == '' and content2_fixed == '') else 1.0 if len(content1_fixed) > sequence_length: content1_fixed = content1_fixed[:sequence_length] if len(content2_fixed) > sequence_length: content2_fixed = content2_fixed[:sequence_length] if line_num == 0: entity_lists[index / 3] = [content1_fixed] mention_lists[index / 3] = [content2_fixed] mask_lists[index / 3] = [flag_empty] if items[index + 2] == '1': label_lists[index / 3] = [[1, 0]] else: label_lists[index / 3] = [[0, 1]] if (flag_empty == 1.0) else [[0, 0]] else: entity_lists[index / 3].append( content1_fixed) #entity,mention list是否需要调换顺序??? mention_lists[index / 3].append(content2_fixed) mask_lists[index / 3].append(flag_empty) if items[index + 2] == '1': label_lists[index / 3].append([1, 0]) else: if flag_empty == 1.0: label_lists[index / 3].append([0, 1]) else: label_lists[index / 3].append([0, 0]) line_num += 1 print('load records %d' % (line_num)) return np.asarray(mention_lists), np.asarray(entity_lists), np.asarray( label_lists), np.asarray(mask_lists)