def load_data(self, path): if ".pickle" in path: print("Loading data from: {}".format(path)) data_utils.load_existing_data_loader(self, path) return True for split in self.data: file_name = "v4_atomic_{}.csv".format(map_name(split)) df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0) df.iloc[:, :9] = df.iloc[:, :9].apply( lambda col: col.apply(json.loads)) for cat in self.categories: attr = df[cat] self.data[split]["total"] += utils.zipped_flatten( zip(attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) if do_take_partial_dataset(self.opt.data): self.data["train"]["total"] = select_partial_dataset( self.opt.data, self.data["train"]["total"]) return False
def load_data(self, path): if ".pickle" in path: print("Loading data from: {}".format(path)) data_utils.load_existing_data_loader(self, path) return True for split in self.data: file_name = map_name(split, self.opt.data) if split != "dev" or self.opt.data.devversion != "12": string_tuples = open("{}/{}".format( path, file_name), "r").read().split("\n") tuples = [x.split("\t") for x in string_tuples if x] elif split == "dev": string_tuples = open("{}/{}".format(path, "dev1.txt"), "r").read().split("\n") tuples = [x.split("\t") for x in string_tuples if x] if split in ["dev", "test"]: self.data[split]["total"] = \ [(i[0].lower().strip(), i[1].lower().strip(), i[2].lower().strip(), i[3].lower().strip(), i[4].lower().strip(), i[5].lower().strip(), i[6].lower().strip(), i[7].lower().strip(), i[8].lower().strip(), i[9].lower().strip(), i[10].lower().strip() ) for i in tuples] else: self.data[split]["total"] = \ [(i[0].lower().strip(), i[1].lower().strip(), i[2].lower().strip(), i[3].lower().strip(), i[4].lower().strip(), i[5].lower().strip(), i[6].lower().strip(), i[7].lower().strip(), i[8].lower().strip(), i[9].lower().strip(), i[10].lower().strip()) for i in tuples] return False
def load_data(self, path): if ".pickle" in path: print("Loading data from: {}".format(path)) data_utils.load_existing_data_loader(self, path) return True for split in self.data: print('We are working on split:', split) file_name = map_name(split, self.opt.data) if split != "dev" or self.opt.data.devversion != "12": string_tuples = open("{}/{}".format(path, file_name), "r").read().split("\n") tuples = [x.split("\t") for x in string_tuples if x] else: string_tuples = open("{}/{}".format(path, "dev1.txt"), "r").read().split("\n") tuples = [x.split("\t") for x in string_tuples if x] string_tuples = open("{}/{}".format(path, "dev2.txt"), "r").read().split("\n") tuples += [x.split("\t") for x in string_tuples if x] if split in ["dev", "test"]: if self.opt.data.rel == "language": self.data[split]["total"] = \ [(i[1].lower().strip(), split_into_words[i[0]], i[2].lower().strip(), int(i[3])) for i in tuples] self.data[split]["positive"] = \ [(i[1].lower().strip(), split_into_words[i[0]], i[2].lower().strip(), int(i[3])) for i in tuples if int(i[3])] self.data[split]["negative"] = \ [(i[1].lower().strip(), split_into_words[i[0]], i[2].lower().strip(), int(i[3])) for i in tuples if not int(i[3])] elif self.opt.data.rel == "relation": self.data[split]["total"] = \ [(i[1].lower().strip(), "<{}>".format(i[0]), i[2].lower().strip(), int(i[3])) for i in tuples] self.data[split]["positive"] = \ [(i[1].lower().strip(), "<{}>".format(i[0]), i[2].lower().strip(), int(i[3])) for i in tuples if int(i[3])] self.data[split]["negative"] = \ [(i[1].lower().strip(), "<{}>".format(i[0]), i[2].lower().strip(), int(i[3])) for i in tuples if not int(i[3])] else: if self.opt.data.rel == "language": self.data[split]["total"] = \ [(i[1].lower().strip(), split_into_words[i[0]], i[2].lower().strip(), i[3]) for i in tuples] elif self.opt.data.rel == "relation": self.data[split]["total"] = \ [(i[1].lower().strip(), "<{}>".format(i[0]), i[2].lower().strip(), i[3]) for i in tuples] return False
def load_data(self, path): if ".pickle" in path: print("Loading data from: {}".format(path)) data_utils.load_existing_data_loader(self, path) return True for split in self.data: file_name = "motiv_sent_none_{}.csv".format(map_name(split)) print(f"read file: {file_name}") df = pandas.read_csv("{}/{}".format(path, file_name)) print(f"columns: {df.columns}") sentences = [] for i in range(len(df.values)): context = df.loc[i, "context"].split('\t')[1:] linenum = int(df.loc[i, "linenum"]) char = df.loc[i, "char"] sentences.append('|'.join(context[0:linenum + 1]) + f"</s>{char}<s>") targets = df["motivation"].values if len(self.categories) == 1: cat = self.categories[0] self.data[split]["total"] += [ (str(sent), str(rel), str(tar)) for sent, rel, tar in zip(sentences, ["<{}>".format(cat)] * len(sentences), targets) ] if do_take_partial_dataset(self.opt.data): self.data["train"]["total"] = select_partial_dataset( self.opt.data, self.data["train"]["total"]) return False
def load_data(self, path): if ".pickle" in path: print("Loading data from: {}".format(path)) data_utils.load_existing_data_loader(self, path) return True for split in self.data: if split == 'train': n_data = self.n_train elif split == 'dev': n_data = self.n_dev elif split == 'test': n_data = self.n_test # Read & load ATOMIC dataset file file_name = "v4_atomic_{}.csv".format(map_name(split)) df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0) df.iloc[:, :9] = df.iloc[:, :9].apply(lambda col: col.apply(json.loads)) if self.comet: """ For replicating original COMET settings we don't need a graph. """ for cat in [item for item in self.categories if not 'Inverse' in item]: attr = df[cat] self.data[split]["total"] += utils.zipped_flatten(zip( attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) #ipdb.set_trace() elif self.pathcomet: """ Replicate original COMET, but prepend every s,r with a path from a graph """ comet_orig = {} comet_orig['train'] = {"total": []} comet_orig['dev'] = {"total": []} comet_orig['test'] = {"total": []} for cat in [item for item in self.categories if not 'Inverse' in item]: attr = df[cat] #self.data[split]["total"] += utils.zipped_flatten(zip( # attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) comet_orig[split]["total"] += utils.zipped_flatten(zip( attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) comet_orig[split]["total"] = [list(item) for item in comet_orig[split]["total"]] # Convert tuples into list # Build graph #G=nx.Graph() G=nx.DiGraph() entities = set() for cat in [item for item in self.categories if not 'Inverse' in item]: attr = df[cat] triples = utils.zipped_flatten(zip(attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) # Add to the graph for triple in triples: m1, rel, m2 = triple entities.add(m1) entities.add(m2) G.add_node(m1, type='subj') G.add_node(m2, type='obj') G.add_edge(m1, m2, rel=rel) G.add_edge(m2, m1, rel=rel.replace('>','Inverse>')) # Inverse relation #ipdb.set_trace() #self.data[split]["total"] += utils.zipped_flatten(zip( # attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) examples = [] for base_subj, base_rel, base_obj in comet_orig[split]["total"]: unique_paths = set() for _ in range(self.n_per_node[split]): curr_node = base_subj walk = data_utils.Path(curr_node) walk.nodes.add(base_obj) # We don't want to see the target object in the input path n_attempts = 0 while len(walk.walk) * 1 < self.max_path_len: obj, relation, dead_end = data_utils.single_step_reverse(curr_node, G) if dead_end: n_attempts += 1 break updated = walk.update(obj, relation, prepend=True) if updated: curr_node = obj else: n_attempts += 1 if n_attempts > 10 : break if not ' '.join(walk.walk + [base_rel] + [base_obj]) in unique_paths: assert walk.walk[-1] == base_subj walk.walk.append(base_rel) walk.walk.append(base_obj) examples.append(walk.walk) unique_paths.add(' '.join(walk.walk)) #ipdb.set_trace() if len(examples) % 500 == 0: print("\nGenerated {} {} examples".format(len(examples), split)) print(walk.walk) #if len(examples) >= n_data: # break #ipdb.set_trace() #examples = examples[:n_data] if self.add_orig[split]: self.data[split]["total"] += comet_orig[split]["total"] self.data[split]["total"] += examples else: self.data[split]["total"] = examples #ipdb.set_trace() else: """ Graph based path data generation """ #G=nx.Graph() G=nx.DiGraph() entities = set() for cat in [item for item in self.categories if not 'Inverse' in item]: attr = df[cat] triples = utils.zipped_flatten(zip(attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) # Add to the graph for triple in triples: m1, rel, m2 = triple entities.add(m1) entities.add(m2) G.add_node(m1, type='subj') G.add_node(m2, type='obj') G.add_edge(m1, m2, rel=rel) G.add_edge(m2, m1, rel=rel.replace('>','Inverse>'))# Inverse relation #ipdb.set_trace() #self.data[split]["total"] += utils.zipped_flatten(zip( # attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) examples = [] all_nodes = list(G.nodes()) random.shuffle(all_nodes) for node in all_nodes: unique_paths = set() # Use for filtering out duplicate paths starting from the same start_node for _ in range(self.n_per_node[split]): curr_node = node walk = data_utils.Path(curr_node) n_attempts = 0 while len(walk.walk) * 1 < self.max_path_len: obj, relation, dead_end = data_utils.single_step(curr_node, G) if dead_end: n_attempts += 1 break updated = walk.update(obj, relation) if updated: curr_node = obj else: #ipdb.set_trace() n_attempts += 1 #print(walk.walk) if n_attempts > 10 : break if not ' '.join(walk.walk) in unique_paths: examples.append(walk.walk) unique_paths.add(' '.join(walk.walk)) #print(' '.join(walk.walk)) if len(examples) % 500 == 0: print("\nGenerated {} {} examples".format(len(examples), split)) print(walk.walk) #ipdb.set_trace() if len(examples) >= n_data: break examples = examples[:n_data] self.data[split]["total"] = examples #ipdb.set_trace() if do_take_partial_dataset(self.opt.data): self.data["train"]["total"] = select_partial_dataset( self.opt.data, self.data["train"]["total"]) return False
def load_data(self, path): if ".pickle" in path: data_utils.load_existing_data_loader(self, path) return True