def load_data(self, path): if ".pickle" in path: print("Loading data from: {}".format(path)) data_utils.load_existing_data_loader(self, path) return True for split in self.data: file_name = "v4_atomic_{}.csv".format(map_name(split)) df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0) df.iloc[:, :9] = df.iloc[:, :9].apply( lambda col: col.apply(json.loads)) for cat in self.categories: attr = df[cat] self.data[split]["total"] += utils.zipped_flatten( zip(attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) if do_take_partial_dataset(self.opt.data): self.data["train"]["total"] = select_partial_dataset( self.opt.data, self.data["train"]["total"]) return False
def load_data(self, path): if ".pickle" in path: print("Loading data from: {}".format(path)) data_utils.load_existing_data_loader(self, path) return True for split in self.data: if split == 'train': n_data = self.n_train elif split == 'dev': n_data = self.n_dev elif split == 'test': n_data = self.n_test # Read & load ATOMIC dataset file file_name = "v4_atomic_{}.csv".format(map_name(split)) df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0) df.iloc[:, :9] = df.iloc[:, :9].apply(lambda col: col.apply(json.loads)) if self.comet: """ For replicating original COMET settings we don't need a graph. """ for cat in [item for item in self.categories if not 'Inverse' in item]: attr = df[cat] self.data[split]["total"] += utils.zipped_flatten(zip( attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) #ipdb.set_trace() elif self.pathcomet: """ Replicate original COMET, but prepend every s,r with a path from a graph """ comet_orig = {} comet_orig['train'] = {"total": []} comet_orig['dev'] = {"total": []} comet_orig['test'] = {"total": []} for cat in [item for item in self.categories if not 'Inverse' in item]: attr = df[cat] #self.data[split]["total"] += utils.zipped_flatten(zip( # attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) comet_orig[split]["total"] += utils.zipped_flatten(zip( attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) comet_orig[split]["total"] = [list(item) for item in comet_orig[split]["total"]] # Convert tuples into list # Build graph #G=nx.Graph() G=nx.DiGraph() entities = set() for cat in [item for item in self.categories if not 'Inverse' in item]: attr = df[cat] triples = utils.zipped_flatten(zip(attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) # Add to the graph for triple in triples: m1, rel, m2 = triple entities.add(m1) entities.add(m2) G.add_node(m1, type='subj') G.add_node(m2, type='obj') G.add_edge(m1, m2, rel=rel) G.add_edge(m2, m1, rel=rel.replace('>','Inverse>')) # Inverse relation #ipdb.set_trace() #self.data[split]["total"] += utils.zipped_flatten(zip( # attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) examples = [] for base_subj, base_rel, base_obj in comet_orig[split]["total"]: unique_paths = set() for _ in range(self.n_per_node[split]): curr_node = base_subj walk = data_utils.Path(curr_node) walk.nodes.add(base_obj) # We don't want to see the target object in the input path n_attempts = 0 while len(walk.walk) * 1 < self.max_path_len: obj, relation, dead_end = data_utils.single_step_reverse(curr_node, G) if dead_end: n_attempts += 1 break updated = walk.update(obj, relation, prepend=True) if updated: curr_node = obj else: n_attempts += 1 if n_attempts > 10 : break if not ' '.join(walk.walk + [base_rel] + [base_obj]) in unique_paths: assert walk.walk[-1] == base_subj walk.walk.append(base_rel) walk.walk.append(base_obj) examples.append(walk.walk) unique_paths.add(' '.join(walk.walk)) #ipdb.set_trace() if len(examples) % 500 == 0: print("\nGenerated {} {} examples".format(len(examples), split)) print(walk.walk) #if len(examples) >= n_data: # break #ipdb.set_trace() #examples = examples[:n_data] if self.add_orig[split]: self.data[split]["total"] += comet_orig[split]["total"] self.data[split]["total"] += examples else: self.data[split]["total"] = examples #ipdb.set_trace() else: """ Graph based path data generation """ #G=nx.Graph() G=nx.DiGraph() entities = set() for cat in [item for item in self.categories if not 'Inverse' in item]: attr = df[cat] triples = utils.zipped_flatten(zip(attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) # Add to the graph for triple in triples: m1, rel, m2 = triple entities.add(m1) entities.add(m2) G.add_node(m1, type='subj') G.add_node(m2, type='obj') G.add_edge(m1, m2, rel=rel) G.add_edge(m2, m1, rel=rel.replace('>','Inverse>'))# Inverse relation #ipdb.set_trace() #self.data[split]["total"] += utils.zipped_flatten(zip( # attr.index, ["<{}>".format(cat)] * len(attr), attr.values)) examples = [] all_nodes = list(G.nodes()) random.shuffle(all_nodes) for node in all_nodes: unique_paths = set() # Use for filtering out duplicate paths starting from the same start_node for _ in range(self.n_per_node[split]): curr_node = node walk = data_utils.Path(curr_node) n_attempts = 0 while len(walk.walk) * 1 < self.max_path_len: obj, relation, dead_end = data_utils.single_step(curr_node, G) if dead_end: n_attempts += 1 break updated = walk.update(obj, relation) if updated: curr_node = obj else: #ipdb.set_trace() n_attempts += 1 #print(walk.walk) if n_attempts > 10 : break if not ' '.join(walk.walk) in unique_paths: examples.append(walk.walk) unique_paths.add(' '.join(walk.walk)) #print(' '.join(walk.walk)) if len(examples) % 500 == 0: print("\nGenerated {} {} examples".format(len(examples), split)) print(walk.walk) #ipdb.set_trace() if len(examples) >= n_data: break examples = examples[:n_data] self.data[split]["total"] = examples #ipdb.set_trace() if do_take_partial_dataset(self.opt.data): self.data["train"]["total"] = select_partial_dataset( self.opt.data, self.data["train"]["total"]) return False