def pre_process(self): processed_dir = os.path.join(self.root, 'processed') pre_processed_file_path = os.path.join(processed_dir, 'dgl_data_processed') if os.path.exists(pre_processed_file_path): self.graphs, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels'] self.ids = label_dict['ids'] else: url = self.meta_info[self.name]["dgl url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) self.graphs, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels'] self.ids = label_dict['ids']
def pre_process(self): processed_dir = osp.join(self.root, 'processed') raw_dir = osp.join(self.root, 'raw') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if os.path.exists(pre_processed_file_path): loaded_dict = torch.load(pre_processed_file_path, 'rb') self.graphs, self.labels = loaded_dict['graphs'], loaded_dict['labels'] else: ### download url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) ### preprocess add_inverse_edge = self.meta_info[self.name]["add_inverse_edge"] == "True" self.graphs = read_csv_graph_raw(raw_dir, add_inverse_edge = add_inverse_edge) self.labels = pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"), compression="gzip", header = None).values print('Saving...') torch.save({'graphs': self.graphs, 'labels': self.labels}, pre_processed_file_path)
def download(self): if decide_download(self.url): path = download_url(self.url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) else: print('Stop download.') exit(-1)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): self.graph, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels'] else: ### check download if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") ### pre-process and save add_inverse_edge = self.meta_info[self.name]["add_inverse_edge"] == "True" if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[self.name]["additional edge files"].split(',') graph = read_csv_graph_dgl(raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)[0] ### adding prediction target node_label = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression="gzip", header = None).values if "classification" in self.task_type: node_label = torch.tensor(node_label, dtype = torch.long) else: node_label = torch.tensor(node_label, dtype = torch.float32) label_dict = {"labels": node_label} print('Saving...') save_graphs(pre_processed_file_path, graph, label_dict) self.graph, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels']
def download(self): if not osp.exists(self.dir): if decide_download(self.url): path = download_url(self.url, self.root) extract_zip(path, self.root) os.unlink(path) else: print('Stop download.') exit(-1)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): self.graph, _ = load_graphs(pre_processed_file_path) else: ### check download if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" ### pre-process and save if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ self.name]["additional edge files"].split(',') graph = read_csv_graph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[0] print('Saving...') save_graphs(pre_processed_file_path, graph, {}) self.graph, _ = load_graphs(pre_processed_file_path)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): loaded_dict = pickle.load(open(pre_processed_file_path, 'rb')) self.graph, self.labels = loaded_dict['graph'], loaded_dict[ 'labels'] else: ### check download if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") ### pre-process and save add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[0] # only a single graph ### adding prediction target node_label = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression="gzip", header=None).values pickle.dump({ 'graph': graph, 'labels': node_label }, open(pre_processed_file_path, 'wb'), protocol=4) ### load preprocessed files loaded_dict = pickle.load(open(pre_processed_file_path, 'rb')) self.graph, self.labels = loaded_dict['graph'], loaded_dict[ 'labels']
def download(self): url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) shutil.rmtree(self.root) shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print('Stop downloading.') shutil.rmtree(self.root) exit(-1)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') raw_dir = osp.join(self.root, 'raw') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') ### download url = 'https://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/pcba.zip' if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) ### preprocess add_inverse_edge = True additional_node_files = [] additional_edge_files = [] graphs = read_csv_graph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) labels = pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"), compression="gzip", header=None).values has_nan = np.isnan(labels).any() if "classification" in self.task_type: if has_nan: labels = torch.from_numpy(labels) else: labels = torch.from_numpy(labels).to(torch.long) else: labels = torch.from_numpy(labels) print('Not Saving...') # save_graphs(pre_processed_file_path, graphs, labels={'labels': labels}) ### load preprocessed files self.graphs = graphs self.labels = labels
def process(self): url = self.meta_info[self.name]["pyg url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) shutil.rmtree(self.root) shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") shutil.rmtree(self.root) exit(-1)
def pre_process(self): """pre_process downlaoding data """ processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed') if osp.exists(pre_processed_file_path): # TODO: Reload Preprocess files pass else: ### check download if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") ### pre-process and save add_inverse_edge = to_bool(self.meta_info[self.name][ "add_inverse_edge"]) self.graph = read_csv_graph_pgl( raw_dir, add_inverse_edge=add_inverse_edge) ### adding prediction target node_label = pd.read_csv( osp.join(raw_dir, 'node-label.csv.gz'), compression="gzip", header=None).values if "classification" in self.task_type: node_label = np.array(node_label, dtype=np.int64) else: node_label = np.array(node_label, dtype=np.float32) label_dict = {"labels": node_label} # TODO: SAVE preprocess graph self.labels = label_dict['labels']
def download(self): if decide_download(self.url): path = download_url(self.url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) try: shutil.rmtree(self.folder) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.folder) else: print('Stop download.') exit(-1)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): loaded_dict = torch.load(pre_processed_file_path) self.graph, self.labels = loaded_dict['graph'], loaded_dict['labels'] else: ### check download if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") ### pre-process and save add_inverse_edge = self.meta_info[self.name]["add_inverse_edge"] == "True" if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[self.name]["additional edge files"].split(',') self.graph = read_csv_graph_raw(raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)[0] # only a single graph ### adding prediction target self.labels = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression="gzip", header = None).values print('Saving...') torch.save({'graph': self.graph, 'labels': self.labels}, pre_processed_file_path)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): self.graph, _ = load_graphs(pre_processed_file_path) else: ### check download if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") file_names = ["edge"] if self.meta_info[self.name]["has_node_attr"] == "True": file_names.append("node-feat") if self.meta_info[self.name]["has_edge_attr"] == "True": file_names.append("edge-feat") raw_file_names = [ file_name + ".csv.gz" for file_name in file_names ] ### pre-process and save add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" graph = read_csv_graph_dgl(raw_dir, raw_file_names, add_inverse_edge=add_inverse_edge) save_graphs(pre_processed_file_path, graph, {}) self.graph, _ = load_graphs(pre_processed_file_path)
def download(self): url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) # hack to keep zip file shutil.copyfile(path, path + '.tmp') extract_zip(path, self.original_root) os.unlink(path) shutil.rmtree(self.root) shutil.move(osp.join(self.original_root, self.download_name), self.root) # finish hack shutil.move(path + '.tmp', path) else: print('Stop downloading.') shutil.rmtree(self.root) exit(-1)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): self.graph = pickle.load(open(pre_processed_file_path, 'rb')) else: ### check download if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") ### pre-process and save add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[0] # only a single graph pickle.dump(graph, open(pre_processed_file_path, 'wb'), protocol=4) self.graph = pickle.load(open(pre_processed_file_path, 'rb'))
def pre_process(self): """Pre-processing""" processed_dir = osp.join(self.root, 'processed') raw_dir = osp.join(self.root, 'raw') pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed') if os.path.exists(pre_processed_file_path): # TODO: Load Preprocessed pass else: ### download url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) ### preprocess add_inverse_edge = to_bool( self.meta_info[self.name]["add_inverse_edge"]) self.graphs = read_csv_graph_pgl(raw_dir, add_inverse_edge=add_inverse_edge) self.graphs = np.array(self.graphs) self.labels = np.array( pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"), compression="gzip", header=None).values)
def pre_process(self): """pre_process downlaoding data """ processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): #TODO: Reload Preprocess files pass else: ### check download if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") ### pre-process and save add_inverse_edge = to_bool( self.meta_info[self.name]["add_inverse_edge"]) self.graph = read_csv_graph_pgl(raw_dir, add_inverse_edge=add_inverse_edge)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): # loaded_dict = torch.load(pre_processed_file_path) loaded_dict = load_pickle(pre_processed_file_path) self.graph, self.labels = loaded_dict['graph'], loaded_dict[ 'labels'] else: ### check download if self.binary: # npz format has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero else: # csv file has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) raw_dir = osp.join(self.root, 'raw') ### pre-process and save add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.is_hetero: if self.binary: self.graph = read_binary_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph tmp = np.load(osp.join(raw_dir, 'node-label.npz')) self.labels = {} for key in list(tmp.keys()): self.labels[key] = tmp[key] del tmp else: self.graph = read_csv_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph self.labels = read_node_label_hetero(raw_dir) else: if self.binary: self.graph = read_binary_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph self.labels = np.load(osp.join( raw_dir, 'node-label.npz'))['node_label'] else: self.graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph self.labels = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression='gzip', header=None).values print('Saving...') # torch.save({'graph': self.graph, 'labels': self.labels}, pre_processed_file_path, pickle_protocol=4) dump_pickle({ 'graph': self.graph, 'labels': self.labels }, pre_processed_file_path)
def download(self): path = download_url(self.url, self.root) extract_zip(path, self.root) os.unlink(path) shutil.move(osp.join(self.root, 'pcqm4m_kddcup2021/raw/data.csv.gz'), osp.join(self.root, 'raw/data.csv.gz'))
def pre_process(self): processed_dir = osp.join(self.root, 'processed') raw_dir = osp.join(self.root, 'raw') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if self.task_type == 'subtoken prediction': target_sequence_file_path = osp.join(processed_dir, 'target_sequence') if os.path.exists(pre_processed_file_path): if self.task_type == 'subtoken prediction': self.graphs, _ = load_graphs(pre_processed_file_path) self.labels = torch.load(target_sequence_file_path) else: self.graphs, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels'] else: ### check download if self.binary: # npz format has_necessary_file = osp.exists( osp.join(self.root, 'raw', 'data.npz')) else: # csv file has_necessary_file = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) ### download if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) ### preprocess add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') graphs = read_graph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files, binary=self.binary) if self.task_type == 'subtoken prediction': # the downloaded labels are initially joined by ' ' labels_joined = pd.read_csv(osp.join(raw_dir, 'graph-label.csv.gz'), compression='gzip', header=None).values # need to split each element into subtokens labels = [ str(labels_joined[i][0]).split(' ') for i in range(len(labels_joined)) ] print('Saving...') save_graphs(pre_processed_file_path, graphs) torch.save(labels, target_sequence_file_path) ### load preprocessed files self.graphs, _ = load_graphs(pre_processed_file_path) self.labels = torch.load(target_sequence_file_path) else: if self.binary: labels = np.load(osp.join( raw_dir, 'graph-label.npz'))['graph_label'] else: labels = pd.read_csv(osp.join(raw_dir, 'graph-label.csv.gz'), compression='gzip', header=None).values has_nan = np.isnan(labels).any() if 'classification' in self.task_type: if has_nan: labels = torch.from_numpy(labels).to(torch.float32) else: labels = torch.from_numpy(labels).to(torch.long) else: labels = torch.from_numpy(labels).to(torch.float32) print('Saving...') save_graphs(pre_processed_file_path, graphs, labels={'labels': labels}) ### load preprocessed files self.graphs, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels']
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): # self.graph = torch.load(pre_processed_file_path, 'rb') self.graph = load_pickle(pre_processed_file_path) else: ### check download if self.binary: # npz format has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero else: # csv file has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) raw_dir = osp.join(self.root, 'raw') ### pre-process and save add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.is_hetero: if self.binary: self.graph = read_binary_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph else: self.graph = read_csv_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph else: if self.binary: self.graph = read_binary_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph else: self.graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph print('Saving...') # torch.save(self.graph, pre_processed_file_path, pickle_protocol=4) dump_pickle(self.graph, pre_processed_file_path)
if __name__ == "__main__": ## example code # if not osp.exists('dataset/tox21'): # url = 'https://ogb.stanford.edu/data/graphproppred/csv_mol_download/tox21.zip' # path = download_url(url, 'dataset') # extract_zip(path, 'dataset') # os.unlink(path) # graph_list = read_csv_graph_raw('dataset/tox21/raw', add_inverse_edge = True) if not osp.exists('dataset/proteinfunc_v2'): url = 'https://ogb.stanford.edu/data/nodeproppred/proteinfunc_v2.zip' path = download_url(url, 'dataset') extract_zip(path, 'dataset') os.unlink(path) graph_list = read_csv_graph_raw('dataset/proteinfunc_v2/raw', add_inverse_edge=True) # if not osp.exists('dataset/ppassoc_v2'): # url = 'https://ogb.stanford.edu/data/linkproppred/ppassoc_v2.zip' # path = download_url(url, 'dataset') # extract_zip(path, 'dataset') # os.unlink(path) #graph_list = read_csv_graph_raw('dataset/ppassoc_v2/raw', add_inverse_edge = True) print(len(graph_list)) print(graph_list[0])
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): if not self.is_hetero: self.graph, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels'] else: with open(pre_processed_file_path, 'rb') as f: self.graph, self.labels = pickle.load(f) else: ### check if the downloaded file exists has_necessary_file_simple = osp.exists( osp.join(self.root, "raw", "edge.csv.gz")) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, "raw", "triplet-type-list.csv.gz")) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") ### pre-process and save add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ self.name]["additional edge files"].split(',') if self.is_hetero: graph = read_csv_heterograph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[0] label_dict = read_node_label_hetero(raw_dir) # convert into torch tensor if "classification" in self.task_type: for nodetype in label_dict.keys(): # detect if there is any nan node_label = label_dict[nodetype] if np.isnan(node_label).any(): label_dict[nodetype] = torch.from_numpy( node_label).to(torch.float32) else: label_dict[nodetype] = torch.from_numpy( node_label).to(torch.long) else: for nodetype in label_dict.keys(): node_label = label_dict[nodetype] label_dict[nodetype] = torch.from_numpy(node_label).to( torch.float32) with open(pre_processed_file_path, 'wb') as f: pickle.dump(([graph], label_dict), f) with open(pre_processed_file_path, 'rb') as f: self.graph, self.labels = pickle.load(f) else: graph = read_csv_graph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[0] ### adding prediction target node_label = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression="gzip", header=None).values if "classification" in self.task_type: # detect if there is any nan if np.isnan(node_label).any(): node_label = torch.from_numpy(node_label).to( torch.float32) else: node_label = torch.from_numpy(node_label).to( torch.long) else: node_label = torch.from_numpy(node_label).to(torch.float32) label_dict = {"labels": node_label} save_graphs(pre_processed_file_path, graph, label_dict) self.graph, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels']
def pre_process(self): processed_dir = osp.join(self.root, 'processed') raw_dir = osp.join(self.root, 'raw') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if os.path.exists(pre_processed_file_path): loaded_dict = torch.load(pre_processed_file_path, 'rb') self.graphs, self.labels = loaded_dict['graphs'], loaded_dict[ 'labels'] else: ### check download if self.binary: # npz format has_necessary_file = osp.exists( osp.join(self.root, 'raw', 'data.npz')) else: # csv file has_necessary_file = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) ### download if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) ### preprocess add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.binary: self.graphs = read_binary_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge) else: self.graphs = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) if self.task_type == 'subtoken prediction': labels_joined = pd.read_csv(osp.join(raw_dir, 'graph-label.csv.gz'), compression='gzip', header=None).values # need to split each element into subtokens self.labels = [ str(labels_joined[i][0]).split(' ') for i in range(len(labels_joined)) ] else: if self.binary: self.labels = np.load(osp.join( raw_dir, 'graph-label.npz'))['graph_label'] else: self.labels = pd.read_csv(osp.join(raw_dir, 'graph-label.csv.gz'), compression='gzip', header=None).values print('Saving...') torch.save({ 'graphs': self.graphs, 'labels': self.labels }, pre_processed_file_path, pickle_protocol=4)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): self.graph = torch.load(pre_processed_file_path, 'rb') else: ### check download has_necessary_file_simple = osp.exists( osp.join(self.root, "raw", "edge.csv.gz")) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, "raw", "triplet-type-list.csv.gz")) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") ### pre-process and save add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ self.name]["additional edge files"].split(',') if self.is_hetero: self.graph = read_csv_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph else: self.graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph print('Saving...') torch.save(self.graph, pre_processed_file_path, pickle_protocol=4)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): self.graph, _ = load_graphs(pre_processed_file_path) else: ### check if the downloaded file exists if self.binary: # npz format has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero else: # csv file has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) raw_dir = osp.join(self.root, 'raw') add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' ### pre-process and save if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.is_hetero: graph = read_heterograph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files, binary=self.binary)[0] else: graph = read_graph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files, binary=self.binary)[0] print('Saving...') save_graphs(pre_processed_file_path, graph, {}) self.graph, _ = load_graphs(pre_processed_file_path)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') raw_dir = osp.join(self.root, 'raw') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if os.path.exists(pre_processed_file_path): loaded_dict = torch.load(pre_processed_file_path, 'rb') self.graphs, self.labels = loaded_dict['graphs'], loaded_dict[ 'labels'] else: ### download url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) ### preprocess add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ self.name]["additional edge files"].split(',') self.graphs = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) if self.task_type == 'subtoken prediction': labels_joined = pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"), compression="gzip", header=None).values # need to split each element into subtokens self.labels = [ str(labels_joined[i][0]).split(' ') for i in range(len(labels_joined)) ] else: self.labels = pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"), compression="gzip", header=None).values print('Saving...') torch.save({ 'graphs': self.graphs, 'labels': self.labels }, pre_processed_file_path, pickle_protocol=4)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') raw_dir = osp.join(self.root, 'raw') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if self.task_type == 'sequence prediction': target_sequence_file_path = osp.join(processed_dir, 'target_sequence') if os.path.exists(pre_processed_file_path): if self.task_type == "sequence prediction": self.graphs, _ = load_graphs(pre_processed_file_path) self.labels = torch.load(target_sequence_file_path) else: self.graphs, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels'] else: ### download url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move(osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) ### preprocess add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ self.name]["additional edge files"].split(',') graphs = read_csv_graph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) if self.task_type == "sequence prediction": # the downloaded labels are initially joined by ' ' labels_joined = pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"), compression="gzip", header=None).values # need to split each element into subtokens labels = [ str(labels_joined[i][0]).split(' ') for i in range(len(labels_joined)) ] print('Saving...') save_graphs(pre_processed_file_path, graphs) torch.save(labels, target_sequence_file_path) ### load preprocessed files self.graphs, _ = load_graphs(pre_processed_file_path) self.labels = torch.load(target_sequence_file_path) else: labels = torch.tensor( pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"), compression="gzip", header=None).values) print('Saving...') save_graphs(pre_processed_file_path, graphs, labels={'labels': labels}) ### load preprocessed files self.graphs, label_dict = load_graphs(pre_processed_file_path) self.labels = label_dict['labels']
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') if osp.exists(pre_processed_file_path): if not self.is_hetero: self.graph, _ = load_graphs(pre_processed_file_path) else: with open(pre_processed_file_path, 'rb') as f: self.graph = pickle.load(f) else: ### check if the downloaded file exists has_necessary_file_simple = osp.exists( osp.join(self.root, "raw", "edge.csv.gz")) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, "raw", "triplet-type-list.csv.gz")) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" ### pre-process and save if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ self.name]["additional edge files"].split(',') if self.is_hetero: graph = read_csv_heterograph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[0] with open(pre_processed_file_path, 'wb') as f: pickle.dump([graph], f) with open(pre_processed_file_path, 'rb') as f: self.graph = pickle.load(f) else: graph = read_csv_graph_dgl( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[0] print('Saving...') save_graphs(pre_processed_file_path, graph, {}) self.graph, _ = load_graphs(pre_processed_file_path)