示例#1
0
    def pre_process(self):
        processed_dir = os.path.join(self.root, 'processed')
        pre_processed_file_path = os.path.join(processed_dir, 'dgl_data_processed')

        if os.path.exists(pre_processed_file_path):
            self.graphs, label_dict = load_graphs(pre_processed_file_path)
            self.labels = label_dict['labels']
            self.ids = label_dict['ids']

        else:
            url = self.meta_info[self.name]["dgl url"]
            if decide_download(url):
                path = download_url(url, self.original_root)
                extract_zip(path, self.original_root)
                os.unlink(path)
                # delete folder if there exists
                try:
                    shutil.rmtree(self.root)
                except:
                    pass
                shutil.move(osp.join(self.original_root, self.download_name), self.root)
            else:
                print("Stop download.")
                exit(-1)

            self.graphs, label_dict = load_graphs(pre_processed_file_path)
            self.labels = label_dict['labels']
            self.ids = label_dict['ids']
示例#2
0
文件: dataset.py 项目: fagan2888/ogb
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        raw_dir = osp.join(self.root, 'raw')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if os.path.exists(pre_processed_file_path):
            loaded_dict = torch.load(pre_processed_file_path, 'rb')
            self.graphs, self.labels = loaded_dict['graphs'], loaded_dict['labels']

        else:
            ### download
            url = self.meta_info[self.name]["url"]
            if decide_download(url):
                path = download_url(url, self.original_root)
                extract_zip(path, self.original_root)
                os.unlink(path)
                # delete folder if there exists
                try:
                    shutil.rmtree(self.root)
                except:
                    pass
                shutil.move(osp.join(self.original_root, self.download_name), self.root)
            else:
                print("Stop download.")
                exit(-1)

            ### preprocess
            add_inverse_edge = self.meta_info[self.name]["add_inverse_edge"] == "True"
            self.graphs = read_csv_graph_raw(raw_dir, add_inverse_edge = add_inverse_edge)
            self.labels = pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"), compression="gzip", header = None).values

            print('Saving...')
            torch.save({'graphs': self.graphs, 'labels': self.labels}, pre_processed_file_path)
示例#3
0
 def download(self):
     if decide_download(self.url):
         path = download_url(self.url, self.original_root)
         extract_zip(path, self.original_root)
         os.unlink(path)
     else:
         print('Stop download.')
         exit(-1)
示例#4
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):
            self.graph, label_dict = load_graphs(pre_processed_file_path)
            self.labels = label_dict['labels']

        else:
            ### check download
            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(osp.join(self.original_root, self.download_name), self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            ### pre-process and save
            add_inverse_edge = self.meta_info[self.name]["add_inverse_edge"] == "True"

            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[self.name]["additional edge files"].split(',')

            graph = read_csv_graph_dgl(raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)[0]

            ### adding prediction target
            node_label = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression="gzip", header = None).values
            if "classification" in self.task_type:
                node_label = torch.tensor(node_label, dtype = torch.long)
            else:
                node_label = torch.tensor(node_label, dtype = torch.float32)

            label_dict = {"labels": node_label}

            print('Saving...')
            save_graphs(pre_processed_file_path, graph, label_dict)

            self.graph, label_dict = load_graphs(pre_processed_file_path)
            self.labels = label_dict['labels']
示例#5
0
 def download(self):
     if not osp.exists(self.dir):
         if decide_download(self.url):
             path = download_url(self.url, self.root)
             extract_zip(path, self.root)
             os.unlink(path)
         else:
             print('Stop download.')
             exit(-1)
示例#6
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):
            self.graph, _ = load_graphs(pre_processed_file_path)

        else:
            ### check download
            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"

            ### pre-process and save
            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    self.name]["additional edge files"].split(',')

            graph = read_csv_graph_dgl(
                raw_dir,
                add_inverse_edge=add_inverse_edge,
                additional_node_files=additional_node_files,
                additional_edge_files=additional_edge_files)[0]

            print('Saving...')
            save_graphs(pre_processed_file_path, graph, {})

            self.graph, _ = load_graphs(pre_processed_file_path)
示例#7
0
文件: dataset.py 项目: jzhou316/ogb
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if osp.exists(pre_processed_file_path):
            loaded_dict = pickle.load(open(pre_processed_file_path, 'rb'))
            self.graph, self.labels = loaded_dict['graph'], loaded_dict[
                'labels']

        else:
            ### check download
            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            ### pre-process and save
            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"
            graph = read_csv_graph_raw(
                raw_dir,
                add_inverse_edge=add_inverse_edge)[0]  # only a single graph

            ### adding prediction target
            node_label = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'),
                                     compression="gzip",
                                     header=None).values

            pickle.dump({
                'graph': graph,
                'labels': node_label
            },
                        open(pre_processed_file_path, 'wb'),
                        protocol=4)

            ### load preprocessed files
            loaded_dict = pickle.load(open(pre_processed_file_path, 'rb'))
            self.graph, self.labels = loaded_dict['graph'], loaded_dict[
                'labels']
示例#8
0
 def download(self):
     url =  self.meta_info['url']
     if decide_download(url):
         path = download_url(url, self.original_root)
         extract_zip(path, self.original_root)
         os.unlink(path)
         shutil.rmtree(self.root)
         shutil.move(osp.join(self.original_root, self.download_name), self.root)
     else:
         print('Stop downloading.')
         shutil.rmtree(self.root)
         exit(-1)
示例#9
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        raw_dir = osp.join(self.root, 'raw')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        ### download
        url = 'https://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/pcba.zip'
        if decide_download(url):
            path = download_url(url, self.original_root)
            extract_zip(path, self.original_root)
            os.unlink(path)
            # delete folder if there exists
            try:
                shutil.rmtree(self.root)
            except:
                pass
            shutil.move(osp.join(self.original_root, self.download_name),
                        self.root)
        else:
            print("Stop download.")
            exit(-1)

        ### preprocess
        add_inverse_edge = True
        additional_node_files = []
        additional_edge_files = []

        graphs = read_csv_graph_dgl(
            raw_dir,
            add_inverse_edge=add_inverse_edge,
            additional_node_files=additional_node_files,
            additional_edge_files=additional_edge_files)

        labels = pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"),
                             compression="gzip",
                             header=None).values

        has_nan = np.isnan(labels).any()

        if "classification" in self.task_type:
            if has_nan:
                labels = torch.from_numpy(labels)
            else:
                labels = torch.from_numpy(labels).to(torch.long)
        else:
            labels = torch.from_numpy(labels)

        print('Not Saving...')
        # save_graphs(pre_processed_file_path, graphs, labels={'labels': labels})

        ### load preprocessed files
        self.graphs = graphs
        self.labels = labels
示例#10
0
 def process(self):
     url = self.meta_info[self.name]["pyg url"]
     if decide_download(url):
         path = download_url(url, self.original_root)
         extract_zip(path, self.original_root)
         os.unlink(path)
         shutil.rmtree(self.root)
         shutil.move(osp.join(self.original_root, self.download_name),
                     self.root)
     else:
         print("Stop download.")
         shutil.rmtree(self.root)
         exit(-1)
示例#11
0
    def pre_process(self):
        """pre_process downlaoding data
        """
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')

        if osp.exists(pre_processed_file_path):
            # TODO: Reload Preprocess files 
            pass
        else:
            ### check download
            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            ### pre-process and save
            add_inverse_edge = to_bool(self.meta_info[self.name][
                "add_inverse_edge"])
            self.graph = read_csv_graph_pgl(
                raw_dir, add_inverse_edge=add_inverse_edge)

            ### adding prediction target
            node_label = pd.read_csv(
                osp.join(raw_dir, 'node-label.csv.gz'),
                compression="gzip",
                header=None).values
            if "classification" in self.task_type:
                node_label = np.array(node_label, dtype=np.int64)
            else:
                node_label = np.array(node_label, dtype=np.float32)

            label_dict = {"labels": node_label}

            # TODO: SAVE preprocess graph
            self.labels = label_dict['labels']
示例#12
0
 def download(self):
     if decide_download(self.url):
         path = download_url(self.url, self.original_root)
         extract_zip(path, self.original_root)
         os.unlink(path)
         try:
             shutil.rmtree(self.folder)
         except:
             pass
         shutil.move(osp.join(self.original_root, self.download_name),
                     self.folder)
     else:
         print('Stop download.')
         exit(-1)
示例#13
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if osp.exists(pre_processed_file_path):
            loaded_dict = torch.load(pre_processed_file_path)
            self.graph, self.labels = loaded_dict['graph'], loaded_dict['labels']

        else:
            ### check download
            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(osp.join(self.original_root, self.download_name), self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            ### pre-process and save
            add_inverse_edge = self.meta_info[self.name]["add_inverse_edge"] == "True"

            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[self.name]["additional edge files"].split(',')

            self.graph = read_csv_graph_raw(raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)[0] # only a single graph

            ### adding prediction target
            self.labels = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression="gzip", header = None).values

            print('Saving...')
            torch.save({'graph': self.graph, 'labels': self.labels}, pre_processed_file_path)
示例#14
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):
            self.graph, _ = load_graphs(pre_processed_file_path)

        else:
            ### check download
            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            file_names = ["edge"]
            if self.meta_info[self.name]["has_node_attr"] == "True":
                file_names.append("node-feat")
            if self.meta_info[self.name]["has_edge_attr"] == "True":
                file_names.append("edge-feat")
            raw_file_names = [
                file_name + ".csv.gz" for file_name in file_names
            ]

            ### pre-process and save
            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"
            graph = read_csv_graph_dgl(raw_dir,
                                       raw_file_names,
                                       add_inverse_edge=add_inverse_edge)

            save_graphs(pre_processed_file_path, graph, {})

            self.graph, _ = load_graphs(pre_processed_file_path)
示例#15
0
    def download(self):
        url = self.meta_info['url']
        if decide_download(url):
            path = download_url(url, self.original_root)

            # hack to keep zip file
            shutil.copyfile(path, path + '.tmp')
            extract_zip(path, self.original_root)
            os.unlink(path)
            shutil.rmtree(self.root)
            shutil.move(osp.join(self.original_root, self.download_name),
                        self.root)
            # finish hack
            shutil.move(path + '.tmp', path)
        else:
            print('Stop downloading.')
            shutil.rmtree(self.root)
            exit(-1)
示例#16
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if osp.exists(pre_processed_file_path):
            self.graph = pickle.load(open(pre_processed_file_path, 'rb'))

        else:
            ### check download
            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            ### pre-process and save
            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"
            graph = read_csv_graph_raw(
                raw_dir,
                add_inverse_edge=add_inverse_edge)[0]  # only a single graph

            pickle.dump(graph, open(pre_processed_file_path, 'wb'), protocol=4)

            self.graph = pickle.load(open(pre_processed_file_path, 'rb'))
示例#17
0
    def pre_process(self):
        """Pre-processing"""
        processed_dir = osp.join(self.root, 'processed')
        raw_dir = osp.join(self.root, 'raw')
        pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')

        if os.path.exists(pre_processed_file_path):
            # TODO: Load Preprocessed
            pass
        else:
            ### download
            url = self.meta_info[self.name]["url"]
            if decide_download(url):
                path = download_url(url, self.original_root)
                extract_zip(path, self.original_root)
                os.unlink(path)
                # delete folder if there exists
                try:
                    shutil.rmtree(self.root)
                except:
                    pass
                shutil.move(osp.join(self.original_root, self.download_name),
                            self.root)
            else:
                print("Stop download.")
                exit(-1)

            ### preprocess
            add_inverse_edge = to_bool(
                self.meta_info[self.name]["add_inverse_edge"])
            self.graphs = read_csv_graph_pgl(raw_dir,
                                             add_inverse_edge=add_inverse_edge)
            self.graphs = np.array(self.graphs)
            self.labels = np.array(
                pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"),
                            compression="gzip",
                            header=None).values)
示例#18
0
    def pre_process(self):
        """pre_process downlaoding data
        """
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):
            #TODO: Reload Preprocess files
            pass
        else:
            ### check download
            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            ### pre-process and save
            add_inverse_edge = to_bool(
                self.meta_info[self.name]["add_inverse_edge"])
            self.graph = read_csv_graph_pgl(raw_dir,
                                            add_inverse_edge=add_inverse_edge)
示例#19
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if osp.exists(pre_processed_file_path):
            # loaded_dict = torch.load(pre_processed_file_path)
            loaded_dict = load_pickle(pre_processed_file_path)
            self.graph, self.labels = loaded_dict['graph'], loaded_dict[
                'labels']

        else:
            ### check download
            if self.binary:
                # npz format
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'data.npz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge_index_dict.npz')) and self.is_hetero
            else:
                # csv file
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge.csv.gz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'triplet-type-list.csv.gz')) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero

            if not has_necessary_file:
                url = self.meta_info['url']
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print('Stop download.')
                    exit(-1)

            raw_dir = osp.join(self.root, 'raw')

            ### pre-process and save
            add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

            if self.meta_info['additional node files'] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    'additional node files'].split(',')

            if self.meta_info['additional edge files'] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    'additional edge files'].split(',')

            if self.is_hetero:
                if self.binary:
                    self.graph = read_binary_heterograph_raw(
                        raw_dir, add_inverse_edge=add_inverse_edge)[
                            0]  # only a single graph

                    tmp = np.load(osp.join(raw_dir, 'node-label.npz'))
                    self.labels = {}
                    for key in list(tmp.keys()):
                        self.labels[key] = tmp[key]
                    del tmp
                else:
                    self.graph = read_csv_heterograph_raw(
                        raw_dir,
                        add_inverse_edge=add_inverse_edge,
                        additional_node_files=additional_node_files,
                        additional_edge_files=additional_edge_files)[
                            0]  # only a single graph
                    self.labels = read_node_label_hetero(raw_dir)

            else:
                if self.binary:
                    self.graph = read_binary_graph_raw(
                        raw_dir, add_inverse_edge=add_inverse_edge)[
                            0]  # only a single graph
                    self.labels = np.load(osp.join(
                        raw_dir, 'node-label.npz'))['node_label']
                else:
                    self.graph = read_csv_graph_raw(
                        raw_dir,
                        add_inverse_edge=add_inverse_edge,
                        additional_node_files=additional_node_files,
                        additional_edge_files=additional_edge_files)[
                            0]  # only a single graph
                    self.labels = pd.read_csv(osp.join(raw_dir,
                                                       'node-label.csv.gz'),
                                              compression='gzip',
                                              header=None).values

            print('Saving...')
            # torch.save({'graph': self.graph, 'labels': self.labels}, pre_processed_file_path, pickle_protocol=4)
            dump_pickle({
                'graph': self.graph,
                'labels': self.labels
            }, pre_processed_file_path)
示例#20
0
 def download(self):
     path = download_url(self.url, self.root)
     extract_zip(path, self.root)
     os.unlink(path)
     shutil.move(osp.join(self.root, 'pcqm4m_kddcup2021/raw/data.csv.gz'),
                 osp.join(self.root, 'raw/data.csv.gz'))
示例#21
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        raw_dir = osp.join(self.root, 'raw')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if self.task_type == 'subtoken prediction':
            target_sequence_file_path = osp.join(processed_dir,
                                                 'target_sequence')

        if os.path.exists(pre_processed_file_path):

            if self.task_type == 'subtoken prediction':
                self.graphs, _ = load_graphs(pre_processed_file_path)
                self.labels = torch.load(target_sequence_file_path)

            else:
                self.graphs, label_dict = load_graphs(pre_processed_file_path)
                self.labels = label_dict['labels']

        else:
            ### check download
            if self.binary:
                # npz format
                has_necessary_file = osp.exists(
                    osp.join(self.root, 'raw', 'data.npz'))
            else:
                # csv file
                has_necessary_file = osp.exists(
                    osp.join(self.root, 'raw', 'edge.csv.gz'))

            ### download
            if not has_necessary_file:
                url = self.meta_info['url']
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print('Stop download.')
                    exit(-1)

            ### preprocess
            add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

            if self.meta_info['additional node files'] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    'additional node files'].split(',')

            if self.meta_info['additional edge files'] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    'additional edge files'].split(',')

            graphs = read_graph_dgl(
                raw_dir,
                add_inverse_edge=add_inverse_edge,
                additional_node_files=additional_node_files,
                additional_edge_files=additional_edge_files,
                binary=self.binary)

            if self.task_type == 'subtoken prediction':
                # the downloaded labels are initially joined by ' '
                labels_joined = pd.read_csv(osp.join(raw_dir,
                                                     'graph-label.csv.gz'),
                                            compression='gzip',
                                            header=None).values
                # need to split each element into subtokens
                labels = [
                    str(labels_joined[i][0]).split(' ')
                    for i in range(len(labels_joined))
                ]

                print('Saving...')
                save_graphs(pre_processed_file_path, graphs)
                torch.save(labels, target_sequence_file_path)

                ### load preprocessed files
                self.graphs, _ = load_graphs(pre_processed_file_path)
                self.labels = torch.load(target_sequence_file_path)

            else:
                if self.binary:
                    labels = np.load(osp.join(
                        raw_dir, 'graph-label.npz'))['graph_label']
                else:
                    labels = pd.read_csv(osp.join(raw_dir,
                                                  'graph-label.csv.gz'),
                                         compression='gzip',
                                         header=None).values

                has_nan = np.isnan(labels).any()

                if 'classification' in self.task_type:
                    if has_nan:
                        labels = torch.from_numpy(labels).to(torch.float32)
                    else:
                        labels = torch.from_numpy(labels).to(torch.long)
                else:
                    labels = torch.from_numpy(labels).to(torch.float32)

                print('Saving...')
                save_graphs(pre_processed_file_path,
                            graphs,
                            labels={'labels': labels})

                ### load preprocessed files
                self.graphs, label_dict = load_graphs(pre_processed_file_path)
                self.labels = label_dict['labels']
示例#22
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if osp.exists(pre_processed_file_path):
            # self.graph = torch.load(pre_processed_file_path, 'rb')
            self.graph = load_pickle(pre_processed_file_path)

        else:
            ### check download
            if self.binary:
                # npz format
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'data.npz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge_index_dict.npz')) and self.is_hetero
            else:
                # csv file
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge.csv.gz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'triplet-type-list.csv.gz')) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero

            if not has_necessary_file:
                url = self.meta_info['url']
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print('Stop download.')
                    exit(-1)

            raw_dir = osp.join(self.root, 'raw')

            ### pre-process and save
            add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

            if self.meta_info['additional node files'] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    'additional node files'].split(',')

            if self.meta_info['additional edge files'] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    'additional edge files'].split(',')

            if self.is_hetero:
                if self.binary:
                    self.graph = read_binary_heterograph_raw(
                        raw_dir, add_inverse_edge=add_inverse_edge)[
                            0]  # only a single graph
                else:
                    self.graph = read_csv_heterograph_raw(
                        raw_dir,
                        add_inverse_edge=add_inverse_edge,
                        additional_node_files=additional_node_files,
                        additional_edge_files=additional_edge_files)[
                            0]  # only a single graph

            else:
                if self.binary:
                    self.graph = read_binary_graph_raw(
                        raw_dir, add_inverse_edge=add_inverse_edge)[
                            0]  # only a single graph
                else:
                    self.graph = read_csv_graph_raw(
                        raw_dir,
                        add_inverse_edge=add_inverse_edge,
                        additional_node_files=additional_node_files,
                        additional_edge_files=additional_edge_files)[
                            0]  # only a single graph

            print('Saving...')

            # torch.save(self.graph, pre_processed_file_path, pickle_protocol=4)
            dump_pickle(self.graph, pre_processed_file_path)
示例#23
0

if __name__ == "__main__":
    ## example code
    # if not osp.exists('dataset/tox21'):
    #     url = 'https://ogb.stanford.edu/data/graphproppred/csv_mol_download/tox21.zip'
    #     path = download_url(url, 'dataset')
    #     extract_zip(path, 'dataset')
    #     os.unlink(path)

    # graph_list = read_csv_graph_raw('dataset/tox21/raw', add_inverse_edge = True)

    if not osp.exists('dataset/proteinfunc_v2'):
        url = 'https://ogb.stanford.edu/data/nodeproppred/proteinfunc_v2.zip'
        path = download_url(url, 'dataset')
        extract_zip(path, 'dataset')
        os.unlink(path)

    graph_list = read_csv_graph_raw('dataset/proteinfunc_v2/raw',
                                    add_inverse_edge=True)

    # if not osp.exists('dataset/ppassoc_v2'):
    #     url = 'https://ogb.stanford.edu/data/linkproppred/ppassoc_v2.zip'
    #     path = download_url(url, 'dataset')
    #     extract_zip(path, 'dataset')
    #     os.unlink(path)

    #graph_list = read_csv_graph_raw('dataset/ppassoc_v2/raw', add_inverse_edge = True)

    print(len(graph_list))
    print(graph_list[0])
示例#24
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):

            if not self.is_hetero:
                self.graph, label_dict = load_graphs(pre_processed_file_path)
                self.labels = label_dict['labels']
            else:
                with open(pre_processed_file_path, 'rb') as f:
                    self.graph, self.labels = pickle.load(f)

        else:
            ### check if the downloaded file exists
            has_necessary_file_simple = osp.exists(
                osp.join(self.root, "raw",
                         "edge.csv.gz")) and (not self.is_hetero)
            has_necessary_file_hetero = osp.exists(
                osp.join(self.root, "raw",
                         "triplet-type-list.csv.gz")) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero
            if not has_necessary_file:
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            ### pre-process and save
            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"

            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    self.name]["additional edge files"].split(',')

            if self.is_hetero:
                graph = read_csv_heterograph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[0]

                label_dict = read_node_label_hetero(raw_dir)

                # convert into torch tensor
                if "classification" in self.task_type:
                    for nodetype in label_dict.keys():
                        # detect if there is any nan
                        node_label = label_dict[nodetype]
                        if np.isnan(node_label).any():
                            label_dict[nodetype] = torch.from_numpy(
                                node_label).to(torch.float32)
                        else:
                            label_dict[nodetype] = torch.from_numpy(
                                node_label).to(torch.long)
                else:
                    for nodetype in label_dict.keys():
                        node_label = label_dict[nodetype]
                        label_dict[nodetype] = torch.from_numpy(node_label).to(
                            torch.float32)

                with open(pre_processed_file_path, 'wb') as f:
                    pickle.dump(([graph], label_dict), f)

                with open(pre_processed_file_path, 'rb') as f:
                    self.graph, self.labels = pickle.load(f)

            else:
                graph = read_csv_graph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[0]

                ### adding prediction target
                node_label = pd.read_csv(osp.join(raw_dir,
                                                  'node-label.csv.gz'),
                                         compression="gzip",
                                         header=None).values

                if "classification" in self.task_type:
                    # detect if there is any nan
                    if np.isnan(node_label).any():
                        node_label = torch.from_numpy(node_label).to(
                            torch.float32)
                    else:
                        node_label = torch.from_numpy(node_label).to(
                            torch.long)
                else:
                    node_label = torch.from_numpy(node_label).to(torch.float32)

                label_dict = {"labels": node_label}

                save_graphs(pre_processed_file_path, graph, label_dict)

                self.graph, label_dict = load_graphs(pre_processed_file_path)
                self.labels = label_dict['labels']
示例#25
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        raw_dir = osp.join(self.root, 'raw')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if os.path.exists(pre_processed_file_path):
            loaded_dict = torch.load(pre_processed_file_path, 'rb')
            self.graphs, self.labels = loaded_dict['graphs'], loaded_dict[
                'labels']

        else:
            ### check download
            if self.binary:
                # npz format
                has_necessary_file = osp.exists(
                    osp.join(self.root, 'raw', 'data.npz'))
            else:
                # csv file
                has_necessary_file = osp.exists(
                    osp.join(self.root, 'raw', 'edge.csv.gz'))

            ### download
            if not has_necessary_file:
                url = self.meta_info['url']
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print('Stop download.')
                    exit(-1)

            ### preprocess
            add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

            if self.meta_info['additional node files'] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    'additional node files'].split(',')

            if self.meta_info['additional edge files'] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    'additional edge files'].split(',')

            if self.binary:
                self.graphs = read_binary_graph_raw(
                    raw_dir, add_inverse_edge=add_inverse_edge)
            else:
                self.graphs = read_csv_graph_raw(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)

            if self.task_type == 'subtoken prediction':
                labels_joined = pd.read_csv(osp.join(raw_dir,
                                                     'graph-label.csv.gz'),
                                            compression='gzip',
                                            header=None).values
                # need to split each element into subtokens
                self.labels = [
                    str(labels_joined[i][0]).split(' ')
                    for i in range(len(labels_joined))
                ]
            else:
                if self.binary:
                    self.labels = np.load(osp.join(
                        raw_dir, 'graph-label.npz'))['graph_label']
                else:
                    self.labels = pd.read_csv(osp.join(raw_dir,
                                                       'graph-label.csv.gz'),
                                              compression='gzip',
                                              header=None).values

            print('Saving...')
            torch.save({
                'graphs': self.graphs,
                'labels': self.labels
            },
                       pre_processed_file_path,
                       pickle_protocol=4)
示例#26
0
文件: dataset.py 项目: yzh119/ogb
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if osp.exists(pre_processed_file_path):
            self.graph = torch.load(pre_processed_file_path, 'rb')

        else:
            ### check download
            has_necessary_file_simple = osp.exists(
                osp.join(self.root, "raw",
                         "edge.csv.gz")) and (not self.is_hetero)
            has_necessary_file_hetero = osp.exists(
                osp.join(self.root, "raw",
                         "triplet-type-list.csv.gz")) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero

            if not has_necessary_file:
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            ### pre-process and save
            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"

            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    self.name]["additional edge files"].split(',')

            if self.is_hetero:
                self.graph = read_csv_heterograph_raw(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[
                        0]  # only a single graph

            else:
                self.graph = read_csv_graph_raw(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[
                        0]  # only a single graph

            print('Saving...')
            torch.save(self.graph, pre_processed_file_path, pickle_protocol=4)
示例#27
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):
            self.graph, _ = load_graphs(pre_processed_file_path)

        else:
            ### check if the downloaded file exists
            if self.binary:
                # npz format
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'data.npz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge_index_dict.npz')) and self.is_hetero
            else:
                # csv file
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge.csv.gz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'triplet-type-list.csv.gz')) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero
            if not has_necessary_file:
                url = self.meta_info['url']
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print('Stop download.')
                    exit(-1)

            raw_dir = osp.join(self.root, 'raw')

            add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

            ### pre-process and save
            if self.meta_info['additional node files'] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    'additional node files'].split(',')

            if self.meta_info['additional edge files'] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    'additional edge files'].split(',')

            if self.is_hetero:
                graph = read_heterograph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files,
                    binary=self.binary)[0]
            else:
                graph = read_graph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files,
                    binary=self.binary)[0]

            print('Saving...')
            save_graphs(pre_processed_file_path, graph, {})

            self.graph, _ = load_graphs(pre_processed_file_path)
示例#28
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        raw_dir = osp.join(self.root, 'raw')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if os.path.exists(pre_processed_file_path):
            loaded_dict = torch.load(pre_processed_file_path, 'rb')
            self.graphs, self.labels = loaded_dict['graphs'], loaded_dict[
                'labels']

        else:
            ### download
            url = self.meta_info[self.name]["url"]
            if decide_download(url):
                path = download_url(url, self.original_root)
                extract_zip(path, self.original_root)
                os.unlink(path)
                # delete folder if there exists
                try:
                    shutil.rmtree(self.root)
                except:
                    pass
                shutil.move(osp.join(self.original_root, self.download_name),
                            self.root)
            else:
                print("Stop download.")
                exit(-1)

            ### preprocess
            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"

            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    self.name]["additional edge files"].split(',')

            self.graphs = read_csv_graph_raw(
                raw_dir,
                add_inverse_edge=add_inverse_edge,
                additional_node_files=additional_node_files,
                additional_edge_files=additional_edge_files)

            if self.task_type == 'subtoken prediction':
                labels_joined = pd.read_csv(osp.join(raw_dir,
                                                     "graph-label.csv.gz"),
                                            compression="gzip",
                                            header=None).values
                # need to split each element into subtokens
                self.labels = [
                    str(labels_joined[i][0]).split(' ')
                    for i in range(len(labels_joined))
                ]
            else:
                self.labels = pd.read_csv(osp.join(raw_dir,
                                                   "graph-label.csv.gz"),
                                          compression="gzip",
                                          header=None).values

            print('Saving...')
            torch.save({
                'graphs': self.graphs,
                'labels': self.labels
            },
                       pre_processed_file_path,
                       pickle_protocol=4)
示例#29
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        raw_dir = osp.join(self.root, 'raw')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if self.task_type == 'sequence prediction':
            target_sequence_file_path = osp.join(processed_dir,
                                                 'target_sequence')

        if os.path.exists(pre_processed_file_path):

            if self.task_type == "sequence prediction":
                self.graphs, _ = load_graphs(pre_processed_file_path)
                self.labels = torch.load(target_sequence_file_path)

            else:
                self.graphs, label_dict = load_graphs(pre_processed_file_path)
                self.labels = label_dict['labels']

        else:
            ### download
            url = self.meta_info[self.name]["url"]
            if decide_download(url):
                path = download_url(url, self.original_root)
                extract_zip(path, self.original_root)
                os.unlink(path)
                # delete folder if there exists
                try:
                    shutil.rmtree(self.root)
                except:
                    pass
                shutil.move(osp.join(self.original_root, self.download_name),
                            self.root)
            else:
                print("Stop download.")
                exit(-1)

            ### preprocess
            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"

            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    self.name]["additional edge files"].split(',')

            graphs = read_csv_graph_dgl(
                raw_dir,
                add_inverse_edge=add_inverse_edge,
                additional_node_files=additional_node_files,
                additional_edge_files=additional_edge_files)

            if self.task_type == "sequence prediction":
                # the downloaded labels are initially joined by ' '
                labels_joined = pd.read_csv(osp.join(raw_dir,
                                                     "graph-label.csv.gz"),
                                            compression="gzip",
                                            header=None).values
                # need to split each element into subtokens
                labels = [
                    str(labels_joined[i][0]).split(' ')
                    for i in range(len(labels_joined))
                ]

                print('Saving...')
                save_graphs(pre_processed_file_path, graphs)
                torch.save(labels, target_sequence_file_path)

                ### load preprocessed files
                self.graphs, _ = load_graphs(pre_processed_file_path)
                self.labels = torch.load(target_sequence_file_path)

            else:
                labels = torch.tensor(
                    pd.read_csv(osp.join(raw_dir, "graph-label.csv.gz"),
                                compression="gzip",
                                header=None).values)

                print('Saving...')
                save_graphs(pre_processed_file_path,
                            graphs,
                            labels={'labels': labels})

                ### load preprocessed files
                self.graphs, label_dict = load_graphs(pre_processed_file_path)
                self.labels = label_dict['labels']
示例#30
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):

            if not self.is_hetero:
                self.graph, _ = load_graphs(pre_processed_file_path)
            else:
                with open(pre_processed_file_path, 'rb') as f:
                    self.graph = pickle.load(f)

        else:
            ### check if the downloaded file exists
            has_necessary_file_simple = osp.exists(
                osp.join(self.root, "raw",
                         "edge.csv.gz")) and (not self.is_hetero)
            has_necessary_file_hetero = osp.exists(
                osp.join(self.root, "raw",
                         "triplet-type-list.csv.gz")) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero
            if not has_necessary_file:
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"

            ### pre-process and save
            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    self.name]["additional edge files"].split(',')

            if self.is_hetero:
                graph = read_csv_heterograph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[0]

                with open(pre_processed_file_path, 'wb') as f:
                    pickle.dump([graph], f)

                with open(pre_processed_file_path, 'rb') as f:
                    self.graph = pickle.load(f)

            else:
                graph = read_csv_graph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[0]

                print('Saving...')
                save_graphs(pre_processed_file_path, graph, {})

                self.graph, _ = load_graphs(pre_processed_file_path)