Пример #1
0
    def download(self, overwrite=False):

        self.input_cites_edge_list_file = self.output_directory / Path("cites_edge_index.npy")
        self.input_splits_file = self.output_directory / Path("split_dict.pt")
        self.input_node_feature_file = self.output_directory / Path("node_feat.npy")
        self.input_node_label_file = self.output_directory / Path("node_label.npy")

        download = False
        if not self.input_cites_edge_list_file.exists():
            download = True
        if not self.input_splits_file.exists():
            download = True
        if not self.input_node_feature_file.exists():
            download = True
        if not self.input_node_label_file.exists():
            download = True

        if download:
            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)
            extract_file(archive_path, remove_input=False)

            (self.output_directory / Path("mag240m_kddcup2021/processed/paper___cites___paper/edge_index.npy")).rename(self.input_cites_edge_list_file)
            (self.output_directory / Path("mag240m_kddcup2021/split_dict.pt")).rename(self.input_splits_file)
            (self.output_directory / Path("mag240m_kddcup2021/processed/paper/node_feat.npy")).rename(self.input_node_feature_file)
            (self.output_directory / Path("mag240m_kddcup2021/processed/paper/node_label.npy")).rename(self.input_node_label_file)
Пример #2
0
    def download(self, overwrite=False):

        self.input_train_edges_file = self.output_directory / Path(
            "train_hrt.npy")
        self.input_valid_edges_sr_file = self.output_directory / Path(
            "val_hr.npy")
        self.input_valid_edges_d_file = self.output_directory / Path(
            "val_t.npy")
        # self.input_test_edges_file = self.output_directory / Path("test-dev_hr.npy")
        # self.input_test_edges_file = self.output_directory / Path("test-challenge_hr.npy")

        self.input_node_feature_file = self.output_directory / Path(
            "entity_feat.npy")
        self.input_rel_feature_file = self.output_directory / Path(
            "relation_feat.npy")

        download = False
        if not self.input_train_edges_file.exists():
            download = True
        if not self.input_valid_edges_sr_file.exists():
            download = True
        if not self.input_valid_edges_d_file.exists():
            download = True

        if download:
            archive_path = download_url(self.dataset_url,
                                        self.output_directory, overwrite)
            extract_file(archive_path, remove_input=True)

            for file in (self.output_directory /
                         Path("wikikg90m-v2/processed/")).iterdir():
                file.rename(self.output_directory / Path(file.name))
Пример #3
0
    def download(self, overwrite=False):

        self.input_train_edges_file = self.output_directory / Path(
            "freebase_mtr100_mte100-train.txt")
        self.input_valid_edges_file = self.output_directory / Path(
            "freebase_mtr100_mte100-valid.txt")
        self.input_test_edges_file = self.output_directory / Path(
            "freebase_mtr100_mte100-test.txt")

        download = False
        if not self.input_train_edges_file.exists():
            download = True
        if not self.input_valid_edges_file.exists():
            download = True
        if not self.input_test_edges_file.exists():
            download = True

        if download:
            archive_path = download_url(self.dataset_url,
                                        self.output_directory, overwrite)
            extract_file(archive_path, remove_input=True)

            for file in (self.output_directory / Path("FB15k")).iterdir():
                file.rename(self.output_directory / Path(file.name))

            (self.output_directory / Path("FB15k")).rmdir()
Пример #4
0
    def download(self, overwrite=False):

        self.input_edges = self.output_directory / Path("twitter-2010.txt")

        if not self.input_edges.exists():
            archive_path = download_url(self.dataset_url,
                                        self.output_directory, overwrite)
            extract_file(archive_path, remove_input=True)
Пример #5
0
    def download(self, overwrite=False):

        self.input_edges = self.output_directory / Path("soc-LiveJournal1.txt")

        if not self.input_edges.exists():
            archive_path = download_url(self.dataset_url,
                                        self.output_directory, overwrite)
            extract_file(archive_path, remove_input=True)
            strip_header(self.input_edges, num_lines=4)
Пример #6
0
    def download(self, overwrite=False):
        self.input_edge_list_file = self.output_directory / Path("edge.csv")
        self.input_node_feature_file = self.output_directory / Path(
            "node-feat.csv")
        self.input_node_label_file = self.output_directory / Path(
            "node-label.csv")
        self.input_train_nodes_file = self.output_directory / Path("train.csv")
        self.input_valid_nodes_file = self.output_directory / Path("valid.csv")
        self.input_test_nodes_file = self.output_directory / Path("test.csv")

        download = False
        if not self.input_edge_list_file.exists():
            download = True
        if not self.input_node_feature_file.exists():
            download = True
        if not self.input_node_label_file.exists():
            download = True
        if not self.input_train_nodes_file.exists():
            download = True
        if not self.input_valid_nodes_file.exists():
            download = True
        if not self.input_test_nodes_file.exists():
            download = True

        if download:
            archive_path = download_url(self.dataset_url,
                                        self.output_directory, overwrite)
            extract_file(archive_path, remove_input=False)

            extract_file(self.output_directory /
                         Path("products/raw/edge.csv.gz"))
            extract_file(self.output_directory /
                         Path("products/raw/node-feat.csv.gz"))
            extract_file(self.output_directory /
                         Path("products/raw/node-label.csv.gz"))

            (self.output_directory / Path("products/raw/edge.csv")).rename(
                self.input_edge_list_file)
            (self.output_directory /
             Path("products/raw/node-feat.csv")).rename(
                 self.input_node_feature_file)
            (self.output_directory /
             Path("products/raw/node-label.csv")).rename(
                 self.input_node_label_file)

            for file in (self.output_directory /
                         Path("products/split/sales_ranking")).iterdir():
                extract_file(file)

            for file in (self.output_directory /
                         Path("products/split/sales_ranking")).iterdir():
                file.rename(self.output_directory / Path(file.name))
Пример #7
0
    def download(self, overwrite=False):
        self.input_train_edges_file = self.output_directory / Path("edge.csv")

        download = False
        if not self.input_train_edges_file.exists():
            download = True

        if download:
            archive_path = download_url(self.dataset_url,
                                        self.output_directory, overwrite)
            extract_file(archive_path, remove_input=False)

            extract_file(self.output_directory / Path("arxiv/raw/edge.csv.gz"))

            (self.output_directory / Path("arxiv/raw/edge.csv")).rename(
                self.input_train_edges_file)
Пример #8
0
    def download(self, overwrite=False):

        self.input_edge_list_file = self.output_directory / Path(
            "data.npz")  # key: edge_index
        self.input_node_feature_file = self.output_directory / Path(
            "data.npz")  # key: node_feat
        self.input_node_label_file = self.output_directory / Path(
            "node-label.npz")
        self.input_train_nodes_file = self.output_directory / Path("train.csv")
        self.input_valid_nodes_file = self.output_directory / Path("valid.csv")
        self.input_test_nodes_file = self.output_directory / Path("test.csv")

        download = False
        if not self.input_edge_list_file.exists():
            download = True
        if not self.input_node_feature_file.exists():
            download = True
        if not self.input_node_label_file.exists():
            download = True
        if not self.input_train_nodes_file.exists():
            download = True
        if not self.input_valid_nodes_file.exists():
            download = True
        if not self.input_test_nodes_file.exists():
            download = True

        if download:
            archive_path = download_url(self.dataset_url,
                                        self.output_directory, overwrite)
            extract_file(archive_path, remove_input=False)

            (self.output_directory /
             Path("papers100M-bin/raw/data.npz")).rename(
                 self.input_node_feature_file)
            (self.output_directory /
             Path("papers100M-bin/raw/node-label.npz")).rename(
                 self.input_node_label_file)

            for file in (self.output_directory /
                         Path("papers100M-bin/split/time")).iterdir():
                extract_file(file)

            for file in (self.output_directory /
                         Path("papers100M-bin/split/time")).iterdir():
                file.rename(self.output_directory / Path(file.name))
Пример #9
0
    def download(self, overwrite=False, remap_ids=True):

        self.input_train_edges_file = self.output_directory / Path("train.pt")
        self.input_valid_edges_file = self.output_directory / Path("valid.pt")
        self.input_test_edges_file = self.output_directory / Path("test.pt")

        download = False
        if not self.input_train_edges_file.exists():
            download = True
        if not self.input_valid_edges_file.exists():
            download = True
        if not self.input_test_edges_file.exists():
            download = True

        if download:
            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)
            extract_file(archive_path, remove_input=False)

            for file in (self.output_directory / Path("ppassoc/split/throughput")).iterdir():
                file.rename(self.output_directory / Path(file.name))
Пример #10
0
    def download(self, overwrite=False):

        # These are the files we want to make my the end of the the download
        self.input_edge_list_file = self.output_directory / Path("edge.csv")
        self.input_node_feature_file = self.output_directory / Path(
            "node-feat.csv")
        self.input_node_label_file = self.output_directory / Path(
            "node-label.csv")
        self.input_train_nodes_file = self.output_directory / Path("train.csv")
        self.input_valid_nodes_file = self.output_directory / Path("valid.csv")
        self.input_test_nodes_file = self.output_directory / Path("test.csv")

        # If files already exist we don't need to do processing
        download = False
        if not self.input_edge_list_file.exists():
            download = True
        if not self.input_node_feature_file.exists():
            download = True
        if not self.input_node_label_file.exists():
            download = True
        if not self.input_train_nodes_file.exists():
            download = True
        if not self.input_valid_nodes_file.exists():
            download = True
        if not self.input_test_nodes_file.exists():
            download = True

        if download:
            archive_path = download_url(self.dataset_url,
                                        self.output_directory, overwrite)
            extract_file(archive_path, remove_input=False)

            # Reading and processing the csv
            df = pd.read_csv(dataset_dir / Path("cora/cora.content"),
                             sep="\t",
                             header=None)
            cols = df.columns[1:len(df.columns) - 1]

            # Getting all the indices
            indices = np.array(range(len(df)))
            np.random.shuffle(indices)
            train_indices = indices[0:int(0.8 * len(df))]
            valid_indices = indices[int(0.8 * len(df)):int(0.8 * len(df)) +
                                    int(0.1 * len(df))]
            test_indices = indices[int(0.8 * len(df)) + int(0.1 * len(df)):]

            np.savetxt(dataset_dir / Path("train.csv"),
                       train_indices,
                       delimiter=",",
                       fmt="%d")
            np.savetxt(dataset_dir / Path("valid.csv"),
                       valid_indices,
                       delimiter=",",
                       fmt="%d")
            np.savetxt(dataset_dir / Path("test.csv"),
                       test_indices,
                       delimiter=",",
                       fmt="%d")

            # Features
            features = df[cols]
            features.to_csv(index=False,
                            sep=",",
                            path_or_buf=dataset_dir / Path("node-feat.csv"),
                            header=False)

            # Labels
            labels = df[df.columns[len(df.columns) - 1]]
            labels = labels.apply(switch_to_num)
            labels.to_csv(index=False,
                          sep=",",
                          path_or_buf=dataset_dir / Path("node-label.csv"),
                          header=False)

            # Edges
            node_ids = df[df.columns[0]]
            dict_reverse = node_ids.to_dict()
            nodes_dict = {v: k for k, v in dict_reverse.items()}
            df_edges = pd.read_csv(dataset_dir / Path("cora/cora.cites"),
                                   sep="\t",
                                   header=None)
            df_edges.replace({0: nodes_dict, 1: nodes_dict}, inplace=True)
            df_edges.to_csv(index=False,
                            sep=",",
                            path_or_buf=dataset_dir / Path("edge.csv"),
                            header=False)