示例#1
0
    def load_random_split(self,
                          test_rate=0.1,
                          random=False,
                          n_negative=100,
                          by_user=False,
                          n_test=10):
        """load split date generated by random_split.

        Load split data generated by random_split from Onedrive, with test_rate = 0.1 and by_user = False.

        Args:
            test_rate: percentage of the test data. Note that percentage of the validation data will be the same as
                        test data.
            random: bool. Whether randomly leave one basket as testing.
            n_negative:  Number of negative samples for testing and validation data.
            by_user: bool. Default False.
                    - Ture: user-based split,
                    - False: global split,
            n_test: int. Default 10. The number of testing and validation copies.
                    If n_test==0, will load the original (no negative items) valid and test datasets.

        Returns:
            train_data (DataFrame): Interaction for training.
            valid_data list(DataFrame): List of interactions for validation
            test_data list(DataFrame): List of interactions for testing
        """

        processed_random_split_path = os.path.join(self.processed_path,
                                                   "random")
        if not os.path.exists(processed_random_split_path):
            os.mkdir(processed_random_split_path)

        parameterized_path = generate_parameterized_path(test_rate=test_rate,
                                                         random=random,
                                                         n_negative=n_negative,
                                                         by_user=by_user)
        download_path = processed_random_split_path
        processed_random_split_path = os.path.join(processed_random_split_path,
                                                   parameterized_path)
        if not os.path.exists(processed_random_split_path):
            if (test_rate == 0.1 and random is False and n_negative == 100
                    and by_user is False):
                # default parameters, can be downloaded from Onedrive
                folder = OneDrive(url=self.processed_random_split_url,
                                  path=download_path)
                folder.download()
                un_zip(processed_random_split_path + ".zip", download_path)
            else:
                # make
                self.make_random_split(
                    test_rate=test_rate,
                    random=random,
                    n_negative=n_negative,
                    by_user=by_user,
                    n_test=n_test,
                )

        # load data from local storage
        return load_split_data(processed_random_split_path, n_test=n_test)
示例#2
0
def download_file_from_onedrive(url, path):
    """Download processed file from OneDrive.

    Download file from OneDrive with the give url and save to the given path.

    Args:
        url: the shared link generated by OneDrive.
        path: the path supposed to store the file.
    """
    folder = OneDrive(url=url, path=path)
    folder.download()
示例#3
0
    def load_leave_one_out(self,
                           random=False,
                           n_negative=100,
                           n_test=10,
                           download=False):
        """Load split data generated by leave_out_out without random select.

        Load split data generated by leave_out_out without random select from Onedrive.

        Args:
            random: bool. Whether randomly leave one item as testing.
            n_negative:  Number of negative samples for testing and validation data.
            n_test: int. Default 10. The number of testing and validation copies.
                    If n_test==0, will load the original (no negative items) valid and test datasets.

        Returns:
            train_data (DataFrame): Interaction for training.
            valid_data list(DataFrame): List of interactions for validation
            test_data list(DataFrame): List of interactions for testing
        """
        processed_leave_one_out_path = os.path.join(self.processed_path,
                                                    "leave_one_out")
        if not os.path.exists(processed_leave_one_out_path):
            os.mkdir(processed_leave_one_out_path)

        parameterized_path = generate_parameterized_path(test_rate=0,
                                                         random=random,
                                                         n_negative=n_negative,
                                                         by_user=False)

        download_path = processed_leave_one_out_path
        processed_leave_one_out_path = os.path.join(
            processed_leave_one_out_path, parameterized_path)

        if not os.path.exists(processed_leave_one_out_path):
            if download and random is False and n_negative == 100:
                # default parameters, can be downloaded from Onedrive
                folder = OneDrive(url=self.processed_leave_one_out_url,
                                  path=download_path)
                folder.download()
                un_zip(processed_leave_one_out_path + ".zip", download_path)
            else:
                # make
                self.make_leave_one_out(random=random,
                                        n_negative=n_negative,
                                        n_test=n_test)

        # load data from local storage
        return load_split_data(processed_leave_one_out_path, n_test=n_test)
示例#4
0
def download_file(url, store_file_path):
    """Download the raw dataset file.

    Download the dataset with the given url and save to the store_path.

    Args:
        url: the url that can be downloaded the dataset file.
        store_file_path: the path that stores the downloaded file.
    Return:
        the archive format of the suffix.
    """
    filename = url.split("/")[-1]
    print(f"Start downloading file {filename} with url {url}...")

    if "1drv.ms" in url:
        # allow downloading raw data from Onedrive
        store_file_path = os.path.dirname(store_file_path)
        folder = OneDrive(url=url, path=store_file_path)
        folder.download()
    else:
        r = requests.get(url, allow_redirects=True, stream=True)
        # Total size in bytes
        total_size = int(r.headers.get("content-length", 0))
        block_size = 1024
        t = tqdm(total=total_size, unit="iB", unit_scale=True)

        with open(store_file_path, "wb") as f:
            for data in r.iter_content(block_size):
                t.update(len(data))
                f.write(data)
        t.close()

        if total_size != 0 and t.n != total_size:
            print("ERROR, download fail")
        else:
            print(f"Success loading file {filename} to {store_file_path}")