def load_random_split(self, test_rate=0.1, random=False, n_negative=100, by_user=False, n_test=10): """load split date generated by random_split. Load split data generated by random_split from Onedrive, with test_rate = 0.1 and by_user = False. Args: test_rate: percentage of the test data. Note that percentage of the validation data will be the same as test data. random: bool. Whether randomly leave one basket as testing. n_negative: Number of negative samples for testing and validation data. by_user: bool. Default False. - Ture: user-based split, - False: global split, n_test: int. Default 10. The number of testing and validation copies. If n_test==0, will load the original (no negative items) valid and test datasets. Returns: train_data (DataFrame): Interaction for training. valid_data list(DataFrame): List of interactions for validation test_data list(DataFrame): List of interactions for testing """ processed_random_split_path = os.path.join(self.processed_path, "random") if not os.path.exists(processed_random_split_path): os.mkdir(processed_random_split_path) parameterized_path = generate_parameterized_path(test_rate=test_rate, random=random, n_negative=n_negative, by_user=by_user) download_path = processed_random_split_path processed_random_split_path = os.path.join(processed_random_split_path, parameterized_path) if not os.path.exists(processed_random_split_path): if (test_rate == 0.1 and random is False and n_negative == 100 and by_user is False): # default parameters, can be downloaded from Onedrive folder = OneDrive(url=self.processed_random_split_url, path=download_path) folder.download() un_zip(processed_random_split_path + ".zip", download_path) else: # make self.make_random_split( test_rate=test_rate, random=random, n_negative=n_negative, by_user=by_user, n_test=n_test, ) # load data from local storage return load_split_data(processed_random_split_path, n_test=n_test)
def download_file_from_onedrive(url, path): """Download processed file from OneDrive. Download file from OneDrive with the give url and save to the given path. Args: url: the shared link generated by OneDrive. path: the path supposed to store the file. """ folder = OneDrive(url=url, path=path) folder.download()
def load_leave_one_out(self, random=False, n_negative=100, n_test=10, download=False): """Load split data generated by leave_out_out without random select. Load split data generated by leave_out_out without random select from Onedrive. Args: random: bool. Whether randomly leave one item as testing. n_negative: Number of negative samples for testing and validation data. n_test: int. Default 10. The number of testing and validation copies. If n_test==0, will load the original (no negative items) valid and test datasets. Returns: train_data (DataFrame): Interaction for training. valid_data list(DataFrame): List of interactions for validation test_data list(DataFrame): List of interactions for testing """ processed_leave_one_out_path = os.path.join(self.processed_path, "leave_one_out") if not os.path.exists(processed_leave_one_out_path): os.mkdir(processed_leave_one_out_path) parameterized_path = generate_parameterized_path(test_rate=0, random=random, n_negative=n_negative, by_user=False) download_path = processed_leave_one_out_path processed_leave_one_out_path = os.path.join( processed_leave_one_out_path, parameterized_path) if not os.path.exists(processed_leave_one_out_path): if download and random is False and n_negative == 100: # default parameters, can be downloaded from Onedrive folder = OneDrive(url=self.processed_leave_one_out_url, path=download_path) folder.download() un_zip(processed_leave_one_out_path + ".zip", download_path) else: # make self.make_leave_one_out(random=random, n_negative=n_negative, n_test=n_test) # load data from local storage return load_split_data(processed_leave_one_out_path, n_test=n_test)
def download_file(url, store_file_path): """Download the raw dataset file. Download the dataset with the given url and save to the store_path. Args: url: the url that can be downloaded the dataset file. store_file_path: the path that stores the downloaded file. Return: the archive format of the suffix. """ filename = url.split("/")[-1] print(f"Start downloading file {filename} with url {url}...") if "1drv.ms" in url: # allow downloading raw data from Onedrive store_file_path = os.path.dirname(store_file_path) folder = OneDrive(url=url, path=store_file_path) folder.download() else: r = requests.get(url, allow_redirects=True, stream=True) # Total size in bytes total_size = int(r.headers.get("content-length", 0)) block_size = 1024 t = tqdm(total=total_size, unit="iB", unit_scale=True) with open(store_file_path, "wb") as f: for data in r.iter_content(block_size): t.update(len(data)) f.write(data) t.close() if total_size != 0 and t.n != total_size: print("ERROR, download fail") else: print(f"Success loading file {filename} to {store_file_path}")