def _load_from_original_file(self): # Load data from original print("Movielens1MReader: Loading original data") zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") except (FileNotFoundError, zipfile.BadZipFile): print("Movielens1MReader: Unable to find data zip file. Downloading...") downloadFromURL(self.DATASET_URL, zipFile_path, "ml-1m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/") URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(URM_path, separator="::") print("Movielens1MReader: cleaning temporary files") import shutil shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("Movielens1MReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={"URM_all": (user_mapper.copy(), item_mapper.copy())})
def _load_from_original_file(self): # Load data from original self.zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER self.decompressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: self.dataFile = zipfile.ZipFile(self.zip_file_folder + "netflix-prize-data.zip") except (FileNotFoundError, zipfile.BadZipFile): print("NetflixPrizeReader: Unable to find data zip file.") print( "NetflixPrizeReader: Automatic download not available, please ensure the ZIP data file is in folder {}." .format(self.zip_file_folder)) print("NetflixPrizeReader: Data can be downloaded here: {}".format( self.DATASET_URL)) # If directory does not exist, create if not os.path.exists(self.zip_file_folder): os.makedirs(self.zip_file_folder) raise FileNotFoundError("Automatic download not available.") URM_all, item_mapper, user_mapper = self._loadURM() print("NetflixPrizeReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("YelpReader: Loading original data") compressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER decompressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: compressed_file = tarfile.open( compressed_file_folder + "yelp_dataset.tar", "r") compressed_file.extract("yelp_academic_dataset_review.json", path=decompressed_file_folder + "decompressed/") compressed_file.close() except (FileNotFoundError, tarfile.ReadError, tarfile.ExtractError): print("YelpReader: Unable to fild or decompress tar.gz file.") print( "YelpReader: Automatic download not available, please ensure the ZIP data file is in folder {}." .format(compressed_file_folder)) print("YelpReader: Data can be downloaded here: {}".format( self.DATASET_URL)) # If directory does not exist, create if not os.path.exists(compressed_file_folder): os.makedirs(compressed_file_folder) raise FileNotFoundError("Automatic download not available.") URM_path = decompressed_file_folder + "decompressed/yelp_academic_dataset_review.json" print("YelpReader: loading URM") URM_all_builder = self._loadURM(URM_path, if_new_user="******", if_new_item="add") URM_all = URM_all_builder.get_SparseMatrix() item_mapper = URM_all_builder.get_column_token_to_id_mapper() user_mapper = URM_all_builder.get_row_token_to_id_mapper() print("YelpReader: cleaning temporary files") import shutil shutil.rmtree(decompressed_file_folder + "decompressed/", ignore_errors=True) print("YelpReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("LastFMHetrec2011Reader: Loading original data") folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(folder_path + "hetrec2011-lastfm-2k.zip") except (FileNotFoundError, zipfile.BadZipFile): print( "LastFMHetrec2011Reader: Unable to find or extract data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, folder_path, "hetrec2011-lastfm-2k.zip") dataFile = zipfile.ZipFile(folder_path + "hetrec2011-lastfm-2k.zip") URM_path = dataFile.extract("user_artists.dat", path=folder_path + "decompressed") tags_path = dataFile.extract("user_taggedartists-timestamps.dat", path=folder_path + "decompressed") print("LastFMHetrec2011Reader: loading URM") URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder( URM_path, separator="\t", header=True) print("LastFMHetrec2011Reader: loading tags") ICM_tags, feature_mapper, _ = self._loadICM_tags(tags_path, item_mapper, header=True, separator='\t', if_new_item="ignore") print("LastFMHetrec2011Reader: cleaning temporary files") import shutil shutil.rmtree(folder_path + "decompressed", ignore_errors=True) print("LastFMHetrec2011Reader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) }, ICM_dict={"ICM_all": ICM_tags}, ICM_mappers_dict={ "ICM_all": (item_mapper.copy(), feature_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("EpinionsReader: Loading original data") folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER compressed_file_path = folder_path + "ratings_data.txt.bz2" decompressed_file_path = folder_path + "ratings_data.txt" try: open(decompressed_file_path, "r") except FileNotFoundError: print( "EpinionsReader: Unable to find decompressed data file. Decompressing..." ) try: compressed_file = bz2.open(compressed_file_path, "rb") except Exception: print( "EpinionsReader: Unable to find or open compressed data file. Downloading..." ) downloadFromURL(self.DATASET_URL, folder_path, "ratings_data.txt.bz2") compressed_file = bz2.open(compressed_file_path, "rb") decompressed_file = open(decompressed_file_path, "w") self._save_BZ2_in_text_file(compressed_file, decompressed_file) decompressed_file.close() print("EpinionsReader: loading URM") URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder( decompressed_file_path, separator=" ", header=True) print("EpinionsReader: cleaning temporary files") import os os.remove(decompressed_file_path) print("EpinionsReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("TVAudienceReader: Loading original data") compressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER decompressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER zipFile_name = "tv-audience-dataset.zip" try: dataFile = zipfile.ZipFile(compressed_zip_file_folder + zipFile_name) interactions_path = dataFile.extract( "tv-audience-dataset/tv-audience-dataset.csv", path=decompressed_zip_file_folder + "decompressed/") except (FileNotFoundError, zipfile.BadZipFile): print("TVAudienceReader: Unable to find or extract data zip file.") print( "TVAudienceReader: Automatic download not available, please ensure the ZIP data file is in folder {}." .format(compressed_zip_file_folder)) print( "TVAudienceReader: Data zip file not found or damaged. You may download the data from: {}" .format(self.DATASET_URL)) # If directory does not exist, create if not os.path.exists(compressed_zip_file_folder): os.makedirs(compressed_zip_file_folder) raise FileNotFoundError("Automatic download not available.") print("TVAudienceReader: Loading Interactions") URM_all, item_mapper, user_mapper = self._load_interactions( interactions_path, if_new_user="******", if_new_item="add") print("TVAudienceReader: cleaning temporary files") import shutil shutil.rmtree(decompressed_zip_file_folder + "decompressed/", ignore_errors=True) print("TVAudienceReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("BrightkiteReader: Loading original data") folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: compressed_file = gzip.open( folder_path + "loc-brightkite_edges.txt.gz", 'rb') except (FileNotFoundError): print( "BrightkiteReader: Unable to find or extract data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, folder_path, "loc-brightkite_edges.txt.gz") compressed_file = gzip.open( folder_path + "loc-brightkite_edges.txt.gz", 'rb') URM_path = folder_path + "loc-brightkite_edges.txt" decompressed_file = open(URM_path, "w") self._save_GZ_in_text_file(compressed_file, decompressed_file) decompressed_file.close() print("BrightkiteReader: loading URM") URM_all, item_mapper, user_mapper = self._loadURM(URM_path, separator="\t", header=False) print("BrightkiteReader: cleaning temporary files") import os os.remove(URM_path) print("BrightkiteReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) })
def _load_from_original_file_all_amazon_datasets(self, URM_path, metadata_path=None, reviews_path=None): print("AmazonReviewDataReader: Loading original data") print("AmazonReviewDataReader: loading URM") URM_all, item_original_ID_to_index, user_original_ID_to_index = load_CSV_into_SparseBuilder(URM_path, separator=",", header = False) urm = {"URM_all": URM_all} urm_mappers = {"URM_all": (user_original_ID_to_index, item_original_ID_to_index)} icm = {} icm_mappers = {} if metadata_path is not None: print("AmazonReviewDataReader: loading metadata") ICM_metadata, feature_mapper, item_mapper = self._loadMetadata(metadata_path, item_original_ID_to_index, if_new_item="ignore") ICM_metadata, _, feature_mapper = removeFeatures(ICM_metadata, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=feature_mapper) icm["ICM_metadata"] = ICM_metadata icm_mappers["ICM_metadata"] = (item_mapper.copy(), feature_mapper.copy()) if reviews_path is not None: print("AmazonReviewDataReader: loading reviews") ICM_reviews, feature_mapper, item_mapper = self._loadReviews(reviews_path, item_original_ID_to_index, if_new_item="ignore") ICM_reviews, _, feature_mapper = removeFeatures(ICM_reviews, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=feature_mapper) icm["ICM_reviews"] = ICM_reviews icm_mappers["ICM_reviews"] = (item_mapper.copy(), feature_mapper.copy()) if len(icm) > 0: ICM_names = list(icm.keys()) ICM_all, ICM_all_mapper = icm[ICM_names[0]], icm_mappers[ICM_names[0]] for key in ICM_names[1:]: ICM_all, ICM_all_mapper = self._merge_ICM(ICM_all, icm[key], ICM_all_mapper, icm_mappers[key]) icm["ICM_all"] = ICM_all icm_mappers["ICM_all"] = ICM_all_mapper # Clean temp files print("AmazonReviewDataReader: cleaning temporary files") if metadata_path is not None: os.remove(metadata_path) if reviews_path is not None: os.remove(reviews_path) print("AmazonReviewDataReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=icm, ICM_mappers_dict=icm_mappers)
def load_split(self, datareader, save_folder_path=None, postprocessings=None): if save_folder_path is None: tmp_save_folder_path = datareader.get_complete_default_save_path( postprocessings) else: tmp_save_folder_path = save_folder_path + os.sep try: datalist = self._get_dataset_names_in_split() for i in datalist: if not datareader.all_files_available( tmp_save_folder_path + self.get_name() + os.sep, filename_suffix="_{}".format(i)): raise Exception datasets = [] for i in datalist: urm, urm_mappers, icm, icm_mappers, ucm, ucm_mappers = datareader.load_from_saved_sparse_matrix( tmp_save_folder_path + self.get_name() + os.sep, filename_suffix="_{}".format(i)) datasets.append( Dataset(datareader.get_dataset_name(), base_folder=datareader.get_default_save_path(), postprocessings=postprocessings, URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=icm, ICM_mappers_dict=icm_mappers, UCM_dict=ucm, UCM_mappers_dict=ucm_mappers)) return datasets except: print( "DataSplitter: Preloaded data not found or corrupted, reading from original files..." ) dataset = datareader.load_data(save_folder_path=save_folder_path, postprocessings=postprocessings) return self.split(dataset)
def _load_from_original_file(self): print("BookCrossingReader: Ratings are in range 1-10, value -1 refers to an implicit rating") print("BookCrossingReader: ICM contains the author, publisher, year and tokens from the title") print("BookCrossingReader: Loading original data") folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(folder_path + "BX-CSV-Dump.zip") except (FileNotFoundError, zipfile.BadZipFile): print("BookCrossingReader: Unable to find or extract data zip file. Downloading...") downloadFromURL(self.DATASET_URL, folder_path, "BX-CSV-Dump.zip") dataFile = zipfile.ZipFile(folder_path + "BX-CSV-Dump.zip") URM_path = dataFile.extract("BX-Book-Ratings.csv", path=folder_path + "decompressed") ICM_path = dataFile.extract("BX-Books.csv", path=folder_path + "decompressed") print("BookCrossingReader: loading ICM") ICM_all, feature_mapper, item_mapper = self._loadICM(ICM_path, separator=';', header=True, if_new_item="add") ICM_all, _, feature_mapper = removeFeatures(ICM_all, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=feature_mapper) print("BookCrossingReader: loading URM") URM_all, _, user_mapper = self._loadURM(URM_path, item_mapper, separator=";", header=True, if_new_user="******", if_new_item="ignore") print("BookCrossingReader: cleaning temporary files") import shutil shutil.rmtree(folder_path + "decompressed", ignore_errors=True) print("BookCrossingReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={"URM_all": (user_mapper.copy(), item_mapper.copy())}, ICM_dict={"ICM_all": ICM_all}, ICM_mappers_dict={"ICM_all": (item_mapper.copy(), feature_mapper.copy())})
def apply(self, dataset): new_URM_dict = {} for URM_name in dataset.get_URM_names(): new_URM_dict[URM_name] = dataset.get_URM(URM_name) mask = np.ones(new_URM_dict[URM_name].data.size, dtype=np.bool) mask[new_URM_dict[URM_name].data >= self.min_rating_threshold] = False new_URM_dict[URM_name].data[mask] = 0.0 new_URM_dict[URM_name].eliminate_zeros() new_URM_dict[URM_name].data[:] = 1.0 return Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings() + [self], URM_dict=new_URM_dict, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict())
def _load_from_original_file(self): print("PinterestReader: Loading original data") zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "pinterest-20.zip") except (FileNotFoundError, zipfile.BadZipFile): print("PinterestReader: Unable to find data zip file.") print("PinterestReader: Automatic download not available, please ensure the compressed data file" " is in folder {}.".format(zipFile_path)) print("PinterestReader: Data can be downloaded here: {}".format( self.DATASET_URL)) URM_train_path = dataFile.extract( "pinterest-20.train.rating.txt", path=zipFile_path + "decompressed/") URM_test_path = dataFile.extract( "pinterest-20.test.rating.txt", path=zipFile_path + "decompressed/") trainMatrix = self.load_rating_file_as_matrix(URM_train_path) testRatings = self.load_rating_file_as_matrix(URM_test_path) from recsys_framework.utils.common import reshapeSparse URM_train = trainMatrix.tocsr() URM_test = testRatings.tocsr() shape = (max(URM_train.shape[0], URM_test.shape[0]), max(URM_train.shape[1], URM_test.shape[1])) URM_train = reshapeSparse(URM_train, shape) URM_test = reshapeSparse(URM_test, shape) mapper_users = {str(i+1): i for i in range(URM_train.shape[0])} mapper_items = {str(i+1): i for i in range(URM_train.shape[1])} return Dataset('Pinterest', URM_dict={"URM_all": URM_train+URM_test}, URM_mappers_dict={"URM_all": (mapper_users.copy(), mapper_items.copy())})
def load_split(self, datareader, save_folder_path=None, postprocessings=None): tmp_save_folder_path = save_folder_path if tmp_save_folder_path is None: tmp_save_folder_path = datareader.get_complete_default_save_path(postprocessings) try: datalist = self._get_dataset_names_in_split() for i in range(self.n_folds): for d in datalist: if not datareader.all_files_available(tmp_save_folder_path + self.get_name() + os.sep, filename_suffix="_{}_{}".format(i, d)): raise Exception r = [] for i in range(self.n_folds): datasets = [] for d in datalist: urm, urm_mappers, icm, icm_mappers, ucm, ucm_mappers = datareader.load_from_saved_sparse_matrix( tmp_save_folder_path + self.get_name() + os.sep, filename_suffix="_{}_{}".format(i, d)) datasets.append(Dataset(datareader.get_dataset_name(), base_folder=datareader.get_default_save_path(), postprocessings=postprocessings, URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=icm, ICM_mappers_dict=icm_mappers, UCM_dict=ucm, UCM_mappers_dict=ucm_mappers)) # With KFold, Validation is intrisic, so we surely have only train and test datasets r.append((datasets[0], datasets[1])) return r except: print("DataSplitterKFold: Preloaded data not found or corrupted, reading from original files...") dataset = datareader.load_data(save_folder_path=save_folder_path, postprocessings=postprocessings) return self.split(dataset)
def _load_from_original_file(self): # Load data from original print("SpotifySkipPredictionReader: Loading original data") compressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER decompressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: compressed_train_set_file = tarfile.open( compressed_file_folder + "20181113_training_set.tar.gz", "r:gz") except (FileNotFoundError, tarfile.ReadError, tarfile.ExtractError): print("SpotifySkipPredictionReader: Unable to fild data zip file.") print( "SpotifySkipPredictionReader: Automatic download not available, please ensure the compressed data file is in folder {}." .format(compressed_file_folder)) print( "SpotifySkipPredictionReader: Data can be downloaded here: {}". format(self.DATASET_URL)) # If directory does not exist, create if not os.path.exists(compressed_file_folder): os.makedirs(compressed_file_folder) raise FileNotFoundError("Automatic download not available.") #session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,hist_user_behavior_is_shuffle,hour_of_day,date,premium,context_type,hist_user_behavior_reason_start,hist_user_behavior_reason_end URM_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=None, on_new_col="add", preinitialized_row_mapper=None, on_new_row="add", dtype=np.bool) # If directory does not exist, create sps_blocks_path = decompressed_file_folder + "sps_blocks/" if not os.path.exists(sps_blocks_path): os.makedirs(sps_blocks_path) next_file = "" file_counter = 0 interaction_counter = 0 while next_file is not None: next_file = compressed_train_set_file.next() if file_counter <= 650: if next_file.isfile(): file_counter += 1 print("Skipping file {}: '{}'".format( file_counter, next_file.path)) continue if next_file is not None and next_file.isfile(): print("Extracting: '{}'".format(next_file.path)) compressed_train_set_file.extractall( path=decompressed_file_folder + "decompressed/", members=[next_file]) decompressed_file_path = decompressed_file_folder + "decompressed/" + next_file.path self._load_URM_events(URM_builder, decompressed_file_path) file_counter += 1 print("Loaded {}/660 files, {:.2E} interactions".format( file_counter, interaction_counter + URM_builder.get_nnz())) os.remove(decompressed_file_path) if file_counter % 50 == 0 or next_file is None: URM_all = URM_builder.get_SparseMatrix() print("Saving {}".format(sps_blocks_path + "URM_file_{}".format(file_counter))) sps.save_npz( sps_blocks_path + "URM_file_{}".format(file_counter), URM_all) item_mapper = URM_builder.get_row_token_to_id_mapper() user_mapper = URM_builder.get_column_token_to_id_mapper() interaction_counter += URM_builder.get_nnz() URM_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=item_mapper, on_new_col="add", preinitialized_row_mapper=user_mapper, on_new_row="add", dtype=np.bool) compressed_train_set_file.close() print("ThirtyMusicReader: cleaning temporary files") import shutil shutil.rmtree(decompressed_file_folder + "decompressed/", ignore_errors=True) print("ThirtyMusicReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) })
def split(self, dataset): super(WarmItemsKFold, self).split(dataset) # I can do the kfold of a slice of the initial URM! if self.percentage_initial_data_to_split < 1.0: h = Holdout(train_perc=self.percentage_initial_data_to_split, test_perc=1-self.percentage_initial_data_to_split) dataset = h.split(dataset)[0] folds = [] URM = dataset.get_URM().tocoo() split_belonging = np.random.choice(self.n_folds, URM.data.size, replace=True) for i in range(self.n_folds): urm = {} urm_mappers = {} mask = split_belonging == i for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name).tocoo() # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs row, col, data = zip(*sorted(zip(URM.row, URM.col, URM.data), key=lambda x: (x[0], x[1]))) urm[URM_name] = sps.csr_matrix((np.array(data)[mask], (np.array(row)[mask], np.array(col)[mask])), shape=URM.shape) urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name) folds.append( Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict() ) ) r = [] for i in range(self.n_folds): urm = {} urm_mappers = {} for URM_name in folds[i].get_URM_names(): # Keep i-th fold as test and merge the others as train urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name) urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name) for j in range(2, self.n_folds): urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name) train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) urm = {} test_urm = folds[i].get_URM() test_urm.sort_indices() mask = test_urm.data <= self.test_rating_threshold for URM_name in folds[i].get_URM_names(): urm[URM_name] = folds[i].get_URM(URM_name) urm[URM_name].sort_indices() urm[URM_name].data[mask] = 0.0 urm[URM_name].eliminate_zeros() test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(), ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) if not self.allow_cold_users: users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0] train.remove_users(users_to_remove) test.remove_users(users_to_remove) r.append((train, test)) return r
def _load_from_original_file(self): # Load data from original print("Movielens20MReader: Loading original data") zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") except (FileNotFoundError, zipfile.BadZipFile): print( "Movielens20MReader: Unable to fild data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, zipFile_path, "ml-20m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") genres_path = dataFile.extract("ml-20m/movies.csv", path=zipFile_path + "decompressed/") tags_path = dataFile.extract("ml-20m/tags.csv", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-20m/ratings.csv", path=zipFile_path + "decompressed/") print("Movielens20MReader: loading genres") ICM_genres, genres_mapper, item_mapper = self._loadICM_genres( genres_path, header=True, separator=',', genresSeparator="|") print("Movielens20MReader: loading tags") ICM_tags, tags_mapper, _ = self._loadICM_tags(tags_path, item_mapper, header=True, separator=',', if_new_item="ignore") print("Movielens20MReader: loading URM") URM_all, _, user_mapper = self._loadURM(URM_path, item_mapper, separator=",", header=True, if_new_user="******", if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_genres, ICM_tags, genres_mapper, tags_mapper) print("Movielens20MReader: cleaning temporary files") import shutil shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("Movielens20MReader: saving URM and ICM") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) }, ICM_dict={ "ICM_genres": ICM_genres, "ICM_tags": ICM_tags, "ICM_all": ICM_all }, ICM_mappers_dict={ "ICM_genres": (item_mapper.copy(), genres_mapper.copy()), "ICM_tags": (item_mapper.copy(), tags_mapper.copy()), "ICM_all": (item_mapper.copy(), feature_mapper.copy()) })
def split(self, dataset): super(ColdItemsKFold, self).split(dataset) folds = [] split_belonging = np.random.choice(self.n_folds, dataset.n_items, replace=True) for i in range(self.n_folds): urm = {} urm_mappers = {} mask = split_belonging != i for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name).tocsc(copy=True) # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs for j in np.arange(URM.shape[1])[mask].tolist(): URM.data[URM.indptr[j]:URM.indptr[j + 1]] = 0.0 URM.eliminate_zeros() urm[URM_name] = URM.tocsr() urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name) folds.append( Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict() ) ) r = [] for i in range(self.n_folds): urm = {} urm_mappers = {} for URM_name in folds[i].get_URM_names(): # Keep i-th fold as test and merge the others as train urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name) urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name) for j in range(2, self.n_folds): urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name) train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) urm = {} test_urm = folds[i].get_URM() test_urm.sort_indices() mask = test_urm.data <= self.test_rating_threshold for URM_name in folds[i].get_URM_names(): urm[URM_name] = folds[i].get_URM(URM_name) urm[URM_name].sort_indices() urm[URM_name].data[mask] = 0.0 urm[URM_name].eliminate_zeros() test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(), ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) if not self.allow_cold_users: users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0] train.remove_users(users_to_remove) test.remove_users(users_to_remove) r.append((train, test)) return r
def _load_from_original_file(self): print("TheMoviesDatasetReader: Loading original data") compressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER decompressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER zipFile_name = "the-movies-dataset.zip" try: dataFile = zipfile.ZipFile(compressed_zip_file_folder + zipFile_name) credits_path = dataFile.extract("credits.csv", path=decompressed_zip_file_folder + "decompressed/") metadata_path = dataFile.extract( "movies_metadata.csv", path=decompressed_zip_file_folder + "decompressed/") movielens_tmdb_id_map_path = dataFile.extract( "links.csv", path=decompressed_zip_file_folder + "decompressed/") URM_path = dataFile.extract("ratings.csv", path=decompressed_zip_file_folder + "decompressed/") except (FileNotFoundError, zipfile.BadZipFile): print( "TheMoviesDatasetReader: Unable to find or extract data zip file." ) print( "TheMoviesDatasetReader: Automatic download not available, please ensure the ZIP data file is in folder {}." .format(compressed_zip_file_folder)) print( "TheMoviesDatasetReader: Data zip file not found or damaged. You may download the data from: {}" .format(self.DATASET_URL)) # If directory does not exist, create if not os.path.exists(compressed_zip_file_folder): os.makedirs(compressed_zip_file_folder) raise FileNotFoundError("Automatic download not available.") self.item_original_ID_to_title = {} self.item_index_to_title = {} print("TheMoviesDatasetReader: Loading ICM_credits") ICM_credits, ICM_credits_mapper, item_mapper = self._loadICM_credits( credits_path, header=True, if_new_item="add") print("TheMoviesDatasetReader: Loading ICM_metadata") ICM_metadata, ICM_metadata_mapper, item_mapper = self._loadICM_metadata( metadata_path, item_mapper, header=True, if_new_item="add") ICM_credits, _, ICM_credits_mapper = removeFeatures( ICM_credits, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=ICM_credits_mapper) ICM_metadata, _, ICM_metadata_mapper = removeFeatures( ICM_metadata, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=ICM_metadata_mapper) n_items = ICM_metadata.shape[0] ICM_credits = reshapeSparse(ICM_credits, (n_items, ICM_credits.shape[1])) # IMPORTANT: ICM uses TMDB indices, URM uses movielens indices # Load index mapper movielens_id_to_tmdb, tmdb_to_movielens_id = self._load_item_id_mapping( movielens_tmdb_id_map_path, header=True) # Modify saved mapper to accept movielens id instead of tmdb item_mapper = self._replace_tmdb_id_with_movielens( tmdb_to_movielens_id, item_mapper) print("TheMoviesDatasetReader: Loading URM") URM_all, _, user_mapper = self._load_URM(URM_path, item_mapper, header=True, separator=",", if_new_user="******", if_new_item="ignore") # Reconcile URM and ICM # Keep only items having ICM entries, remove all the others n_items = ICM_credits.shape[0] URM_all = URM_all[:, 0:n_items] # URM is already clean ICM_all, ICM_all_mapper = self._merge_ICM(ICM_credits, ICM_metadata, ICM_credits_mapper, ICM_metadata_mapper) print("TheMoviesDatasetReader: cleaning temporary files") import shutil shutil.rmtree(decompressed_zip_file_folder + "decompressed/", ignore_errors=True) print("TheMoviesDatasetReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) }, ICM_dict={ "ICM_credits": ICM_credits, "ICM_metadata": ICM_metadata, "ICM_all": ICM_all }, ICM_mappers_dict={ "ICM_credits": (item_mapper.copy(), ICM_credits_mapper.copy()), "ICM_metadata": (item_mapper.copy(), ICM_metadata_mapper.copy()), "ICM_all": (item_mapper.copy(), ICM_all_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("ThirtyMusicReader: Loading original data") compressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER decompressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER credits_path = "entities/albums.idomaar" persons_path = "entities/persons.idomaar" playlist_path = "entities/playlist.idomaar" tags_path = "entities/tags.idomaar" tracks_path = "entities/tracks.idomaar" users_path = "entities/users.idomaar" events_path = "relations/events.idomaar" love_path = "relations/love.idomaar" sessions_path = "relations/sessions.idomaar" try: compressed_file = tarfile.open( compressed_file_folder + "ThirtyMusic.tar.gz", "r:gz") compressed_file.extract(tracks_path, path=decompressed_file_folder + "decompressed/") compressed_file.extract(events_path, path=decompressed_file_folder + "decompressed/") compressed_file.close() except (FileNotFoundError, tarfile.ReadError, tarfile.ExtractError): print("ThirtyMusicReader: Unable to fild data zip file.") print( "ThirtyMusicReader: Automatic download not available, please ensure the compressed data file is in folder {}." .format(compressed_file_folder)) print("ThirtyMusicReader: Data can be downloaded here: {}".format( self.DATASET_URL)) # If directory does not exist, create if not os.path.exists(compressed_file_folder): os.makedirs(compressed_file_folder) raise FileNotFoundError("Automatic download not available.") tracks_path = decompressed_file_folder + "decompressed/" + tracks_path events_path = decompressed_file_folder + "decompressed/" + events_path print("ThirtyMusicReader: loading ICM_tracks") ICM_all, feature_mapper, item_mapper = self._load_ICM_tracks( tracks_path, if_new_item="add") print("ThirtyMusicReader: loading URM_events") URM_all, _, user_mapper = self._load_URM_events(events_path, item_mapper, if_new_user="******", if_new_item="ignore") print("ThirtyMusicReader: cleaning temporary files") import shutil shutil.rmtree(decompressed_file_folder + "decompressed/", ignore_errors=True) print("ThirtyMusicReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) }, ICM_dict={"ICM_all": ICM_all}, ICM_mappers_dict={ "ICM_all": (item_mapper.copy(), feature_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("MovielensHetrecReader: Loading original data") zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") except (FileNotFoundError, zipfile.BadZipFile): print( "MovielensHetrecReader: Unable to fild data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, zipFile_path, "hetrec2011-movielens-2k-v2.zip") dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") movies_path = dataFile.extract("movies.dat", path=zipFile_path + "decompressed/") genres_path = dataFile.extract("movie_genres.dat", path=zipFile_path + "decompressed/") directors_path = dataFile.extract("movie_directors.dat", path=zipFile_path + "decompressed/") actors_path = dataFile.extract("movie_actors.dat", path=zipFile_path + "decompressed/") countries_path = dataFile.extract("movie_countries.dat", path=zipFile_path + "decompressed/") locations_path = dataFile.extract("movie_locations.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("user_ratedmovies.dat", path=zipFile_path + "decompressed/") print("MovielensHetrecReader: loading years") ICM_years, years_mapper, item_mapper = self._load_tsv( movies_path, None, feature_columns=[5], header=True, if_new_item="add") print("MovielensHetrecReader: loading genres") ICM_genres, genres_mapper, _ = self._load_tsv(genres_path, item_mapper, header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_genres, ICM_years, genres_mapper, years_mapper) print("MovielensHetrecReader: loading directors") ICM_directors, directors_mapper, _ = self._load_tsv( directors_path, item_mapper, header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_directors, feature_mapper, directors_mapper) print("MovielensHetrecReader: loading actors") ICM_actors, actors_mapper, _ = self._load_tsv(actors_path, item_mapper, header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_actors, feature_mapper, actors_mapper) print("MovielensHetrecReader: loading countries") ICM_countries, countries_mapper, _ = self._load_tsv( countries_path, item_mapper, header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_countries, feature_mapper, countries_mapper) print("MovielensHetrecReader: loading locations") ICM_locations, locations_mapper, _ = self._load_tsv( locations_path, item_mapper, feature_columns=[1, 2, 3], header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_locations, feature_mapper, locations_mapper) print("MovielensHetrecReader: loading URM") URM_all, _, user_mapper = self._loadURM(URM_path, item_mapper, separator="\t", header=True, if_new_user="******", if_new_item="ignore") print("MovielensHetrecReader: cleaning temporary files") import shutil shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("MovielensHetrecReader: saving URM and ICM") return Dataset( self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) }, ICM_dict={ "ICM_genres": ICM_genres, "ICM_years": ICM_years, "ICM_all": ICM_all, "ICM_directors": ICM_directors, "ICM_actors": ICM_actors, "ICM_countries": ICM_countries, "ICM_locations": ICM_locations, }, ICM_mappers_dict={ "ICM_genres": (item_mapper.copy(), genres_mapper.copy()), "ICM_years": (item_mapper.copy(), years_mapper.copy()), "ICM_directors": (item_mapper.copy(), directors_mapper.copy()), "ICM_actors": (item_mapper.copy(), actors_mapper.copy()), "ICM_countries": (item_mapper.copy(), countries_mapper.copy()), "ICM_locations": (item_mapper.copy(), locations_mapper.copy()), "ICM_all": (item_mapper.copy(), feature_mapper.copy()) })
def split(self, dataset): super(ColdItemsHoldout, self).split(dataset) n_users, n_items = dataset.n_users, dataset.n_items URM_train, URM_test, URM_validation = {}, {}, {} items_split = np.random.choice( 3, n_items, replace=True, p=[self.train_perc, self.validation_perc, self.test_perc]) train_items = np.arange(n_items)[items_split == 0] validation_items = np.arange(n_items)[items_split == 1] test_items = np.arange(n_items)[items_split == 2] #Select apriori how to randomizely sort every user users_to_remove = [] for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name) URM = sps.csr_matrix(URM) URM_train_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) URM_test_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) if self.with_validation: URM_validation_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) for user_id in range(n_users): start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] user_interaction_items = URM.indices[ start_user_position:end_user_position] user_interaction_data = URM.data[ start_user_position:end_user_position] # Test interactions indices = np.in1d(user_interaction_items, test_items, assume_unique=True) user_interaction_items_test = user_interaction_items[indices] user_interaction_data_test = user_interaction_data[indices] # Remove from test interactions below a given threshold mask = user_interaction_data_test > self.test_rating_threshold user_interaction_items_test = user_interaction_items_test[mask] user_interaction_data_test = user_interaction_data_test[mask] URM_test_builder.add_data_lists( [user_id] * len(user_interaction_data_test), user_interaction_items_test, user_interaction_data_test) # validation interactions if self.with_validation: indices = np.in1d(user_interaction_items, validation_items, assume_unique=True) user_interaction_items_validation = user_interaction_items[ indices] user_interaction_data_validation = user_interaction_data[ indices] # Remove from validation interactions below a given threshold mask = user_interaction_data_validation > self.test_rating_threshold user_interaction_items_validation = user_interaction_items_validation[ mask] user_interaction_data_validation = user_interaction_data_validation[ mask] URM_validation_builder.add_data_lists( [user_id] * len(user_interaction_data_validation), user_interaction_items_validation, user_interaction_data_validation) #if len(user_interaction_items_validation) <= 0: # users_to_remove.append(user_id) # Train interactions indices = np.in1d(user_interaction_items, train_items, assume_unique=True) user_interaction_items_train = user_interaction_items[indices] user_interaction_data_train = user_interaction_data[indices] URM_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) #if len(user_interaction_items_test) <= 0: # users_to_remove.append(user_id) if not self.allow_cold_users and len( user_interaction_items_train) <= 0: users_to_remove.append(user_id) URM_train[URM_name] = URM_train_builder.get_SparseMatrix() URM_test[URM_name] = URM_test_builder.get_SparseMatrix() if self.with_validation: URM_validation[ URM_name] = URM_validation_builder.get_SparseMatrix() train = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_train, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) train.remove_users(users_to_remove) test = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_test, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) test.remove_users(users_to_remove) if self.with_validation: validation = Dataset( dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_validation, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) validation.remove_users(users_to_remove) return train, test, validation else: return train, test
def split(self, dataset): super(Holdout, self).split(dataset) URM = sps.csr_matrix(dataset.get_URM()) n_users, n_items = dataset.n_users, dataset.n_items user_indices = [] URM_train, URM_test, URM_validation = {}, {}, {} #Select apriori how to randomizely sort every user users_to_remove = [] for user_id in range(n_users): assignment = np.random.choice( 3, URM.indptr[user_id + 1] - URM.indptr[user_id], replace=True, p=[self.train_perc, self.validation_perc, self.test_perc]) assignments = [assignment == i for i in range(3)] #if assignments[2].sum() <= 0: #No interactions in test # users_to_remove.append(user_id) #if self.with_validation and assignments[1].sum() <= 0: #No interactions in validation # users_to_remove.append(user_id) if not self.allow_cold_users and assignments[0].sum() <= 0: #No interactions in train users_to_remove.append(user_id) user_indices.append(assignments) for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name) URM = sps.csr_matrix(URM) URM_train_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) URM_test_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) if self.with_validation: URM_validation_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) users_to_remove_index = 0 for user_id in range(n_users): if users_to_remove_index < len( users_to_remove ) and user_id == users_to_remove[users_to_remove_index]: users_to_remove_index += 1 continue indices = user_indices[user_id] start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] user_interaction_items = URM.indices[ start_user_position:end_user_position] user_interaction_data = URM.data[ start_user_position:end_user_position] # Test interactions user_interaction_items_test = user_interaction_items[ indices[2]] user_interaction_data_test = user_interaction_data[indices[2]] mask = user_interaction_data_test > self.test_rating_threshold user_interaction_items_test = user_interaction_items_test[mask] user_interaction_data_test = user_interaction_data_test[mask] URM_test_builder.add_data_lists( [user_id] * len(user_interaction_data_test), user_interaction_items_test, user_interaction_data_test) # validation interactions if self.with_validation: user_interaction_items_validation = user_interaction_items[ indices[1]] user_interaction_data_validation = user_interaction_data[ indices[1]] # Remove from validation interactions below a given threshold mask = user_interaction_data_validation > self.test_rating_threshold user_interaction_items_validation = user_interaction_items_validation[ mask] user_interaction_data_validation = user_interaction_data_validation[ mask] URM_validation_builder.add_data_lists( [user_id] * len(user_interaction_data_validation), user_interaction_items_validation, user_interaction_data_validation) # Train interactions user_interaction_items_train = user_interaction_items[ indices[0]] user_interaction_data_train = user_interaction_data[indices[0]] URM_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) URM_train[URM_name] = URM_train_builder.get_SparseMatrix() URM_test[URM_name] = URM_test_builder.get_SparseMatrix() if self.with_validation: URM_validation[ URM_name] = URM_validation_builder.get_SparseMatrix() train = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_train, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) train.remove_users(users_to_remove) test = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_test, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) test.remove_users(users_to_remove) if self.with_validation: validation = Dataset( dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_validation, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) validation.remove_users(users_to_remove) return train, test, validation else: return train, test
def _load_from_original_file(self): print("NetflixEnhancedReader: Loading original data") compressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER decompressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(compressed_zip_file_folder + "NetflixEnhancedData.zip") URM_matfile_path = dataFile.extract( "urm.mat", path=decompressed_zip_file_folder + "decompressed/") titles_matfile_path = dataFile.extract( "titles.mat", path=decompressed_zip_file_folder + "decompressed/") ICM_matfile_path = dataFile.extract( "icm.mat", path=decompressed_zip_file_folder + "decompressed/") except (FileNotFoundError, zipfile.BadZipFile): print( "NetflixPrizeReader: Unable to find or extract data zip file.") print( "NetflixPrizeReader: Automatic download not available, please ensure the ZIP data file is in folder {}." .format(compressed_zip_file_folder)) print("NetflixPrizeReader: Data can be downloaded here: {}".format( self.DATASET_URL)) # If directory does not exist, create if not os.path.exists(compressed_zip_file_folder): os.makedirs(compressed_zip_file_folder) raise FileNotFoundError("Automatic download not available.") URM_matfile = sio.loadmat(URM_matfile_path) URM_all = URM_matfile["urm"] usercache_urm = URM_matfile["usercache_urm"] itemcache_urm = URM_matfile["itemcache_urm"] user_mapper = {} item_mapper = {} for item_id in range(URM_all.shape[1]): item_mapper[item_id] = item_id for user_id in range(URM_all.shape[0]): user_mapper[user_id] = user_id titles_matfile = sio.loadmat(titles_matfile_path) titles_list = titles_matfile["titles"] ICM_matfile = sio.loadmat(ICM_matfile_path) ICM_all = ICM_matfile["icm"] ICM_all = sps.csr_matrix(ICM_all.T) ICM_dictionary = ICM_matfile["dictionary"] itemcache_icm = ICM_matfile["itemcache_icm"] stemTypes = ICM_dictionary["stemTypes"][0][0] stems = ICM_dictionary["stems"][0][0] # Split ICM_tags and ICM_editorial is_tag_mask = np.zeros((len(stems)), dtype=np.bool) ICM_all_mapper, ICM_tags_mapper, ICM_editorial_mapper = {}, {}, {} for current_stem_index in range(len(stems)): current_stem_type = stemTypes[current_stem_index] current_stem_type_string = current_stem_type[0][0] token = stems[current_stem_index][0][0] if token in ICM_all_mapper: print( "Duplicate token {} alredy existent in position {}".format( token, ICM_all_mapper[token])) else: ICM_all_mapper[token] = current_stem_index if "KeywordsArray" in current_stem_type_string: is_tag_mask[current_stem_index] = True ICM_tags_mapper[token] = len(ICM_tags_mapper) else: ICM_editorial_mapper[token] = len(ICM_editorial_mapper) ICM_tags = ICM_all[:, is_tag_mask] is_editorial_mask = np.logical_not(is_tag_mask) ICM_editorial = ICM_all[:, is_editorial_mask] # Remove features taking into account the filtered ICM ICM_all, _, ICM_all_mapper = removeFeatures( ICM_all, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=ICM_all_mapper) ICM_tags, _, ICM_tags_mapper = removeFeatures( ICM_tags, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=ICM_tags_mapper) ICM_editorial, _, ICM_editorial_mapper = removeFeatures( ICM_editorial, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=ICM_editorial_mapper) print("NetflixEnhancedReader: cleaning temporary files") import shutil shutil.rmtree(decompressed_zip_file_folder + "decompressed", ignore_errors=True) print("NetflixEnhancedReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) }, ICM_dict={ "ICM_editorial": ICM_editorial, "ICM_tags": ICM_tags, "ICM_all": ICM_all }, ICM_mappers_dict={ "ICM_editorial": (item_mapper.copy(), ICM_editorial_mapper.copy()), "ICM_tags": (item_mapper.copy(), ICM_tags_mapper.copy()), "ICM_all": (item_mapper.copy(), ICM_all_mapper.copy()) })
def split(self, dataset): super(LeaveKOut, self).split(dataset) URM = sps.csr_matrix(dataset.get_URM()) URM.sort_indices() split_number = 2 if self.with_validation: split_number += 1 # Min interactions at least self.k_value for each split +1 for train min_user_interactions = split_number * (self.k_value - 1) + 1 users_to_preserve = np.arange(URM.shape[0]) if not self.allow_cold_users: urm_threshold = URM.copy() urm_threshold.data[ urm_threshold.data <= self.test_rating_threshold] = 0 urm_threshold.eliminate_zeros() user_interactions = np.ediff1d(urm_threshold.tocsr().indptr) users_to_preserve = users_to_preserve[ user_interactions >= min_user_interactions] print( "DataSplitterLeaveKOut: Removing {} of {} users because they have less than the {} interactions required for {} splits" .format(URM.shape[0] - len(users_to_preserve), URM.shape[0], min_user_interactions, split_number)) users_to_remove = np.setdiff1d(np.arange(URM.shape[0]), users_to_preserve) n_users, n_items = URM.shape user_indices = [] URM_train, URM_test, URM_validation = {}, {}, {} #Select apriori how to randomizely sort every user for user_id in users_to_preserve.tolist(): user_profile = URM.data[URM.indptr[user_id]:URM. indptr[user_id + 1]] > self.test_rating_threshold test_and_val = np.random.permutation( np.arange(URM.indptr[user_id + 1] - URM.indptr[user_id])[user_profile]) limit = self.k_value if self.with_validation: limit = self.k_value * 2 # Train, Test and Validation user_indices.append((np.setdiff1d(np.arange(len(user_profile)), test_and_val[:limit]), test_and_val[:self.k_value], test_and_val[self.k_value:limit])) for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name).tocsr() URM.sort_indices() URM_train_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) URM_test_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) if self.with_validation: URM_validation_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) for i, user_id in enumerate(users_to_preserve.tolist()): start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] indices = user_indices[i] user_interaction_items = URM.indices[ start_user_position:end_user_position] user_interaction_data = URM.data[ start_user_position:end_user_position] # Test interactions user_interaction_items_test = user_interaction_items[ indices[1]] user_interaction_data_test = user_interaction_data[indices[1]] URM_test_builder.add_data_lists([user_id] * self.k_value, user_interaction_items_test, user_interaction_data_test) train_start = self.k_value # validation interactions if self.with_validation: user_interaction_items_validation = user_interaction_items[ indices[2]] user_interaction_data_validation = user_interaction_data[ indices[2]] URM_validation_builder.add_data_lists( [user_id] * self.k_value, user_interaction_items_validation, user_interaction_data_validation) train_start = self.k_value * 2 # Train interactions user_interaction_items_train = user_interaction_items[ indices[0]] user_interaction_data_train = user_interaction_data[indices[0]] URM_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) URM_train[URM_name] = URM_train_builder.get_SparseMatrix() URM_test[URM_name] = URM_test_builder.get_SparseMatrix() if self.with_validation: URM_validation[ URM_name] = URM_validation_builder.get_SparseMatrix() train = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_train, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) train.remove_users(users_to_remove) test = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_test, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) test.remove_users(users_to_remove) if self.with_validation: validation = Dataset( dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_validation, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) validation.remove_users(users_to_remove) return train, test, validation else: return train, test