def split_train_validation_percentage_random_holdout(URM_train, train_percentage=0.8): from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix num_users, num_items = URM_train.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) URM_train = sps.coo_matrix(URM_train) train_mask = np.random.rand(URM_train.nnz) <= train_percentage validation_mask = np.logical_not(train_mask) URM_train_builder.add_data_lists(URM_train.row[train_mask], URM_train.col[train_mask], URM_train.data[train_mask]) URM_validation_builder.add_data_lists(URM_train.row[validation_mask], URM_train.col[validation_mask], URM_train.data[validation_mask]) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() return URM_train, URM_validation
def split_train_validation_percentage_user_wise(URM_train, train_percentage=0.1, verbose=True): from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix # ensure to use csr matrix or we get big problem URM_train = URM_train.tocsr() num_users, num_items = URM_train.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) user_no_item_train = 0 user_no_item_validation = 0 for user_id in range(URM_train.shape[0]): start_pos = URM_train.indptr[user_id] end_pos = URM_train.indptr[user_id + 1] user_profile_items = URM_train.indices[start_pos:end_pos] user_profile_ratings = URM_train.data[start_pos:end_pos] user_profile_length = len(user_profile_items) n_train_items = round(user_profile_length * train_percentage) if n_train_items == len(user_profile_items) and n_train_items > 1: n_train_items -= 1 indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int) np.random.shuffle(indices_for_sampling) train_items = user_profile_items[indices_for_sampling[0:n_train_items]] train_ratings = user_profile_ratings[indices_for_sampling[0:n_train_items]] validation_items = user_profile_items[indices_for_sampling[n_train_items:]] validation_ratings = user_profile_ratings[indices_for_sampling[n_train_items:]] if len(train_items) == 0: if verbose: print("User {} has 0 train items".format(user_id)) user_no_item_train += 1 if len(validation_items) == 0: if verbose: print("User {} has 0 validation items".format(user_id)) user_no_item_validation += 1 URM_train_builder.add_data_lists([user_id] * len(train_items), train_items, train_ratings) URM_validation_builder.add_data_lists([user_id] * len(validation_items), validation_items, validation_ratings) if user_no_item_train != 0: print("Warning split: {} users with 0 train items ({} total users)".format(user_no_item_train, URM_train.shape[0])) if user_no_item_validation != 0: print("Warning split: {} users with 0 validation items ({} total users)".format(user_no_item_validation, URM_train.shape[0])) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() return URM_train, URM_validation
def split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.1): """ The function splits an URM in two matrices selecting the number of interactions globally :param URM_all: :param train_percentage: :param verbose: :return: """ assert train_percentage >= 0.0 and train_percentage<=1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(train_percentage) from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix num_users, num_items = URM_all.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) URM_train = sps.coo_matrix(URM_all) indices_for_sampling = np.arange(0, URM_all.nnz, dtype=np.int) np.random.shuffle(indices_for_sampling) n_train_interactions = round(URM_all.nnz * train_percentage) indices_for_train = indices_for_sampling[indices_for_sampling[0:n_train_interactions]] indices_for_validation = indices_for_sampling[indices_for_sampling[n_train_interactions:]] URM_train_builder.add_data_lists(URM_train.row[indices_for_train], URM_train.col[indices_for_train], URM_train.data[indices_for_train]) URM_validation_builder.add_data_lists(URM_train.row[indices_for_validation], URM_train.col[indices_for_validation], URM_train.data[indices_for_validation]) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() URM_train = sps.csr_matrix(URM_train) URM_validation = sps.csr_matrix(URM_validation) user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0) user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no train items".format(user_no_item_train, user_no_item_train/num_users*100, num_users)) if user_no_item_validation != 0: print("Warning: {} ({:.2f} %) of {} users have no sampled items".format(user_no_item_validation, user_no_item_validation/num_users*100, num_users)) return URM_train, URM_validation
def split_train_validation_leave_one_out_user_wise(URM_train, verbose=True, at_least_n_train_items=0): from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix num_users, num_items = URM_train.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) count_train = 0 count_validation = 0 for user_id in range(URM_train.shape[0]): start_pos = URM_train.indptr[user_id] end_pos = URM_train.indptr[user_id + 1] user_profile_items = URM_train.indices[start_pos:end_pos] user_profile_ratings = URM_train.data[start_pos:end_pos] user_profile_length = len(user_profile_items) n_train_items = user_profile_length if n_train_items > at_least_n_train_items: n_train_items -= 1 indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int) np.random.shuffle(indices_for_sampling) train_items = user_profile_items[indices_for_sampling[0:n_train_items]] train_ratings = user_profile_ratings[indices_for_sampling[0:n_train_items]] validation_items = user_profile_items[indices_for_sampling[n_train_items:]] validation_ratings = user_profile_ratings[indices_for_sampling[n_train_items:]] if len(train_items) == 0: if verbose: print("User {} has 0 train items".format(user_id)) count_train += 1 if len(validation_items) == 0: if verbose: print("User {} has 0 validation items".format(user_id)) count_validation += 1 URM_train_builder.add_data_lists([user_id] * len(train_items), train_items, train_ratings) URM_validation_builder.add_data_lists([user_id] * len(validation_items), validation_items, validation_ratings) if count_train > 0: print("{} users with 0 train items".format(count_train)) if count_validation > 0: print("{} users with 0 validation items".format(count_validation)) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() return URM_train, URM_validation
def __init__(self, path): ''' Constructor ''' trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating") testRatings = self.load_rating_file_as_matrix(path + ".test.rating") testNegatives = self.load_negative_file(path + ".test.negative") assert len(testRatings) == len(testNegatives) self.num_users, self.num_items = trainMatrix.shape from Base.Recommender_utils import reshapeSparse self.URM_train = trainMatrix.tocsr() self.URM_test = testRatings.tocsr() shape = (max(self.URM_train.shape[0], self.URM_test.shape[0]), max(self.URM_train.shape[1], self.URM_test.shape[1])) self.URM_train = reshapeSparse(self.URM_train, shape) self.URM_test = reshapeSparse(self.URM_test, shape) URM_test_negatives_builder = IncrementalSparseMatrix(n_rows=shape[0], n_cols=shape[1]) for user_index in range(len(testNegatives)): user_test_items = testNegatives[user_index] URM_test_negatives_builder.add_single_row(user_index, user_test_items, data=1.0) self.URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
def test_IncrementalSparseMatrix_add_rows(self): import numpy as np n_rows = 100 n_cols = 200 randomMatrix = sps.random(n_rows, n_cols, density=0.01, format="csr") incrementalMatrix = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) for row in range(n_rows): row_data = randomMatrix.indices[randomMatrix. indptr[row]:randomMatrix.indptr[row + 1]] incrementalMatrix.add_single_row(row, row_data, 5.0) randomMatrix.data = np.ones_like(randomMatrix.data) * 5.0 randomMatrix_incremental = incrementalMatrix.get_SparseMatrix() assert sparse_are_equals(randomMatrix, randomMatrix_incremental)
def split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=50, verbose=True, at_least_n_train_items_test=0, at_least_n_train_items_validation=0): """ This function creates a Train, Test, Validation split with negative items sampled The split is perfomed user-wise, hold 1 out for validation and test :param URM_all: :param negative_items_per_positive: :return: """ URM_all = sps.csr_matrix(URM_all) n_rows, n_cols = URM_all.shape print('Creation test...') URM_train_all, URM_test = split_train_validation_leave_one_out_user_wise(URM_all, at_least_n_train_items=at_least_n_train_items_test, verbose=verbose) print('Creation validation...') URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train_all, at_least_n_train_items=at_least_n_train_items_validation, verbose=verbose) URM_negative_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) all_items = np.arange(0, n_cols, dtype=np.int) for user_index in range(URM_train_all.shape[0]): if user_index % 10000 == 0: print("split_data_train_validation_test_negative: user {} of {}".format(user_index, URM_all.shape[0])) start_pos = URM_all.indptr[user_index] end_pos = URM_all.indptr[user_index + 1] user_profile = URM_all.indices[start_pos:end_pos] unobserved_index = np.in1d(all_items, user_profile, assume_unique=True, invert=True) unobserved_items = all_items[unobserved_index] np.random.shuffle(unobserved_items) n_test_items = URM_test.indptr[user_index + 1] - URM_test.indptr[user_index] num_negative_items = n_test_items * negative_items_per_positive if num_negative_items > len(unobserved_items): print( "split_data_train_validation_test_negative: WARNING number of negative to sample for user {} is greater than available negative items {}".format( num_negative_items, len(unobserved_items))) num_negative_items = min(num_negative_items, len(unobserved_items)) URM_negative_builder.add_single_row(user_index, unobserved_items[:num_negative_items], 1.0) URM_negative = URM_negative_builder.get_SparseMatrix() return URM_train, URM_validation, URM_test, URM_negative
def split_data_train_validation_test_negative_user_wise(URM_all, negative_items_per_positive=50): """ This function creates a Train, Test, Validation split with negative items sampled The split is perfomed user-wise, 20% is test, 80% is train. Train is further divided in 90% final train and 10% validation :param URM_all: :param negative_items_per_positive: :return: """ URM_all = sps.csr_matrix(URM_all) n_rows, n_cols = URM_all.shape URM_train_all, URM_test = split_train_validation_percentage_user_wise(URM_all, train_percentage=0.8) URM_train, URM_validation = split_train_validation_percentage_user_wise(URM_train_all, train_percentage=0.9) URM_negative_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) all_items = np.arange(0, n_cols, dtype=np.int) for user_index in range(URM_train_all.shape[0]): if user_index % 10000 == 0: print("split_data_train_validation_test_negative: user {} of {}".format(user_index, URM_all.shape[0])) start_pos = URM_all.indptr[user_index] end_pos = URM_all.indptr[user_index + 1] user_profile = URM_all.indices[start_pos:end_pos] unobserved_index = np.in1d(all_items, user_profile, assume_unique=True, invert=True) unobserved_items = all_items[unobserved_index] np.random.shuffle(unobserved_items) n_test_items = URM_test.indptr[user_index + 1] - URM_test.indptr[user_index] num_negative_items = n_test_items * negative_items_per_positive if num_negative_items > len(unobserved_items): print( "split_data_train_validation_test_negative: WARNING number of negative to sample for user {} is greater than available negative items {}".format( num_negative_items, len(unobserved_items))) num_negative_items = min(num_negative_items, len(unobserved_items)) URM_negative_builder.add_single_row(user_index, unobserved_items[:num_negative_items], 1.0) URM_negative = URM_negative_builder.get_SparseMatrix() return URM_train, URM_validation, URM_test, URM_negative
def load_CSV_into_SparseBuilder(filePath, header=False, separator="::"): matrixBuilder = IncrementalSparseMatrix(auto_create_col_mapper=True, auto_create_row_mapper=True) fileHandle = open(filePath, "r") numCells = 0 if header: fileHandle.readline() for line in fileHandle: numCells += 1 if (numCells % 1000000 == 0): print("Processed {} cells".format(numCells)) if (len(line)) > 1: line = line.split(separator) line[-1] = line[-1].replace("\n", "") try: user_id = line[0] item_id = line[1] try: value = float(line[2]) if value != 0.0: matrixBuilder.add_data_lists([user_id], [item_id], [value]) except ValueError: print( "load_CSV_into_SparseBuilder: Cannot parse as float value '{}'" .format(line[2])) except IndexError: print( "load_CSV_into_SparseBuilder: Index out of bound in line '{}'" .format(line)) fileHandle.close() return matrixBuilder.get_SparseMatrix( ), matrixBuilder.get_column_token_to_id_mapper( ), matrixBuilder.get_row_token_to_id_mapper()
def test_IncrementalSparseMatrix_add_lists(self): n_rows = 100 n_cols = 200 randomMatrix = sps.random(n_rows, n_cols, density=0.01, format="coo") incrementalMatrix = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) incrementalMatrix.add_data_lists(randomMatrix.row.copy(), randomMatrix.col.copy(), randomMatrix.data.copy()) randomMatrix_incremental = incrementalMatrix.get_SparseMatrix() assert sparse_are_equals(randomMatrix, randomMatrix_incremental)
def split_train_validation_cold_start_user_wise(URM_train, full_train_percentage=0.0, cold_items=1, verbose=True): from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix # ensure to use csr matrix or we get big problem URM_train = URM_train.tocsr() num_users, num_items = URM_train.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) user_no_item_train = 0 user_no_item_validation = 0 # if we split two time train-test and train-validation we could get users with no items in the second split, # in order to get good test with enough non empty users, get the random users within the users with at least <cold_items> nnz_per_row = URM_train.getnnz(axis=1) users_enough_items = np.where(nnz_per_row > cold_items)[0] users_no_enough_items = np.where(nnz_per_row <= cold_items)[0] np.random.shuffle(users_enough_items) n_train_users = round(len(users_enough_items) * full_train_percentage) print("Users enough items: {}".format(len(users_enough_items))) print("Users no enough items: {}".format(len(users_no_enough_items))) # create full train part without coldstart for user_id in np.concatenate( (users_enough_items[0:n_train_users], users_no_enough_items), axis=0): start_pos = URM_train.indptr[user_id] end_pos = URM_train.indptr[user_id + 1] user_profile_items = URM_train.indices[start_pos:end_pos] user_profile_ratings = URM_train.data[start_pos:end_pos] user_profile_length = len(user_profile_items) URM_train_builder.add_data_lists([user_id] * user_profile_length, user_profile_items, user_profile_ratings) # create test + train for the cold start users for user_id in users_enough_items[n_train_users:]: start_pos = URM_train.indptr[user_id] end_pos = URM_train.indptr[user_id + 1] user_profile_items = URM_train.indices[start_pos:end_pos] user_profile_ratings = URM_train.data[start_pos:end_pos] user_profile_length = len(user_profile_items) n_train_items = min(cold_items, user_profile_length) if n_train_items == len(user_profile_items) and n_train_items > 1: n_train_items -= 1 indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int) np.random.shuffle(indices_for_sampling) train_items = user_profile_items[indices_for_sampling[0:n_train_items]] train_ratings = user_profile_ratings[ indices_for_sampling[0:n_train_items]] validation_items = user_profile_items[ indices_for_sampling[n_train_items:]] validation_ratings = user_profile_ratings[ indices_for_sampling[n_train_items:]] if len(train_items) == 0: if verbose: print("User {} has 0 train items".format(user_id)) user_no_item_train += 1 if len(validation_items) == 0: if verbose: print("User {} has 0 validation items".format(user_id)) user_no_item_validation += 1 URM_train_builder.add_data_lists([user_id] * len(train_items), train_items, train_ratings) URM_validation_builder.add_data_lists( [user_id] * len(validation_items), validation_items, validation_ratings) if user_no_item_train != 0: print("Warning split: {} users with 0 train items ({} total users)". format(user_no_item_train, URM_train.shape[0])) if user_no_item_validation != 0: print( "Warning split: {} users with 0 validation items ({} total users)". format(user_no_item_validation, URM_train.shape[0])) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() return URM_train, URM_validation
def __init__(self, pre_splitted_path): pre_splitted_path += "data_split/" pre_splitted_filename = "splitted_data_" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_{}: Attempting to load pre-splitted data".format( self.DATASET_NAME)) for attrib_name, attrib_object in load_data_dict_zip( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Dataset_{}: Pre-splitted data not found, building new one". format(self.DATASET_NAME)) compressed_file_folder = "Conferences/IJCAI/ConvNCF_github/Data/" decompressed_file_folder = "Data_manager_split_datasets/Gowalla/" # compressed_file = tarfile.open(compressed_file_folder + "gowalla.test.negative.gz", "r:gz") # compressed_file.extract("yelp.test.negative", path=decompressed_file_folder + "decompressed/") # compressed_file.close() # # compressed_file = tarfile.open(compressed_file_folder + "gowalla.test.rating.gz", "r:gz") # compressed_file.extract("yelp.test.rating", path=decompressed_file_folder + "decompressed/") # compressed_file.close() # # compressed_file = tarfile.open(compressed_file_folder + "gowalla.train.rating.gz", "r:gz") # compressed_file.extract("yelp.train.rating", path=decompressed_file_folder + "decompressed/") # compressed_file.close() # if original: Dataset_github.load_rating_file_as_list = Dataset_github.load_training_file_as_matrix try: dataset = Dataset_github(compressed_file_folder + "gowalla") except FileNotFoundError as exc: print( "Dataset_{}: Gowalla files not found, please download them and put them in this folder '{}', url: {}" .format(self.DATASET_NAME, compressed_file_folder, self.DATASET_URL)) print( "Dataset_{}: Uncompressed files not found, please manually decompress the *.gz files in this folder: '{}'" .format(self.DATASET_NAME, compressed_file_folder)) raise exc URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings n_users = max(URM_train_original.shape[0], URM_test.shape[0]) n_items = max(URM_train_original.shape[1], URM_test.shape[1]) URM_train_original = sps.csr_matrix(URM_train_original, shape=(n_users, n_items)) URM_test = sps.csr_matrix(URM_test, shape=(n_users, n_items)) URM_train_original.data = np.ones_like(URM_train_original.data) URM_test.data = np.ones_like(URM_test.data) URM_test_negatives_builder = IncrementalSparseMatrix( n_rows=n_users, n_cols=n_items) n_negative_samples = 999 for user_index in range(len(dataset.testNegatives)): user_test_items = dataset.testNegatives[user_index] if len(user_test_items) != n_negative_samples: print( "user id: {} has {} negative items instead {}".format( user_index, len(user_test_items), n_negative_samples)) URM_test_negatives_builder.add_single_row(user_index, user_test_items, data=1.0) URM_test_negative = URM_test_negatives_builder.get_SparseMatrix( ).tocsr() URM_test_negative.data = np.ones_like(URM_test_negative.data) URM_train, URM_validation = split_train_validation_leave_one_out_user_wise( URM_train_original.copy(), verbose=False) # # # # NOT USED # # elif not time_split: #create from full dataset with random leave one out from LINKED dateset in the article since timestamp is not present. # # # # data_reader = GowallaGithubReader_DataManager() # # loaded_dataset = data_reader.load_data() # # # # URM_all = loaded_dataset.get_URM_all() # # # # URM_all.eliminate_zeros() # # # # URM_all.data = np.ones_like(URM_all.data) # # # # #use this function 2 time because the order could change slightly the number of final interactions # # #with this order we get the same number of interactions as in the paper # # URM_all = filter_urm(URM_all, user_min_number_ratings=0, item_min_number_ratings=10) # # URM_all = filter_urm(URM_all, user_min_number_ratings=2, item_min_number_ratings=0) # # # # URM_train, URM_validation, URM_test, URM_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=999, # # at_least_n_train_items_test=0, at_least_n_train_items_validation=0, # # verbose=True) # # URM_timestamp = sps.csc_matrix(([],([],[])), shape=URM_train.shape) # # else: # create from full dataset with leave out one time wise from ORIGINAL full dateset # data_reader = GowallaReader_DataManager() # loaded_dataset = data_reader.load_data() # # URM_all = loaded_dataset.get_URM_all() # # # use this function 2 time because the order could change slightly the number of final interactions # # with this order we get the same number of interactions as in the paper # URM_all = filter_urm(URM_all, user_min_number_ratings=0, item_min_number_ratings=10) # URM_all = filter_urm(URM_all, user_min_number_ratings=2, item_min_number_ratings=0) # # URM_timestamp = URM_all.copy() # URM_all.data = np.ones_like(URM_all.data) # # URM_train, URM_validation, URM_test, URM_negative = split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=999) # URM_train = URM_train + URM_validation # URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train, verbose=False) self.URM_DICT = { "URM_train": URM_train, "URM_test": URM_test, "URM_validation": URM_validation, "URM_test_negative": URM_test_negative, } save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename) print("{}: Dataset loaded".format(self.DATASET_NAME)) ut.print_stat_datareader(self)
def split_train_in_two_percentage_user_wise(URM_train, train_percentage=0.1, verbose=False): """ The function splits an URM in two matrices selecting the number of interactions one user at a time :param URM_train: :param train_percentage: :param verbose: :return: """ assert train_percentage >= 0.0 and train_percentage <= 1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format( train_percentage) from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix # ensure to use csr matrix or we get big problem URM_train = URM_train.tocsr() num_users, num_items = URM_train.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) URM_validation_builder = IncrementalSparseMatrix( n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) user_no_item_train = 0 user_no_item_validation = 0 for user_id in range(URM_train.shape[0]): start_pos = URM_train.indptr[user_id] end_pos = URM_train.indptr[user_id + 1] user_profile_items = URM_train.indices[start_pos:end_pos] user_profile_ratings = URM_train.data[start_pos:end_pos] user_profile_length = len(user_profile_items) n_train_items = round(user_profile_length * train_percentage) if n_train_items == len(user_profile_items) and n_train_items > 1: n_train_items -= 1 indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int) np.random.shuffle(indices_for_sampling) train_items = user_profile_items[indices_for_sampling[0:n_train_items]] train_ratings = user_profile_ratings[ indices_for_sampling[0:n_train_items]] validation_items = user_profile_items[ indices_for_sampling[n_train_items:]] validation_ratings = user_profile_ratings[ indices_for_sampling[n_train_items:]] if len(train_items) == 0: if verbose: print("User {} has 0 train items".format(user_id)) user_no_item_train += 1 if len(validation_items) == 0: if verbose: print("User {} has 0 validation items".format(user_id)) user_no_item_validation += 1 URM_train_builder.add_data_lists([user_id] * len(train_items), train_items, train_ratings) URM_validation_builder.add_data_lists( [user_id] * len(validation_items), validation_items, validation_ratings) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no train items".format( user_no_item_train, user_no_item_train / num_users * 100, num_users)) if user_no_item_validation != 0: print( "Warning: {} ({:.2f} %) of {} users have no sampled items".format( user_no_item_validation, user_no_item_validation / num_users * 100, num_users)) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() return URM_train, URM_validation
def __init__(self, pre_splitted_path, original=True): pre_splitted_path += "data_split/" pre_splitted_filename = "splitted_data_" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_{}: Attempting to load pre-splitted data".format( self.DATASET_NAME)) for attrib_name, attrib_object in load_data_dict_zip( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Dataset_{}: Pre-splitted data not found, building new one". format(self.DATASET_NAME)) if original: URM_path = 'Conferences/IJCAI/DMF_original/data_www/Amazon_ratings_Digital_Music_pruned.txt' # # dataFile = open(URM_path, "r") # # # textData = dataFile.readlines() # dataFile.close() # # u_map = {} # discarded = 0 # for line in tqdm(textData): # line = line.split(' ') # u, i, rating, new_time = int(line[0]), int(line[1]), float(line[2]), int(line[3]) # # # convert u id and i id in integer starting from 0 and initialize u_map # if u not in u_map: # u_map[u] = {} # # if i not in u_map[u]: # u_map[u][i] = [rating, new_time] # else: # rating already exist, keep the most recent timestamp # discarded += 1 # current_time = u_map[u][i][1] # if new_time > current_time: # u_map[u][i] = [rating, new_time] # # print('Merged {} interactions, kept the most recent timestamps'.format(discarded)) # # UTM_builder = IncrementalSparseMatrix() # URM_builder = IncrementalSparseMatrix() # # for u in u_map: # items, ratings, timestamps = [], [], [] # for i in u_map[u]: # items.append(i) # timestamps.append(u_map[u][i][1]) # ratings.append(u_map[u][i][0]) # UTM_builder.add_data_lists(row_list_to_add=np.full(len(items), int(u)), col_list_to_add=items, data_list_to_add=timestamps) # URM_builder.add_data_lists(row_list_to_add=np.full(len(items), int(u)), col_list_to_add=items, data_list_to_add=ratings) # URM_rating_builder = IncrementalSparseMatrix( auto_create_col_mapper=True, auto_create_row_mapper=True) URM_timestamp_builder = IncrementalSparseMatrix( auto_create_col_mapper=True, auto_create_row_mapper=True) # URM_duplicate_assert_builder = IncrementalSparseMatrix( auto_create_col_mapper = True, auto_create_row_mapper = True) df_original = pd.read_csv(filepath_or_buffer=URM_path, sep=" ", header=None, dtype={ 0: int, 1: int, 2: float, 3: int }) df_original.columns = [ 'userId', 'itemId', 'rating', 'timestamp' ] userId_list = df_original['userId'].values itemId_list = df_original['itemId'].values rating_list = df_original['rating'].values timestamp_list = df_original['timestamp'].values URM_rating_builder.add_data_lists(userId_list, itemId_list, rating_list) URM_timestamp_builder.add_data_lists(userId_list, itemId_list, timestamp_list) # URM_duplicate_assert_builder.add_data_lists(userId_list, itemId_list, np.ones_like(rating_list)) # URM_duplicate_assert = URM_duplicate_assert_builder.get_SparseMatrix() # # assert np.all(URM_duplicate_assert.data == 1.0), "Duplicates detected" # Check if duplicates exist num_unique_user_item_ids = df_original.drop_duplicates( ['userId', 'itemId'], keep='first', inplace=False).shape[0] assert num_unique_user_item_ids == len( userId_list), "Duplicate (user, item) values found" URM_timestamp = URM_timestamp_builder.get_SparseMatrix() URM_all = URM_rating_builder.get_SparseMatrix() URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp( URM_all, URM_timestamp, negative_items_per_positive=99) # We want the validation to be sampled at random, not as the last interaction URM_train = URM_train + URM_validation URM_train, URM_validation = split_train_validation_leave_one_out_user_wise( URM_train, verbose=False) else: # create from full dataset with leave out one time wise from ORIGINAL full dateset data_reader = AmazonMusicReader_DataManager() loaded_dataset = data_reader.load_data() URM_all = loaded_dataset.get_URM_from_name("URM_all") URM_timestamp = loaded_dataset.get_URM_from_name( "URM_timestamp") # use this function 2 time because the order could change slightly the number of final interactions URM_all = filter_urm(URM_all, user_min_number_ratings=1, item_min_number_ratings=5) URM_all = filter_urm(URM_all, user_min_number_ratings=20, item_min_number_ratings=1) URM_timestamp = filter_urm(URM_timestamp, user_min_number_ratings=1, item_min_number_ratings=5) URM_timestamp = filter_urm(URM_timestamp, user_min_number_ratings=20, item_min_number_ratings=1) URM_timestamp = URM_timestamp URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp( URM_all, URM_timestamp, negative_items_per_positive=99) # We want the validation to be sampled at random, not as the last interaction URM_train = URM_train + URM_validation URM_train, URM_validation = split_train_validation_leave_one_out_user_wise( URM_train, verbose=False) self.URM_DICT = { "URM_train": URM_train, "URM_test": URM_test, "URM_validation": URM_validation, "URM_test_negative": URM_test_negative, "URM_timestamp": URM_timestamp, } save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename) print("{}: Dataset loaded".format(self.DATASET_NAME)) print_stat_datareader(self)
def __init__(self): test_percentage = 0.2 validation_percentage = 0.2 pre_splitted_path = "Data_manager_split_datasets/AmazonInstantVideo/RecSys/SpectralCF_our_interface/" pre_splitted_filename = "splitted_data" ratings_file_name = "ratings_Amazon_Instant_Video.csv" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print( "Dataset_AmazonInstantVideo: Attempting to load pre-splitted data" ) for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "Dataset_AmazonInstantVideo: Pre-splitted data not found, building new one" ) folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER downloadFromURL(self.DATASET_URL, folder_path, ratings_file_name) # read Amazon Instant Video df = pd.read_csv(folder_path + ratings_file_name, sep=',', header=None, names=['user', 'item', 'rating', 'timestamp'])[['user', 'item', 'rating']] # keep only ratings = 5 URM_train_builder = IncrementalSparseMatrix( auto_create_col_mapper=True, auto_create_row_mapper=True) URM_train_builder.add_data_lists(df['user'].values, df['item'].values, df['rating'].values) URM_all = URM_train_builder.get_SparseMatrix() URM_all.data = URM_all.data == 5 URM_all.eliminate_zeros() # keep only users with at least 5 ratings URM_all = ut.filter_urm(URM_all, user_min_number_ratings=5, item_min_number_ratings=1) # create train - test - validation URM_train_original, self.URM_test = split_train_validation_percentage_user_wise( URM_all, train_percentage=1 - test_percentage, verbose=False) self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise( URM_train_original, train_percentage=1 - validation_percentage, verbose=False) data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Dataset_AmazonInstantVideo: Dataset loaded") ut.print_stat_datareader(self)
def __init__(self, pre_splitted_path, original=True): pre_splitted_path += "data_split/" pre_splitted_filename = "splitted_data_" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_{}: Attempting to load pre-splitted data".format( self.DATASET_NAME)) for attrib_name, attrib_object in load_data_dict_zip( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Dataset_{}: Pre-splitted data not found, building new one". format(self.DATASET_NAME)) compressed_file_folder = "Conferences/IJCAI/ConvNCF_github/Data/" decompressed_file_folder = "Data_manager_split_datasets/Yelp/" # compressed_file = tarfile.open(compressed_file_folder + "yelp.test.negative.gz", "r:gz") # compressed_file.extract("yelp.test.negative", path=decompressed_file_folder + "decompressed/") # compressed_file.close() # # compressed_file = tarfile.open(compressed_file_folder + "yelp.test.rating.gz", "r:gz") # compressed_file.extract("yelp.test.rating", path=decompressed_file_folder + "decompressed/") # compressed_file.close() # # compressed_file = tarfile.open(compressed_file_folder + "yelp.train.rating.gz", "r:gz") # compressed_file.extract("yelp.train.rating", path=decompressed_file_folder + "decompressed/") # compressed_file.close() # if original: Dataset_github.load_rating_file_as_list = Dataset_github.load_training_file_as_matrix try: dataset = Dataset_github(compressed_file_folder + "yelp") except FileNotFoundError as exc: print( "Dataset_{}: Uncompressed files not found, please manually decompress the *.gz files in this folder: '{}'" .format(self.DATASET_NAME, compressed_file_folder)) raise exc URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings n_users = max(URM_train_original.shape[0], URM_test.shape[0]) n_items = max(URM_train_original.shape[1], URM_test.shape[1]) URM_train_original = sps.csr_matrix(URM_train_original, shape=(n_users, n_items)) URM_test = sps.csr_matrix(URM_test, shape=(n_users, n_items)) URM_train_original.data = np.ones_like(URM_train_original.data) URM_test.data = np.ones_like(URM_test.data) URM_test_negatives_builder = IncrementalSparseMatrix( n_rows=n_users, n_cols=n_items) n_negative_samples = 999 for user_index in range(len(dataset.testNegatives)): user_test_items = dataset.testNegatives[user_index] if len(user_test_items) != n_negative_samples: print( "user id: {} has {} negative items instead {}".format( user_index, len(user_test_items), n_negative_samples)) URM_test_negatives_builder.add_single_row(user_index, user_test_items, data=1.0) URM_test_negative = URM_test_negatives_builder.get_SparseMatrix() URM_test_negative.data = np.ones_like(URM_test_negative.data) URM_train, URM_validation = split_train_validation_leave_one_out_user_wise( URM_train_original.copy(), verbose=False) # # else: # data_reader = YelpReader_DataManager() # loaded_dataset = data_reader.load_data() # # URM_all = loaded_dataset.get_URM_all() # # URM_timestamp = URM_all.copy() # # URM_all.data = np.ones_like(URM_all.data) # # URM_train, URM_validation, URM_test, URM_negative = split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=999) # URM_train = URM_train + URM_validation # URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train, verbose=False) shutil.rmtree(decompressed_file_folder + "decompressed/", ignore_errors=True) self.URM_DICT = { "URM_train": URM_train, "URM_test": URM_test, "URM_validation": URM_validation, "URM_test_negative": URM_test_negative, } save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename) print("{}: Dataset loaded".format(self.DATASET_NAME)) ut.print_stat_datareader(self)
def __init__(self, pre_splitted_path, type='original'): assert type in ["original", "ours"] pre_splitted_path += "data_split/" pre_splitted_filename = "splitted_data_" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_{}: Attempting to load pre-splitted data".format( self.DATASET_NAME)) for attrib_name, attrib_object in load_data_dict_zip( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Dataset_{}: Pre-splitted data not found, building new one". format(self.DATASET_NAME)) from Conferences.IJCAI.CoupledCF_original import LoadTafengDataCnn as DatareaderOriginal path = "Conferences/IJCAI/CoupledCF_original/tafeng/" n_users, user_attributes_mat = DatareaderOriginal.load_user_attributes( path=path) n_items, items_genres_mat = DatareaderOriginal.load_itemGenres_as_matrix( path=path) ratings = DatareaderOriginal.load_rating_train_as_matrix(path=path) testRatings = DatareaderOriginal.load_rating_file_as_list( path=path) testNegatives = DatareaderOriginal.load_negative_file(path=path) URM_all = ratings.tocsr() UCM_all = sps.csc_matrix(user_attributes_mat) UCM_age = UCM_all[:, 0:11].tocsr() UCM_region = UCM_all[:, 11:19].tocsr() UCM_all = UCM_all.tocsr() # col: 0->category, 2->asset(0-1), 1->price(0-1) ICM_original = sps.csc_matrix(items_genres_mat) # category could be used as matrix, not single row ICM_sub_class = ICM_original[:, 0:1].tocsr() max = ICM_sub_class.shape[0] rows, cols, data = [], [], [] for idx in range(max): # we have only index 0 as col data_vect = ICM_sub_class.data[ ICM_sub_class.indptr[idx]:ICM_sub_class.indptr[idx + 1]] if len(data_vect) == 0: # handle category value 0 that in a csr matrix is not present cols.append(int(0)) else: cols.append(int(data_vect[0])) rows.append(idx) data.append(1.0) ICM_sub_class = sps.csr_matrix((data, (rows, cols))) ICM_asset = ICM_original[:, 1:2].tocsr() ICM_price = ICM_original[:, 2:3].tocsr() ICM_original = ICM_original.tocsc() ICM_all = sps.hstack((ICM_sub_class, ICM_asset, ICM_price)) testRatings = np.array(testRatings).T URM_test_builder = IncrementalSparseMatrix(n_rows=n_users + 1, n_cols=n_items + 1) URM_test_builder.add_data_lists(testRatings[0], testRatings[1], np.ones(len(testRatings[0]))) URM_test = URM_test_builder.get_SparseMatrix() URM_test_negatives_builder = IncrementalSparseMatrix( n_rows=n_users + 1, n_cols=n_items + 1) # care here, the test negative start from index 0 but it refer to user index 1 (user index start from 1) n_negative_samples = 99 for index in range(len(testNegatives)): user_test_items = testNegatives[index] if len(user_test_items) != n_negative_samples: print( "user id: {} has {} negative items instead {}".format( index + 1, len(user_test_items), n_negative_samples)) URM_test_negatives_builder.add_single_row(index + 1, user_test_items, data=1.0) URM_test_negative = URM_test_negatives_builder.get_SparseMatrix() URM_test_negative.data = np.ones_like(URM_test_negative.data) if type == 'original': URM_test = URM_test URM_train, URM_validation = split_train_validation_leave_one_out_user_wise( URM_all.copy(), verbose=False) else: # redo the split URM_full = URM_all + URM_test URM_temp, URM_test = split_train_validation_leave_one_out_user_wise( URM_full.copy(), verbose=False) URM_train, URM_validation = split_train_validation_leave_one_out_user_wise( URM_temp.copy(), verbose=False) self.ICM_DICT = { "UCM_age": UCM_age, "UCM_region": UCM_region, "UCM_all": UCM_all, "ICM_all": ICM_all, "ICM_original": ICM_original, "ICM_sub_class": ICM_sub_class, "ICM_asset": ICM_asset, "ICM_price": ICM_price, } self.URM_DICT = { "URM_train": URM_train, "URM_test": URM_test, "URM_validation": URM_validation, "URM_test_negative": URM_test_negative, } save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename) print("{}: Dataset loaded".format(self.DATASET_NAME)) ut.print_stat_datareader(self)
def load_CSV_into_SparseBuilder (filePath, header = False, separator="::", timestamp = False, remove_duplicates = False, custom_user_item_rating_columns = None): URM_all_builder = IncrementalSparseMatrix(auto_create_col_mapper = True, auto_create_row_mapper = True) URM_timestamp_builder = IncrementalSparseMatrix(auto_create_col_mapper = True, auto_create_row_mapper = True) if timestamp: dtype={0:str, 1:str, 2:float, 3:float} columns = ['userId', 'itemId', 'interaction', 'timestamp'] else: dtype={0:str, 1:str, 2:float} columns = ['userId', 'itemId', 'interaction'] df_original = pd.read_csv(filepath_or_buffer=filePath, sep=separator, header= 0 if header else None, dtype=dtype, usecols=custom_user_item_rating_columns) # If the original file has more columns, keep them but ignore them df_original.columns = columns user_id_list = df_original['userId'].values item_id_list = df_original['itemId'].values interaction_list = df_original['interaction'].values # Check if duplicates exist num_unique_user_item_ids = df_original.drop_duplicates(['userId', 'itemId'], keep='first', inplace=False).shape[0] contains_duplicates_flag = num_unique_user_item_ids != len(user_id_list) if contains_duplicates_flag: if remove_duplicates: # # Remove duplicates. # This way of removing the duplicates keeping the last tiemstamp without removing other columns # would be the simplest, but it is so slow to the point of being unusable on any dataset but ML100k # idxs = df_original.groupby(by=['userId', 'itemId'], as_index=False)["timestamp"].idxmax() # df_original = df_original.loc[idxs] # Alternative faster way: # 1 - Sort in ascending order so that the last (bigger) timestamp is in the last position. Set Nan to be in the first position, to remove them if possible # 2 - Then remove duplicates for user-item keeping the last row, which will be the last timestamp. if timestamp: sort_by = ["userId", "itemId", "timestamp"] else: sort_by = ["userId", "itemId", 'interaction'] df_original.sort_values(by=sort_by, ascending=True, inplace=True, kind="quicksort", na_position="first") df_original.drop_duplicates(["userId", "itemId"], keep='last', inplace=True) user_id_list = df_original['userId'].values item_id_list = df_original['itemId'].values interaction_list = df_original['interaction'].values assert num_unique_user_item_ids == len(user_id_list), "load_CSV_into_SparseBuilder: duplicate (user, item) values found" else: assert num_unique_user_item_ids == len(user_id_list), "load_CSV_into_SparseBuilder: duplicate (user, item) values found" URM_all_builder.add_data_lists(user_id_list, item_id_list, interaction_list) if timestamp: timestamp_list = df_original['timestamp'].values URM_timestamp_builder.add_data_lists(user_id_list, item_id_list, timestamp_list) return URM_all_builder.get_SparseMatrix(), URM_timestamp_builder.get_SparseMatrix(), \ URM_all_builder.get_column_token_to_id_mapper(), URM_all_builder.get_row_token_to_id_mapper() return URM_all_builder.get_SparseMatrix(), \ URM_all_builder.get_column_token_to_id_mapper(), URM_all_builder.get_row_token_to_id_mapper()
def __init__(self, pre_splitted_path, type='original'): assert type in ["original", "ours"] pre_splitted_path += "data_split/" pre_splitted_filename = "splitted_data_" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_{}: Attempting to load pre-splitted data".format( self.DATASET_NAME)) for attrib_name, attrib_object in load_data_dict_zip( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Dataset_{}: Pre-splitted data not found, building new one". format(self.DATASET_NAME)) from Conferences.IJCAI.CoupledCF_original import LoadMovieDataCnn as DatareaderOriginal path = "Conferences/IJCAI/CoupledCF_original/ml-1m/" n_users, gender, age, occupation = DatareaderOriginal.load_user_attributes( path=path, split=True) n_items, items_genres_mat = DatareaderOriginal.load_itemGenres_as_matrix( path=path) ratings = DatareaderOriginal.load_rating_train_as_matrix(path=path) testRatings = DatareaderOriginal.load_rating_file_as_list( path=path) testNegatives = DatareaderOriginal.load_negative_file(path=path) URM_all = ratings.tocsr() UCM_gender = gender.tocsr() UCM_age = age.tocsr() UCM_occupation = occupation.tocsr() UCM_all = sps.hstack((UCM_gender, UCM_age, UCM_occupation)).tocsr() ICM_all = sps.csr_matrix(items_genres_mat) testRatings = np.array(testRatings).T URM_test_builder = IncrementalSparseMatrix(n_rows=n_users + 1, n_cols=n_items + 1) URM_test_builder.add_data_lists(testRatings[0], testRatings[1], np.ones(len(testRatings[0]))) URM_test = URM_test_builder.get_SparseMatrix() URM_test_negatives_builder = IncrementalSparseMatrix( n_rows=n_users + 1, n_cols=n_items + 1) # care here, the test negative start from index 0 but it refer to user index 1 (user index start from 1) n_negative_samples = 99 for index in range(len(testNegatives)): user_test_items = testNegatives[index] if len(user_test_items) != n_negative_samples: print( "user id: {} has {} negative items instead {}".format( index + 1, len(user_test_items), n_negative_samples)) URM_test_negatives_builder.add_single_row(index + 1, user_test_items, data=1.0) URM_test_negative = URM_test_negatives_builder.get_SparseMatrix() URM_test_negative.data = np.ones_like(URM_test_negative.data) if type == 'original': URM_test = URM_test URM_train, URM_validation = split_train_validation_leave_one_out_user_wise( URM_all.copy(), verbose=False) else: # redo the split URM_full = URM_all + URM_test URM_temp, URM_test = split_train_validation_leave_one_out_user_wise( URM_full.copy(), verbose=False) URM_train, URM_validation = split_train_validation_leave_one_out_user_wise( URM_temp.copy(), verbose=False) self.ICM_DICT = { "UCM_gender": UCM_gender, "UCM_occupation": UCM_occupation, "UCM_age": UCM_age, "UCM_all": UCM_all, "ICM_all": ICM_all, } self.URM_DICT = { "URM_train": URM_train, "URM_test": URM_test, "URM_validation": URM_validation, "URM_test_negative": URM_test_negative, } save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename) print("{}: Dataset loaded".format(self.DATASET_NAME)) ut.print_stat_datareader(self)
def split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=100): URM_all = sps.csr_matrix(URM_all) URM_timestamp = sps.csr_matrix(URM_timestamp) n_rows, n_cols = URM_all.shape URM_train_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) URM_test_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) URM_validation_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) URM_negative_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) all_items = np.arange(0, n_cols, dtype=np.int) for user_index in range(URM_all.shape[0]): if user_index % 10000 == 0: print("split_data_on_sequence: user {} of {}".format(user_index, URM_all.shape[0])) start_pos = URM_all.indptr[user_index] end_pos = URM_all.indptr[user_index+1] user_profile = URM_all.indices[start_pos:end_pos] user_data = URM_all.data[start_pos:end_pos] user_sequence = URM_timestamp.data[start_pos:end_pos] unobserved_index = np.in1d(all_items, user_profile, assume_unique=True, invert=True) unobserved_items = all_items[unobserved_index] np.random.shuffle(unobserved_items) URM_negative_builder.add_single_row(user_index, unobserved_items[:negative_items_per_positive], 1.0) if len(user_profile) >= 3: # Test contain the first one, validation the second min_pos = np.argmax(user_sequence) venue_index = user_profile[min_pos] venue_data = user_data[min_pos] URM_test_builder.add_data_lists([user_index], [venue_index], [venue_data]) user_profile = np.delete(user_profile, min_pos) user_data = np.delete(user_data, min_pos) user_sequence = np.delete(user_sequence, min_pos) min_pos = np.argmax(user_sequence) venue_index = user_profile[min_pos] venue_data = user_data[min_pos] URM_validation_builder.add_data_lists([user_index], [venue_index], [venue_data]) user_profile = np.delete(user_profile, min_pos) user_data = np.delete(user_data, min_pos) #user_sequence = np.delete(user_sequence, min_pos) URM_train_builder.add_data_lists([user_index]*len(user_profile), user_profile, user_data) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() URM_test = URM_test_builder.get_SparseMatrix() URM_negative = URM_negative_builder.get_SparseMatrix() return URM_train, URM_validation, URM_test, URM_negative
def __init__(self): super(PinterestICCVReader, self).__init__() pre_splitted_path = "Data_manager_split_datasets/PinterestICCV/WWW/NeuMF_our_interface/" pre_splitted_filename = "splitted_data" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_Pinterest: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "Dataset_Pinterest: Pre-splitted data not found, building new one" ) # Ensure file is loaded as matrix Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix dataset = Dataset_github( "Conferences/WWW/NeuMF_github/Data/pinterest-20") self.URM_train_original, self.URM_test = dataset.trainMatrix, dataset.testRatings self.URM_train_original = self.URM_train_original.tocsr() self.URM_test = self.URM_test.tocsr() from Base.Recommender_utils import reshapeSparse shape = (max(self.URM_train_original.shape[0], self.URM_test.shape[0]), max(self.URM_train_original.shape[1], self.URM_test.shape[1])) self.URM_train_original = reshapeSparse(self.URM_train_original, shape) self.URM_test = reshapeSparse(self.URM_test, shape) URM_test_negatives_builder = IncrementalSparseMatrix( n_rows=shape[0], n_cols=shape[1]) for user_index in range(len(dataset.testNegatives)): user_test_items = dataset.testNegatives[user_index] URM_test_negatives_builder.add_single_row(user_index, user_test_items, data=1.0) self.URM_test_negative = URM_test_negatives_builder.get_SparseMatrix( ) self.URM_train, self.URM_validation = split_train_validation_leave_one_out_user_wise( self.URM_train_original.copy()) data_dict = { "URM_train_original": self.URM_train_original, "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_test_negative": self.URM_test_negative, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Dataset_Pinterest: Dataset loaded") print("N_items {}, n_users {}".format(self.URM_train.shape[1], self.URM_train.shape[0]))
def __init__(self, pre_splitted_path, type="original"): pre_splitted_path += "data_split/" pre_splitted_filename = "splitted_data_" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_{}: Attempting to load pre-splitted data".format( self.DATASET_NAME)) for attrib_name, attrib_object in load_data_dict_zip( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Dataset_{}: Pre-splitted data not found, building new one". format(self.DATASET_NAME)) if type == "original": # Ensure file is loaded as matrix Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix dataset = Dataset_github( "Conferences/IJCAI/DELF_original/Data/ml-1m") URM_train, URM_validation, URM_test, testNegatives = dataset.trainMatrix, dataset.validRatings, \ dataset.testRatings, dataset.testNegatives URM_train = URM_train.tocsr() URM_validation = URM_validation.tocsr() URM_test = URM_test.tocsr() URM_timestamp = "no" from Base.Recommender_utils import reshapeSparse shape = (max(URM_train.shape[0], URM_validation.shape[0], URM_test.shape[0]), max(URM_train.shape[1], URM_validation.shape[1], URM_test.shape[1])) URM_train = reshapeSparse(URM_train, shape) URM_validation = reshapeSparse(URM_validation, shape) URM_test = reshapeSparse(URM_test, shape) URM_test_negatives_builder = IncrementalSparseMatrix( n_rows=shape[0], n_cols=shape[1]) for user_index in range(len(dataset.testNegatives)): user_test_items = dataset.testNegatives[user_index] URM_test_negatives_builder.add_single_row(user_index, user_test_items, data=1.0) URM_test_negative = URM_test_negatives_builder.get_SparseMatrix( ) elif type == "ours": # create from full dataset with leave out one time wise from ORIGINAL full dateset data_reader = Movielens1MReader_DataManager() loaded_dataset = data_reader.load_data() URM_all = loaded_dataset.get_URM_from_name("URM_all") URM_timestamp = loaded_dataset.get_URM_from_name( "URM_timestamp") # make rating implicit URM_all.data = np.ones_like(URM_all.data) URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp( URM_all, URM_timestamp, negative_items_per_positive=99) else: assert False self.URM_DICT = { "URM_train": URM_train, "URM_test": URM_test, "URM_validation": URM_validation, "URM_test_negative": URM_test_negative, "URM_timestamp": URM_timestamp, } save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename) print("{}: Dataset loaded".format(self.DATASET_NAME)) print_stat_datareader(self)
def __init__(self, pre_splitted_path): super(Movielens1MReader, self).__init__() pre_splitted_path += "data_split/" pre_splitted_filename = "splitted_data_" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_Movielens1M: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict_zip( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "Dataset_Movielens1M: Pre-splitted data not found, building new one" ) # Ensure file is loaded as matrix Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix dataset = Dataset_github("Conferences/WWW/NeuMF_github/Data/ml-1m") URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings URM_train_original = URM_train_original.tocsr() URM_test = URM_test.tocsr() from Base.Recommender_utils import reshapeSparse shape = (max(URM_train_original.shape[0], URM_test.shape[0]), max(URM_train_original.shape[1], URM_test.shape[1])) URM_train_original = reshapeSparse(URM_train_original, shape) URM_test = reshapeSparse(URM_test, shape) URM_test_negatives_builder = IncrementalSparseMatrix( n_rows=shape[0], n_cols=shape[1]) for user_index in range(len(dataset.testNegatives)): user_test_items = dataset.testNegatives[user_index] URM_test_negatives_builder.add_single_row(user_index, user_test_items, data=1.0) URM_test_negative = URM_test_negatives_builder.get_SparseMatrix() URM_train, URM_validation = split_train_validation_leave_one_out_user_wise( URM_train_original.copy()) self.URM_DICT = { "URM_train_original": URM_train_original, "URM_train": URM_train, "URM_test": URM_test, "URM_test_negative": URM_test_negative, "URM_validation": URM_validation, } save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename) print("Dataset_Movielens1M: Dataset loaded")
def _loadURM(self): from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix numCells = 0 URM_builder = IncrementalSparseMatrix(auto_create_col_mapper=True, auto_create_row_mapper=True) for current_split in [1, 2, 3, 4]: current_split_path = self.dataFile.extract( "combined_data_{}.txt".format(current_split), path=self.decompressed_zip_file_folder + "decompressed/") fileHandle = open(current_split_path, "r") print("NetflixPrizeReader: loading split {}".format(current_split)) currentMovie_id = None for line in fileHandle: if numCells % 1000000 == 0 and numCells != 0: print("Processed {} cells".format(numCells)) if (len(line)) > 1: line_split = line.split(",") # If line has 3 components, it is a 'user_id,rating,date' row if len(line_split) == 3 and currentMovie_id != None: user_id = line_split[0] URM_builder.add_data_lists([user_id], [currentMovie_id], [float(line_split[1])]) numCells += 1 # If line has 1 component, it MIGHT be a 'item_id:' row elif len(line_split) == 1: line_split = line.split(":") # Confirm it is a 'item_id:' row if len(line_split) == 2: currentMovie_id = line_split[0] else: print("Unexpected row: '{}'".format(line)) else: print("Unexpected row: '{}'".format(line)) fileHandle.close() print("NetflixPrizeReader: cleaning temporary files") shutil.rmtree(self.decompressed_zip_file_folder + "decompressed/", ignore_errors=True) return URM_builder.get_SparseMatrix( ), URM_builder.get_column_token_to_id_mapper( ), URM_builder.get_row_token_to_id_mapper()
def __init__(self): super(Movielens100KReader, self).__init__() pre_splitted_path = "Data_manager_split_datasets/Movielens100K/KDD/MCRec_our_interface/" pre_splitted_filename = "splitted_data" original_data_path = "Conferences/KDD/MCRec_github/data/" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Movielens100KReader: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Movielens100KReader: Pre-splitted data not found, building new one") print("Movielens100KReader: loading URM") from Conferences.KDD.MCRec_github.code.Dataset import Dataset dataset = 'ml-100k' dataset = Dataset(original_data_path + dataset) URM_train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives # Dataset adds 1 to user and item id, removing it to restore 0 indexing URM_train = sps.coo_matrix(URM_train) URM_train.row -= 1 URM_train.col -= 1 self.URM_train = sps.csr_matrix((np.ones_like(URM_train.data), (URM_train.row, URM_train.col))) num_users, num_items = self.URM_train.shape # Build sparse matrices from lists URM_test_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) URM_test_negative_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) for user_index in range(len(testRatings)): user_id = testRatings[user_index][0] current_user_test_items = testRatings[user_index][1:] current_user_test_negative_items = testNegatives[user_index] current_user_test_items = np.array(current_user_test_items) -1 current_user_test_negative_items = np.array(current_user_test_negative_items) -1 URM_test_builder.add_single_row(user_id -1, current_user_test_items, 1.0) URM_test_negative_builder.add_single_row(user_id -1, current_user_test_negative_items, 1.0) # the test data has repeated data, apparently self.URM_test = URM_test_builder.get_SparseMatrix() self.URM_test_negative = URM_test_negative_builder.get_SparseMatrix() # Split validation from train as 10% from Data_manager.split_functions.split_train_validation import split_train_validation_percentage_user_wise self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(self.URM_train, train_percentage=0.9) # Load features data_reader = Movielens100KReader_DataManager() data_reader.load_data() zipFile_path = data_reader.DATASET_SPLIT_ROOT_FOLDER + data_reader.DATASET_SUBFOLDER dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip") ICM_path = dataFile.extract("ml-100k/u.item", path=zipFile_path + "decompressed/") ICM_genre = self._loadICM(ICM_path) ICM_genre = ICM_genre.get_SparseMatrix() shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self.ICM_dict = {"ICM_genre": ICM_genre} data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation, "URM_test_negative": self.URM_test_negative, "ICM_dict": self.ICM_dict, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Movielens100KReader: loading complete")
def split_train_leave_k_out_user_wise( URM, k_out=1, use_validation_set=True, leave_random_out=True ): """ The function splits an URM in two matrices selecting the k_out interactions one user at a time :param URM: :param k_out: :param use_validation_set: :param leave_random_out: :return: """ assert k_out > 0, "k_out must be a value greater than 0, provided was '{}'".format( k_out ) URM = sps.csr_matrix(URM) n_users, n_items = URM.shape URM_train_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items, ) URM_test_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items, ) if use_validation_set: URM_validation_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items, ) for user_id in range(n_users): start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] user_profile = URM.indices[start_user_position:end_user_position] if leave_random_out: indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_interaction_items = user_profile[indices_to_suffle] user_interaction_data = URM.data[start_user_position:end_user_position][ indices_to_suffle ] else: # The first will be sampled so the last interaction must be the first one interaction_position = URM.data[start_user_position:end_user_position] sort_interaction_index = np.argsort(-interaction_position) user_interaction_items = user_profile[sort_interaction_index] user_interaction_data = URM.data[start_user_position:end_user_position][ sort_interaction_index ] # Test interactions user_interaction_items_test = user_interaction_items[0:k_out] user_interaction_data_test = user_interaction_data[0:k_out] URM_test_builder.add_data_lists( [user_id] * len(user_interaction_items_test), user_interaction_items_test, user_interaction_data_test, ) # validation interactions if use_validation_set: user_interaction_items_validation = user_interaction_items[ k_out : k_out * 2 ] user_interaction_data_validation = user_interaction_data[k_out : k_out * 2] URM_validation_builder.add_data_lists( [user_id] * k_out, user_interaction_items_validation, user_interaction_data_validation, ) # Train interactions user_interaction_items_train = user_interaction_items[k_out * 2 :] user_interaction_data_train = user_interaction_data[k_out * 2 :] URM_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train, ) URM_train = URM_train_builder.get_SparseMatrix() URM_test = URM_test_builder.get_SparseMatrix() URM_train = sps.csr_matrix(URM_train) user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0) if user_no_item_train != 0: print( "Warning: {} ({:.2f} %) of {} users have no Train items".format( user_no_item_train, user_no_item_train / n_users * 100, n_users ) ) if use_validation_set: URM_validation = URM_validation_builder.get_SparseMatrix() URM_validation = sps.csr_matrix(URM_validation) user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0) if user_no_item_validation != 0: print( "Warning: {} ({:.2f} %) of {} users have no Validation items".format( user_no_item_validation, user_no_item_validation / n_users * 100, n_users, ) ) return URM_train, URM_validation, URM_test return URM_train, URM_test