def _loadICM_tags(tags_path, header=True, separator=',', if_new_item="ignore", item_original_ID_to_index=None, preinitialized_col_mapper=None): # Tags from Data_manager.TagPreprocessing import tagFilterAndStemming from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs ICM_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=preinitialized_col_mapper, on_new_col="add", preinitialized_row_mapper=item_original_ID_to_index, on_new_row=if_new_item) fileHandle = open(tags_path, "r", encoding="latin1") numCells = 0 if header: fileHandle.readline() for line in fileHandle: numCells += 1 if (numCells % 100000 == 0): print("Processed {} cells".format(numCells)) if (len(line)) > 1: line = line.split(separator) line[-1] = line[-1].replace("\n", "") # If a movie has no genre, ignore it movie_id = line[1] tagList = line[2] # Remove non alphabetical character and split on spaces tagList = tagFilterAndStemming(tagList) # Rows movie ID # Cols features ICM_builder.add_single_row(movie_id, tagList, data=1.0) fileHandle.close() return ICM_builder.get_SparseMatrix( ), ICM_builder.get_column_token_to_id_mapper( ), ICM_builder.get_row_token_to_id_mapper()
def _loadURM_preinitialized_item_id(filePath, header=False, separator="::", if_new_user="******", if_new_item="ignore", item_original_ID_to_index=None, user_original_ID_to_index=None): from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs URM_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=item_original_ID_to_index, on_new_col=if_new_item, preinitialized_row_mapper=user_original_ID_to_index, on_new_row=if_new_user) fileHandle = open(filePath, "r") numCells = 0 if header: fileHandle.readline() for line in fileHandle: numCells += 1 if (numCells % 1000000 == 0): print("Processed {} cells".format(numCells)) if (len(line)) > 1: line = line.split(separator) line[-1] = line[-1].replace("\n", "") user_id = line[0] item_id = line[1] try: value = float(line[2]) if value != 0.0: URM_builder.add_data_lists([user_id], [item_id], [value]) except: pass fileHandle.close() return URM_builder.get_SparseMatrix( ), URM_builder.get_column_token_to_id_mapper( ), URM_builder.get_row_token_to_id_mapper()
def _loadURM_preinitialized_item_id(URM_path, header=False, separator=",", if_new_user="******", if_new_item="add", item_original_ID_to_index=None, user_original_ID_to_index=None): from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs URM_all_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=item_original_ID_to_index, on_new_col=if_new_item, preinitialized_row_mapper=user_original_ID_to_index, on_new_row=if_new_user) if header: df_original = pd.read_csv(filepath_or_buffer=URM_path, sep=separator, header=0 if header else None, usecols=['user', 'item', 'rating'], dtype={ 'user': str, 'item': str, 'rating': float }) else: df_original = pd.read_csv(filepath_or_buffer=URM_path, sep=separator, header=0 if header else None, dtype={ 0: str, 1: str, 2: float }) df_original.columns = ['user', 'item', 'rating'] # Remove data with rating non valid # df_original.drop(df_original[df_original.rating == 0.0].index, inplace=True) user_id_list = df_original['user'].values item_id_list = df_original['item'].values rating_list = df_original['rating'].values URM_all_builder.add_data_lists(user_id_list, item_id_list, rating_list) return URM_all_builder.get_SparseMatrix(), \ URM_all_builder.get_column_token_to_id_mapper(), \ URM_all_builder.get_row_token_to_id_mapper()
def _loadICM_genres(genres_path, header=True, separator=',', genresSeparator="|"): # Genres from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs ICM_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=None, on_new_col="add", preinitialized_row_mapper=None, on_new_row="add") fileHandle = open(genres_path, "r", encoding="latin1") numCells = 0 if header: fileHandle.readline() for line in fileHandle: numCells += 1 if (numCells % 1000000 == 0): print("Processed {} cells".format(numCells)) if (len(line)) > 1: line = line.split(separator) line[-1] = line[-1].replace("\n", "") movie_id = line[0] title = line[1] # In case the title contains commas, it is enclosed in "..." # genre list will always be the last element genreList = line[-1] genreList = genreList.split(genresSeparator) # Rows movie ID # Cols features ICM_builder.add_single_row(movie_id, genreList, data=1.0) fileHandle.close() return ICM_builder.get_SparseMatrix( ), ICM_builder.get_column_token_to_id_mapper( ), ICM_builder.get_row_token_to_id_mapper()
def _loadURM(self, file_name, header=False, separator=" ", item_original_ID_to_index=None, user_original_ID_to_index=None): URM_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=item_original_ID_to_index, on_new_col="add", preinitialized_row_mapper=user_original_ID_to_index, on_new_row="add") fileHandle = open(file_name, "r") numCells = 0 if header: fileHandle.readline() for line in fileHandle: if (numCells % 100000 == 0 and numCells != 0): print("Processed {} cells".format(numCells)) line = line.split(separator) if (len(line)) > 1: if line[0] == '-1': numCells += 1 continue elif line[0] == '1': item = int(line[2].split(':')[0]) user = int(line[1].split(':')[0]) value = 1.0 else: print('ERROR READING DATASET') break numCells += 1 URM_builder.add_data_lists([user], [item], [value]) fileHandle.close() return URM_builder.get_SparseMatrix( ), URM_builder.get_column_token_to_id_mapper( ), URM_builder.get_row_token_to_id_mapper()
def _loadUCM(UCM_path, header=True, separator=','): # Genres from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs ICM_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=None, on_new_col="add", preinitialized_row_mapper=None, on_new_row="add") fileHandle = open(UCM_path, "r", encoding="latin1") numCells = 0 if header: fileHandle.readline() for line in fileHandle: numCells += 1 if (numCells % 1000000 == 0): print("Processed {} rows".format(numCells)) if (len(line)) > 1: line = line.split(separator) line[-1] = line[-1].replace("\n", "") user_id = line[0] token_list = [] token_list.append("gender_" + str(line[1])) token_list.append("age_group_" + str(line[2])) token_list.append("occupation_" + str(line[3])) token_list.append("zip_code_" + str(line[4])) # Rows movie ID # Cols features ICM_builder.add_single_row(user_id, token_list, data=1.0) fileHandle.close() return ICM_builder.get_SparseMatrix( ), ICM_builder.get_column_token_to_id_mapper( ), ICM_builder.get_row_token_to_id_mapper()
def _loadReviews(self, file_path, if_new_item="add"): from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs ICM_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=None, on_new_col="add", preinitialized_row_mapper=self.item_original_ID_to_index, on_new_row=if_new_item) from Data_manager.TagPreprocessing import tagFilterAndStemming, tagFilter parser_reviews = parse_json(file_path) numReviewParsed = 0 for newReview in parser_reviews: numReviewParsed += 1 if (numReviewParsed % 20000 == 0): print("Processed {} reviews".format(numReviewParsed)) user_ID = newReview["reviewerID"] item_ID = newReview["asin"] reviewText = newReview["reviewText"] reviewSummary = newReview["summary"] tagList = ' '.join([reviewText, reviewSummary]) # Remove non alphabetical character and split on spaces tagList = tagFilterAndStemming(tagList) ICM_builder.add_single_row(item_ID, tagList, data=1.0) return ICM_builder.get_SparseMatrix( ), ICM_builder.get_column_token_to_id_mapper( ), ICM_builder.get_row_token_to_id_mapper()
def load_CSV_into_SparseBuilder( filePath, header=False, separator="::", timestamp=False, remove_duplicates=False, custom_user_item_rating_columns=None, create_mapper=True, preinitialized_row_mapper=None, preinitialized_col_mapper=None, on_new_col="add", on_new_row="add", ): """ The function loads a CSV file into a URM :param filePath: :param header: True/False the file does have a header :param separator: :param timestamp: True/False load the timestamp as well :param remove_duplicates: Remove row/column duplicates, if the timestamp is provided it kees the most recent one, otherwise the highest rating or interaction value. :param custom_user_item_rating_columns: Column names for the user_id, item_id and rating value as in the file header :param create_mapper: True map the IDs into a new interger value, False use the original value :param preinitialized_row_mapper: Dictionary {originalID: matrix index} to translate rowIDs into row indices (e.g., userID into user index) :param preinitialized_col_mapper: Dictionary {originalID: matrix index} to translate rowIDs into row indices (e.g., ItemID into item index) :return: """ if preinitialized_row_mapper is not None or preinitialized_col_mapper is not None: URM_all_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=preinitialized_col_mapper, preinitialized_row_mapper=preinitialized_row_mapper, on_new_col=on_new_col, on_new_row=on_new_row, ) URM_timestamp_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=preinitialized_col_mapper, preinitialized_row_mapper=preinitialized_row_mapper, on_new_col=on_new_col, on_new_row=on_new_row, ) else: URM_all_builder = IncrementalSparseMatrix( auto_create_col_mapper=create_mapper, auto_create_row_mapper=create_mapper) URM_timestamp_builder = IncrementalSparseMatrix( auto_create_col_mapper=create_mapper, auto_create_row_mapper=create_mapper) if timestamp: dtype = {0: str, 1: str, 2: float, 3: float} columns = ["userId", "itemId", "interaction", "timestamp"] else: dtype = {0: str, 1: str, 2: float} columns = ["userId", "itemId", "interaction"] df_original = pd.read_csv( filepath_or_buffer=filePath, sep=separator, header=0 if header else None, dtype=dtype, usecols=custom_user_item_rating_columns, ) # If the original file has more columns, keep them but ignore them df_original.columns = columns user_id_list = df_original["userId"].values item_id_list = df_original["itemId"].values interaction_list = df_original["interaction"].values # Check if duplicates exist num_unique_user_item_ids = df_original.drop_duplicates( ["userId", "itemId"], keep="first", inplace=False).shape[0] contains_duplicates_flag = num_unique_user_item_ids != len(user_id_list) if contains_duplicates_flag: if remove_duplicates: # # Remove duplicates. # This way of removing the duplicates keeping the last tiemstamp without removing other columns # would be the simplest, but it is so slow to the point of being unusable on any dataset but ML100k # idxs = df_original.groupby(by=['userId', 'itemId'], as_index=False)["timestamp"].idxmax() # df_original = df_original.loc[idxs] # Alternative faster way: # 1 - Sort in ascending order so that the last (bigger) timestamp is in the last position. Set Nan to be in the first position, to remove them if possible # 2 - Then remove duplicates for user-item keeping the last row, which will be the last timestamp. if timestamp: sort_by = ["userId", "itemId", "timestamp"] else: sort_by = ["userId", "itemId", "interaction"] df_original.sort_values( by=sort_by, ascending=True, inplace=True, kind="quicksort", na_position="first", ) df_original.drop_duplicates(["userId", "itemId"], keep="last", inplace=True) user_id_list = df_original["userId"].values item_id_list = df_original["itemId"].values interaction_list = df_original["interaction"].values assert num_unique_user_item_ids == len( user_id_list ), "load_CSV_into_SparseBuilder: duplicate (user, item) values found" else: assert num_unique_user_item_ids == len( user_id_list ), "load_CSV_into_SparseBuilder: duplicate (user, item) values found" URM_all_builder.add_data_lists(user_id_list, item_id_list, interaction_list) if timestamp: timestamp_list = df_original["timestamp"].values URM_timestamp_builder.add_data_lists(user_id_list, item_id_list, timestamp_list) return ( URM_all_builder.get_SparseMatrix(), URM_timestamp_builder.get_SparseMatrix(), URM_all_builder.get_column_token_to_id_mapper(), URM_all_builder.get_row_token_to_id_mapper(), ) return ( URM_all_builder.get_SparseMatrix(), URM_all_builder.get_column_token_to_id_mapper(), URM_all_builder.get_row_token_to_id_mapper(), )
def generate_Dataset(self, dataset_name, is_implicit): assert ( not self.__Dataset_finalized ), "Dataset mappers have already been generated, adding new data is forbidden" self.__Dataset_finalized = True # Generate ID to index mappers self._generate_global_mappers() self._generate_ICM_UCM_mappers() URM_DICT_sparse = {} ICM_DICT_sparse = {} UCM_DICT_sparse = {} on_new_ID = "ignore" for URM_name, URM_dataframe in self.URM_DICT.items(): URM_sparse_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=self.item_original_ID_to_index, preinitialized_row_mapper=self.user_original_ID_to_index, on_new_col=on_new_ID, on_new_row=on_new_ID, ) URM_sparse_builder.add_data_lists( URM_dataframe["UserID"].values, URM_dataframe["ItemID"].values, URM_dataframe["Data"].values, ) URM_DICT_sparse[URM_name] = URM_sparse_builder.get_SparseMatrix() for ICM_name, ICM_dataframe in self.ICM_DICT.items(): feature_ID_to_index = self.ICM_mapper_DICT[ICM_name] ICM_sparse_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=feature_ID_to_index, preinitialized_row_mapper=self.item_original_ID_to_index, on_new_col=on_new_ID, on_new_row=on_new_ID, ) ICM_sparse_builder.add_data_lists( ICM_dataframe["ItemID"].values, ICM_dataframe["FeatureID"].values, ICM_dataframe["Data"].values, ) ICM_DICT_sparse[ICM_name] = ICM_sparse_builder.get_SparseMatrix() for UCM_name, UCM_dataframe in self.UCM_DICT.items(): feature_ID_to_index = self.UCM_mapper_DICT[UCM_name] UCM_sparse_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=feature_ID_to_index, preinitialized_row_mapper=self.user_original_ID_to_index, on_new_col=on_new_ID, on_new_row=on_new_ID, ) UCM_sparse_builder.add_data_lists( UCM_dataframe["UserID"].values, UCM_dataframe["FeatureID"].values, UCM_dataframe["Data"].values, ) UCM_DICT_sparse[UCM_name] = UCM_sparse_builder.get_SparseMatrix() loaded_dataset = Dataset( dataset_name=dataset_name, URM_dictionary=URM_DICT_sparse, ICM_dictionary=ICM_DICT_sparse, ICM_feature_mapper_dictionary=self.ICM_mapper_DICT, UCM_dictionary=UCM_DICT_sparse, UCM_feature_mapper_dictionary=self.UCM_mapper_DICT, user_original_ID_to_index=self.user_original_ID_to_index, item_original_ID_to_index=self.item_original_ID_to_index, is_implicit=is_implicit, ) return loaded_dataset
def _loadMetadata(self, file_path, if_new_item="ignore"): from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs ICM_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=None, on_new_col="add", preinitialized_row_mapper=self.item_original_ID_to_index, on_new_row=if_new_item) from Data_manager.TagPreprocessing import tagFilterAndStemming, tagFilter import itertools parser_metadata = parse_json(file_path) numMetadataParsed = 0 for newMetadata in parser_metadata: numMetadataParsed += 1 if (numMetadataParsed % 20000 == 0): print("Processed {}".format(numMetadataParsed)) item_ID = newMetadata["asin"] # The file might contain other elements, restrict to # Those in the URM tokenList = [] #item_price = newMetadata["price"] if "title" in newMetadata: item_name = newMetadata["title"] tokenList.append(item_name) # Sometimes brand is not present if "brand" in newMetadata: item_brand = newMetadata["brand"] tokenList.append(item_brand) # Categories are a list of lists. Unclear whether only the first element contains data or not if "categories" in newMetadata: item_categories = newMetadata["categories"] item_categories = list( itertools.chain.from_iterable(item_categories)) tokenList.extend(item_categories) if "description" in newMetadata: item_description = newMetadata["description"] tokenList.append(item_description) tokenList = ' '.join(tokenList) # Remove non alphabetical character and split on spaces tokenList = tagFilterAndStemming(tokenList) # Remove duplicates tokenList = list(set(tokenList)) ICM_builder.add_single_row(item_ID, tokenList, data=1.0) return ICM_builder.get_SparseMatrix( ), ICM_builder.get_column_token_to_id_mapper( ), ICM_builder.get_row_token_to_id_mapper()