def load_dataset(self): print('loading ratings from path: ' + self.rates_file_dir) train_data_frame, test_data_frame = self.__load_rate( self.rates_file_dir, self.split_ratio) self.user_ids = self.data['User ID'].drop_duplicates().tolist() self.item_ids = self.data['Item ID'].drop_duplicates().tolist() if self.type == DataType.ndarray: self.train_dataset = RatingNDArray( self.name + '_' + self.type.name + '_trainset', len(self.user_ids), len(self.item_ids)) self.test_dataset = RatingNDArray( self.name + '_' + self.type.name + '_testset', len(self.user_ids), len(self.item_ids)) elif self.type == DataType.dataframe: self.train_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids) elif self.type == DataType.dictionary: self.train_dataset = RatingDictionary( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDictionary( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids) self.train_dataset.load_dataset(train_data_frame) self.test_dataset.load_dataset(test_data_frame) print('Done.')
def __init__(self, name='MovieLensDataset100K', type=DataType.ndarray, split_ratio=0.8, dataset_path='../data/MovieLens/ml-100k'): self.name = name self.type = type self.users_info_path = os.path.join(dataset_path, 'u.user') self.items_info_path = os.path.join(dataset_path, 'u.item') self.rates_file_path = os.path.join(dataset_path, 'u.data') self.split_ratio = split_ratio self.users = self.__load_user_info(self.users_info_path) self.items = self.__load_item_info(self.items_info_path) occupation_group = self.users.groupby(by='Occupation') occupation_list = list(occupation_group.groups.keys()) self.occupation2id = {} for i, occupation in enumerate(occupation_list): self.occupation2id[occupation] = i self.gender2id = {'M': 0, 'F': 1} self.user_ids = self.users['ID'].drop_duplicates().tolist() self.item_ids = self.items['ID'].drop_duplicates().tolist() self.user_num = len(self.user_ids) self.item_num = len(self.item_ids) self.user_occupation_num = len(self.occupation2id) self.users = self.users.set_index('ID').sort_index() self.items = self.items.set_index('ID').sort_index() if self.type == DataType.ndarray: self.train_dataset = RatingNDArray( self.name + '_' + self.type.name + '_trainset', len(self.user_ids), len(self.item_ids)) self.test_dataset = RatingNDArray( self.name + '_' + self.type.name + '_testset', len(self.user_ids), len(self.item_ids)) elif self.type == DataType.dataframe: self.train_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids) elif self.type == DataType.dictionary: self.train_dataset = RatingDictionary( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDictionary( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids)
def __init__(self, name='BookCrossingDataset', type=DataType.dictionary, split_ratio=0.8, dataset_path='G:/dataset/RS/Book-crossing/BX-CSV-Dump'): self.name = name self.type = type self.users_info_path = os.path.join(dataset_path, 'BX-Users.csv') self.items_info_path = os.path.join(dataset_path, 'BX-Books.csv') self.rates_file_path = os.path.join(dataset_path, 'BX-Book-Ratings.csv') self.split_ratio = split_ratio self.users = self.__load_user_info(self.users_info_path) self.items = self.__load_item_info(self.items_info_path) self.data = pd.read_csv(self.rates_file_path, sep=';', skiprows=[0], names=['User ID', 'Item ID', 'Rating'], dtype={ 'User ID': str, 'Item ID': str, 'Rating': np.float16 }, encoding='latin1') self.user_ids = self.data['User ID'].drop_duplicates().tolist() self.item_ids = self.data['Item ID'].drop_duplicates().tolist() if self.type == DataType.ndarray: self.train_dataset = RatingNDArray( self.name + '_' + self.type.name + '_trainset', len(self.user_ids), len(self.item_ids)) self.test_dataset = RatingNDArray( self.name + '_' + self.type.name + '_testset', len(self.user_ids), len(self.item_ids)) elif self.type == DataType.dataframe: self.train_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids) elif self.type == DataType.dictionary: self.train_dataset = RatingDictionary( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDictionary( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids)
def __init__(self, name='MovieLensDataset100K', type=DataType.ndarray, split_ratio=0.8, dataset_path='G:/dataset/RS/MovieLens/ml-100k'): self.name = name self.type = type self.users_info_path = os.path.join(dataset_path, 'u.user') self.items_info_path = os.path.join(dataset_path, 'u.item') self.rates_file_path = os.path.join(dataset_path, 'u.data') self.split_ratio = split_ratio self.users = self.__load_user_info(self.users_info_path) self.items = self.__load_item_info(self.items_info_path) self.user_ids = self.users['ID'].drop_duplicates().tolist() self.item_ids = self.items['ID'].drop_duplicates().tolist() if self.type == DataType.ndarray: self.train_dataset = RatingNDArray( self.name + '_' + self.type.name + '_trainset', len(self.user_ids), len(self.item_ids)) self.test_dataset = RatingNDArray( self.name + '_' + self.type.name + '_testset', len(self.user_ids), len(self.item_ids)) elif self.type == DataType.dataframe: self.train_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids) elif self.type == DataType.dictionary: self.train_dataset = RatingDictionary( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDictionary( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids)
def __init__(self, name='MovieLensDataset1M', type=DataType.dictionary, split_ratio=0.8, dataset_path='G:/dataset/RS/MovieLens/ml-1m'): self.name = name self.type = type self.users_info_path = os.path.join(dataset_path, 'users.dat') self.items_info_path = os.path.join(dataset_path, 'movies.dat') self.rates_file_path = os.path.join(dataset_path, 'ratings.dat') self.split_ratio = split_ratio self.users = self.__load_user_info(self.users_info_path) self.items = self.__load_item_info(self.items_info_path) self.user_ids = list(range(1, 6041)) self.item_ids = list(range(1, 3953)) if self.type == DataType.ndarray: self.train_dataset = RatingNDArray( self.name + '_' + self.type.name + '_trainset', len(self.user_ids), len(self.item_ids)) self.test_dataset = RatingNDArray( self.name + '_' + self.type.name + '_testset', len(self.user_ids), len(self.item_ids)) elif self.type == DataType.dataframe: self.train_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids) elif self.type == DataType.dictionary: self.train_dataset = RatingDictionary( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDictionary( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids)
def __init__(self, name='MovieLensDataset25M', type=DataType.dictionary, split_ratio=0.8, dataset_path='G:/dataset/RS/MovieLens/ml-25m'): self.name = name self.type = type self.items_info_path = os.path.join(dataset_path, 'movies.csv') self.rates_file_path = os.path.join(dataset_path, 'ratings.csv') self.user_num = 162541 self.split_ratio = split_ratio self.items = self.__load_item_info(self.items_info_path) self.user_ids = list(range(1, self.user_num + 1)) self.item_ids = self.items['ID'].drop_duplicates().tolist() if self.type == DataType.ndarray: self.train_dataset = RatingNDArray( self.name + '_' + self.type.name + '_trainset', len(self.user_ids), len(self.item_ids)) self.test_dataset = RatingNDArray( self.name + '_' + self.type.name + '_testset', len(self.user_ids), len(self.item_ids)) elif self.type == DataType.dataframe: self.train_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids) elif self.type == DataType.dictionary: self.train_dataset = RatingDictionary( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDictionary( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids)
def __init__(self, name='MovieLensDataset1M', type=DataType.dictionary, split_ratio=0.8, dataset_path='../data/MovieLens/ml-1m'): self.name = name self.type = type self.users_info_path = os.path.join(dataset_path, 'users.dat') self.items_info_path = os.path.join(dataset_path, 'movies.dat') self.rates_file_path = os.path.join(dataset_path, 'ratings.dat') self.split_ratio = split_ratio self.users = self.__load_user_info(self.users_info_path) self.items = self.__load_item_info(self.items_info_path) self.gender2id = {'M': 0, 'F': 1} self.category2id = { 'Action': 0, 'Adventure': 1, 'Animation': 2, "Children's": 3, 'Comedy': 4, 'Crime': 5, 'Documentary': 6, 'Drama': 7, 'Fantasy': 8, 'Film-Noir': 9, 'Horror': 10, 'Musical': 11, 'Mystery': 12, 'Romance': 13, 'Sci-Fi': 14, 'Thriller': 15, 'War': 16, 'Western': 17 } self.user_ids = self.users['ID'].drop_duplicates().tolist() self.item_ids = self.items['ID'].drop_duplicates().tolist() self.user_occupation_num = 21 self.users = self.users.set_index('ID').sort_index() self.items = self.items.set_index('ID').sort_index() self.user_ids = list(range(1, 6041)) self.item_ids = list(range(1, 3953)) self.user_num = len(self.user_ids) self.item_num = len(self.item_ids) if self.type == DataType.ndarray: self.train_dataset = RatingNDArray( self.name + '_' + self.type.name + '_trainset', len(self.user_ids), len(self.item_ids)) self.test_dataset = RatingNDArray( self.name + '_' + self.type.name + '_testset', len(self.user_ids), len(self.item_ids)) elif self.type == DataType.dataframe: self.train_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDataFrame( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids) elif self.type == DataType.dictionary: self.train_dataset = RatingDictionary( self.name + '_' + self.type.name + '_trainset', self.user_ids, self.item_ids) self.test_dataset = RatingDictionary( self.name + '_' + self.type.name + '_testset', self.user_ids, self.item_ids)