示例#1
0
    def load_dataset(self):
        print('loading ratings from path: ' + self.rates_file_dir)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_dir, self.split_ratio)

        self.user_ids = self.data['User ID'].drop_duplicates().tolist()
        self.item_ids = self.data['Item ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

        self.train_dataset.load_dataset(train_data_frame)
        self.test_dataset.load_dataset(test_data_frame)
        print('Done.')
    def __init__(self,
                 name='MovieLensDataset100K',
                 type=DataType.ndarray,
                 split_ratio=0.8,
                 dataset_path='../data/MovieLens/ml-100k'):

        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'u.user')
        self.items_info_path = os.path.join(dataset_path, 'u.item')
        self.rates_file_path = os.path.join(dataset_path, 'u.data')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)

        occupation_group = self.users.groupby(by='Occupation')
        occupation_list = list(occupation_group.groups.keys())
        self.occupation2id = {}
        for i, occupation in enumerate(occupation_list):
            self.occupation2id[occupation] = i
        self.gender2id = {'M': 0, 'F': 1}

        self.user_ids = self.users['ID'].drop_duplicates().tolist()
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        self.user_num = len(self.user_ids)
        self.item_num = len(self.item_ids)
        self.user_occupation_num = len(self.occupation2id)

        self.users = self.users.set_index('ID').sort_index()
        self.items = self.items.set_index('ID').sort_index()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
    def __init__(self,
                 name='BookCrossingDataset',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/Book-crossing/BX-CSV-Dump'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'BX-Users.csv')
        self.items_info_path = os.path.join(dataset_path, 'BX-Books.csv')
        self.rates_file_path = os.path.join(dataset_path,
                                            'BX-Book-Ratings.csv')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)
        self.data = pd.read_csv(self.rates_file_path,
                                sep=';',
                                skiprows=[0],
                                names=['User ID', 'Item ID', 'Rating'],
                                dtype={
                                    'User ID': str,
                                    'Item ID': str,
                                    'Rating': np.float16
                                },
                                encoding='latin1')
        self.user_ids = self.data['User ID'].drop_duplicates().tolist()
        self.item_ids = self.data['Item ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
示例#4
0
    def __init__(self,
                 name='MovieLensDataset100K',
                 type=DataType.ndarray,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/MovieLens/ml-100k'):

        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'u.user')
        self.items_info_path = os.path.join(dataset_path, 'u.item')
        self.rates_file_path = os.path.join(dataset_path, 'u.data')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)
        self.user_ids = self.users['ID'].drop_duplicates().tolist()
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
示例#5
0
    def __init__(self,
                 name='MovieLensDataset1M',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/MovieLens/ml-1m'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'users.dat')
        self.items_info_path = os.path.join(dataset_path, 'movies.dat')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.dat')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)
        self.user_ids = list(range(1, 6041))
        self.item_ids = list(range(1, 3953))

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
示例#6
0
    def __init__(self,
                 name='MovieLensDataset25M',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/MovieLens/ml-25m'):
        self.name = name
        self.type = type
        self.items_info_path = os.path.join(dataset_path, 'movies.csv')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.csv')
        self.user_num = 162541
        self.split_ratio = split_ratio
        self.items = self.__load_item_info(self.items_info_path)
        self.user_ids = list(range(1, self.user_num + 1))
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
    def __init__(self,
                 name='MovieLensDataset1M',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='../data/MovieLens/ml-1m'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'users.dat')
        self.items_info_path = os.path.join(dataset_path, 'movies.dat')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.dat')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)

        self.gender2id = {'M': 0, 'F': 1}
        self.category2id = {
            'Action': 0,
            'Adventure': 1,
            'Animation': 2,
            "Children's": 3,
            'Comedy': 4,
            'Crime': 5,
            'Documentary': 6,
            'Drama': 7,
            'Fantasy': 8,
            'Film-Noir': 9,
            'Horror': 10,
            'Musical': 11,
            'Mystery': 12,
            'Romance': 13,
            'Sci-Fi': 14,
            'Thriller': 15,
            'War': 16,
            'Western': 17
        }

        self.user_ids = self.users['ID'].drop_duplicates().tolist()
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        self.user_occupation_num = 21

        self.users = self.users.set_index('ID').sort_index()
        self.items = self.items.set_index('ID').sort_index()

        self.user_ids = list(range(1, 6041))
        self.item_ids = list(range(1, 3953))

        self.user_num = len(self.user_ids)
        self.item_num = len(self.item_ids)

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)