Exemplos de Utility em Python, exemplos de Data_manager.Utility em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: MovielensHetrec2011Reader.py Projeto: yuzhaofeng/RecSys2019_DeepLearning_Evaluation

    def __init__(self, pre_splitted_path):

        test_percentage = 0.2
        validation_percentage = 0.2

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print(
                "Dataset_MovielensHetrec2011: Attempting to load pre-splitted data"
            )

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_MovielensHetrec2011: Pre-splitted data not found, building new one"
            )

            data_reader = MovielensHetrec2011Reader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_all()

            # keep only ratings 5
            URM_all.data = URM_all.data == 5
            URM_all.eliminate_zeros()

            # create train - test - validation
            URM_train_original, URM_test = split_train_validation_percentage_user_wise(
                URM_all, train_percentage=1 - test_percentage, verbose=False)

            URM_train, URM_validation = split_train_validation_percentage_user_wise(
                URM_train_original,
                train_percentage=1 - validation_percentage,
                verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("Dataset_MovielensHetrec2011: Dataset loaded")

        ut.print_stat_datareader(self)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: FilmTrustReader.py Projeto: yuzhaofeng/RecSys2019_DeepLearning_Evaluation

    def __init__(self, pre_splitted_path):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_FilmTrust: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_FilmTrust: Pre-splitted data not found, building new one"
            )

            data_reader = FilmTrustReader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_all()

            URM_all.eliminate_zeros()

            URM_all.data = np.ones_like(URM_all.data)

            URM_train, URM_test = split_train_validation_percentage_random_holdout(
                URM_all, train_percentage=0.8)

            URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                URM_train, train_percentage=0.9)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("FilmTrust: Dataset loaded")

        ut.print_stat_datareader(self)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: AmazonInstantVideoReader.py Projeto: caroprese/CMN_CNR

    def __init__(self):

        test_percentage = 0.2
        validation_percentage = 0.2

        pre_splitted_path = "Data_manager_split_datasets/AmazonInstantVideo/RecSys/SpectralCF_our_interface/"
        pre_splitted_filename = "splitted_data"

        ratings_file_name = "ratings_Amazon_Instant_Video.csv"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print(
                "Dataset_AmazonInstantVideo: Attempting to load pre-splitted data"
            )

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_AmazonInstantVideo: Pre-splitted data not found, building new one"
            )

            folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

            downloadFromURL(self.DATASET_URL, folder_path, ratings_file_name)

            # read Amazon Instant Video
            df = pd.read_csv(folder_path + ratings_file_name,
                             sep=',',
                             header=None,
                             names=['user', 'item', 'rating',
                                    'timestamp'])[['user', 'item', 'rating']]

            # keep only ratings = 5
            URM_train_builder = IncrementalSparseMatrix(
                auto_create_col_mapper=True, auto_create_row_mapper=True)
            URM_train_builder.add_data_lists(df['user'].values,
                                             df['item'].values,
                                             df['rating'].values)
            URM_all = URM_train_builder.get_SparseMatrix()

            URM_all.data = URM_all.data == 5
            URM_all.eliminate_zeros()

            # keep only users with at least 5 ratings
            URM_all = ut.filter_urm(URM_all,
                                    user_min_number_ratings=5,
                                    item_min_number_ratings=1)

            # create train - test - validation

            URM_train_original, self.URM_test = split_train_validation_percentage_user_wise(
                URM_all, train_percentage=1 - test_percentage, verbose=False)

            self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(
                URM_train_original,
                train_percentage=1 - validation_percentage,
                verbose=False)

            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("Dataset_AmazonInstantVideo: Dataset loaded")

        ut.print_stat_datareader(self)

Exemplo n.º 4

0

Exibir arquivo

    def __init__(self, pre_splitted_path, original=True):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            compressed_file_folder = "Conferences/IJCAI/ConvNCF_github/Data/"
            decompressed_file_folder = "Data_manager_split_datasets/Yelp/"

            # compressed_file = tarfile.open(compressed_file_folder + "yelp.test.negative.gz", "r:gz")
            # compressed_file.extract("yelp.test.negative", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "yelp.test.rating.gz", "r:gz")
            # compressed_file.extract("yelp.test.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "yelp.train.rating.gz", "r:gz")
            # compressed_file.extract("yelp.train.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()

            # if original:

            Dataset_github.load_rating_file_as_list = Dataset_github.load_training_file_as_matrix

            try:
                dataset = Dataset_github(compressed_file_folder + "yelp")

            except FileNotFoundError as exc:

                print(
                    "Dataset_{}: Uncompressed files not found, please manually decompress the *.gz files in this folder: '{}'"
                    .format(self.DATASET_NAME, compressed_file_folder))

                raise exc

            URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings

            n_users = max(URM_train_original.shape[0], URM_test.shape[0])
            n_items = max(URM_train_original.shape[1], URM_test.shape[1])

            URM_train_original = sps.csr_matrix(URM_train_original,
                                                shape=(n_users, n_items))
            URM_test = sps.csr_matrix(URM_test, shape=(n_users, n_items))

            URM_train_original.data = np.ones_like(URM_train_original.data)
            URM_test.data = np.ones_like(URM_test.data)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users, n_cols=n_items)

            n_negative_samples = 999
            for user_index in range(len(dataset.testNegatives)):
                user_test_items = dataset.testNegatives[user_index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            user_index, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy(), verbose=False)

            #
            # else:
            #     data_reader = YelpReader_DataManager()
            #     loaded_dataset = data_reader.load_data()
            #
            #     URM_all = loaded_dataset.get_URM_all()
            #
            #     URM_timestamp = URM_all.copy()
            #
            #     URM_all.data = np.ones_like(URM_all.data)
            #
            #     URM_train, URM_validation, URM_test, URM_negative = split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=999)
            #     URM_train = URM_train + URM_validation
            #     URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train, verbose=False)

            shutil.rmtree(decompressed_file_folder + "decompressed/",
                          ignore_errors=True)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: Movielens1MReader.py Projeto: yuzhaofeng/RecSys2019_DeepLearning_Evaluation

    def __init__(self,
                 pre_splitted_path,
                 type="original",
                 cold_start=False,
                 cold_items=None):

        assert type in ["original", "ours"]

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # their mode in cold start
        mode = 1

        # path for pre existed movielens1M split
        movielens_splitted_path = "Conferences/RecSys/SpectralCF_github/data/ml-1m/"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_Movielens1M: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_Movielens1M: Pre-splitted data not found, building new one"
            )

            if type == "original":
                assert (cold_start is False)

                # use the SpectralCF class to read data
                data_generator = Data(
                    train_file=movielens_splitted_path + 'train_users.dat',
                    test_file=movielens_splitted_path + 'test_users.dat',
                    batch_size=BATCH_SIZE)

                # convert train into csr
                full_train_matrix = sps.csr_matrix(data_generator.R)
                URM_train_original = full_train_matrix

                # convert test into csr
                test_set = data_generator.test_set
                uids, items = [], []
                for uid in test_set.keys():
                    uids += np.full(len(test_set[uid]), uid).tolist()
                    items += test_set[uid]
                test_matrix = sps.csr_matrix(
                    (np.ones(len(items)), (uids, items)),
                    shape=(full_train_matrix.shape))

                if not cold_start:
                    URM_test = test_matrix

                    # create validation
                    URM_train, URM_validation = split_train_validation_percentage_user_wise(
                        URM_train_original,
                        train_percentage=0.9,
                        verbose=False)

                else:
                    print('nothing')

            elif type == "ours":

                data_reader = Movielens1MReader_DataManager()
                loaded_dataset = data_reader.load_data()

                URM_all = loaded_dataset.get_URM_all()

                URM_all.data = URM_all.data == 5
                URM_all.eliminate_zeros()

                if not cold_start:
                    URM_train, URM_test = split_train_validation_percentage_user_wise(
                        URM_all, train_percentage=0.8, verbose=False)

                    URM_train, URM_validation = split_train_validation_percentage_user_wise(
                        URM_train, train_percentage=0.9, verbose=False)

                else:

                    if mode == 1:  # their mode, cold start for full dataset
                        URM_train, URM_test = split_train_validation_cold_start_user_wise(
                            URM_all,
                            full_train_percentage=0.0,
                            cold_items=cold_items,
                            verbose=False)

                        URM_test, URM_validation = split_train_validation_percentage_user_wise(
                            URM_test, train_percentage=0.9, verbose=False)

                    if mode == 2:  # cold start only for some users
                        URM_train, URM_test = split_train_validation_cold_start_user_wise(
                            URM_all,
                            full_train_percentage=0.8,
                            cold_items=cold_items,
                            verbose=False)

                        URM_train, URM_validation = split_train_validation_cold_start_user_wise(
                            URM_train,
                            full_train_percentage=0.9,
                            cold_items=cold_items,
                            verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("Dataset_Movielens1M: Dataset loaded")

        ut.print_stat_datareader(self)

Exemplo n.º 6

0

Exibir arquivo

    def __init__(self, pre_splitted_path, type='original'):
        assert type in ["original", "ours"]

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            from Conferences.IJCAI.CoupledCF_original import LoadMovieDataCnn as DatareaderOriginal
            path = "Conferences/IJCAI/CoupledCF_original/ml-1m/"

            n_users, gender, age, occupation = DatareaderOriginal.load_user_attributes(
                path=path, split=True)
            n_items, items_genres_mat = DatareaderOriginal.load_itemGenres_as_matrix(
                path=path)
            ratings = DatareaderOriginal.load_rating_train_as_matrix(path=path)
            testRatings = DatareaderOriginal.load_rating_file_as_list(
                path=path)
            testNegatives = DatareaderOriginal.load_negative_file(path=path)

            URM_all = ratings.tocsr()

            UCM_gender = gender.tocsr()
            UCM_age = age.tocsr()
            UCM_occupation = occupation.tocsr()
            UCM_all = sps.hstack((UCM_gender, UCM_age, UCM_occupation)).tocsr()

            ICM_all = sps.csr_matrix(items_genres_mat)

            testRatings = np.array(testRatings).T
            URM_test_builder = IncrementalSparseMatrix(n_rows=n_users + 1,
                                                       n_cols=n_items + 1)
            URM_test_builder.add_data_lists(testRatings[0], testRatings[1],
                                            np.ones(len(testRatings[0])))

            URM_test = URM_test_builder.get_SparseMatrix()

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users + 1, n_cols=n_items + 1)

            # care here, the test negative start from index 0 but it refer to user index 1 (user index start from 1)
            n_negative_samples = 99
            for index in range(len(testNegatives)):
                user_test_items = testNegatives[index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            index + 1, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(index + 1,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            if type == 'original':
                URM_test = URM_test
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_all.copy(), verbose=False)

            else:  # redo the split
                URM_full = URM_all + URM_test
                URM_temp, URM_test = split_train_validation_leave_one_out_user_wise(
                    URM_full.copy(), verbose=False)
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_temp.copy(), verbose=False)

            self.ICM_DICT = {
                "UCM_gender": UCM_gender,
                "UCM_occupation": UCM_occupation,
                "UCM_age": UCM_age,
                "UCM_all": UCM_all,
                "ICM_all": ICM_all,
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: TafengReader.py Projeto: yuzhaofeng/RecSys2019_DeepLearning_Evaluation

    def __init__(self, pre_splitted_path, type='original'):
        assert type in ["original", "ours"]

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            from Conferences.IJCAI.CoupledCF_original import LoadTafengDataCnn as DatareaderOriginal
            path = "Conferences/IJCAI/CoupledCF_original/tafeng/"

            n_users, user_attributes_mat = DatareaderOriginal.load_user_attributes(
                path=path)
            n_items, items_genres_mat = DatareaderOriginal.load_itemGenres_as_matrix(
                path=path)
            ratings = DatareaderOriginal.load_rating_train_as_matrix(path=path)
            testRatings = DatareaderOriginal.load_rating_file_as_list(
                path=path)
            testNegatives = DatareaderOriginal.load_negative_file(path=path)

            URM_all = ratings.tocsr()

            UCM_all = sps.csc_matrix(user_attributes_mat)
            UCM_age = UCM_all[:, 0:11].tocsr()
            UCM_region = UCM_all[:, 11:19].tocsr()
            UCM_all = UCM_all.tocsr()

            # col: 0->category, 2->asset(0-1), 1->price(0-1)
            ICM_original = sps.csc_matrix(items_genres_mat)

            # category could be used as matrix, not single row
            ICM_sub_class = ICM_original[:, 0:1].tocsr()
            max = ICM_sub_class.shape[0]
            rows, cols, data = [], [], []
            for idx in range(max):
                # we have only index 0 as col
                data_vect = ICM_sub_class.data[
                    ICM_sub_class.indptr[idx]:ICM_sub_class.indptr[idx + 1]]
                if len(data_vect) == 0:
                    # handle category value 0 that in a csr matrix is not present
                    cols.append(int(0))
                else:
                    cols.append(int(data_vect[0]))
                rows.append(idx)
                data.append(1.0)

            ICM_sub_class = sps.csr_matrix((data, (rows, cols)))
            ICM_asset = ICM_original[:, 1:2].tocsr()
            ICM_price = ICM_original[:, 2:3].tocsr()

            ICM_original = ICM_original.tocsc()
            ICM_all = sps.hstack((ICM_sub_class, ICM_asset, ICM_price))

            testRatings = np.array(testRatings).T
            URM_test_builder = IncrementalSparseMatrix(n_rows=n_users + 1,
                                                       n_cols=n_items + 1)
            URM_test_builder.add_data_lists(testRatings[0], testRatings[1],
                                            np.ones(len(testRatings[0])))
            URM_test = URM_test_builder.get_SparseMatrix()

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users + 1, n_cols=n_items + 1)

            # care here, the test negative start from index 0 but it refer to user index 1 (user index start from 1)
            n_negative_samples = 99
            for index in range(len(testNegatives)):
                user_test_items = testNegatives[index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            index + 1, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(index + 1,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            if type == 'original':
                URM_test = URM_test
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_all.copy(), verbose=False)
            else:  # redo the split
                URM_full = URM_all + URM_test
                URM_temp, URM_test = split_train_validation_leave_one_out_user_wise(
                    URM_full.copy(), verbose=False)
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_temp.copy(), verbose=False)

            self.ICM_DICT = {
                "UCM_age": UCM_age,
                "UCM_region": UCM_region,
                "UCM_all": UCM_all,
                "ICM_all": ICM_all,
                "ICM_original": ICM_original,
                "ICM_sub_class": ICM_sub_class,
                "ICM_asset": ICM_asset,
                "ICM_price": ICM_price,
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: GowallaReader.py Projeto: yuzhaofeng/RecSys2019_DeepLearning_Evaluation

    def __init__(self, pre_splitted_path):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            compressed_file_folder = "Conferences/IJCAI/ConvNCF_github/Data/"
            decompressed_file_folder = "Data_manager_split_datasets/Gowalla/"

            # compressed_file = tarfile.open(compressed_file_folder + "gowalla.test.negative.gz", "r:gz")
            # compressed_file.extract("yelp.test.negative", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "gowalla.test.rating.gz", "r:gz")
            # compressed_file.extract("yelp.test.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "gowalla.train.rating.gz", "r:gz")
            # compressed_file.extract("yelp.train.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()

            # if original:

            Dataset_github.load_rating_file_as_list = Dataset_github.load_training_file_as_matrix

            try:
                dataset = Dataset_github(compressed_file_folder + "gowalla")

            except FileNotFoundError as exc:

                print(
                    "Dataset_{}: Gowalla files not found, please download them and put them in this folder '{}', url: {}"
                    .format(self.DATASET_NAME, compressed_file_folder,
                            self.DATASET_URL))
                print(
                    "Dataset_{}: Uncompressed files not found, please manually decompress the *.gz files in this folder: '{}'"
                    .format(self.DATASET_NAME, compressed_file_folder))

                raise exc

            URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings

            n_users = max(URM_train_original.shape[0], URM_test.shape[0])
            n_items = max(URM_train_original.shape[1], URM_test.shape[1])

            URM_train_original = sps.csr_matrix(URM_train_original,
                                                shape=(n_users, n_items))
            URM_test = sps.csr_matrix(URM_test, shape=(n_users, n_items))

            URM_train_original.data = np.ones_like(URM_train_original.data)
            URM_test.data = np.ones_like(URM_test.data)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users, n_cols=n_items)

            n_negative_samples = 999
            for user_index in range(len(dataset.testNegatives)):
                user_test_items = dataset.testNegatives[user_index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            user_index, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix(
            ).tocsr()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy(), verbose=False)

            #
            #
            # # NOT USED
            # # elif not time_split: #create from full dataset with random leave one out from LINKED dateset in the article since timestamp is not present.
            # #
            # #     data_reader = GowallaGithubReader_DataManager()
            # #     loaded_dataset = data_reader.load_data()
            # #
            # #     URM_all = loaded_dataset.get_URM_all()
            # #
            # #     URM_all.eliminate_zeros()
            # #
            # #     URM_all.data = np.ones_like(URM_all.data)
            # #
            # #     #use this function 2 time because the order could change slightly the number of final interactions
            # #     #with this order we get the same number of interactions as in the paper
            # #     URM_all = filter_urm(URM_all, user_min_number_ratings=0, item_min_number_ratings=10)
            # #     URM_all = filter_urm(URM_all, user_min_number_ratings=2, item_min_number_ratings=0)
            # #
            # #     URM_train, URM_validation, URM_test, URM_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=999,
            # #                                                                                                                                          at_least_n_train_items_test=0, at_least_n_train_items_validation=0,
            # #                                                                                                                                          verbose=True)
            # #     URM_timestamp = sps.csc_matrix(([],([],[])), shape=URM_train.shape)
            #
            # else: # create from full dataset with leave out one time wise from ORIGINAL full dateset
            #     data_reader = GowallaReader_DataManager()
            #     loaded_dataset = data_reader.load_data()
            #
            #     URM_all = loaded_dataset.get_URM_all()
            #
            #     # use this function 2 time because the order could change slightly the number of final interactions
            #     # with this order we get the same number of interactions as in the paper
            #     URM_all = filter_urm(URM_all, user_min_number_ratings=0, item_min_number_ratings=10)
            #     URM_all = filter_urm(URM_all, user_min_number_ratings=2, item_min_number_ratings=0)
            #
            #     URM_timestamp = URM_all.copy()
            #     URM_all.data = np.ones_like(URM_all.data)
            #
            #     URM_train, URM_validation, URM_test, URM_negative = split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=999)
            #     URM_train = URM_train + URM_validation
            #     URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train, verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)