def __init__(self, pre_splitted_path):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_FilmTrust: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_FilmTrust: Pre-splitted data not found, building new one"
            )

            data_reader = FilmTrustReader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_all()

            URM_all.eliminate_zeros()

            URM_all.data = np.ones_like(URM_all.data)

            URM_train, URM_test = split_train_validation_percentage_random_holdout(
                URM_all, train_percentage=0.8)

            URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                URM_train, train_percentage=0.9)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("FilmTrust: Dataset loaded")

        ut.print_stat_datareader(self)
Exemplo n.º 2
0
    def __init__(self):
        super(PinterestICCVReader, self).__init__()

        pre_splitted_path = "Data_manager_split_datasets/PinterestICCV/SIGIR/CMN_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("PinterestICCVReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "PinterestICCVReader: Pre-splitted data not found, building new one"
            )

            print("PinterestICCVReader: loading URM")

            # data_reader = PinterestICCVReader()
            # data_reader.load_data()
            #
            # URM_all = data_reader.get_URM_all()
            #
            # self.URM_train, self.URM_validation, self.URM_test, self.URM_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=100)

            dataset = Dataset_NeuralCollaborativeFiltering(
                "Conferences/WWW/NeuMF_github/Data/pinterest-20")

            self.URM_train_original, self.URM_test, self.URM_test_negative = dataset.URM_train, dataset.URM_test, dataset.URM_test_negative

            self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout(
                self.URM_train_original.copy(), train_percentage=0.8)

            data_dict = {
                "URM_train_original": self.URM_train_original,
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_test_negative": self.URM_test_negative,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("PinterestICCVReader: loading complete")
    def __init__(self,
                 pre_splitted_path,
                 dataset_variant="a",
                 train_interactions=1):

        super(CiteulikeReader, self).__init__()

        assert dataset_variant in [
            "a", "t"
        ], "CiteulikeReader: dataset_variant must be either 'a' or 't'"
        assert train_interactions in [
            1, 10, "all"
        ], "CiteulikeReader: train_interactions must be: 1, 10 or 'all'"

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        original_data_path = "Conferences/KDD/CollaborativeVAE_github/data/citeulike-{}/".format(
            dataset_variant)

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("CiteulikeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "CiteulikeReader: Pre-splitted data not found, building new one"
            )

            print("CiteulikeReader: loading URM")

            if train_interactions == "all":
                train_interactions_file_suffix = 10
            else:
                train_interactions_file_suffix = train_interactions

            URM_train_builder = self._load_data_file(
                original_data_path +
                "cf-train-{}-users.dat".format(train_interactions_file_suffix))
            URM_test_builder = self._load_data_file(
                original_data_path +
                "cf-test-{}-users.dat".format(train_interactions_file_suffix))

            URM_test = URM_test_builder.get_SparseMatrix()
            URM_train = URM_train_builder.get_SparseMatrix()

            if dataset_variant == "a":
                ICM_tokens_TFIDF = scipy.io.loadmat(original_data_path +
                                                    "mult_nor.mat")['X']
            else:
                # Variant "t" uses a different file format and is transposed
                ICM_tokens_TFIDF = h5py.File(original_data_path +
                                             "mult_nor.mat").get('X')
                ICM_tokens_TFIDF = sps.csr_matrix(ICM_tokens_TFIDF).T

            ICM_tokens_TFIDF = sps.csr_matrix(ICM_tokens_TFIDF)

            ICM_tokens_bool = ICM_tokens_TFIDF.copy()
            ICM_tokens_bool.data = np.ones_like(ICM_tokens_bool.data)

            n_rows = max(URM_test.shape[0], URM_train.shape[0])
            n_cols = max(URM_test.shape[1], URM_train.shape[1],
                         ICM_tokens_TFIDF.shape[0])

            newShape = (n_rows, n_cols)

            URM_test = reshapeSparse(URM_test, newShape)
            URM_train = reshapeSparse(URM_train, newShape)

            if train_interactions == "all":

                URM_train += URM_test

                URM_train, URM_test = split_train_validation_percentage_random_holdout(
                    URM_train, train_percentage=0.8)
                URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                    URM_train.copy(), train_percentage=0.8)

            elif train_interactions == 10:
                # If train interactions == 10 the train will NOT contain the validation data
                URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                    URM_train.copy(), train_percentage=0.8)

            else:
                # If train interactions == 10 the train WILL contain the validation data
                _, URM_validation = split_train_validation_percentage_random_holdout(
                    URM_train.copy(), train_percentage=0.8)

            self.ICM_DICT = {
                "ICM_tokens_TFIDF": ICM_tokens_TFIDF,
                "ICM_tokens_bool": ICM_tokens_bool,
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

            print("CiteulikeReader: loading complete")
Exemplo n.º 4
0
    def __init__(self, dataset_variant="a", train_interactions=1):

        super(CiteulikeReader, self).__init__()

        assert dataset_variant in [
            "a", "t"
        ], "CiteulikeReader: dataset_variant must be either 'a' or 't'"
        assert train_interactions in [
            1, 10, "all"
        ], "CiteulikeReader: train_interactions must be: 1, 10 or 'all'"

        pre_splitted_path = "Data_manager_split_datasets/CiteULike/KDD/CollaborativeVAE_our_interface/"

        pre_splitted_filename = "splitted_data_citeulike-{}-{}-items".format(
            dataset_variant, train_interactions)

        original_data_path = "Conferences/KDD/CollaborativeVAE_github/data/citeulike-{}/".format(
            dataset_variant)

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("CiteulikeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "CiteulikeReader: Pre-splitted data not found, building new one"
            )

            print("CiteulikeReader: loading URM")

            if train_interactions == "all":
                train_interactions_file_suffix = 10
            else:
                train_interactions_file_suffix = train_interactions

            URM_train_builder = self._load_data_file(
                original_data_path +
                "cf-train-{}-users.dat".format(train_interactions_file_suffix))
            URM_test_builder = self._load_data_file(
                original_data_path +
                "cf-test-{}-users.dat".format(train_interactions_file_suffix))

            self.URM_test = URM_test_builder.get_SparseMatrix()
            self.URM_train = URM_train_builder.get_SparseMatrix()

            if dataset_variant == "a":
                self.ICM_title_abstract = scipy.io.loadmat(original_data_path +
                                                           "mult_nor.mat")['X']
            else:
                # Variant "t" uses a different file format and is transposed
                self.ICM_title_abstract = h5py.File(original_data_path +
                                                    "mult_nor.mat").get('X')
                self.ICM_title_abstract = sps.csr_matrix(
                    self.ICM_title_abstract).T

            self.ICM_title_abstract = sps.csr_matrix(self.ICM_title_abstract)

            n_rows = max(self.URM_test.shape[0], self.URM_train.shape[0])
            n_cols = max(self.URM_test.shape[1], self.URM_train.shape[1],
                         self.ICM_title_abstract.shape[0])

            newShape = (n_rows, n_cols)

            self.URM_test = reshapeSparse(self.URM_test, newShape)
            self.URM_train = reshapeSparse(self.URM_train, newShape)

            if train_interactions == "all":

                self.URM_train += self.URM_test

                self.URM_train, self.URM_test = split_train_validation_percentage_random_holdout(
                    self.URM_train, train_percentage=0.8)
                self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout(
                    self.URM_train, train_percentage=0.8)

            else:

                self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout(
                    self.URM_train, train_percentage=0.8)

            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
                "ICM_title_abstract": self.ICM_title_abstract
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("CiteulikeReader: loading complete")