def __init__(self): test_percentage = 0.2 validation_percentage = 0.2 pre_splitted_path = "Data_manager_split_datasets/MovielensHetrec2011/RecSys/SpectralCF_our_interface/" pre_splitted_filename = "splitted_data" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print( "Dataset_MovielensHetrec2011: Attempting to load pre-splitted data" ) for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "Dataset_MovielensHetrec2011: Pre-splitted data not found, building new one" ) data_reader = MovielensHetrec2011Reader_DataManager() data_reader.load_data() URM_all = data_reader.get_URM_all() # keep only ratings 5 URM_all.data = URM_all.data == 5 URM_all.eliminate_zeros() # create train - test - validation URM_train_original, self.URM_test = split_train_validation_percentage_user_wise( URM_all, train_percentage=1 - test_percentage, verbose=False) self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise( URM_train_original, train_percentage=1 - validation_percentage, verbose=False) data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Dataset_MovielensHetrec2011: Dataset loaded") ut.print_stat_datareader(self)
def __init__(self): super(PinterestICCVReader, self).__init__() pre_splitted_path = "Data_manager_split_datasets/PinterestICCV/SIGIR/CMN_our_interface/" pre_splitted_filename = "splitted_data" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("PinterestICCVReader: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "PinterestICCVReader: Pre-splitted data not found, building new one" ) print("PinterestICCVReader: loading URM") # data_reader = PinterestICCVReader() # data_reader.load_data() # # URM_all = data_reader.get_URM_all() # # self.URM_train, self.URM_validation, self.URM_test, self.URM_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=100) dataset = Dataset_NeuralCollaborativeFiltering( "Conferences/WWW/NeuMF_github/Data/pinterest-20") self.URM_train_original, self.URM_test, self.URM_test_negative = dataset.URM_train, dataset.URM_test, dataset.URM_test_negative self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout( self.URM_train_original.copy(), train_percentage=0.8) data_dict = { "URM_train_original": self.URM_train_original, "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_test_negative": self.URM_test_negative, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("PinterestICCVReader: loading complete")
def __init__(self): super(CiteULikeReader, self).__init__() pre_splitted_path = "Data_manager_split_datasets/CiteULike/SIGIR/CMN_our_interface/" pre_splitted_filename = "splitted_data" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("CiteULikeReader: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "CiteULikeReader: Pre-splitted data not found, building new one" ) print("CiteULikeReader: loading URM") filename = "Conferences/SIGIR/CMN_github/data/citeulike-a.npz" self.URM_train_original, self.URM_test, self.URM_test_negative = self.build_sparse_matrix( filename) self.URM_train, self.URM_validation = split_train_validation_leave_one_out_user_wise( self.URM_train_original.copy()) data_dict = { "URM_train_original": self.URM_train_original, "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_test_negative": self.URM_test_negative, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("N_items {}, n_users {}".format(self.URM_train.shape[1], self.URM_train.shape[0])) print("CiteULikeReader: Dataset loaded")
def __init__(self): super(EpinionsReader, self).__init__() pre_splitted_path = "Data_manager_split_datasets/Epinions/SIGIR/CMN_our_interface/" pre_splitted_filename = "splitted_data" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("EpinionsReader: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("EpinionsReader: Pre-splitted data not found, building new one") print("EpinionsReader: loading URM") data_reader = EpinionsReader_DataManager() data_reader.load_data() URM_all = data_reader.get_URM_all() URM_all.data = np.ones_like(URM_all.data) self.URM_train, self.URM_validation, self.URM_test, self.URM_test_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=100) # Compatibility with the other two datasets self.URM_train_original = self.URM_train + self.URM_validation data_dict = { "URM_train_original": self.URM_train_original, "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_test_negative": self.URM_test_negative, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("EpinionsReader: loading complete")
def __init__(self): super(Movielens100KReader, self).__init__() pre_splitted_path = "Data_manager_split_datasets/Movielens100K/KDD/MCRec_our_interface/" pre_splitted_filename = "splitted_data" original_data_path = "Conferences/KDD/MCRec_github/data/" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Movielens100KReader: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Movielens100KReader: Pre-splitted data not found, building new one") print("Movielens100KReader: loading URM") from Conferences.KDD.MCRec_github.code.Dataset import Dataset dataset = 'ml-100k' dataset = Dataset(original_data_path + dataset) URM_train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives # Dataset adds 1 to user and item id, removing it to restore 0 indexing URM_train = sps.coo_matrix(URM_train) URM_train.row -= 1 URM_train.col -= 1 self.URM_train = sps.csr_matrix((np.ones_like(URM_train.data), (URM_train.row, URM_train.col))) num_users, num_items = self.URM_train.shape # Build sparse matrices from lists URM_test_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) URM_test_negative_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) for user_index in range(len(testRatings)): user_id = testRatings[user_index][0] current_user_test_items = testRatings[user_index][1:] current_user_test_negative_items = testNegatives[user_index] current_user_test_items = np.array(current_user_test_items) -1 current_user_test_negative_items = np.array(current_user_test_negative_items) -1 URM_test_builder.add_single_row(user_id -1, current_user_test_items, 1.0) URM_test_negative_builder.add_single_row(user_id -1, current_user_test_negative_items, 1.0) # the test data has repeated data, apparently self.URM_test = URM_test_builder.get_SparseMatrix() self.URM_test_negative = URM_test_negative_builder.get_SparseMatrix() # Split validation from train as 10% from Data_manager.split_functions.split_train_validation import split_train_validation_percentage_user_wise self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(self.URM_train, train_percentage=0.9) # Load features data_reader = Movielens100KReader_DataManager() data_reader.load_data() zipFile_path = data_reader.DATASET_SPLIT_ROOT_FOLDER + data_reader.DATASET_SUBFOLDER dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip") ICM_path = dataFile.extract("ml-100k/u.item", path=zipFile_path + "decompressed/") ICM_genre = self._loadICM(ICM_path) ICM_genre = ICM_genre.get_SparseMatrix() shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self.ICM_dict = {"ICM_genre": ICM_genre} data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation, "URM_test_negative": self.URM_test_negative, "ICM_dict": self.ICM_dict, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Movielens100KReader: loading complete")
def __init__(self): super(NetflixPrizeReader, self).__init__() pre_splitted_path = "Data_manager_split_datasets/NetflixPrize/WWW/MultiVAE_our_interface/" pre_splitted_filename = "splitted_data" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("NetflixPrizeReader: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("NetflixPrizeReader: Pre-splitted data not found, building new one") data_reader = NetflixPrizeReader_DataManager() data_reader.load_data() URM_all = data_reader.get_URM_all() # binarize the data (only keep ratings >= 4) URM_all.data = URM_all.data >= 4.0 URM_all.eliminate_zeros() URM_all = sps.coo_matrix(URM_all) dict_for_dataframe = {"userId": URM_all.row, "movieId": URM_all.col, "rating": URM_all.data } URM_all_dataframe = pd.DataFrame(data = dict_for_dataframe) self.URM_train, self.URM_train_all, self.URM_validation, self.URM_test = split_train_validation_test_VAE_CF(URM_all_dataframe, n_heldout_users = 40000) n_rows = max(self.URM_train.shape[0], self.URM_train_all.shape[0], self.URM_validation.shape[0], self.URM_test.shape[0]) n_cols = max(self.URM_train.shape[1], self.URM_train_all.shape[1], self.URM_validation.shape[1], self.URM_test.shape[1]) newShape = (n_rows, n_cols) self.URM_test = reshapeSparse(self.URM_test, newShape) self.URM_train = reshapeSparse(self.URM_train, newShape) self.URM_train_all = reshapeSparse(self.URM_train_all, newShape) self.URM_test = reshapeSparse(self.URM_test, newShape) data_dict = { "URM_train": self.URM_train, "URM_train_all": self.URM_train_all, "URM_test": self.URM_test, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("NetflixPrizeReader: Dataset loaded")
def __init__(self): super(Movielens1MReader, self).__init__() pre_splitted_path = "Data_manager_split_datasets/Movielens1M/WWW/NeuMF_our_interface/" pre_splitted_filename = "splitted_data" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_Movielens1M: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "Dataset_Movielens1M: Pre-splitted data not found, building new one" ) # Ensure file is loaded as matrix Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix dataset = Dataset_github("Conferences/WWW/NeuMF_github/Data/ml-1m") self.URM_train_original, self.URM_test = dataset.trainMatrix, dataset.testRatings self.URM_train_original = self.URM_train_original.tocsr() self.URM_test = self.URM_test.tocsr() from Base.Recommender_utils import reshapeSparse shape = (max(self.URM_train_original.shape[0], self.URM_test.shape[0]), max(self.URM_train_original.shape[1], self.URM_test.shape[1])) self.URM_train_original = reshapeSparse(self.URM_train_original, shape) self.URM_test = reshapeSparse(self.URM_test, shape) URM_test_negatives_builder = IncrementalSparseMatrix( n_rows=shape[0], n_cols=shape[1]) for user_index in range(len(dataset.testNegatives)): user_test_items = dataset.testNegatives[user_index] URM_test_negatives_builder.add_single_row(user_index, user_test_items, data=1.0) self.URM_test_negative = URM_test_negatives_builder.get_SparseMatrix( ) self.URM_train, self.URM_validation = split_train_validation_leave_one_out_user_wise( self.URM_train_original.copy()) data_dict = { "URM_train_original": self.URM_train_original, "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_test_negative": self.URM_test_negative, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Dataset_Movielens1M: Dataset loaded") print("N_items {}, n_users {}".format(self.URM_train.shape[1], self.URM_train.shape[0]))
def __init__(self, split_type="cold_user"): super(Movielens20MReader, self).__init__() assert split_type in ["cold_user", "warm_user"] pre_splitted_path = "Data_manager_split_datasets/Movielens20M/WWW/MultiVAE_our_interface/" pre_splitted_filename = "splitted_data" + "_" + split_type # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Movielens20MReader: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Movielens20MReader: Pre-splitted data not found, building new one") data_reader = Movielens20MReader_DataManager() data_reader.load_data() URM_all = data_reader.get_URM_all() # binarize the data (only keep ratings >= 4) URM_all.data = URM_all.data >= 4.0 URM_all.eliminate_zeros() if split_type == "cold_user": URM_all = sps.coo_matrix(URM_all) dict_for_dataframe = {"userId": URM_all.row, "movieId": URM_all.col, "rating": URM_all.data } URM_all_dataframe = pd.DataFrame(data=dict_for_dataframe) self.URM_train, self.URM_train_all, self.URM_validation, self.URM_test = split_train_validation_test_VAE_CF( URM_all_dataframe, n_heldout_users=10000) n_rows = max(self.URM_train.shape[0], self.URM_train_all.shape[0], self.URM_validation.shape[0], self.URM_test.shape[0]) n_cols = max(self.URM_train.shape[1], self.URM_train_all.shape[1], self.URM_validation.shape[1], self.URM_test.shape[1]) newShape = (n_rows, n_cols) self.URM_test = reshapeSparse(self.URM_test, newShape) self.URM_train = reshapeSparse(self.URM_train, newShape) self.URM_train_all = reshapeSparse(self.URM_train_all, newShape) self.URM_test = reshapeSparse(self.URM_test, newShape) data_dict = { "URM_train": self.URM_train, "URM_train_all": self.URM_train_all, "URM_test": self.URM_test, "URM_validation": self.URM_validation, } elif split_type == "warm_user": URM_all = sps.csr_matrix(URM_all) users_to_keep = np.ediff1d(URM_all.indptr) >= 4 URM_all = URM_all[users_to_keep, :] URM_all = sps.csc_matrix(URM_all) items_to_keep = np.ediff1d(URM_all.indptr) >= 1 URM_all = URM_all[:, items_to_keep] URM_all = sps.csr_matrix(URM_all) self.URM_train, self.URM_validation, self.URM_test, _ = split_train_validation_test_negative_leave_one_out_user_wise( URM_all) data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Movielens20MReader: Dataset loaded")
def __init__(self): test_percentage = 0.2 validation_percentage = 0.2 pre_splitted_path = "Data_manager_split_datasets/AmazonInstantVideo/RecSys/SpectralCF_our_interface/" pre_splitted_filename = "splitted_data" ratings_file_name = "ratings_Amazon_Instant_Video.csv" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print( "Dataset_AmazonInstantVideo: Attempting to load pre-splitted data" ) for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "Dataset_AmazonInstantVideo: Pre-splitted data not found, building new one" ) folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER downloadFromURL(self.DATASET_URL, folder_path, ratings_file_name) # read Amazon Instant Video df = pd.read_csv(folder_path + ratings_file_name, sep=',', header=None, names=['user', 'item', 'rating', 'timestamp'])[['user', 'item', 'rating']] # keep only ratings = 5 URM_train_builder = IncrementalSparseMatrix( auto_create_col_mapper=True, auto_create_row_mapper=True) URM_train_builder.add_data_lists(df['user'].values, df['item'].values, df['rating'].values) URM_all = URM_train_builder.get_SparseMatrix() URM_all.data = URM_all.data == 5 URM_all.eliminate_zeros() # keep only users with at least 5 ratings URM_all = ut.filter_urm(URM_all, user_min_number_ratings=5, item_min_number_ratings=1) # create train - test - validation URM_train_original, self.URM_test = split_train_validation_percentage_user_wise( URM_all, train_percentage=1 - test_percentage, verbose=False) self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise( URM_train_original, train_percentage=1 - validation_percentage, verbose=False) data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Dataset_AmazonInstantVideo: Dataset loaded") ut.print_stat_datareader(self)
def __init__(self, type="original", cold_start=False, cold_items=None): assert type in ["original", "ours"] # path for pre existed movielens1M split movielens_splitted_path = "Conferences/RecSys/SpectralCF_github/data/ml-1m/" pre_splitted_path = "Data_manager_split_datasets/Movielens1M/RecSys/SpectralCF_our_interface/" mode = 1 # their mode in cold start if cold_start: assert (isinstance(cold_items, int) and cold_items > 0) pre_splitted_filename = "splitted_data_{}_cold_start_{}_mode_{}".format( type, cold_items, mode) else: pre_splitted_filename = "splitted_data_{}".format(type) # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_Movielens1M: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "Dataset_Movielens1M: Pre-splitted data not found, building new one" ) if type == "original": assert (cold_start is False) # use the SpectralCF class to read data data_generator = Data( train_file=movielens_splitted_path + 'train_users.dat', test_file=movielens_splitted_path + 'test_users.dat', batch_size=BATCH_SIZE) # convert train into csr full_train_matrix = sps.csr_matrix(data_generator.R) URM_train_original = full_train_matrix # convert test into csr test_set = data_generator.test_set uids, items = [], [] for uid in test_set.keys(): uids += np.full(len(test_set[uid]), uid).tolist() items += test_set[uid] test_matrix = sps.csr_matrix( (np.ones(len(items)), (uids, items)), shape=(full_train_matrix.shape)) if not cold_start: self.URM_test = test_matrix # create validation self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise( URM_train_original, train_percentage=0.9, verbose=False) else: print('nothing') elif type == "ours": data_reader = Movielens1MReader_DataManager() data_reader.load_data() URM_all = data_reader.get_URM_all() URM_all.data = URM_all.data == 5 URM_all.eliminate_zeros() if not cold_start: URM_train, self.URM_test = split_train_validation_percentage_user_wise( URM_all, train_percentage=0.8, verbose=False) self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise( URM_train, train_percentage=0.9, verbose=False) else: if mode == 1: # their mode, cold start for full dataset self.URM_train, URM_test = split_train_validation_cold_start_user_wise( URM_all, full_train_percentage=0.0, cold_items=cold_items, verbose=False) self.URM_test, self.URM_validation = split_train_validation_percentage_user_wise( URM_test, train_percentage=0.9, verbose=False) if mode == 2: # cold start only for some users URM_train, self.URM_test = split_train_validation_cold_start_user_wise( URM_all, full_train_percentage=0.8, cold_items=cold_items, verbose=False) self.URM_train, self.URM_validation = split_train_validation_cold_start_user_wise( URM_train, full_train_percentage=0.9, cold_items=cold_items, verbose=False) data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Dataset_Movielens1M: Dataset loaded") ut.print_stat_datareader(self)
def __init__(self, dataset_variant="a", train_interactions=1): super(CiteulikeReader, self).__init__() assert dataset_variant in [ "a", "t" ], "CiteulikeReader: dataset_variant must be either 'a' or 't'" assert train_interactions in [ 1, 10, "all" ], "CiteulikeReader: train_interactions must be: 1, 10 or 'all'" pre_splitted_path = "Data_manager_split_datasets/CiteULike/KDD/CollaborativeVAE_our_interface/" pre_splitted_filename = "splitted_data_citeulike-{}-{}-items".format( dataset_variant, train_interactions) original_data_path = "Conferences/KDD/CollaborativeVAE_github/data/citeulike-{}/".format( dataset_variant) # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("CiteulikeReader: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "CiteulikeReader: Pre-splitted data not found, building new one" ) print("CiteulikeReader: loading URM") if train_interactions == "all": train_interactions_file_suffix = 10 else: train_interactions_file_suffix = train_interactions URM_train_builder = self._load_data_file( original_data_path + "cf-train-{}-users.dat".format(train_interactions_file_suffix)) URM_test_builder = self._load_data_file( original_data_path + "cf-test-{}-users.dat".format(train_interactions_file_suffix)) self.URM_test = URM_test_builder.get_SparseMatrix() self.URM_train = URM_train_builder.get_SparseMatrix() if dataset_variant == "a": self.ICM_title_abstract = scipy.io.loadmat(original_data_path + "mult_nor.mat")['X'] else: # Variant "t" uses a different file format and is transposed self.ICM_title_abstract = h5py.File(original_data_path + "mult_nor.mat").get('X') self.ICM_title_abstract = sps.csr_matrix( self.ICM_title_abstract).T self.ICM_title_abstract = sps.csr_matrix(self.ICM_title_abstract) n_rows = max(self.URM_test.shape[0], self.URM_train.shape[0]) n_cols = max(self.URM_test.shape[1], self.URM_train.shape[1], self.ICM_title_abstract.shape[0]) newShape = (n_rows, n_cols) self.URM_test = reshapeSparse(self.URM_test, newShape) self.URM_train = reshapeSparse(self.URM_train, newShape) if train_interactions == "all": self.URM_train += self.URM_test self.URM_train, self.URM_test = split_train_validation_percentage_random_holdout( self.URM_train, train_percentage=0.8) self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout( self.URM_train, train_percentage=0.8) else: self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout( self.URM_train, train_percentage=0.8) data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation, "ICM_title_abstract": self.ICM_title_abstract } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("CiteulikeReader: loading complete")