def split(self, dataset): super(ColdItemsKFold, self).split(dataset) folds = [] split_belonging = np.random.choice(self.n_folds, dataset.n_items, replace=True) for i in range(self.n_folds): urm = {} urm_mappers = {} mask = split_belonging != i for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name).tocsc(copy=True) # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs for j in np.arange(URM.shape[1])[mask].tolist(): URM.data[URM.indptr[j]:URM.indptr[j + 1]] = 0.0 URM.eliminate_zeros() urm[URM_name] = URM.tocsr() urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name) folds.append( Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict() ) ) r = [] for i in range(self.n_folds): urm = {} urm_mappers = {} for URM_name in folds[i].get_URM_names(): # Keep i-th fold as test and merge the others as train urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name) urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name) for j in range(2, self.n_folds): urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name) train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) urm = {} test_urm = folds[i].get_URM() test_urm.sort_indices() mask = test_urm.data <= self.test_rating_threshold for URM_name in folds[i].get_URM_names(): urm[URM_name] = folds[i].get_URM(URM_name) urm[URM_name].sort_indices() urm[URM_name].data[mask] = 0.0 urm[URM_name].eliminate_zeros() test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(), ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) if not self.allow_cold_users: users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0] train.remove_users(users_to_remove) test.remove_users(users_to_remove) r.append((train, test)) return r
def split(self, dataset): super(WarmItemsKFold, self).split(dataset) # I can do the kfold of a slice of the initial URM! if self.percentage_initial_data_to_split < 1.0: h = Holdout(train_perc=self.percentage_initial_data_to_split, test_perc=1-self.percentage_initial_data_to_split) dataset = h.split(dataset)[0] folds = [] URM = dataset.get_URM().tocoo() split_belonging = np.random.choice(self.n_folds, URM.data.size, replace=True) for i in range(self.n_folds): urm = {} urm_mappers = {} mask = split_belonging == i for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name).tocoo() # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs row, col, data = zip(*sorted(zip(URM.row, URM.col, URM.data), key=lambda x: (x[0], x[1]))) urm[URM_name] = sps.csr_matrix((np.array(data)[mask], (np.array(row)[mask], np.array(col)[mask])), shape=URM.shape) urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name) folds.append( Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict() ) ) r = [] for i in range(self.n_folds): urm = {} urm_mappers = {} for URM_name in folds[i].get_URM_names(): # Keep i-th fold as test and merge the others as train urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name) urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name) for j in range(2, self.n_folds): urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name) train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) urm = {} test_urm = folds[i].get_URM() test_urm.sort_indices() mask = test_urm.data <= self.test_rating_threshold for URM_name in folds[i].get_URM_names(): urm[URM_name] = folds[i].get_URM(URM_name) urm[URM_name].sort_indices() urm[URM_name].data[mask] = 0.0 urm[URM_name].eliminate_zeros() test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(), ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) if not self.allow_cold_users: users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0] train.remove_users(users_to_remove) test.remove_users(users_to_remove) r.append((train, test)) return r