def verify_data_consistency(self): self._assert_is_initialized() print_preamble = "{} consistency check: ".format(self.DATASET_NAME) URM_all = self.get_URM_all() n_interactions = URM_all.nnz assert n_interactions != 0, print_preamble + "Number of interactions in URM is 0" if self.is_implicit(): assert np.all( URM_all.data == 1.0 ), print_preamble + "The DataReader is stated to be implicit but the main URM is not" assert_URM_ICM_mapper_consistency( URM_DICT=self.AVAILABLE_URM, user_original_ID_to_index=self.user_original_ID_to_index, item_original_ID_to_index=self.item_original_ID_to_index, ICM_DICT=self.AVAILABLE_ICM, ICM_MAPPER_DICT=self.AVAILABLE_ICM_feature_mapper, UCM_DICT=self.AVAILABLE_UCM, UCM_MAPPER_DICT=self.AVAILABLE_UCM_feature_mapper, DATA_SPLITTER_NAME=self.DATASET_NAME)
def _verify_data_consistency(self): self._assert_is_initialized() print_preamble = "{} consistency check: ".format( self._get_dataset_name()) URM_all = self.get_URM_all() n_interactions = URM_all.nnz assert n_interactions != 0, print_preamble + "Number of interactions in URM is 0" assert all(loaded_ICM_name in self._LOADED_ICM_DICT for loaded_ICM_name in self.get_loaded_ICM_names()), \ print_preamble + "The DataReader has not loaded all the ICMs it was supposed to load." assert all(loaded_ICM_name in self.get_loaded_ICM_names() for loaded_ICM_name in self._LOADED_ICM_DICT), \ print_preamble + "The DataReader has loaded an ICM which was not supposed to load" assert_URM_ICM_mapper_consistency( URM_DICT=self.get_loaded_URM_dict(), GLOBAL_MAPPER_DICT=self.get_loaded_Global_mappers(), ICM_DICT=self.get_loaded_ICM_dict(), ICM_MAPPER_DICT=self._LOADED_ICM_MAPPER_DICT, DATA_SPLITTER_NAME=self._get_dataset_name())
def _verify_data_consistency(self): self._assert_is_initialized() print_preamble = "{} consistency check: ".format( self.DATA_SPLITTER_NAME) URM_to_load_list = ["URM_train", "URM_test"] if self.use_validation_set: URM_to_load_list.append("URM_validation") assert len(self.SPLIT_URM_DICT) == len(URM_to_load_list),\ print_preamble + "The available URM are not as many as they are supposed to be. URMs are {}, expected URMs are {}".format(len(self.SPLIT_URM_DICT), len(URM_to_load_list)) assert all(URM_name in self.SPLIT_URM_DICT for URM_name in URM_to_load_list ), print_preamble + "Not all URMs have been created" assert all( URM_name in URM_to_load_list for URM_name in self.SPLIT_URM_DICT.keys() ), print_preamble + "The split contains URMs that should not exist" URM_shape = None for URM_name, URM_object in self.SPLIT_URM_DICT.items(): if URM_shape is None: URM_shape = URM_object.shape n_users, n_items = URM_shape assert n_users != 0, print_preamble + "Number of users in URM is 0" assert n_items != 0, print_preamble + "Number of items in URM is 0" assert URM_shape == URM_object.shape, print_preamble + "URM shape is inconsistent" assert self.SPLIT_URM_DICT[ "URM_train"].nnz != 0, print_preamble + "Number of interactions in URM Train is 0" assert self.SPLIT_URM_DICT[ "URM_test"].nnz != 0, print_preamble + "Number of interactions in URM Test is 0" URM = self.SPLIT_URM_DICT["URM_test"].copy() user_interactions = np.ediff1d(sps.csr_matrix(URM).indptr) assert np.all( user_interactions == self.k_out_value ), print_preamble + "Not all users have the desired number of interactions in URM_test, {} users out of {}".format( (user_interactions != self.k_out_value).sum(), n_users) if self.use_validation_set: assert self.SPLIT_URM_DICT[ "URM_validation"].nnz != 0, print_preamble + "Number of interactions in URM Validation is 0" URM = self.SPLIT_URM_DICT["URM_validation"].copy() user_interactions = np.ediff1d(sps.csr_matrix(URM).indptr) assert np.all( user_interactions == self.k_out_value ), print_preamble + "Not all users have the desired number of interactions in URM_validation, {} users out of {}".format( (user_interactions != self.k_out_value).sum(), n_users) URM = self.SPLIT_URM_DICT["URM_train"].copy() user_interactions = np.ediff1d(sps.csr_matrix(URM).indptr) if not self.allow_cold_users: assert np.all( user_interactions != 0 ), print_preamble + "Cold users exist despite not being allowed as per DataSplitter parameters, {} users out of {}".format( (user_interactions == 0).sum(), n_users) assert assert_disjoint_matrices(list(self.SPLIT_URM_DICT.values())) assert_URM_ICM_mapper_consistency( URM_DICT=self.SPLIT_URM_DICT, GLOBAL_MAPPER_DICT=self.SPLIT_GLOBAL_MAPPER_DICT, ICM_DICT=self.SPLIT_ICM_DICT, ICM_MAPPER_DICT=self.SPLIT_ICM_MAPPER_DICT, DATA_SPLITTER_NAME=self.DATA_SPLITTER_NAME)
def _verify_data_consistency(self): self._assert_is_initialized() print_preamble = "{} consistency check: ".format( self.DATA_SPLITTER_NAME) assert len(self.SPLIT_URM_DICT) == len(self._SPLIT_URM_NAME_LIST),\ print_preamble + "The available URM are not as many as they are supposed to be. URMs are {}, expected URMs are {}".format(len(self.SPLIT_URM_DICT), len(self._SPLIT_URM_NAME_LIST)) assert all(URM_name in self.SPLIT_URM_DICT for URM_name in self._SPLIT_URM_NAME_LIST ), print_preamble + "Not all URMs have been created" assert all( URM_name in self._SPLIT_URM_NAME_LIST for URM_name in self.SPLIT_URM_DICT.keys() ), print_preamble + "The split contains URMs that should not exist" URM_shape = None for URM_name, URM_object in self.SPLIT_URM_DICT.items(): if URM_shape is None: URM_shape = URM_object.shape n_users, n_items = URM_shape assert n_users != 0, print_preamble + "Number of users in URM is 0" assert n_items != 0, print_preamble + "Number of items in URM is 0" assert URM_shape == URM_object.shape, print_preamble + "URM shape is inconsistent" assert self.SPLIT_URM_DICT[ "URM_train"].nnz != 0, print_preamble + "Number of interactions in URM Train is 0" assert self.SPLIT_URM_DICT[ "URM_test"].nnz != 0, print_preamble + "Number of interactions in URM Test is 0" # Assert URM_validation is not empty only when the input quota list is zero # It may create problems on the user-wise split if the validation quota is too small and no items gets selected # Although we assume in that case it would be acceptable to receive a warning your validation data cannot be built assert (self.SPLIT_URM_DICT["URM_validation"].nnz == 0 and self.input_split_interaction_quota_list[1] == 0.0)\ or (self.SPLIT_URM_DICT["URM_validation"].nnz != 0 and self.input_split_interaction_quota_list[1] > 0.0)\ , print_preamble + "Number of interactions in Validation is 0" quota_oscillation_allowed = 0.2 for URM_index, URM_name in enumerate(self._SPLIT_URM_NAME_LIST): input_quota = self.input_split_interaction_quota_list[URM_index] actual_quota = self.actual_split_interaction_quota_list[URM_index] max_value_allowed = input_quota * (1 + quota_oscillation_allowed) min_value_allowed = input_quota * (1 - quota_oscillation_allowed) if actual_quota < min_value_allowed or actual_quota > max_value_allowed: print( print_preamble + "The differentce between the input interaction quota '{}' and actual interaction quota '{}' of '{}' higher than {} %" .format(input_quota, actual_quota, URM_name, quota_oscillation_allowed * 100)) URM = self.SPLIT_URM_DICT["URM_train"].copy() user_interactions = np.ediff1d(sps.csr_matrix(URM).indptr) if not self.allow_cold_users: assert np.all( user_interactions != 0 ), print_preamble + "Cold users exist despite not being allowed as per DataSplitter parameters, {} users out of {}".format( (user_interactions == 0).sum(), n_users) assert assert_disjoint_matrices(list(self.SPLIT_URM_DICT.values())) assert_URM_ICM_mapper_consistency( URM_DICT=self.SPLIT_URM_DICT, user_original_ID_to_index=self. SPLIT_GLOBAL_MAPPER_DICT["user_original_ID_to_index"], item_original_ID_to_index=self. SPLIT_GLOBAL_MAPPER_DICT["item_original_ID_to_index"], ICM_DICT=self.SPLIT_ICM_DICT, ICM_MAPPER_DICT=self.SPLIT_ICM_MAPPER_DICT, UCM_DICT=self.SPLIT_UCM_DICT, UCM_MAPPER_DICT=self.SPLIT_UCM_MAPPER_DICT, DATA_SPLITTER_NAME=self.DATA_SPLITTER_NAME)