Exemplo n.º 1
0
    def _verify_data_consistency(self):

        self._assert_is_initialized()

        print_preamble = "{} consistency check: ".format(
            self.DATA_SPLITTER_NAME)

        URM_to_load_list = ["URM_train", "URM_test"]

        if self.use_validation_set:
            URM_to_load_list.append("URM_validation")


        assert len(self.SPLIT_URM_DICT) == len(URM_to_load_list),\
            print_preamble + "The available URM are not as many as they are supposed to be. URMs are {}, expected URMs are {}".format(len(self.SPLIT_URM_DICT), len(URM_to_load_list))

        assert all(URM_name in self.SPLIT_URM_DICT
                   for URM_name in URM_to_load_list
                   ), print_preamble + "Not all URMs have been created"
        assert all(
            URM_name in URM_to_load_list
            for URM_name in self.SPLIT_URM_DICT.keys()
        ), print_preamble + "The split contains URMs that should not exist"

        URM_shape = None

        for URM_name, URM_object in self.SPLIT_URM_DICT.items():

            if URM_shape is None:
                URM_shape = URM_object.shape

                n_users, n_items = URM_shape

                assert n_users != 0, print_preamble + "Number of users in URM is 0"
                assert n_items != 0, print_preamble + "Number of items in URM is 0"

            assert URM_shape == URM_object.shape, print_preamble + "URM shape is inconsistent"

        assert self.SPLIT_URM_DICT[
            "URM_train"].nnz != 0, print_preamble + "Number of interactions in URM Train is 0"
        assert self.SPLIT_URM_DICT[
            "URM_test"].nnz != 0, print_preamble + "Number of interactions in URM Test is 0"

        URM = self.SPLIT_URM_DICT["URM_test"].copy()
        user_interactions = np.ediff1d(sps.csr_matrix(URM).indptr)

        assert np.all(
            user_interactions == self.k_out_value
        ), print_preamble + "Not all users have the desired number of interactions in URM_test, {} users out of {}".format(
            (user_interactions != self.k_out_value).sum(), n_users)

        if self.use_validation_set:
            assert self.SPLIT_URM_DICT[
                "URM_validation"].nnz != 0, print_preamble + "Number of interactions in URM Validation is 0"

            URM = self.SPLIT_URM_DICT["URM_validation"].copy()
            user_interactions = np.ediff1d(sps.csr_matrix(URM).indptr)

            assert np.all(
                user_interactions == self.k_out_value
            ), print_preamble + "Not all users have the desired number of interactions in URM_validation, {} users out of {}".format(
                (user_interactions != self.k_out_value).sum(), n_users)

        URM = self.SPLIT_URM_DICT["URM_train"].copy()
        user_interactions = np.ediff1d(sps.csr_matrix(URM).indptr)

        if not self.allow_cold_users:
            assert np.all(
                user_interactions != 0
            ), print_preamble + "Cold users exist despite not being allowed as per DataSplitter parameters, {} users out of {}".format(
                (user_interactions == 0).sum(), n_users)

        assert assert_disjoint_matrices(list(self.SPLIT_URM_DICT.values()))

        assert_URM_ICM_mapper_consistency(
            URM_DICT=self.SPLIT_URM_DICT,
            GLOBAL_MAPPER_DICT=self.SPLIT_GLOBAL_MAPPER_DICT,
            ICM_DICT=self.SPLIT_ICM_DICT,
            ICM_MAPPER_DICT=self.SPLIT_ICM_MAPPER_DICT,
            DATA_SPLITTER_NAME=self.DATA_SPLITTER_NAME)
Exemplo n.º 2
0
    def _verify_data_consistency(self):

        self._assert_is_initialized()

        print_preamble = "{} consistency check: ".format(
            self.DATA_SPLITTER_NAME)

        assert len(self.SPLIT_URM_DICT) == len(self._SPLIT_URM_NAME_LIST),\
            print_preamble + "The available URM are not as many as they are supposed to be. URMs are {}, expected URMs are {}".format(len(self.SPLIT_URM_DICT), len(self._SPLIT_URM_NAME_LIST))

        assert all(URM_name in self.SPLIT_URM_DICT
                   for URM_name in self._SPLIT_URM_NAME_LIST
                   ), print_preamble + "Not all URMs have been created"
        assert all(
            URM_name in self._SPLIT_URM_NAME_LIST
            for URM_name in self.SPLIT_URM_DICT.keys()
        ), print_preamble + "The split contains URMs that should not exist"

        URM_shape = None

        for URM_name, URM_object in self.SPLIT_URM_DICT.items():

            if URM_shape is None:
                URM_shape = URM_object.shape

                n_users, n_items = URM_shape

                assert n_users != 0, print_preamble + "Number of users in URM is 0"
                assert n_items != 0, print_preamble + "Number of items in URM is 0"

            assert URM_shape == URM_object.shape, print_preamble + "URM shape is inconsistent"

        assert self.SPLIT_URM_DICT[
            "URM_train"].nnz != 0, print_preamble + "Number of interactions in URM Train is 0"
        assert self.SPLIT_URM_DICT[
            "URM_test"].nnz != 0, print_preamble + "Number of interactions in URM Test is 0"

        # Assert URM_validation is not empty only when the input quota list is zero
        # It may create problems on the user-wise split if the validation quota is too small and no items gets selected
        # Although we assume in that case it would be acceptable to receive a warning your validation data cannot be built
        assert (self.SPLIT_URM_DICT["URM_validation"].nnz == 0 and self.input_split_interaction_quota_list[1] == 0.0)\
            or (self.SPLIT_URM_DICT["URM_validation"].nnz != 0 and self.input_split_interaction_quota_list[1] > 0.0)\
            , print_preamble + "Number of interactions in Validation is 0"

        quota_oscillation_allowed = 0.2

        for URM_index, URM_name in enumerate(self._SPLIT_URM_NAME_LIST):

            input_quota = self.input_split_interaction_quota_list[URM_index]
            actual_quota = self.actual_split_interaction_quota_list[URM_index]
            max_value_allowed = input_quota * (1 + quota_oscillation_allowed)
            min_value_allowed = input_quota * (1 - quota_oscillation_allowed)

            if actual_quota < min_value_allowed or actual_quota > max_value_allowed:
                print(
                    print_preamble +
                    "The differentce between the input interaction quota '{}' and actual interaction quota '{}' of '{}' higher than {} %"
                    .format(input_quota, actual_quota, URM_name,
                            quota_oscillation_allowed * 100))

        URM = self.SPLIT_URM_DICT["URM_train"].copy()

        user_interactions = np.ediff1d(sps.csr_matrix(URM).indptr)

        if not self.allow_cold_users:
            assert np.all(
                user_interactions != 0
            ), print_preamble + "Cold users exist despite not being allowed as per DataSplitter parameters, {} users out of {}".format(
                (user_interactions == 0).sum(), n_users)

        assert assert_disjoint_matrices(list(self.SPLIT_URM_DICT.values()))

        assert_URM_ICM_mapper_consistency(
            URM_DICT=self.SPLIT_URM_DICT,
            user_original_ID_to_index=self.
            SPLIT_GLOBAL_MAPPER_DICT["user_original_ID_to_index"],
            item_original_ID_to_index=self.
            SPLIT_GLOBAL_MAPPER_DICT["item_original_ID_to_index"],
            ICM_DICT=self.SPLIT_ICM_DICT,
            ICM_MAPPER_DICT=self.SPLIT_ICM_MAPPER_DICT,
            UCM_DICT=self.SPLIT_UCM_DICT,
            UCM_MAPPER_DICT=self.SPLIT_UCM_MAPPER_DICT,
            DATA_SPLITTER_NAME=self.DATA_SPLITTER_NAME)