Пример #1
0
    def test_add_random_negative_ratings(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)
        df['label'] = label_array

        new_df = add_random_negative_ratings(data_frame=df,
                                             URM_train=self.URM_train,
                                             proportion=1)

        new_elements_id = np.arange(len(df), len(new_df))
        user_ids = new_df.iloc[new_elements_id]['user_id'].values
        item_ids = new_df.iloc[new_elements_id]['item_id'].values

        shifted_invalid_items = np.left_shift(item_ids,
                                              np.uint64(np.log2(n_users) + 1))
        tuple_user_item = np.bitwise_or(user_ids, shifted_invalid_items)
        unique_tuple = np.unique(tuple_user_item)
        assert unique_tuple.size == tuple_user_item.size

        labels = np.array(self.URM_train[user_ids,
                                         item_ids].tolist()).flatten()
        assert np.any(labels > 0) == False
Пример #2
0
    def test_add_UCM_information(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)
        df['label'] = label_array

        new_df = add_UCM_information(
            df, self.data_reader.get_original_user_id_to_index_mapper(),
            self.path)

        UCM_age = self.data_reader.get_UCM_from_name("UCM_age")
        age_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name(
            "UCM_age")
        age_demographic = get_user_demographic(UCM_age, age_mapper)

        UCM_region = self.data_reader.get_UCM_from_name("UCM_region")
        region_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name(
            "UCM_region")
        id_to_original_region_mapper = {
            v: int(k)
            for k, v in region_mapper.items()
        }
        for i in range(len(new_df)):
            user = new_df['user_id'].iloc[i]

            # Test age
            age = new_df['age'].iloc[i]
            age_imputed_flag = new_df['age_imputed_flag'].iloc[i]

            if age_demographic[user] == -1:
                assert age_imputed_flag == 1
                assert age == 5  # Imputed value (mode + 1)
            else:
                assert age_imputed_flag == 0
                assert age == age_demographic[user]

            # Test region
            true_regions = UCM_region.indices[
                UCM_region.indptr[user]:UCM_region.indptr[user + 1]]
            true_regions = [
                id_to_original_region_mapper[true_region]
                for true_region in true_regions
            ]
            for region in id_to_original_region_mapper.values():
                column_name = "region_{}".format(region)
                region_in_newdf = new_df[column_name].iloc[i]

                if region in true_regions:
                    assert region_in_newdf == 1, "User {} has not correct region {}".format(
                        user, region)
                else:
                    assert region_in_newdf == 0, "User {} has not correct region {}".format(
                        user, region)
Пример #3
0
    def test_get_label_array(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)

        labels = np.array(
            self.URM_train[df['user_id'].values,
                           df['item_id'].values].tolist()).flatten()
        assert np.array_equal(labels, label_array)
Пример #4
0
    def test_add_ICM_information(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)
        df['label'] = label_array

        new_df = add_ICM_information(df, self.path)

        print(new_df)
Пример #5
0
    def test_advanced_subclass_handling(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)
        df['label'] = label_array

        new_df = advanced_subclass_handling(data_frame=df,
                                            URM_train=self.URM_train)

        print(new_df)
        print(new_df.columns)
Пример #6
0
    def test_add_UCM_information_age_onehot(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)
        df['label'] = label_array

        new_df = add_UCM_information(
            df,
            self.data_reader.get_original_user_id_to_index_mapper(),
            self.path,
            use_age_onehot=True)

        UCM_age = self.data_reader.get_UCM_from_name("UCM_age")
        age_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name(
            "UCM_age")
        id_to_original_age_mapper = {v: int(k) for k, v in age_mapper.items()}
        for i in range(len(new_df)):
            user = new_df['user_id'].iloc[i]
            # Test age
            ages_original = UCM_age.indices[UCM_age.indptr[user]:UCM_age.
                                            indptr[user + 1]]
            ages_original = [
                id_to_original_age_mapper[age] for age in ages_original
            ]
            age_imputed_flag = new_df['age_imputed_flag'].iloc[i]

            for original_age in id_to_original_age_mapper.values():
                column_name = "age_{}".format(original_age)
                age_in_newdf = new_df[column_name].iloc[i]

                if original_age in ages_original:
                    assert age_in_newdf == 1
                elif age_imputed_flag == 1 and original_age == 5:
                    assert age_in_newdf == 1
                else:
                    assert age_in_newdf == 0, "User {} has incorrect age {}".format(
                        user, original_age)
Пример #7
0
    def test_add_item_popularity(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)
        df['label'] = label_array

        newdf = add_item_popularity(df, self.URM_train)
        URM_train_csc = self.URM_train.tocsc()
        for i in range(len(newdf)):
            item = newdf['item_id'].iloc[i]
            item_pop = newdf['item_pop'].iloc[i]

            true_item_pop = len(
                URM_train_csc.indices[URM_train_csc.indptr[item]:URM_train_csc.
                                      indptr[item + 1]])

            assert item_pop == true_item_pop
Пример #8
0
    def test_add_user_len_information(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)
        df['label'] = label_array

        newdf = add_user_len_information(df, self.URM_train)

        for i in range(len(newdf)):
            user = newdf['user_id'].iloc[i]
            user_profile_len = newdf['user_act'].iloc[i]

            true_user_profile_len = len(
                self.URM_train.indices[self.URM_train.indptr[user]:self.
                                       URM_train.indptr[user + 1]])

            assert user_profile_len == true_user_profile_len
Пример #9
0
    def test_add_recommender_predictions(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)
        df['label'] = label_array
        df = add_random_negative_ratings(data_frame=df,
                                         URM_train=self.URM_train,
                                         proportion=1)

        # Need to reorder the dataframe in order for the add_recommender_predictions to work
        df = df.sort_values(by="user_id", ascending=True)
        df = df.reset_index()
        df = df.drop(columns=["index"], inplace=False)

        new_df = add_recommender_predictions(
            data_frame=df,
            recommender=self.main_rec,
            column_name=self.main_rec.RECOMMENDER_NAME)

        # Test that all scores are correct
        all_scores = self.main_rec._compute_item_score(user_id_array)
        scaler = MinMaxScaler()
        scaler.fit(all_scores.reshape(-1, 1))
        all_scores = np.reshape(scaler.transform(all_scores.reshape(-1, 1)),
                                newshape=all_scores.shape)

        for i in range(len(new_df)):
            user = new_df['user_id'].iloc[i]
            item = new_df['item_id'].iloc[i]
            score = new_df[self.main_rec.RECOMMENDER_NAME].iloc[i]

            assert score == all_scores[user, item]
Пример #10
0
# Build ICMs
ICM_all = get_ICM_train(data_reader)

# Build UCMs: do not change the order of ICMs and UCMs
UCM_all = get_UCM_train(data_reader)

# Reading the dataframe
dataframe_path = "../boosting_dataframe/"
train_df = pd.read_csv(dataframe_path + "train_df_20.csv")
valid_df = pd.read_csv(dataframe_path + "valid_df_20.csv")

train_df = _preprocess_dataframe(train_df)
valid_df = _preprocess_dataframe(valid_df)

print("Retrieving training labels...", end="")
y_train, non_zero_count, total = get_label_array(data_frame=train_df,
                                                 URM_train=URM_train)
print("Done")

train_df['label'] = y_train
# -

print("Retrieving training labels...", end="")
y_train_valid, non_zero_count_vaid, total_valid = get_label_array(
    data_frame=valid_df, URM_train=URM_test)
print("Done")
valid_df['label'] = y_train_valid

# ### Pure scores exploration


def plot_score_distribution(column_name):
Пример #11
0
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()

    # Reading the dataframe
    dataframe_path = "../../resources/boosting_dataframe/"
    train_df = pd.read_csv(dataframe_path + "train_df_100_advanced_lt_20.csv")
    valid_df = pd.read_csv(dataframe_path + "valid_df_30_advanced_lt_20.csv")

    train_df = preprocess_dataframe_after_reading(train_df)
    y_train = train_df['label'].values + 1

    train_df = train_df.drop(columns=["label"], inplace=False)
    valid_df = preprocess_dataframe_after_reading(valid_df)
    valid_df = valid_df.drop(columns=[], inplace=False)

    _, non_zero_count, total = get_label_array(data_frame=train_df,
                                               URM_train=URM_train)
    y_valid, _, _ = get_label_array(data_frame=valid_df, URM_train=URM_test)

    # Setting evaluator
    mapper = data_reader.get_original_user_id_to_index_mapper()
    ignore_users = get_ignore_users(URM_train,
                                    mapper,
                                    lower_threshold=20,
                                    upper_threshold=2**16 - 1,
                                    ignore_non_target_users=True)
    evaluator = EvaluatorHoldout(URM_test,
                                 cutoff_list=[10],
                                 ignore_users=ignore_users)
    total_users = np.arange(URM_train.shape[0])
    mask = np.in1d(total_users, ignore_users, invert=True)
    users_to_validate = total_users[mask]
Пример #12
0
                                               seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()

    # Reading the dataframe
    dataframe_path = "../../resources/boosting_dataframe/"
    train_df = pd.read_csv(dataframe_path + "train_df_100_advanced_lt_20.csv")
    valid_df = pd.read_csv(dataframe_path + "valid_df_30_advanced_lt_20.csv")

    train_df = preprocess_dataframe_after_reading(train_df)
    y_train = train_df['label'].values + 1
    train_df = train_df.drop(columns=["label"], inplace=False)
    valid_df = preprocess_dataframe_after_reading(valid_df)

    print("Retrieving training labels...", end="")
    _, non_zero_count, total = get_label_array(data_frame=train_df,
                                               URM_train=URM_train)

    print("Done")

    # Setting evaluator
    mapper = data_reader.get_original_user_id_to_index_mapper()
    ignore_users = get_ignore_users(URM_train,
                                    mapper,
                                    lower_threshold=20,
                                    upper_threshold=2**16 - 1,
                                    ignore_non_target_users=True)
    evaluator = EvaluatorHoldout(URM_test,
                                 cutoff_list=[10],
                                 ignore_users=ignore_users)
    total_users = np.arange(URM_train.shape[0])
    mask = np.in1d(total_users, ignore_users, invert=True)