def get_ICM_with_fields(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the ICM_train_all after applying feature engineering

    :param reader: data splitter
    :return: return ICM_train_all and feature fields
    """
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_all_dict = reader.get_loaded_ICM_dict()
    ICM_all_dict.pop("ICM_all")
    ICM_all_dict = apply_feature_engineering_ICM(ICM_all_dict,
                                                 URM_train,
                                                 UCM_all_dict,
                                                 ICM_names_to_count=[],
                                                 UCM_names_to_list=[])
    ICM_all_dict = apply_filtering_ICM(
        ICM_all_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    ICM_all_dict = apply_transformation_ICM(ICM_all_dict,
                                            ICM_name_to_transform_mapper={
                                                "ICM_asset":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_price":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_item_pop":
                                                np.log1p
                                            })
    ICM_all_dict = apply_discretization_ICM(ICM_all_dict,
                                            ICM_name_to_bins_mapper={
                                                "ICM_asset": 200,
                                                "ICM_price": 200,
                                                "ICM_item_pop": 50
                                            })

    ICM_all = None
    item_feature_fields = None
    for idx, ICM_key_value in enumerate(ICM_all_dict.items()):
        ICM_name, ICM_object = ICM_key_value
        if idx == 0:
            ICM_all = ICM_object
            item_feature_fields = np.full(shape=ICM_object.shape[1],
                                          fill_value=idx)
        else:
            ICM_all = sps.hstack([ICM_all, ICM_object], format="csr")
            item_feature_fields = np.concatenate([
                item_feature_fields,
                np.full(shape=ICM_object.shape[1], fill_value=idx)
            ])
    return ICM_all, item_feature_fields
示例#2
0
def read_split_load_data(k_out, allow_cold_users, seed):
    root_data_path = os.path.join(get_project_root_path(), "data/")
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(
        data_reader,
        k_out_value=k_out,
        use_validation_set=False,
        allow_cold_users=allow_cold_users,
        force_new_split=True,
        seed=seed)
    data_reader.load_data()
    return data_reader
示例#3
0
def get_ICM_train(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the ICM_train_all after applying feature engineering. This preprocessing is used on new_best_models
    file

    :param reader: data splitter
    :return: return ICM_train_all
    """
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_all_dict = reader.get_loaded_ICM_dict()
    ICM_all_dict.pop("ICM_all")
    ICM_all_dict = apply_feature_engineering_ICM(
        ICM_all_dict,
        URM_train,
        UCM_all_dict,
        ICM_names_to_count=["ICM_sub_class"],
        UCM_names_to_list=["UCM_age"])
    ICM_all_dict = apply_filtering_ICM(
        ICM_all_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    ICM_all_dict = apply_transformation_ICM(ICM_all_dict,
                                            ICM_name_to_transform_mapper={
                                                "ICM_asset":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_price":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_item_pop":
                                                np.log1p,
                                                "ICM_sub_class_count":
                                                np.log1p,
                                                "ICM_age":
                                                lambda x: x**(1 / 2.5)
                                            })
    ICM_all_dict = apply_discretization_ICM(ICM_all_dict,
                                            ICM_name_to_bins_mapper={
                                                "ICM_asset": 200,
                                                "ICM_price": 200,
                                                "ICM_item_pop": 50,
                                                "ICM_sub_class_count": 50
                                            })
    ICM_all = build_ICM_all_from_dict(ICM_all_dict)
    return ICM_all
示例#4
0
def get_UCM_train_cold(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the UCM_all after applying feature engineering. This preprocessing is used on new_best_models file

    :param reader: data splitter
    :return: return UCM_all
    """
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    UCM_all_dict = apply_transformation_UCM(
        UCM_all_dict, UCM_name_to_transform_mapper={"UCM_user_act": np.log1p})
    UCM_all_dict = apply_discretization_UCM(
        UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50})
    UCM_all = build_UCM_all_from_dict(UCM_all_dict)
    return UCM_all
示例#5
0
def get_ICM_train_new(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the ICM_train_all after applying feature engineering.

    :param reader: data splitter
    :return: return ICM_train_all
    """
    ICM_all_dict = reader.get_loaded_ICM_dict()
    ICM_all_dict.pop("ICM_all")
    ICM_all_dict = apply_filtering_ICM(
        ICM_all_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    # Apply useful transformation
    ICM_all_dict = apply_transformation_ICM(ICM_all_dict,
                                            ICM_name_to_transform_mapper={
                                                "ICM_asset":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_price":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_item_pop":
                                                np.log1p
                                            })
    ICM_all_dict = apply_discretization_ICM(ICM_all_dict,
                                            ICM_name_to_bins_mapper={
                                                "ICM_asset": 200,
                                                "ICM_price": 200,
                                                "ICM_item_pop": 50
                                            })
    # Apply feature weighting
    ICM_all_dict = apply_transformation_ICM(
        ICM_all_dict,
        ICM_name_to_transform_mapper={
            "ICM_price": lambda x: x * 1.8474248499810804,
            "ICM_asset": lambda x: x * 1.2232716972721878,
            "ICM_sub_class": lambda x: x * 1.662671860026709,
            "ICM_item_pop": lambda x: x * 0.886528360392298
        })

    ICM_all = None
    item_feature_to_range_mapper = {}
    last_range = 0
    for idx, ICM_key_value in enumerate(ICM_all_dict.items()):
        ICM_name, ICM_object = ICM_key_value
        if idx == 0:
            ICM_all = ICM_object
        else:
            ICM_all = sps.hstack([ICM_all, ICM_object], format="csr")
        item_feature_to_range_mapper[ICM_name] = (last_range, last_range +
                                                  ICM_object.shape[1])
        last_range = last_range + ICM_object.shape[1]
    return ICM_all, item_feature_to_range_mapper
示例#6
0
def main():
    # Data loading
    root_data_path = os.path.join(get_project_root_path(), "data/")
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(
        data_reader,
        k_out_value=K_OUT,
        use_validation_set=False,
        allow_cold_users=ALLOW_COLD_USERS,
        force_new_split=True,
        seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    ICM_all, _ = get_ICM_train_new(data_reader)
    UCM_all, _ = get_UCM_train_new(data_reader)

    # Ignoring users
    ignore_users = get_ignore_users(
        URM_train,
        data_reader.get_original_user_id_to_index_mapper(),
        lower_threshold=LOWER_THRESHOLD,
        upper_threshold=UPPER_THRESHOLD,
        ignore_non_target_users=IGNORE_NON_TARGET_USERS)
    evaluator = EvaluatorHoldout(URM_test,
                                 cutoff_list=[CUTOFF],
                                 ignore_users=ignore_users)

    # Model evaluation
    model = get_model(URM_train, ICM_all, UCM_all)
    print(evaluator.evaluateRecommender(model))
def get_UCM_with_fields(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the UCM_all after applying feature engineering

    :param reader: data splitter
    :return: return UCM_all
    """
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_dict = reader.get_loaded_ICM_dict()
    UCM_all_dict = apply_feature_engineering_UCM(
        UCM_all_dict, URM_train, ICM_dict, ICM_names_to_UCM=["ICM_sub_class"])

    # These are useful feature weighting for UserCBF_CF_Warm
    UCM_all_dict = apply_transformation_UCM(UCM_all_dict,
                                            UCM_name_to_transform_mapper={
                                                "UCM_sub_class":
                                                lambda x: x / 2,
                                                "UCM_user_act": np.log1p
                                            })
    UCM_all_dict = apply_discretization_UCM(
        UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50})
    UCM_all = None
    user_feature_fields = None
    for idx, UCM_key_value in enumerate(UCM_all_dict.items()):
        UCM_name, UCM_object = UCM_key_value
        if idx == 0:
            UCM_all = UCM_object
            user_feature_fields = np.full(shape=UCM_object.shape[1],
                                          fill_value=idx)
        else:
            UCM_all = sps.hstack([UCM_all, UCM_object], format="csr")
            user_feature_fields = np.concatenate([
                user_feature_fields,
                np.full(shape=UCM_object.shape[1], fill_value=idx)
            ])
    return UCM_all, user_feature_fields
示例#8
0
def get_UCM_train(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the UCM_all after applying feature engineering. This preprocessing is used on new_best_models file

    :param reader: data splitter
    :return: return UCM_all
    """
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_dict = reader.get_loaded_ICM_dict()
    UCM_all_dict = apply_feature_engineering_UCM(
        UCM_all_dict, URM_train, ICM_dict, ICM_names_to_UCM=["ICM_sub_class"])

    # These are useful feature weighting for UserCBF_CF_Warm
    UCM_all_dict = apply_transformation_UCM(UCM_all_dict,
                                            UCM_name_to_transform_mapper={
                                                "UCM_sub_class":
                                                lambda x: x / 2,
                                                "UCM_user_act": np.log1p
                                            })
    UCM_all_dict = apply_discretization_UCM(
        UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50})
    UCM_all = build_UCM_all_from_dict(UCM_all_dict)
    return UCM_all
示例#9
0
from datetime import datetime

from course_lib.Base.Evaluation.Evaluator import EvaluatorHoldout
from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out
from src.data_management.RecSys2019Reader import RecSys2019Reader
from src.data_management.dataframe_preprocesser import get_preprocessed_dataframe
from src.tuning.holdout_validation.run_parameter_search_advanced_top_pop import run_parameter_search_advanced_top_pop
from src.utils.general_utility_functions import get_split_seed

if __name__ == '__main__':
    # Data loading
    data_reader = RecSys2019Reader("../../data/")
    data_reader = New_DataSplitter_leave_k_out(data_reader,
                                               k_out_value=3,
                                               use_validation_set=False,
                                               force_new_split=True,
                                               seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    mapper = data_reader.get_original_user_id_to_index_mapper()
    df = get_preprocessed_dataframe("../../data/", keep_warm_only=True)

    # Setting evaluator
    # warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > 0
    # warm_users = np.arange(URM_train.shape[0])[warm_users_mask]
    # ignore_users = warm_users
    cutoff_list = [10]
    evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list)

    # HP tuning
    print("Start tuning...")
示例#10
0
        output_folder_path = version_path + now + "/"
        output_file_name = output_folder_path + "results.txt"
        try:
            if not os.path.exists(output_folder_path):
                os.mkdir(output_folder_path)
        except FileNotFoundError as e:
            os.makedirs(output_folder_path)

        f = open(output_file_name, "w")
    else:
        f = None

    # Data loading
    root_data_path = "../../data/"
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=1, use_validation_set=False,
                                               force_new_split=True, seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    ICM_all = get_ICM_train(data_reader)
    UCM_all = get_UCM_train(data_reader)

    UCM_age = data_reader.get_UCM_from_name("UCM_age")
    age_feature_to_id_mapper = data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name("UCM_age")
    age_demographic = get_user_demographic(UCM_age, age_feature_to_id_mapper, binned=True)

    ICM_subclass = data_reader.get_ICM_from_name("ICM_sub_class")
    subclass_feature_to_id_mapper = data_reader.dataReader_object.get_ICM_feature_to_index_mapper_from_name(
        "ICM_sub_class")
    subclass_content_dict = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=True)
    subclass_content = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=False)
            UCM_all = sps.hstack([UCM_all, UCM_object], format="csr")
            user_feature_fields = np.concatenate([
                user_feature_fields,
                np.full(shape=UCM_object.shape[1], fill_value=idx)
            ])
    return UCM_all, user_feature_fields


if __name__ == '__main__':
    data_reader = RecSys2019Reader("../../data/")
    data_reader = DataPreprocessingRemoveColdUsersItems(data_reader,
                                                        threshold_items=20,
                                                        threshold_users=25)
    data_reader = New_DataSplitter_leave_k_out(data_reader,
                                               k_out_value=3,
                                               use_validation_set=False,
                                               force_new_split=True,
                                               seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    ICM_all, item_feature_fields = get_ICM_with_fields(data_reader)
    UCM_all, user_feature_fields = get_UCM_with_fields(data_reader)

    user_fields = np.full(shape=URM_train.shape[0], fill_value=0)
    item_fields = np.full(shape=URM_train.shape[1], fill_value=1)
    item_feature_fields = item_feature_fields + 2
    user_feature_fields = user_feature_fields + np.max(item_feature_fields) + 1
    fields = np.concatenate(
        [user_fields, item_fields, item_feature_fields, user_feature_fields])

    positive_URM = URM_train
示例#12
0
from src.data_management.RecSys2019Reader_utils import merge_UCM
from src.data_management.data_getter import get_warmer_UCM
from src.model.HybridRecommender.HybridRankBasedRecommender import HybridRankBasedRecommender
import numpy as np

from src.model.KNN.UserSimilarityRecommender import UserSimilarityRecommender

SEED = 69420

if __name__ == '__main__':
    seed(SEED)

    # Data loading
    data_reader = RecSys2019Reader("../../data/")
    data_reader = New_DataSplitter_leave_k_out(data_reader,
                                               k_out_value=3,
                                               use_validation_set=False,
                                               force_new_split=True)
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    URM_all = data_reader.dataReader_object.get_URM_all()
    UCM_age = data_reader.dataReader_object.get_UCM_from_name("UCM_age")
    UCM_region = data_reader.dataReader_object.get_UCM_from_name("UCM_region")
    UCM_all, _ = merge_UCM(UCM_age, UCM_region, {}, {})

    UCM_all = get_warmer_UCM(UCM_all, URM_all, threshold_users=3)
    UCM_all, _ = merge_UCM(UCM_all, URM_train, {}, {})

    warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > 0
    warm_users = np.arange(URM_train.shape[0])[warm_users_mask]

    # Reset seed for hyper-parameter tuning
from src.data_management.RecSys2019Reader import RecSys2019Reader
from src.model_management.model_result_reader import best_model_reader
from course_lib.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from src.data_management.DataPreprocessing import DataPreprocessingRemoveColdUsersItems
from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out

# +
SEED = 69420
seed(SEED)

# Data loading

dataset = RecSys2019Reader("../data/")
dataset = DataPreprocessingRemoveColdUsersItems(dataset, threshold_users=3)
dataset = New_DataSplitter_leave_k_out(dataset, k_out_value=3, use_validation_set=False, force_new_split=True)
dataset.load_data()

seed() # reset random seeds for other things
# -

URM_train, URM_test = dataset.get_holdout_split()
ICM_all = dataset.get_ICM_from_name('ICM_all')

best_model_list = best_model_reader("../report/hp_tuning/item_cbf/Nov19_11-23-21_k_out_value_3/")

best_model_list

cosine_best_model = ItemKNNCBFRecommender(ICM_all, URM_train)
cosine_best_model.fit(topK=9, shrink=968, similarity='cosine', normalize=True, feature_weighting='TF-IDF')
示例#14
0
import os

import numpy as np
import xlearn as xl

from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out
from src.data_management.RecSys2019Reader import RecSys2019Reader
from src.data_management.data_preprocessing_fm import format_URM_positive_non_compressed, \
    uniform_sampling_strategy, format_URM_positive_user_compressed, \
    format_URM_negative_sampling_user_compressed, mix_URM, add_ICM_info
from src.utils.general_utility_functions import get_split_seed, get_project_root_path

if __name__ == '__main__':
    dataset = RecSys2019Reader("../../data/")
    dataset = New_DataSplitter_leave_k_out(dataset, k_out_value=3, use_validation_set=False, force_new_split=True,
                                           seed=get_split_seed())
    dataset.load_data()
    URM_train, URM_test = dataset.get_holdout_split()
    ICM_all = dataset.get_ICM_from_name("ICM_all")

    URM_positive_FM_matrix = format_URM_positive_user_compressed(URM_train)
    URM_negative_FM_matrix = format_URM_negative_sampling_user_compressed(URM_train, negative_rate=1,
                                                                          sampling_function=uniform_sampling_strategy,
                                                                          check_replacement=True)
    URM_FM_matrix = mix_URM(URM_positive_FM_matrix, URM_negative_FM_matrix)[:, :-1]
    URM_FM_matrix = add_ICM_info(URM_FM_matrix, ICM_all, URM_train.shape[0])

    root_path = get_project_root_path()
    fm_data_path = os.path.join(root_path, "resources", "fm_data")

    # Prepare train sparse matrix and labels for dumping to file
示例#15
0
def get_UCM_train_new(reader: New_DataSplitter_leave_k_out):
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_dict = reader.get_loaded_ICM_dict()

    # Preprocess ICM
    ICM_dict.pop("ICM_all")
    ICM_dict = apply_feature_engineering_ICM(
        ICM_dict,
        URM_train,
        UCM_all_dict,
        ICM_names_to_count=["ICM_sub_class"],
        UCM_names_to_list=["UCM_age"])
    ICM_dict = apply_filtering_ICM(
        ICM_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    ICM_dict = apply_transformation_ICM(ICM_dict,
                                        ICM_name_to_transform_mapper={
                                            "ICM_asset":
                                            lambda x: np.log1p(1 / x),
                                            "ICM_price":
                                            lambda x: np.log1p(1 / x),
                                            "ICM_item_pop": np.log1p,
                                            "ICM_sub_class_count": np.log1p,
                                            "ICM_age": lambda x: x**(1 / 2.5)
                                        })
    ICM_dict = apply_discretization_ICM(ICM_dict,
                                        ICM_name_to_bins_mapper={
                                            "ICM_asset": 200,
                                            "ICM_price": 200,
                                            "ICM_item_pop": 50,
                                            "ICM_sub_class_count": 50
                                        })

    # Preprocess UCM
    UCM_all_dict = apply_feature_engineering_UCM(
        UCM_all_dict,
        URM_train,
        ICM_dict,
        ICM_names_to_UCM=["ICM_sub_class", "ICM_item_pop"])
    UCM_all_dict = apply_feature_entropy_UCM(
        UCM_all_dict, UCM_names_to_entropy=["UCM_sub_class"])
    # Apply useful transformation
    UCM_all_dict = apply_transformation_UCM(
        UCM_all_dict, UCM_name_to_transform_mapper={"UCM_user_act": np.log1p})

    UCM_all_dict = apply_discretization_UCM(UCM_all_dict,
                                            UCM_name_to_bins_mapper={
                                                "UCM_user_act": 50,
                                                "UCM_sub_class_entropy": 20
                                            })

    UCM_all = None
    user_feature_to_range_mapper = {}
    last_range = 0
    for idx, UCM_key_value in enumerate(UCM_all_dict.items()):
        UCM_name, UCM_object = UCM_key_value
        if idx == 0:
            UCM_all = UCM_object
        else:
            UCM_all = sps.hstack([UCM_all, UCM_object], format="csr")
        user_feature_to_range_mapper[UCM_name] = (last_range, last_range +
                                                  UCM_object.shape[1])
        last_range = last_range + UCM_object.shape[1]
    return UCM_all, user_feature_to_range_mapper