예제 #1
0
def get_ICM_train_new(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the ICM_train_all after applying feature engineering.

    :param reader: data splitter
    :return: return ICM_train_all
    """
    ICM_all_dict = reader.get_loaded_ICM_dict()
    ICM_all_dict.pop("ICM_all")
    ICM_all_dict = apply_filtering_ICM(
        ICM_all_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    # Apply useful transformation
    ICM_all_dict = apply_transformation_ICM(ICM_all_dict,
                                            ICM_name_to_transform_mapper={
                                                "ICM_asset":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_price":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_item_pop":
                                                np.log1p
                                            })
    ICM_all_dict = apply_discretization_ICM(ICM_all_dict,
                                            ICM_name_to_bins_mapper={
                                                "ICM_asset": 200,
                                                "ICM_price": 200,
                                                "ICM_item_pop": 50
                                            })
    # Apply feature weighting
    ICM_all_dict = apply_transformation_ICM(
        ICM_all_dict,
        ICM_name_to_transform_mapper={
            "ICM_price": lambda x: x * 1.8474248499810804,
            "ICM_asset": lambda x: x * 1.2232716972721878,
            "ICM_sub_class": lambda x: x * 1.662671860026709,
            "ICM_item_pop": lambda x: x * 0.886528360392298
        })

    ICM_all = None
    item_feature_to_range_mapper = {}
    last_range = 0
    for idx, ICM_key_value in enumerate(ICM_all_dict.items()):
        ICM_name, ICM_object = ICM_key_value
        if idx == 0:
            ICM_all = ICM_object
        else:
            ICM_all = sps.hstack([ICM_all, ICM_object], format="csr")
        item_feature_to_range_mapper[ICM_name] = (last_range, last_range +
                                                  ICM_object.shape[1])
        last_range = last_range + ICM_object.shape[1]
    return ICM_all, item_feature_to_range_mapper
def get_ICM_with_fields(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the ICM_train_all after applying feature engineering

    :param reader: data splitter
    :return: return ICM_train_all and feature fields
    """
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_all_dict = reader.get_loaded_ICM_dict()
    ICM_all_dict.pop("ICM_all")
    ICM_all_dict = apply_feature_engineering_ICM(ICM_all_dict,
                                                 URM_train,
                                                 UCM_all_dict,
                                                 ICM_names_to_count=[],
                                                 UCM_names_to_list=[])
    ICM_all_dict = apply_filtering_ICM(
        ICM_all_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    ICM_all_dict = apply_transformation_ICM(ICM_all_dict,
                                            ICM_name_to_transform_mapper={
                                                "ICM_asset":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_price":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_item_pop":
                                                np.log1p
                                            })
    ICM_all_dict = apply_discretization_ICM(ICM_all_dict,
                                            ICM_name_to_bins_mapper={
                                                "ICM_asset": 200,
                                                "ICM_price": 200,
                                                "ICM_item_pop": 50
                                            })

    ICM_all = None
    item_feature_fields = None
    for idx, ICM_key_value in enumerate(ICM_all_dict.items()):
        ICM_name, ICM_object = ICM_key_value
        if idx == 0:
            ICM_all = ICM_object
            item_feature_fields = np.full(shape=ICM_object.shape[1],
                                          fill_value=idx)
        else:
            ICM_all = sps.hstack([ICM_all, ICM_object], format="csr")
            item_feature_fields = np.concatenate([
                item_feature_fields,
                np.full(shape=ICM_object.shape[1], fill_value=idx)
            ])
    return ICM_all, item_feature_fields
예제 #3
0
def get_ICM_train(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the ICM_train_all after applying feature engineering. This preprocessing is used on new_best_models
    file

    :param reader: data splitter
    :return: return ICM_train_all
    """
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_all_dict = reader.get_loaded_ICM_dict()
    ICM_all_dict.pop("ICM_all")
    ICM_all_dict = apply_feature_engineering_ICM(
        ICM_all_dict,
        URM_train,
        UCM_all_dict,
        ICM_names_to_count=["ICM_sub_class"],
        UCM_names_to_list=["UCM_age"])
    ICM_all_dict = apply_filtering_ICM(
        ICM_all_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    ICM_all_dict = apply_transformation_ICM(ICM_all_dict,
                                            ICM_name_to_transform_mapper={
                                                "ICM_asset":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_price":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_item_pop":
                                                np.log1p,
                                                "ICM_sub_class_count":
                                                np.log1p,
                                                "ICM_age":
                                                lambda x: x**(1 / 2.5)
                                            })
    ICM_all_dict = apply_discretization_ICM(ICM_all_dict,
                                            ICM_name_to_bins_mapper={
                                                "ICM_asset": 200,
                                                "ICM_price": 200,
                                                "ICM_item_pop": 50,
                                                "ICM_sub_class_count": 50
                                            })
    ICM_all = build_ICM_all_from_dict(ICM_all_dict)
    return ICM_all
def get_UCM_with_fields(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the UCM_all after applying feature engineering

    :param reader: data splitter
    :return: return UCM_all
    """
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_dict = reader.get_loaded_ICM_dict()
    UCM_all_dict = apply_feature_engineering_UCM(
        UCM_all_dict, URM_train, ICM_dict, ICM_names_to_UCM=["ICM_sub_class"])

    # These are useful feature weighting for UserCBF_CF_Warm
    UCM_all_dict = apply_transformation_UCM(UCM_all_dict,
                                            UCM_name_to_transform_mapper={
                                                "UCM_sub_class":
                                                lambda x: x / 2,
                                                "UCM_user_act": np.log1p
                                            })
    UCM_all_dict = apply_discretization_UCM(
        UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50})
    UCM_all = None
    user_feature_fields = None
    for idx, UCM_key_value in enumerate(UCM_all_dict.items()):
        UCM_name, UCM_object = UCM_key_value
        if idx == 0:
            UCM_all = UCM_object
            user_feature_fields = np.full(shape=UCM_object.shape[1],
                                          fill_value=idx)
        else:
            UCM_all = sps.hstack([UCM_all, UCM_object], format="csr")
            user_feature_fields = np.concatenate([
                user_feature_fields,
                np.full(shape=UCM_object.shape[1], fill_value=idx)
            ])
    return UCM_all, user_feature_fields
예제 #5
0
def get_UCM_train(reader: New_DataSplitter_leave_k_out):
    """
    It returns all the UCM_all after applying feature engineering. This preprocessing is used on new_best_models file

    :param reader: data splitter
    :return: return UCM_all
    """
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_dict = reader.get_loaded_ICM_dict()
    UCM_all_dict = apply_feature_engineering_UCM(
        UCM_all_dict, URM_train, ICM_dict, ICM_names_to_UCM=["ICM_sub_class"])

    # These are useful feature weighting for UserCBF_CF_Warm
    UCM_all_dict = apply_transformation_UCM(UCM_all_dict,
                                            UCM_name_to_transform_mapper={
                                                "UCM_sub_class":
                                                lambda x: x / 2,
                                                "UCM_user_act": np.log1p
                                            })
    UCM_all_dict = apply_discretization_UCM(
        UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50})
    UCM_all = build_UCM_all_from_dict(UCM_all_dict)
    return UCM_all
예제 #6
0
def get_UCM_train_new(reader: New_DataSplitter_leave_k_out):
    URM_train, _ = reader.get_holdout_split()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_dict = reader.get_loaded_ICM_dict()

    # Preprocess ICM
    ICM_dict.pop("ICM_all")
    ICM_dict = apply_feature_engineering_ICM(
        ICM_dict,
        URM_train,
        UCM_all_dict,
        ICM_names_to_count=["ICM_sub_class"],
        UCM_names_to_list=["UCM_age"])
    ICM_dict = apply_filtering_ICM(
        ICM_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    ICM_dict = apply_transformation_ICM(ICM_dict,
                                        ICM_name_to_transform_mapper={
                                            "ICM_asset":
                                            lambda x: np.log1p(1 / x),
                                            "ICM_price":
                                            lambda x: np.log1p(1 / x),
                                            "ICM_item_pop": np.log1p,
                                            "ICM_sub_class_count": np.log1p,
                                            "ICM_age": lambda x: x**(1 / 2.5)
                                        })
    ICM_dict = apply_discretization_ICM(ICM_dict,
                                        ICM_name_to_bins_mapper={
                                            "ICM_asset": 200,
                                            "ICM_price": 200,
                                            "ICM_item_pop": 50,
                                            "ICM_sub_class_count": 50
                                        })

    # Preprocess UCM
    UCM_all_dict = apply_feature_engineering_UCM(
        UCM_all_dict,
        URM_train,
        ICM_dict,
        ICM_names_to_UCM=["ICM_sub_class", "ICM_item_pop"])
    UCM_all_dict = apply_feature_entropy_UCM(
        UCM_all_dict, UCM_names_to_entropy=["UCM_sub_class"])
    # Apply useful transformation
    UCM_all_dict = apply_transformation_UCM(
        UCM_all_dict, UCM_name_to_transform_mapper={"UCM_user_act": np.log1p})

    UCM_all_dict = apply_discretization_UCM(UCM_all_dict,
                                            UCM_name_to_bins_mapper={
                                                "UCM_user_act": 50,
                                                "UCM_sub_class_entropy": 20
                                            })

    UCM_all = None
    user_feature_to_range_mapper = {}
    last_range = 0
    for idx, UCM_key_value in enumerate(UCM_all_dict.items()):
        UCM_name, UCM_object = UCM_key_value
        if idx == 0:
            UCM_all = UCM_object
        else:
            UCM_all = sps.hstack([UCM_all, UCM_object], format="csr")
        user_feature_to_range_mapper[UCM_name] = (last_range, last_range +
                                                  UCM_object.shape[1])
        last_range = last_range + UCM_object.shape[1]
    return UCM_all, user_feature_to_range_mapper