示例#1
0
def eval_routine(DATA_PATH, TEST_PATH, ITEM_FEATURES_PATH, USER_FEATURES_PATH,
                 TEST_SIZE_WEEKS, N_POPULAR_ITEMS, INIT_NUM_RECS, N_FIN_RECS):
    data_train_lvl_1 = get_raw_data_splits(DATA_PATH, mode=0)
    data_val_lvl_1 = get_raw_data_splits(TEST_PATH, mode=0)
    item_features, user_features = preprare_features(ITEM_FEATURES_PATH,
                                                     USER_FEATURES_PATH)
    data_train_lvl_1 = prefilter_items(data_train_lvl_1,
                                       N_POPULAR_ITEMS)  # Prefilter routine
    data_val_lvl_1 = prefilter_items(data_val_lvl_1,
                                     N_POPULAR_ITEMS)  # Prefilter routine
    itemid_to_price = get_price_list(data_train_lvl_1, data_val_lvl_1)
    user_bought_history = get_bought_ever_list(data_train_lvl_1)
    item_to_commodity = get_item_commodities_list(item_features)
    recommender = MainRecommender(data_train_lvl_1, itemid_to_price)
    result_lvl_1 = data_val_lvl_1.groupby(
        'user_id')['item_id'].unique().reset_index()
    result_lvl_1.columns = ['user_id', 'actual']
    result_lvl_1['base_rec'] = result_lvl_1['user_id'].apply(
        lambda x: recommender.get_own_recommendations(x, N=INIT_NUM_RECS))
    #     result_lvl_1['als_rec'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=INIT_NUM_RECS))
    result_lvl_1 = postfilter_items(result_lvl_1,
                                    recommender.overall_top_purchases,
                                    item_to_commodity,
                                    itemid_to_price,
                                    user_bought_history,
                                    n=N_FIN_RECS)

    res = result_lvl_1.apply(lambda row: money_precision_at_k(
        row['result'], row['actual'], itemid_to_price, k=5),
                             axis=1).mean()
    print(f"Result money precision @ 5 metric: {res}")
    if SAVE_RESULTS:
        result_lvl_1[['user_id', 'result']].to_csv('YN_recs.csv', index=False)
    return res
    def fit(self, data_train, data_val, item_sub_comm, prices, N=5):
        param_grid_list = self._get_param_grid(self.model_params)
        result_lvl_1 = data_val.groupby(
            'user_id')['item_id'].unique().reset_index()
        result_lvl_1.columns = ['user_id', 'actual']
        best_params = []
        best_score = 0
        if self.recommender == 'MainRecommender':
            for top_n in self.top_n_list:
                X = prefilter_items(data_train, take_n_popular=top_n)
                for weighting in self.weighting_list:
                    model = MainRecommender(X,
                                            weighting=weighting,
                                            **param_grid_list[0])
                    result_lvl_1['recs'] = result_lvl_1['user_id'].apply(
                        lambda x: model.get_main_model_recommendations(
                            x, N=top_n))
                    result_lvl_1 = postfilter_items(result_lvl_1,
                                                    'recs',
                                                    item_sub_comm,
                                                    prices,
                                                    N=N)
                    score = result_lvl_1.apply(lambda row: self.scoring_func(
                        row['postfilter_recs'], row['actual'], k=200),
                                               axis=1).mean()
                    if score > best_score:
                        best_score = score
                        best_params = [top_n, weighting, param_grid_list[0]]
                    for param in param_grid_list[1:]:
                        model.model = model.fit(model.user_item_matrix,
                                                **param)
                        result_lvl_1['recs'] = result_lvl_1['user_id'].apply(
                            lambda x: model.get_main_model_recommendations(
                                x, N=top_n))
                        result_lvl_1 = postfilter_items(result_lvl_1,
                                                        'recs',
                                                        item_sub_comm,
                                                        prices,
                                                        N=N)
                        score = result_lvl_1.apply(
                            lambda row: self.scoring_func(
                                row['postfilter_recs'], row['actual'], k=N),
                            axis=1).mean()
                        if score > best_score:
                            best_score = score
                            best_params = [top_n, weighting, param]

            return best_score, best_params
def calc_precision_take_n_popular(data_train, data_test, item_features, take_n_popular):
    data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=take_n_popular)
    als_model = MainRecommender(data_train)

    result = data_test.groupby('user_id')['item_id'].unique().reset_index()
    result.columns=['user_id', 'actual']

    # result['als_recommendations'] = result['user_id'].apply(lambda x: als_model.get_als_recommendations(x, N=5))
    result['own_recommendations'] = result['user_id'].apply(lambda x: als_model.get_own_recommendations(x, N=5))
    # result['similar_items_recommendations'] = result['user_id'].apply(lambda x: als_model.get_similar_items_recommendation(x, N=5))

    res = {}
    # res['als_recommendations'] = result.apply(lambda row: precision_at_k(row['als_recommendations'], row['actual']), axis=1).mean()
    res['own_recommendations'] = result.apply(lambda row: precision_at_k(row['own_recommendations'], row['actual']), axis=1).mean()
    # res['similar_items_recommendations'] = result.apply(lambda row: precision_at_k(row['similar_items_recommendations'], row['actual']), axis=1).mean()

    return res, result
示例#4
0
def get_results(main_data_path, item_features_path, user_features_path,
                val_lvl_1_size_weeks, val_lvl_2_size_weeks, model_lvl_1_path,
                model_lvl_2_path, test_data_path):

    print('Reading data...')
    data = pd.read_csv(main_data_path)
    item_features = pd.read_csv(item_features_path)
    user_features = pd.read_csv(user_features_path)
    item_features.columns = [col.lower() for col in item_features.columns]
    user_features.columns = [col.lower() for col in user_features.columns]

    item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
    user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

    data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() -
                            (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
    data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() -
                           (val_lvl_1_size_weeks + val_lvl_2_size_weeks))
                          & (data['week_no'] < data['week_no'].max() -
                             (val_lvl_2_size_weeks))]

    data_train_lvl_2 = data_val_lvl_1.copy()
    data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() -
                          val_lvl_2_size_weeks]

    prices = data.groupby('item_id')[['sales_value',
                                      'quantity']].sum().reset_index()
    prices['price'] = prices['sales_value'] / prices['quantity']
    prices.replace(np.inf, 0, inplace=True)
    prices = dict(zip(prices['item_id'], prices['price']))

    items_sub_comm = dict(
        zip(item_features['item_id'], item_features['sub_commodity_desc']))
    trans = DataTransformer()

    if test_data_path:
        test_data = pd.read_csv(test_data_path)

    if model_lvl_1_path and model_lvl_2_path:
        print('Reading models...')
        with open(model_lvl_1_path, 'rb') as f:
            model_lvl_1 = pickle.load(f)
        with open(model_lvl_2_path, 'rb') as f:
            model_lvl_2 = pickle.load(f)
        if test_data_path:
            result = get_recommendation_df(test_data, model_lvl_1, model_lvl_2,
                                           items_sub_comm, prices,
                                           item_features, user_features)
        else:
            result = get_recommendation_df(data_val_lvl_2, model_lvl_1,
                                           model_lvl_2, items_sub_comm, prices,
                                           item_features, user_features)
    else:
        print('Prepare data for level 1 model fit...')
        data_train_lvl_1 = prefilter_items(data_train_lvl_1, 6000)
        print('Level 1 model fit...')
        num_threads = multiprocessing.cpu_count() // 2 + 1
        model_lvl_1 = MainRecommender(data_train_lvl_1,
                                      weighting=None,
                                      n_factors=100,
                                      regularization=0.01,
                                      iterations=100,
                                      num_threads=num_threads)
        print('Construct level 1 recommendations DataFrame...')
        result_lvl_1 = get_recommendation_lvl_1(data_val_lvl_1, model_lvl_1)
        result_lvl_1 = postfilter_items(result_lvl_1,
                                        'lvl_1_recs',
                                        items_sub_comm,
                                        prices,
                                        N=200)
        result_lvl_1.rename(
            columns={'postfilter_lvl_1_recs': 'recommendations'}, inplace=True)
        print('Prepare data for level 2 model fit...')
        data_train_lvl_2 = trans.fit_transform(result_lvl_1,
                                               data_val_lvl_1,
                                               item_features,
                                               user_features,
                                               with_targets=True)
        y = data_train_lvl_2['target']
        X = data_train_lvl_2.drop('target', axis=1)
        cat_features = [
            'department', 'brand', 'commodity_desc', 'sub_commodity_desc',
            'curr_size_of_product', 'age_desc', 'marital_status_code',
            'income_desc', 'homeowner_desc', 'hh_comp_desc',
            'household_size_desc', 'kid_category_desc'
        ]
        class_1_weight = len(y[y == 0]) / len(y[y == 1])
        model_lvl_2 = CatBoostClassifier(n_estimators=300,
                                         max_depth=7,
                                         class_weights=[1, class_1_weight],
                                         cat_features=cat_features)
        print('Level 2 model fit...')
        model_lvl_2.fit(X, y)
        if test_data_path:
            result = get_recommendation_df(test_data, model_lvl_1, model_lvl_2,
                                           items_sub_comm, prices,
                                           item_features, user_features)
        else:
            result = get_recommendation_df(data_val_lvl_2, model_lvl_1,
                                           model_lvl_2, items_sub_comm, prices,
                                           item_features, user_features)

    print('Calculating final metric...')
    money_precision_at_5 = result.apply(
        lambda row: money_precision_at_k(row['postfilter_catboost_recs'], row[
            'actual'], row['recommend_prices']),
        axis=1).mean()
    print(
        f'Money precision@5 for final recommendations:{money_precision_at_5}')
    result.to_csv('Final_recommendations.csv')
    return result
示例#5
0
from src.utils import prefilter_items, postfilter, create_dataset, dataset_processing
from src.recommenders import MainRecommender, Ranking

data = pd.read_csv('../raw_data/retail_train.csv')
item_features = pd.read_csv('../raw_data/product.csv')
user_features = pd.read_csv('../raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# data filtering
data = prefilter_items(data, item_features=item_features, take_n_popular=5000)

candidates = pd.DataFrame(data['user_id'].unique())
candidates = candidates.rename(columns={0: 'user_id'})

recommender = MainRecommender(data)

# Рекомендации по BM25 взвешиванию
candidates['candidates'] = candidates['user_id'].apply(lambda x: recommender.get_bm25_recommendations(x, N=100))

# Создадим датафрейм в целевой переменной и добавим фичи юзеров и товаров
targets = create_dataset(data=data, data_candidates=candidates, users_info=user_features, items_info=item_features)

# Сгенерируем новые фичи
targets = dataset_processing(dataset=targets, data=data, items_info=item_features)
示例#6
0
    def __init__(self,
                 data,
                 item_info,
                 weighting=True,
                 first_model_weeks=6,
                 second_model_weeks=3,
                 take_n_popular=7000):

        self.top_purchases = data.groupby(
            ['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity',
                                       ascending=False,
                                       inplace=True)
        self.top_purchases = self.top_purchases[
            self.top_purchases['item_id'] != 999999]

        self.overall_top_purchases = data.groupby(
            'item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity',
                                               ascending=False,
                                               inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[
            self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist(
        )

        self.user_buyses = data.groupby(
            'user_id')['item_id'].unique().reset_index()
        self.user_buyses.columns = ['user_id', 'actual']

        self.user_item_matrix = self._prepare_matrix(data)
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

        self.item_info = item_info

        self.first_model_weeks = first_model_weeks
        self.second_model_weeks = second_model_weeks

        self.val_lvl_1_size_weeks = first_model_weeks
        self.val_lvl_2_size_weeks = second_model_weeks

        self.data_train_lvl_1 = data[
            data['week_no'] < data['week_no'].max() -
            (self.val_lvl_1_size_weeks + self.val_lvl_2_size_weeks)]
        self.data_val_lvl_1 = data[
            (data['week_no'] >= data['week_no'].max() -
             (self.val_lvl_1_size_weeks + self.val_lvl_2_size_weeks))
            & (data['week_no'] < data['week_no'].max() -
               (self.val_lvl_2_size_weeks))]
        self.data_train_lvl_1 = prefilter_items(self.data_train_lvl_1,
                                                item_features=self.item_info,
                                                take_n_popular=take_n_popular)

        self.data_train_lvl_2 = self.data_val_lvl_1.copy()
        self.data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() -
                                   self.val_lvl_2_size_weeks]

        self.user_buyses_lvl_1 = self.data_train_lvl_1.groupby(
            'user_id')['item_id'].unique().reset_index()
        self.user_buyses_lvl_1.columns = ['user_id', 'actual']

        self.users_recommendations_lvl_1 = self.get_als_recommendations_users(
            self.user_buyses_lvl_1["user_id"], N=25)