def hgb_objective_map(params): """ objective function for HistGradientBoostingRegressor. """ # hyperopt casts as float params['max_iter'] = int(params['max_iter']) params['max_leaf_nodes'] = int(params['max_leaf_nodes']) model = HistGradientBoostingRegressor(**params) model.fit(train, y_train) preds = model.predict(X_valid) df_eval['interest'] = preds df_ranked = df_eval.sort_values(['user_id_hash', 'interest'], ascending=[False, False]) df_ranked = (df_ranked.groupby('user_id_hash')['coupon_id_hash'].apply( list).reset_index()) recomendations_dict = pd.Series(df_ranked.coupon_id_hash.values, index=df_ranked.user_id_hash).to_dict() actual = [] pred = [] for k, _ in recomendations_dict.items(): actual.append(list(interactions_valid_dict[k])) pred.append(list(recomendations_dict[k])) result = mapk(actual, pred) print("INFO: iteration {} MAP {:.3f}".format(lgb_objective_map.i, result)) hgb_objective_map.i += 1 return 1 - result
def compute_mapk(interactions_dict, recomendations_dict): actual = [] pred = [] for k, _ in recomendations_dict_hot.items(): actual.append(list(interactions_dict[k])) pred.append(list(recomendations_dict[k])) return mapk(actual, pred)
def lgb_objective_map(params): """ objective function for lightgbm. """ # hyperopt casts as float params['num_boost_round'] = int(params['num_boost_round']) params['num_leaves'] = int(params['num_leaves']) # need to be passed as parameter params['verbose'] = -1 params['seed'] = 1 cv_result = lgb.cv( params, lgtrain, nfold=3, metrics='rmse', num_boost_round=params['num_boost_round'], early_stopping_rounds=20, stratified=False, ) early_stop_dict[lgb_objective_map.i] = len(cv_result['rmse-mean']) params['num_boost_round'] = len(cv_result['rmse-mean']) model = lgb.LGBMRegressor(**params) model.fit(train, y_train, feature_name=all_cols, categorical_feature=cat_cols) preds = model.predict(X_valid) df_eval['interest'] = preds df_ranked = df_eval.sort_values(['user_id_hash', 'interest'], ascending=[False, False]) df_ranked = (df_ranked.groupby('user_id_hash')['coupon_id_hash'].apply( list).reset_index()) recomendations_dict = pd.Series(df_ranked.coupon_id_hash.values, index=df_ranked.user_id_hash).to_dict() actual = [] pred = [] for k, _ in recomendations_dict.items(): actual.append(list(interactions_valid_dict[k])) pred.append(list(recomendations_dict[k])) result = mapk(actual, pred) print("INFO: iteration {} MAP {:.3f}".format(lgb_objective_map.i, result)) lgb_objective_map.i += 1 return 1 - result
def xl_objective(params): start = time() xl_objective.i+=1 params['task'] = 'reg' params['metric'] = 'rmse' params['stop_window'] = 3 # remember hyperopt casts as floats params['epoch'] = int(params['epoch']) params['k'] = int(params['k']) xl_model = xl.create_ffm() xl_model.setTrain(train_data_file) # xl_model.setValidate(valid_data_file_opt) xl_model.setTest(valid_data_file) # xl_model.setQuiet() xl_model.fit(params, xlmodel_fname_tmp) xl_model.predict(xlmodel_fname_tmp, xlpreds_fname_tmp) preds = np.loadtxt(xlpreds_fname_tmp) df_preds['interest'] = preds df_ranked = df_preds.sort_values(['user_id_hash', 'interest'], ascending=[False, False]) df_ranked = (df_ranked .groupby('user_id_hash')['coupon_id_hash'] .apply(list) .reset_index()) recomendations_dict = pd.Series(df_ranked.coupon_id_hash.values, index=df_ranked.user_id_hash).to_dict() actual = [] pred = [] for k,_ in recomendations_dict.items(): actual.append(list(interactions_valid_dict[k])) pred.append(list(recomendations_dict[k])) score = mapk(actual,pred) end = round((time() - start)/60.,2) print("INFO: iteration {} was completed in {} min. Score {:.3f}".format(xl_objective.i, end, score)) return 1-score
def mapk_similarity(alpha, at_random=False): mpv = user_mean_purchase_vector_valid.copy() feat_cols = [c for c in mpv.columns if 'id_hash' not in c] mvv = user_mean_visit_vector_valid.copy() mvv[feat_cols] = alpha*mvv[feat_cols] user_vector= (pd.concat([mpv, mvv]) .groupby('user_id_hash') .sum() .reset_index()) user_ids = user_vector.user_id_hash.values item_ids = df_coupons_valid_feat_oh.coupon_id_hash.values # ensure the same column order user_cols = ['user_id_hash'] + [c for c in user_vector.columns if 'id_hash' not in c] item_cols = ['coupon_id_hash'] + [c for c in user_vector.columns if 'id_hash' not in c] user_feat = user_vector[user_cols[1:]].values item_feat = df_coupons_valid_feat_oh[item_cols[1:]].values user_item_sim = euclidean_distances(user_feat, item_feat) top_n_idx = np.apply_along_axis(np.argsort, 1, user_item_sim) if at_random: item_feat_rnd = item_ids.copy() recomendations_dict = {} for user,idx in zip(user_ids,top_n_idx): np.random.shuffle(item_feat_rnd) recomendations_dict[user] = item_feat_rnd else: recomendations_dict = {} for user,idx in zip(user_ids,top_n_idx): recomendations_dict[user] = [item_ids[i] for i in idx] actual = [] pred = [] for k,_ in recomendations_dict.items(): actual.append(list(interactions_valid_dict[k])) pred.append(list(recomendations_dict[k])) return mapk(actual, pred)
'coupon_id_hash': 'unique' }).reset_index()) tmp_valid_dict = pd.Series(df_interactions_valid.coupon_id_hash.values, index=df_interactions_valid.user_id_hash).to_dict() # keep users that have interacted at least with one validation coupon keep_users = [] for user, coupons in tmp_valid_dict.items(): if np.intersect1d(valid_coupon_ids, coupons).size != 0: keep_users.append(user) # out of 6924, we end up with 6071, so not bad interactions_valid_dict = { k: v for k, v in tmp_valid_dict.items() if k in keep_users } coupon_id_rn = valid_coupon_ids.copy() recomendations_dict = {} for user, _ in interactions_valid_dict.items(): np.random.shuffle(coupon_id_rn) recomendations_dict[user] = coupon_id_rn actual = [] pred = [] for k, _ in recomendations_dict.items(): actual.append(list(interactions_valid_dict[k])) pred.append(list(recomendations_dict[k])) print(mapk(actual, pred))
dist, nnidx = dist.ravel(), nnidx.ravel() ranked_dist = np.argsort(dist) ranked_cp_idxs = nnidx[ranked_dist][:50] ranked_cp_ids = [idx_item_dict[i] for i in ranked_cp_idxs] ranked_cp_idxs_valid = [ train_to_valid_most_similar[c] for c in ranked_cp_ids ] return (user, ranked_cp_idxs_valid) start = time() cores = multiprocessing.cpu_count() pool = Pool(cores) all_users = list(interactions_valid_dict.keys()) recommend_coupons = pool.map(build_recommendations, all_users) # recommend_coupons = Parallel(cores)(delayed(build_recommendations)(user) for user,_ in user_items_tuple) print(time() - start) recommendations_dict = {k: v for k, v in recommend_coupons} actual = [] pred = [] for k, _ in recommendations_dict.items(): actual.append(list(interactions_valid_dict[k])) pred.append(list(recommendations_dict[k])) result = mapk(actual, pred) print(result)
elif set_up_name is 'set_up_4': lr_scheduler = MultiStepLR(optimizer, milestones=[3, 8], gamma=0.1) elif set_up_name is 'set_up_5': lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4], gamma=0.1) model.fit(train_loader, criterion, optimizer, n_epochs=n_epochs, eval_loader=eval_loader, lr_scheduler=lr_scheduler) preds = model.predict(test_loader) df_all_interactions['interest'] = preds df_ranked = df_all_interactions.sort_values(['user_id_hash', 'interest'], ascending=[False, False]) df_ranked = (df_ranked.groupby('user_id_hash')['coupon_id_hash'].apply( list).reset_index()) recomendations_dict = pd.Series(df_ranked.coupon_id_hash.values, index=df_ranked.user_id_hash).to_dict() true_valid_interactions = wd_interactions['true_valid_interactions'] actual = [] pred = [] for k, _ in recomendations_dict.items(): actual.append(list(true_valid_interactions[k])) pred.append(list(recomendations_dict[k])) print("Mean Average Precission: {}".format(mapk(actual, pred))) results[set_up_name] = mapk(actual, pred) del (model, optimizer, criterion)