class CosineRecommenderModel(BaseItemItemRecommenderModel): def __init__(self, products: np.ndarray, params: dict): self.cosine_recommender = CosineRecommender(**params) self._product_idx = dict(zip(products, range(len(products)))) self._idx_product = products.tolist() def fit_recommender(self, purchases): user_item_matrix = self._create_user_item_matrix_from_purchases( purchases) logger.debug('Training CosineRecommender ...') self.cosine_recommender.fit(user_item_matrix.T) return self def recommend(self, products_counter): recs = [] user_item_csr_row = self._make_user_item_csr_row( values=list(products_counter.values()), item_idx=products_counter.keys()) cosine_preds = self.cosine_recommender.recommend( 0, user_item_csr_row, N=30, recalculate_user=True, filter_already_liked_items=False) cosine_preds = [ self._idx_product[idx] for (idx, score) in cosine_preds ] return cosine_preds
def fit_cosin_recommender(user_item_matrix): """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером""" cosin_recommender = CosineRecommender(K=2, num_threads=0) cosin_recommender.fit(csr_matrix(user_item_matrix).T.tocsr()) return cosin_recommender
class BaseItemItemRecommenderModel: def __init__(self, products: np.ndarray, params: dict): self.recommender = CosineRecommender(**params) self._product_idx = dict(zip(products, range(len(products)))) self._idx_product = products.tolist() def _make_user_item_csr_row(self, values, item_idx): row = sp.coo_matrix( (values, (np.zeros(len(values)), [self._product_idx[p] for p in item_idx])), shape=(1, len(self._product_idx))) return row.tocsr() def _create_user_item_matrix_from_purchases(self, purchases): clients = purchases.client_id.unique() clients_mapper = dict(zip(clients, range(len(clients)))) user_item_matrix = sp.coo_matrix( (purchases.relevance.values, (purchases.client_id.map(clients_mapper).values, purchases.product_id.map(self._product_idx).values))) user_item_matrix = user_item_matrix.tocsr() user_item_matrix.eliminate_zeros() return user_item_matrix def _fit_recommender(self, purchases): user_item_matrix = self._create_user_item_matrix_from_purchases( purchases) logger.debug('Training Recommender Model...') self.recommender.fit(user_item_matrix.T)
def calculate_distance_matrix(dataset): logging.debug("Calculating similar items matrix. This might take a while") # generate a recommender model based off the input params model = CosineRecommender() # train the model logging.debug("calculating distant matrix") start = time.time() model.fit(dataset) similarity_matrix = model.similarity logging.debug("trained model '%s' in %s", 'cosine', time.time() - start) return similarity_matrix
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def __init__(self, products: pd.DataFrame, params_rec: dict, params_catboost: dict, catboost_features=CB_FEATURES): self.ranker = catboost.CatBoost(params_catboost) self._catboost_features = catboost_features self._nan_fill_dict = dict() self.recommender = CosineRecommender(**params_rec) self._product_idx = dict(zip(products.product_id, range(len(products)))) self._idx_product = products.product_id.tolist() self._product_features = { row['product_id']: dict(row.drop(index='product_id')) for (i, row) in products.iterrows() }
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, movies, m = read_data(input_path, min_rating=min_rating) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model m = m.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = ratings.groupby('movieId').size() movie_lookup = dict( (i, m) for i, m in zip(movies['movieId'], movies['title'])) to_generate = sorted(list(movies['movieId']), key=lambda x: -user_count.get(x, 0)) with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if m.indptr[movieid] == m.indptr[movieid + 1]: continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): """ :param input_path: 训练数据集的路径 :param output_filename: 输出的文件名称 :param model_name: 采用的模型 :param min_rating: 过滤所需的阈值大小 :return: """ logging.debug("reading data from %s", input_path) start = time.time() rating_data, movies_data, m = read_data(input_path, min_rating=min_rating) logging.debug("reading data in %s", time.time() - start) if model_name == "als": model = AlternatingLeastSquares() logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender() else: raise NotImplementedError("TODU: model %s" % model_name) m = m.tocsr() logging.debug("Training model :%s" % model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = rating_data.groupby("movieId").size() movie_lookup = dict((i, m) for i,m in zip(movies_data['movieId'], movies_data['title'])) to_generate = sorted(list(movies_data['movieId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for movieid in to_generate: if(m.indptr[movieid] == m.indptr[movieid + 1]): continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_similar_beers(input_path, output_filename, model_name="cosine"): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, beers, m = read_data(input_path) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model m = m.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top beers") user_count = ratings.groupby('beerId').size() beer_lookup = dict((i, m) for i, m in zip(beers['beerId'], beers['name'])) to_generate = sorted(list(beers['beerId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for beerId in to_generate: if m.indptr[beerId] == m.indptr[beerId + 1]: continue beer = beer_lookup[beerId] for other, score in model.similar_items(beerId, 11): o.write("%s,%s,%s\n" % (beer, beer_lookup[other], score))
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, movies, m = read_data(input_path, min_rating=min_rating) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = ratings.groupby('movieId').size() movie_lookup = dict( (i, m) for i, m in zip(movies['movieId'], movies['title'])) to_generate = sorted(list(movies['movieId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for movieid in to_generate: movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_recommendations(train_filename, test_filename, output_filename, dir, model_name="als", factors=80, regularization=0.8, iterations=10, exact=False, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar items. This might take a while") # read in the input data file logging.debug("reading data from %s", dir + train_filename) start = time.time() df, cnts = read_data(dir + train_filename) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based on the input params if model_name == "als": if exact: model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, iterations=iterations, dtype=dtype) else: model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, iterations=iterations, dtype=dtype) # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") cnts = bm25_weight(cnts, K1=100, B=0.8) elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(K1=100, B=0.5) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(cnts) logging.debug("trained model '%s' in %s", model_name, time.time() - start) # test_data = pandas.read_csv(test_filename, sep="\t", usecols=[0, 1, 2], names=['user', 'item', 'cnt']) test_data = test_data.groupby(["user", "item"], as_index=False).sum() users_test = set(test_data['user']) users_train = set(df['user']) # position is important for recommendation list and actual list dict_actual = {} for user in users_test: if user not in users_train: continue matched_df = test_data.loc[test_data["user"] == user] matched_df.sort(["cnt"], ascending=[False], inplace=True) dict_actual[user] = list(matched_df["item"]) user_items = cnts.T.tocsr() # print(user_items) # recommend items for a user dict_recommended = {} # for computing MAP and MP for user in users_test: if user not in users_train: continue # print(user) recommendations = model.recommend(user, user_items) df = pandas.DataFrame(recommendations, columns=["item", "score"]) # print(recommendations) # print(df["item"]) dict_recommended[user] = list(df["item"]) ndcg = NDCG(dict_actual, dict_recommended) err = ERR(dict_actual, dict_recommended) map = MAP(dict_actual, dict_recommended) mp = MP(dict_actual, dict_recommended) with open("%siALS_result_%s.txt" % (dir, train_filename), "w") as o: o.write("NDCG\tERR\tMAP\tMP\n") o.write("%s\t%s\t%s\t%s\n" % (ndcg, err, map, mp)) return (ndcg, err, map, mp)
train_pairs[user_column] = leusers.fit_transform(train_pairs[user_column]) leservices = LabelEncoder() train_pairs[item_column] = leservices.fit_transform(train_pairs[item_column]) test_pairs[user_column] = leusers.transform(test_pairs[user_column]) test_pairs[item_column] = leservices.transform(test_pairs[item_column]) n_users = len(leusers.classes_) n_items = len(leservices.classes_) sparse_matrix = csr_matrix( (np.ones(len(train_pairs)), (train_pairs[user_column], train_pairs[item_column])), shape=(n_users, n_items) ) model = CosineRecommender() model.fit(sparse_matrix.T) print('saving artifacts') with open('leservices.pkl', 'wb') as f: pickle.dump(leservices, f) with open('kdf_rec.pkl', 'wb') as f: pickle.dump(model, f) scipy.sparse.save_npz('sparse_kdf.npz', sparse_matrix) services_df_for_save = services_df[ services_df.id_clustered.isin(leservices.inverse_transform(train_pairs[item_column]))].reset_index(drop=True) services_df_for_save = services_df_for_save.drop_duplicates('id_clustered').reset_index(drop=True) services_df_for_save['id_enc_cluster'] = leservices.transform(services_df_for_save.id_clustered) most_popular_items = Counter(train_df['id_clustered']) services_df_for_save['popularity'] = services_df_for_save.id_enc_cluster.apply(most_popular_items.get) services_df.to_csv('services.csv', index=False)
issues_train = issues_train.groupby(['reader_id', 'author' ])['record_id'].count().reset_index() issues_test = issues_test.groupby(['reader_id', 'author' ])['record_id'].count().reset_index() train_matrix = csr_matrix( (issues_train['record_id'], (issues_train['reader_id'], issues_train['author'])), shape=(n_readers, n_items)).astype('float64') test_matrix = csr_matrix( (issues_test['record_id'], (issues_test['reader_id'], issues_test['author'])), shape=(n_readers, n_items)).astype('float64') model = CosineRecommender() model.fit(train_matrix.T) author_top_items = get_authors_items(issues_prepared) similar_author_recommender = SimilarAuthorRecommender(model, train_matrix) author_top_items_recommender = AuthorTopItemsRecommender( similar_author_recommender, author_top_items, None) wrapper = RecommenderWrapper(user_encoder=user_lc, item_encoder=item_lc, model=author_top_items_recommender) dump_pickle(wrapper, AUTHOR_RECOMMENDER_PATH)
def calculate_similar_movies(input_filename, output_filename, model_name="als", min_rating=4.0, variant='20m'): # read in the input data file start = time.time() # titles, ratings = get_movielens(variant) user_item_df = read_user_item_data(input_filename) print(user_item_df) unique_user, unique_item, user_item_df = get_user_item_sparse_data_presto( user_item_df) #user_item_df = user_item_df.sort_values(by=['user_index','item_index']) user_item_ratings = scipy.sparse.csr_matrix( (user_item_df['score'], (user_item_df['item_index'], user_item_df['user_index']))) print(user_item_ratings) ''' # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) ''' log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares( factors=128, regularization=0.01, use_native=True, iterations=20, calculate_training_loss=True) # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") # ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(user_item_ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") k=10 iterations = 10000 similar_df_gen = similar_to_csv(model, k, unique_item, iterations) with tqdm.tqdm(total=len(unique_item) // iterations + 1) as progress: for similar_df_slice in similar_df_gen: similar_df_slice.to_csv(args.outputfile, mode='a', header=False, index=False) print("finsih a batch") progress.update(1) '''
def __init__(self, products: np.ndarray, params: dict): self.cosine_recommender = CosineRecommender(**params) self._product_idx = dict(zip(products, range(len(products)))) self._idx_product = products.tolist()
def calculate_similar_artists(input_filename, output_filename, model_name="als", factors=50, regularization=0.01, iterations=15, exact=False, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar artists. This might take a while") # read in the input data file logging.debug("reading data from %s", input_filename) start = time.time() df, plays = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": if exact: model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype, iterations=iterations) else: model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype, iterations=iterations) # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(K1=100, B=0.5) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %s", model_name, time.time() - start) # write out similar artists by popularity logging.debug("calculating top artists") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) # write out as a TSV of artistid, otherartistid, score with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in model.similar_items(artistid, 11): o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
def train(alpha_=None, beta_=None): logger.info('Running ...') # Load all types of interactions, movies catalogue and test users transactions = c.data_interim.join('transactions.pkl').load() bookmarks = c.data_interim.join('bookmarks.pkl').load() ratings = c.data_interim.join('ratings.pkl').load() catalogue = c.data_interim.join('catalogue.pkl').load() test_users = c.data_interim.join('catalogue.pkl').load() logger.info('Data loaded') # Train/test split transactions_train, transactions_test = train_test_split_on_date(transactions) bookmarks_train, bookmarks_test = train_test_split_on_date(bookmarks) ratings_train, ratings_test = train_test_split_on_date(ratings) logger.info('Train/test split completed') # Processing ratings_train = to_universal_df_view(ratings_train) bookmarks_train = to_universal_df_view(bookmarks_train) transactions_train = to_universal_df_view(transactions_train) # ... and combining into a single dataframe all_interaction_train = combine_interaction_types(ratings_train, bookmarks_train, transactions_train) # Processing ratings = prepare_df_with_interactions(to_universal_df_view(ratings)) bookmarks = prepare_df_with_interactions(to_universal_df_view(bookmarks)) transactions = prepare_df_with_interactions(to_universal_df_view(transactions)) # ... and combining into a single dataframe all_interaction = pd.concat([ratings, bookmarks, transactions]) all_interaction = all_interaction.reset_index(drop=True) all_interaction = all_interaction.drop_duplicates(subset=['user_id', 'item_id']).reset_index(drop=True) unique_items = set(catalogue.element_uid.unique()) # Creating global csr matrix csr, dict_of_users, dict_of_items = create_interaction_matrix(all_interaction, unique_items) assert np.all([k == v for k, v in dict_of_items.items()]) logger.info('Csr_matrix with all interactions created') interaction_train = prepare_df_with_interactions(transactions_train) interaction_test = prepare_df_with_interactions(transactions_test) real_int_train_csr, real_int_test_csr = create_interaction_matrices(interaction_train, interaction_test, dict_of_users, dict_of_items, logger=None) logger.info('Data preparation finished') # count true labels for validation # validation dictionary for train data and test data train_true_dict = csr_to_dict(real_int_train_csr) test_true_dict = csr_to_dict(real_int_test_csr) # and an example with set inside for time optimization during filtering train_true_dict_set = {k: set(v) for k, v in train_true_dict.items()} # test_true_dict_set = {k: set(v) for k, v in test_true_dict.items()} # ----------------------------------------------------------------------------------------- # # Recency function parameter search if (alpha_ is None) and (beta_ is None): # Prepare attributes for recency function all_interaction_train['time_scaled'] = minmax_scale(all_interaction_train.ts) # Все операции выполняем на трейне all_interaction_train = all_interaction_train.merge( all_interaction_train.groupby('element_uid').time_scaled.min().reset_index().rename( {'time_scaled': 'element_launch_ts'}, axis=1)) all_interaction_train['seen_ts_since_launch'] = all_interaction_train['time_scaled'] - all_interaction_train[ 'element_launch_ts'] all_interaction_train = all_interaction_train[ ['element_uid', 'user_uid', 'element_launch_ts', 'seen_ts_since_launch']] # takes parameters to search alpha_par = config_recency['grid_params']['alpha'] beta_par = config_recency['grid_params']['beta'] alpha_range_params = np.arange(alpha_par['min'], alpha_par['max'], alpha_par['step']) beta_range_params = np.arange(beta_par['min'], beta_par['max'], beta_par['step']) iters = [alpha_range_params, beta_range_params] all_variants = list(itertools.product(*iters)) np.random.shuffle(all_variants) logger.info('Starting search ...') # ----------------------------------------------------------------------------------------- # for element in all_variants: alpha_ = element[0] beta_ = element[1] interaction_train_ = recency_function(all_interaction_train, alpha_, beta_) train_csr, test_csr = create_interaction_matrices(interaction_train_, interaction_test, dict_of_users, dict_of_items, logger=None) train_true_dict_set = {k: set(v) for k, v in train_true_dict.items()} model = CosineRecommender(K=10200) model.fit(train_csr.T, show_progress=False) # without filtering in model test_predict = {} for id_ in tqdm(np.unique(test_csr.nonzero()[0])): test_predict[id_] = model.recommend(id_, train_csr, N=300, filter_already_liked_items=False) test_predict = {k: [x[0] for x in v] for k, v in test_predict.items()} # get rid of movies watched in train test_predict = {k: [x for x in v if x not in train_true_dict_set.get(k, [])][:20] for k, v in tqdm(test_predict.items())} mapk = metric.metric(test_true_dict, test_predict) logger.info('alpha = {0}, beta = {1}, mnap@20 = {2}'.format(alpha_, beta_, mapk)) # dump mlflow params run = mlflow.start_run(experiment_id=0) mlflow.set_tag("tag", "Implicit_with_recency") mlflow.log_param('lib', 'implicit') mlflow.log_param('feedbacks_mode', 'implicit') mlflow.log_param('type', 'CF') # search-related params mlflow.log_param('alpha', alpha_) mlflow.log_param('beta', beta_) mlflow.log_metric('MNAP_at_20_test', mapk) mlflow.end_run() # ----------------------------------------------------------------------------------------- # # ----------------------------------------------------------------------------------------- # # ----------------------------------------------------------------------------------------- # # Pruning parameter search else: pruning_range = config_recency['grid_params']['max_len'] all_interaction_train = all_interaction_train.sort_values(by=['user_uid', 'ts'], ascending=False) for max_len in range(pruning_range['min'], pruning_range['max'], pruning_range['step']): # ----------------------------------------------------------------------------------------- # logger.info('max_len = {}'.format(max_len)) all_interaction_train_pruned = all_interaction_train.groupby('user_uid').apply(lambda x: x[:max_len]).reset_index(drop=True) # Все операции выполняем на трейне all_interaction_train_pruned = all_interaction_train_pruned.merge(all_interaction_train_pruned.groupby('element_uid').time_scaled.min().reset_index().rename({'time_scaled': 'element_launch_ts'}, axis=1)) all_interaction_train_pruned['seen_ts_since_launch'] = all_interaction_train_pruned['time_scaled'] - all_interaction_train_pruned['element_launch_ts'] all_interaction_train_pruned = all_interaction_train_pruned[['element_uid', 'user_uid', 'element_launch_ts', 'seen_ts_since_launch']] inv_test_users = [dict_of_users.get(k, None) for k in test_users['users']] inv_test_users = [k for k in inv_test_users if k is not None] inv_test_users = set(inv_test_users) test_true_dict_50k = {k: v for k, v in test_true_dict.items() if k in inv_test_users} interaction_train_ = recency_function(all_interaction_train_pruned, int(alpha_), int(beta_)) train_csr, test_csr = create_interaction_matrices(interaction_train_, interaction_test, dict_of_users, dict_of_items, logger=None) model = CosineRecommender(K=10200) model.fit(train_csr.T, show_progress=False) # without filtering in model test_predict = {} for id_ in tqdm(inv_test_users): test_predict[id_] = model.recommend(id_, train_csr, N=300, filter_already_liked_items=False) test_predict = {k: [x[0] for x in v] for k, v in test_predict.items()} # get rid of movies watched in train test_predict = {k: [x for x in v if x not in train_true_dict_set.get(k, [])][:20] for k, v in tqdm(test_predict.items())} mapk = metric.metric(test_true_dict_50k, test_predict) logger.info('mapk = {}'.format(mapk)) # dump mlflow params run = mlflow.start_run(experiment_id=1) mlflow.set_tag("tag", "Implicit_with_pruning") mlflow.log_param('lib', 'implicit') mlflow.log_param('feedbacks_mode', 'implicit') mlflow.log_param('type', 'CF') # search-related params mlflow.log_param('max_len', max_len) mlflow.log_metric('MNAP_at_20_test', mapk) mlflow.end_run()
def calculate_similar_artists(input_filename, output_filename, model="als", factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar artists. This might take a while") logging.debug("reading data from %s", input_filename) start = time.time() df, plays = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) # write out artists by popularity logging.debug("calculating top artists") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) start = time.time() if model == "als": logging.debug("weighting matrix by bm25") weighted = bm25_weight(plays, K1=100, B=0.8) logging.debug("calculating factors") artist_factors, user_factors = alternating_least_squares( weighted, factors=factors, regularization=regularization, iterations=iterations, use_native=use_native, dtype=dtype, use_cg=cg) logging.debug("calculated factors in %s", time.time() - start) if exact: calc = TopRelated(artist_factors) else: calc = ApproximateTopRelated(artist_factors, trees) logging.debug("writing top related to %s", output_filename) with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in calc.get_related(artistid): o.write("%s\t%s\t%s\n" % (artist, artists[other], score)) elif model in ("bm25", "tfidf", "cosine", "smoothed_cosine", "ochiai", "overlap"): if model == "bm25": scorer = BM25Recommender(K1=100, B=0.5) elif model == "tfidf": scorer = TFIDFRecommender() elif model == "cosine": scorer = CosineRecommender() else: raise NotImplementedError("TODO: model %s" % model) logging.debug("calculating similar items") start = time.time() scorer.fit(plays, K=11) logging.debug("calculated all_pairs_knn in %s", time.time() - start) with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in scorer.similar_items(artistid): o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
class RetailHeroRecommender(BaseItemItemRecommenderModel): def __init__(self, products: pd.DataFrame, params_rec: dict, params_catboost: dict, catboost_features=CB_FEATURES): self.ranker = catboost.CatBoost(params_catboost) self._catboost_features = catboost_features self._nan_fill_dict = dict() self.recommender = CosineRecommender(**params_rec) self._product_idx = dict(zip(products.product_id, range(len(products)))) self._idx_product = products.product_id.tolist() self._product_features = { row['product_id']: dict(row.drop(index='product_id')) for (i, row) in products.iterrows() } def _cat_features(self): return ( 'gender', 'level_1', 'level_2', 'level_3', 'level_4', 'product_id', 'is_alcohol', 'brand_id', 'store_id', 'vendor_id', 'segment_id', 'is_own_trademark', ) def _fillna(self, df): for feature, fill_value in self._nan_fill_dict.items(): df.loc[:, feature] = df.loc[:, feature].fillna(fill_value) return df def _fit_ranker(self, train, valid=None): features = self._catboost_features cat_features = self._cat_features() cat_inds = [i for i, col in enumerate(features) if col in cat_features] for feature in features: if feature in cat_features: self._nan_fill_dict[feature] = 'unknown' else: self._nan_fill_dict[feature] = np.nanmedian(train[feature]) train = self._fillna(train) logger.debug(f'Train shape: {train.shape}') logger.debug(f'Train target mean: {train.target.mean()}') for feature, nuniques in train[features].nunique().to_dict().items(): logger.debug(f'{feature} has {nuniques} values') train_pool = catboost.Pool(data=train[features], label=train['target'], weight=train.weight, cat_features=cat_inds, group_id=train['client_id']) if valid is not None: valid = self._fillna(valid) val_pool = catboost.Pool(data=valid[features], label=valid['target'], weight=valid.weight, cat_features=cat_inds, group_id=valid['client_id']) else: val_pool = None logger.debug('Training Ranker Model...') self.ranker.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100) def train_model(self, train_rec: pd.DataFrame, train_ranker: pd.DataFrame, ranker_labels_dict: dict): self._fit_recommender(train_rec) cb_feats_df = train_ranker.sort_values( by=['client_id', 'transaction_datetime']) cb_feats_df = cb_feats_df.drop_duplicates( subset=['client_id', 'product_id'], keep='last') logger.debug('Preparing Train data for Ranker Model...') implicit_preds = [] for cid, df in train_ranker.groupby('client_id'): csr_row = self._make_user_item_csr_row( values=df.relevance.values, item_idx=df.product_id.values) pred = self.recommender.recommend(0, csr_row, N=30, recalculate_user=True, filter_already_liked_items=False) for (i, (idx, score)) in enumerate(pred): implicit_preds.append({ 'client_id': cid, 'score': score, 'product_id': self._idx_product[idx], 'weight': len(pred) - i, 'target': int(self._idx_product[idx] in ranker_labels_dict[cid]), }) logger.debug('Finished preparing Train data') len_before = len(implicit_preds) implicit_preds = pd.DataFrame(implicit_preds) cb_feats_df = pd.merge(implicit_preds, cb_feats_df, on=['client_id', 'product_id'], how='left') assert len(cb_feats_df) == len_before, 'Shape after merge is different' uniq_clients = cb_feats_df.client_id.unique() train = cb_feats_df[cb_feats_df.client_id.isin(uniq_clients[:8000])] valid = cb_feats_df[cb_feats_df.client_id.isin(uniq_clients[8000:])] self._fit_ranker(train, valid=valid) def recommend(self, products_counter: dict, histdata_products: dict): user_item_csr_row = self._make_user_item_csr_row( values=list(products_counter.values()), item_idx=products_counter.keys()) rec_preds = self.recommender.recommend( 0, user_item_csr_row, N=30, recalculate_user=True, filter_already_liked_items=False) data_list = [] nan_hist_data = dict( store_id=np.nan, regular_points_received=np.nan, express_points_received=np.nan, ) for i, score in rec_preds: pid = self._idx_product[i] row_dic = dict( product_id=pid, score=score, age=histdata_products['age'], gender=histdata_products['gender'], ) row_dic.update(self._product_features[pid]) row_dic.update(histdata_products.get(pid, nan_hist_data)) data_list.append(row_dic) preds_df = pd.DataFrame(data_list) preds_df = self._fillna(preds_df) preds_df.loc[:, 'catb_score'] = self.ranker.predict( preds_df[self._catboost_features]) result = preds_df.sort_values(by='catb_score', ascending=False).product_id.tolist() if len(result) < 30: for t_prod in TOP_PRODUCTS: if t_prod not in result: result.append(t_prod) return result[:30]