def fit_bm25_recommender(user_item_matrix): """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером""" bm25_recommender = BM25Recommender(K=6, K1=1.2, B=.76, num_threads=0) bm25_recommender.fit(csr_matrix(user_item_matrix).T.tocsr()) return bm25_recommender
def main(): args = get_args() set_seeds(args.seed) data_path = args.data_path data = read_data(root=data_path) result = evaluate(data=c.deepcopy(data), smoothing=args.smoothing, models_list=[ BM25Recommender(K=args.bm25_k), AlternatingLeastSquares(factors=args.als_factors, iterations=args.als_iters), ], r=args.rating, N=args.top_k) RATES = [5.5, 6] TOP_K = 100 predictions = mix_solutions(result=result, rates=RATES, pictures_num_to_leave=TOP_K) test_users = pd.DataFrame.from_dict(predictions).T.reset_index() test_users.rename({'index': 'user_id'}, inplace=True, axis=1) test_users.sort_values('user_id', inplace=True) test_users['predictions'] = test_users[list(range(TOP_K))].apply( lambda x: ' '.join(map(str, x)), axis=1) test_users[['user_id', 'predictions']].to_csv('sub.csv', index=False)
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def get_model(self): # Get a model based off the input params self.app_logger.info(msg='Initializing the nearest neighbors model') return BM25Recommender(**self.model_params)
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, movies, m = read_data(input_path, min_rating=min_rating) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model m = m.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = ratings.groupby('movieId').size() movie_lookup = dict( (i, m) for i, m in zip(movies['movieId'], movies['title'])) to_generate = sorted(list(movies['movieId']), key=lambda x: -user_count.get(x, 0)) with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if m.indptr[movieid] == m.indptr[movieid + 1]: continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): """ :param input_path: 训练数据集的路径 :param output_filename: 输出的文件名称 :param model_name: 采用的模型 :param min_rating: 过滤所需的阈值大小 :return: """ logging.debug("reading data from %s", input_path) start = time.time() rating_data, movies_data, m = read_data(input_path, min_rating=min_rating) logging.debug("reading data in %s", time.time() - start) if model_name == "als": model = AlternatingLeastSquares() logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender() else: raise NotImplementedError("TODU: model %s" % model_name) m = m.tocsr() logging.debug("Training model :%s" % model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = rating_data.groupby("movieId").size() movie_lookup = dict((i, m) for i,m in zip(movies_data['movieId'], movies_data['title'])) to_generate = sorted(list(movies_data['movieId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for movieid in to_generate: if(m.indptr[movieid] == m.indptr[movieid + 1]): continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_similar_beers(input_path, output_filename, model_name="cosine"): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, beers, m = read_data(input_path) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model m = m.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top beers") user_count = ratings.groupby('beerId').size() beer_lookup = dict((i, m) for i, m in zip(beers['beerId'], beers['name'])) to_generate = sorted(list(beers['beerId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for beerId in to_generate: if m.indptr[beerId] == m.indptr[beerId + 1]: continue beer = beer_lookup[beerId] for other, score in model.similar_items(beerId, 11): o.write("%s,%s,%s\n" % (beer, beer_lookup[other], score))
def load_recommender(item_to_item_model_file: str) -> ItemToItemRecommender: log.info("Loading item to item bm25 model") data = np.load(item_to_item_model_file) k = data['model.K'][0] k1 = data['model.bm25.K1'][0] b = data['model.bm25.B'][0] model = BM25Recommender(K=k, K1=k1, B=b) model.similarity = sparse.csr_matrix( (data['model.similarity.data'], data['model.similarity.indices'], data['model.similarity.indptr']), shape=data['model.similarity.shape']) model.scorer = NearestNeighboursScorer(model.similarity) user_labels = data['user_labels'] item_labels = data['item_labels'] return ItemToItemRecommender(model, user_labels, item_labels)
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, movies, m = read_data(input_path, min_rating=min_rating) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = ratings.groupby('movieId').size() movie_lookup = dict( (i, m) for i, m in zip(movies['movieId'], movies['title'])) to_generate = sorted(list(movies['movieId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for movieid in to_generate: movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_recommendations(train_filename, test_filename, output_filename, dir, model_name="als", factors=80, regularization=0.8, iterations=10, exact=False, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar items. This might take a while") # read in the input data file logging.debug("reading data from %s", dir + train_filename) start = time.time() df, cnts = read_data(dir + train_filename) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based on the input params if model_name == "als": if exact: model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, iterations=iterations, dtype=dtype) else: model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, iterations=iterations, dtype=dtype) # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") cnts = bm25_weight(cnts, K1=100, B=0.8) elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(K1=100, B=0.5) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(cnts) logging.debug("trained model '%s' in %s", model_name, time.time() - start) # test_data = pandas.read_csv(test_filename, sep="\t", usecols=[0, 1, 2], names=['user', 'item', 'cnt']) test_data = test_data.groupby(["user", "item"], as_index=False).sum() users_test = set(test_data['user']) users_train = set(df['user']) # position is important for recommendation list and actual list dict_actual = {} for user in users_test: if user not in users_train: continue matched_df = test_data.loc[test_data["user"] == user] matched_df.sort(["cnt"], ascending=[False], inplace=True) dict_actual[user] = list(matched_df["item"]) user_items = cnts.T.tocsr() # print(user_items) # recommend items for a user dict_recommended = {} # for computing MAP and MP for user in users_test: if user not in users_train: continue # print(user) recommendations = model.recommend(user, user_items) df = pandas.DataFrame(recommendations, columns=["item", "score"]) # print(recommendations) # print(df["item"]) dict_recommended[user] = list(df["item"]) ndcg = NDCG(dict_actual, dict_recommended) err = ERR(dict_actual, dict_recommended) map = MAP(dict_actual, dict_recommended) mp = MP(dict_actual, dict_recommended) with open("%siALS_result_%s.txt" % (dir, train_filename), "w") as o: o.write("NDCG\tERR\tMAP\tMP\n") o.write("%s\t%s\t%s\t%s\n" % (ndcg, err, map, mp)) return (ndcg, err, map, mp)
def calculate_similar_movies(input_filename, output_filename, model_name="als", min_rating=4.0, variant='20m'): # read in the input data file start = time.time() # titles, ratings = get_movielens(variant) user_item_df = read_user_item_data(input_filename) print(user_item_df) unique_user, unique_item, user_item_df = get_user_item_sparse_data_presto( user_item_df) #user_item_df = user_item_df.sort_values(by=['user_index','item_index']) user_item_ratings = scipy.sparse.csr_matrix( (user_item_df['score'], (user_item_df['item_index'], user_item_df['user_index']))) print(user_item_ratings) ''' # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) ''' log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares( factors=128, regularization=0.01, use_native=True, iterations=20, calculate_training_loss=True) # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") # ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(user_item_ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") k=10 iterations = 10000 similar_df_gen = similar_to_csv(model, k, unique_item, iterations) with tqdm.tqdm(total=len(unique_item) // iterations + 1) as progress: for similar_df_slice in similar_df_gen: similar_df_slice.to_csv(args.outputfile, mode='a', header=False, index=False) print("finsih a batch") progress.update(1) '''
def fit_bm25_recommender(user_item_matrix): bm25_recommender = BM25Recommender(K=6, K1=1.2, B=.76, num_threads=0) bm25_recommender.fit(csr_matrix(user_item_matrix).T.tocsr()) return bm25_recommender
# regularization_levels = [0.001, 0.01, 0.1, 1, 10, 100, 1000] # num_factors_levels = [10, 50, 100] K1_levels = [10, 20, 50, 100, 200] B_levels = [0, 0.25, 0.5, 0.75, 1] filter_already_liked_items_levels = [True, False] K = 20 runs = [] for K1, B, filter_already_liked_items in itertools.product( K1_levels, B_levels, filter_already_liked_items_levels): print((K1, B, filter_already_liked_items)) start_time = time() model = BM25Recommender(K1, B) model.fit(implicit_matrix) brec = recommenders.MyBM25Recommender(model, implicit_matrix) brecs = brec.recommend_all( userids, K, u2i=u2i, n2i=n2i, i2p=i2p, filter_already_liked_items=filter_already_liked_items, ) print("Computing metrics...") metrics = wr.get_recs_metrics( histories_test, brecs,
def get_model(self): return BM25Recommender(**self.model_params)
def calculate_similar_artists(input_filename, output_filename, model_name="als", factors=50, regularization=0.01, iterations=15, exact=False, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar artists. This might take a while") # read in the input data file logging.debug("reading data from %s", input_filename) start = time.time() df, plays = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": if exact: model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype, iterations=iterations) else: model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype, iterations=iterations) # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(K1=100, B=0.5) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %s", model_name, time.time() - start) # write out similar artists by popularity logging.debug("calculating top artists") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) # write out as a TSV of artistid, otherartistid, score with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in model.similar_items(artistid, 11): o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
def calculate_similar_artists(input_filename, output_filename, model="als", factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar artists. This might take a while") logging.debug("reading data from %s", input_filename) start = time.time() df, plays = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) # write out artists by popularity logging.debug("calculating top artists") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) start = time.time() if model == "als": logging.debug("weighting matrix by bm25") weighted = bm25_weight(plays, K1=100, B=0.8) logging.debug("calculating factors") artist_factors, user_factors = alternating_least_squares( weighted, factors=factors, regularization=regularization, iterations=iterations, use_native=use_native, dtype=dtype, use_cg=cg) logging.debug("calculated factors in %s", time.time() - start) if exact: calc = TopRelated(artist_factors) else: calc = ApproximateTopRelated(artist_factors, trees) logging.debug("writing top related to %s", output_filename) with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in calc.get_related(artistid): o.write("%s\t%s\t%s\n" % (artist, artists[other], score)) elif model in ("bm25", "tfidf", "cosine", "smoothed_cosine", "ochiai", "overlap"): if model == "bm25": scorer = BM25Recommender(K1=100, B=0.5) elif model == "tfidf": scorer = TFIDFRecommender() elif model == "cosine": scorer = CosineRecommender() else: raise NotImplementedError("TODO: model %s" % model) logging.debug("calculating similar items") start = time.time() scorer.fit(plays, K=11) logging.debug("calculated all_pairs_knn in %s", time.time() - start) with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in scorer.similar_items(artistid): o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
def _get_model(self): self.app_logger.info("Initializing {} model".format( BM25Recommender.__dict__["__module__"])) return BM25Recommender(**self.model_params)