def test_cg_nan(self): # test issue with CG code that was causing NaN values in output: # https://github.com/benfred/implicit/issues/19#issuecomment-283164905 raw = [[0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] counts = csr_matrix(raw, dtype=np.float64) for use_native in (True, False): model = AlternatingLeastSquares(factors=3, regularization=0.01, dtype=np.float64, use_native=use_native, use_cg=True, use_gpu=False) model.fit(counts, show_progress=False) rows, cols = model.item_factors, model.user_factors self.assertFalse(np.isnan(np.sum(cols))) self.assertFalse(np.isnan(np.sum(rows)))
class ALSRecommender(BaseRecommender): """ implement alternating least squares algorithm implementation based on implicit library """ def fit(self, train_df, col_user=cfg.USER_COL, col_item=cfg.ITEM_COL, col_rating=cfg.DEFAULT_RATING_COL, factors=100, confidence=5, regularization=0.1): """ Trains implicit ALS recommender on train data :param train_df: pandas DataFrame with train data :param col_user: str column name for user :param col_item: str column name for item :param col_rating: str column name for ratings :param factors: int number of factors to use in ALS model :param confidence: int as described in implicit documentation :param regularization: float higher values mean stronger regularization :return: None """ BaseRecommender.fit(self, train_df, col_user, col_item, col_rating) self.train_df[self.col_rating] = train_df[self.col_rating] * confidence self.uii_matrix = self.get_uii_matrix() self.als = AlternatingLeastSquares(factors=factors, use_gpu=False, regularization=regularization) self.als.fit(self.uii_matrix.T) def predict(self, test_df, k=cfg.DEFAULT_K): """ recommend k items for each user in test_df :param test_df: pandas DataFrame with test_users and truth recommendations :param k: int number of items to recommend :return: pandas DataFrame with k recommendations for each user in test_df """ test_users_indices = [ self.users.index(user) for user in test_df[self.col_user].values if user in self.users ] prediction_records = [] for item in test_users_indices: doc = { self.col_user: self.users[item], self.col_item: [ self.items[it[0]] for it in self.als.recommend( item, self.uii_matrix, k, filter_already_liked_items=False) ] } prediction_records.append(doc) prediction = pd.DataFrame.from_records(prediction_records) return prediction
def make_latent_feature(df: pd.DataFrame, index_col: str, value_col: str, n_factors: int, n_iterations: int, sum_col: Optional[str] = None): if sum_col is None: csr = make_count_csr(df, index_col=index_col, value_col=value_col) else: csr = make_sum_csr( df, index_col=index_col, value_col=value_col, col_to_sum=sum_col, ) model = AlternatingLeastSquares( factors=n_factors, dtype=np.float32, iterations=n_iterations, regularization=0.1, use_gpu=False, # True if n_factors >= 32 else False, ) np.random.seed(RANDOM_STATE) model.fit(csr.T) return model.user_factors
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def calculate_similar_event(path, output_filename): model = AlternatingLeastSquares() a, b = read_event_data(path) event, users = hfd5_from_dataframe(a, b, output_filename) users.eliminate_zeros() users.data = np.ones(len(users.data)) log.info("Start fitting") model.fit(users) user_count = np.ediff1d(users.indptr) to_generate = sorted(np.arange(len(event)), key=lambda x: -user_count[x]) with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf-8") as o: for eventid in to_generate: if users.indptr[eventid] != users.indptr[eventid + 1]: name = event[eventid] for other, score in model.similar_items( eventid, int(len(event) * 2 / 3) ): o.write(f"{name},{event[other]},{score}\n") progress.update(1)
class ALSEstimator(BaseEstimator, TransformerMixin): def __init__(self, factors=50, regularization=0.01, iterations=10, filter_seen=True): self.factors = factors self.regularization = regularization self.iterations = iterations self.filter_seen = filter_seen def fit(self, X, y=None): self.model = AlternatingLeastSquares( factors=self.factors, regularization=self.regularization, iterations=self.iterations, dtype=np.float64, use_native=True, use_cg=True) self.model.fit(X) if self.fiter_seen: self.fit_X = X return self def predict(self, X, y=None): predictions = np.dot(self.model.item_factors, self.model.user_factors.T) if self.filter_seen: predictions[self.fit_x.nonzero()] = -99 return predictions
def test_cg_nan2(self): # test out Nan appearing in CG code (from https://github.com/benfred/implicit/issues/106) Ciu = random(m=100, n=100, density=0.0005, format='coo', dtype=np.float32, random_state=42, data_rvs=None).T.tocsr() configs = [{ 'use_native': True, 'use_gpu': False }, { 'use_native': False, 'use_gpu': False }] if HAS_CUDA: configs.append({'use_gpu': True}) for options in configs: model = AlternatingLeastSquares(factors=32, regularization=10, iterations=10, dtype=np.float32, **options) model.fit(Ciu, show_progress=False) self.assertTrue(np.isfinite(model.item_factors).all()) self.assertTrue(np.isfinite(model.user_factors).all())
class ALS(Model): def __init__(self): """ Model inicialization """ self.model = AlternatingLeastSquares() self.trainset = None def fit(self, X, y): #Create Coo-Matrix with X and y data = coo_matrix((y, (X[:, 0], X[:, 1]))) self.trainset = data data.transpose() #rows:[n_items] ; columns:[n_users] self.model.fit(data) def recommend(self, user_id, N=1): n_recomendation = self.model.recommend( user_id, self.trainset.tocsr(), N=N) #array of tuples (item_id,rating) #convert array of [tuples] in array of [item_id] result = np.zeros(N, dtype=int) pos = 0 for recomendation_tuple in n_recomendation: result[pos] = recomendation_tuple[0] pos = pos + 1 return result def get_params(self, deep=True): return dict()
def train_als(train_df, test_df, min_rating=4.0): # map each user/item to a unique numeric value train_df['user_id'] = train_df['user_id'].astype("category") train_df['item_id'] = train_df['item_id'].astype("category") ratings_csr = coo_matrix((train_df['rating'].astype(np.float32), (train_df['item_id'].cat.codes.copy(), train_df['user_id'].cat.codes.copy()))).tocsr() items = np.array(train_df['item_id'].cat.categories) users = np.array(train_df['user_id'].cat.categories) ratings = ratings_csr # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) model = AlternatingLeastSquares() # lets weight these models by bm25weight. ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() # train the model start = time.time() model.fit(ratings) print("Training time: {}".format(time.time() - start)) return model, users, items, ratings
def dump_factors(): numfactors = int(request.args['numfactors'].strip()) model = AlternatingLeastSquares(factors=numfactors, dtype=np.float32, use_gpu=False, iterations=30) model.approximate_recommend = False model.approximate_similar_items = False data = {'userid': [], 'productid': [], 'purchase_count': []} for userid in purchases: for productid in purchases[userid]: data['userid'].append(userid) data['productid'].append(productid) data['purchase_count'].append(purchases[userid][productid]) df = pd.DataFrame(data) df['userid'] = df['userid'].astype("category") df['productid'] = df['productid'].astype("category") userids = list(df['userid'].cat.categories) userids_reverse = dict(zip(userids, list(range(len(userids))))) productids = list(df['productid'].cat.categories) productids_reverse = dict(zip(productids, list(range(len(productids))))) purchases_matrix = coo_matrix((df['purchase_count'].astype(np.float32), (df['productid'].cat.codes.copy(), df['userid'].cat.codes.copy()))) print("Matrix shape: %s, max value: %.2f" % (np.shape(purchases_matrix), np.max(purchases_matrix))) purchases_matrix = bm25_weight(purchases_matrix, K1=2.0, B=0.25) purchases_matrix_T = purchases_matrix.T.tocsr() purchases_matrix = purchases_matrix.tocsr() # to support indexing in recommend/similar_items functions model.fit(purchases_matrix) np.savetxt('item_factors.csv', model.item_factors, delimiter=',') np.savetxt('user_factors.csv', model.user_factors, delimiter=',') with open('item_ids.csv', 'w') as f: for pid in productids_reverse: f.write("%s,%d,%s\n" % (pid, productids_reverse[pid], recommendation.sub(r',', ' ', productnames[pid]))) with open('user_ids.csv', 'w') as f: for uid in userids_reverse: f.write("%s,%d,%s\n" % (uid, userids_reverse[uid], recommendation.sub(r',', ' ', usernames[uid]))) return 'OK\n'
def train_and_predict(train_filepath, test_filepath): train_df = pd.read_json(train_filepath) test_df = pd.read_json(test_filepath) tr_songs = train_df.songs.tolist() te_songs = test_df.songs.tolist() tr_tags = train_df.tags.tolist() te_tags = test_df.tags.tolist() vocab = Vocabulary(pd.concat([train_df, test_df], ignore_index=True)) train_data = encode_features(train_df, vocab) test_data = encode_features(test_df, vocab) # Shuffle train data train_data = shuffle(train_data) # list of lists -> CSR def lil_to_csr(indices, shape): data = [] row_ind = [] col_ind = [] for row_idx, row in enumerate(indices): for col_idx in row: data.append(1) row_ind.append(row_idx) col_ind.append(col_idx) return csr_matrix((data, (row_ind, col_ind)), shape=shape) train_csr = lil_to_csr(train_data, (len(train_data), vocab.size)) test_csr = lil_to_csr(test_data, (len(test_data), vocab.size)) r = scipy.sparse.vstack([test_csr, train_csr]) r = csr_matrix(r) factors = 512 alpha = 500.0 als_model = ALS(factors=factors, regularization=0.1) als_model.fit(r.T * alpha) song_model = ALS(factors=factors) tag_model = ALS(factors=factors) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:vocab.num_songs] tag_model.item_factors = als_model.item_factors[vocab.num_songs:] song_rec_csr = test_csr[:, :vocab.num_songs] tag_rec_csr = test_csr[:, vocab.num_songs:] song_rec = song_model.recommend_all(song_rec_csr, N=100) tag_rec = tag_model.recommend_all(tag_rec_csr, N=10) tag_rec += vocab.num_songs return [{ "id": test_playlist_id, "songs": list(map(vocab.id_to_song, song_rec[test_row_idx])), "tags": list(map(vocab.id_to_tag, tag_rec[test_row_idx])), } for test_row_idx, test_playlist_id in enumerate(tqdm(test_df.id))]
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant='20m'): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def test_factorize(self): counts = csr_matrix( [ [1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 1, 1], ], dtype=np.float64, ) user_items = counts * 2 # try all 8 variants of native/python, cg/cholesky, and # 64 vs 32 bit factors options = [(dtype, cg, native, False) for dtype in (np.float32, np.float64) for cg in (False, True) for native in (False, True)] # also try out GPU support if available if HAS_CUDA: options.append((np.float32, False, False, True)) for dtype, use_cg, use_native, use_gpu in options: try: model = AlternatingLeastSquares( factors=6, regularization=0, dtype=dtype, use_native=use_native, use_cg=use_cg, use_gpu=use_gpu, random_state=42, ) model.fit(user_items, show_progress=False) rows, cols = model.item_factors, model.user_factors if use_gpu: rows, cols = rows.to_numpy(), cols.to_numpy() except Exception as e: self.fail(msg="failed to factorize matrix. Error=%s" " dtype=%s, cg=%s, native=%s gpu=%s" % (e, dtype, use_cg, use_native, use_gpu)) reconstructed = rows.dot(cols.T) for i in range(counts.shape[0]): for j in range(counts.shape[1]): self.assertAlmostEqual( counts[i, j], reconstructed[i, j], delta=0.0001, msg="failed to reconstruct row=%s, col=%s," " value=%.5f, dtype=%s, cg=%s, native=%s gpu=%s" % (i, j, reconstructed[i, j], dtype, use_cg, use_native, use_gpu), )
def benchmark_implicit(matrix, factors, reg, iterations): start = time.time() model = AlternatingLeastSquares(factors, regularization=reg, iterations=iterations, use_cg=True) model.fit(matrix) return time.time() - start
class AlsRecommender(OwnRecommender): """Модель, обученная ALS Input ----- ds: RecommenderDataset подготовленный RecommenderDataset обьект """ def fit(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4): """Обучает ALS""" self.model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) self.model.fit(self.ds.csr_matrix) return self def _similarItems(self, userId, N=5): """Рекомендуем товары, похожие на топ-N купленных юзером товаров""" if not self.ds.userExist(userId): return self.ds.extend([], N) def _get_similar_item(item_id): """Находит товар, похожий на item_id""" recs = self.model.similar_items(self.ds.itemid_to_id[item_id], N=2) if len(recs) > 1: top_rec = recs[1][0] return self.ds.id_to_itemid[top_rec] return item_id res = [_get_similar_item(item) for item in self.ds.userTop(userId, N)] return self.extend(res, N) def _similarUsers(self, userId, N=5): """Рекомендуем топ-N товаров, среди купленных похожими юзерами""" if not self.ds.userExist(userId): return self.ds.extend([], N) res = [] similar_users = [rec[0] for rec in self.model.similar_users(self.ds.userid_to_id[userId], N=N+1)] similar_users = similar_users[1:] for user in similar_users: res.extend(self.ds.userTop(userId, 1)) return self.extend(res, N) def items_embedings(self): emb = pd.DataFrame(data=self.model.item_factors).add_prefix('itm') emb['item_id'] = self.ds.itemids return emb def users_embedings(self): emb = pd.DataFrame(data=self.model.user_factors).add_prefix('usr') emb['user_id'] = self.ds.userids return emb
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, movies, m = read_data(input_path, min_rating=min_rating) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model m = m.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = ratings.groupby('movieId').size() movie_lookup = dict( (i, m) for i, m in zip(movies['movieId'], movies['title'])) to_generate = sorted(list(movies['movieId']), key=lambda x: -user_count.get(x, 0)) with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if m.indptr[movieid] == m.indptr[movieid + 1]: continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def fit(user_item_matrix, factors=20, regularization=0.001, iterations=15): """Обучает ALS""" model = AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def test_explain(self): counts = csr_matrix( [ [1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 4, 1, 0, 7, 0], [1, 1, 0, 0, 0, 0], [9, 0, 4, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 2, 0, 1, 1], ], dtype=np.float64, ) user_items = counts * 2 item_users = user_items.T model = AlternatingLeastSquares( factors=4, regularization=20, use_native=False, use_cg=False, use_gpu=False, iterations=100, random_state=23, ) model.fit(user_items, show_progress=False) userid = 0 # Assert recommendation is the the same if we recompute user vectors recs = model.recommend(userid, item_users, N=10) recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True) for (item1, score1), (item2, score2) in zip(recs, recalculated_recs): self.assertEqual(item1, item2) self.assertAlmostEqual(score1, score2, 4) # Assert explanation makes sense top_rec, score = recalculated_recs[0] score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec) scores = [s for _, s in contributions] items = [i for i, _ in contributions] self.assertAlmostEqual(score, score_explained, 4) self.assertAlmostEqual(score, sum(scores), 4) self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order") self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user") # Assert explanation with precomputed user weights is correct top_score_explained, top_contributions, W = model.explain( userid, item_users, itemid=top_rec, user_weights=W, N=2 ) top_scores = [s for _, s in top_contributions] top_items = [i for i, _ in top_contributions] self.assertEqual(2, len(top_contributions)) self.assertAlmostEqual(score, top_score_explained, 4) self.assertEqual(scores[:2], top_scores) self.assertEqual(items[:2], top_items)
def _add_als_recs(self, n_factors=20, regularization=0.001, iterations=20, num_threads=0): als_model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) als_model.fit(csr_matrix(self.user_item_matrix).T.tocsr()) self.als_model = als_model als_recs = lambda i: [ self.id_to_itemid[rec[0]] for rec in als_model.recommend( userid=int(i), user_items=csr_matrix(self.user_item_matrix).tocsr(), N=self.first_model_rec_limit, filter_items=[self.itemid_to_id[999999]], recalculate_user=True, filter_already_liked_items=False) ] self.df_users['als_recommender'] = None self.df_users.loc[~self.df_users['id'].isnull(), 'als_recommender'] = self.df_users.loc[ ~self.df_users['id'].isnull(), 'id'].map(als_recs) self.df_users['als_recommender'] = self.df_users[ 'als_recommender'].map(lambda val: val if type(val) == type([]) else []) # adding embedings to df_users and df_items as features als_user_factors = pd.DataFrame( self.als_model.user_factors, columns=[ f'als_user_factor_{i}' for i in range(self.als_model.user_factors.shape[1]) ]) als_user_factors['id'] = als_user_factors.index self.df_users = pd.merge(left=self.df_users, right=als_user_factors, on='id', how='left') als_item_factors = pd.DataFrame( self.als_model.item_factors, columns=[ f'als_item_factor_{i}' for i in range(self.als_model.item_factors.shape[1]) ]) als_item_factors['id'] = als_item_factors.index self.df_items = pd.merge(left=self.df_items, right=als_item_factors, on='id', how='left')
def main(params): """Main function.""" # check for mandatory params if 'reference_repo' not in params: return {'error': 'Mandatory param reference_repo not present'} reference_repo = params['reference_repo'] LOGGER.info('reference_repo %s' % reference_repo) # get data LOGGER.info('read GBQ data') _GC_SVC_ACCOUNT['private_key_id'] = params['GC_SVC_PRIVATE_KEY_ID'] _GC_SVC_ACCOUNT['private_key'] = params['GC_SVC_PRIVATE_KEY'] data = pd.io.gbq.read_gbq(_QUERY, dialect="standard", project_id=_GC_SVC_ACCOUNT['project_id'], private_key=json.dumps(_GC_SVC_ACCOUNT)) # map each repo and user to a unique numeric value data['user'] = data['user'].astype("category") data['repo'] = data['repo'].astype("category") # dictionaries to translate names to ids and vice-versa repos = dict(enumerate(data['repo'].cat.categories)) repo_ids = {r: i for i, r in repos.items()} if reference_repo not in repo_ids: return {"message": "No result. Reference repo not in training set."} # create a sparse matrix of all the users/repos stars = coo_matrix( (np.ones(data.shape[0]), (data['repo'].cat.codes.copy(), data['user'].cat.codes.copy()))) # train model LOGGER.info('training model') model = AlternatingLeastSquares( factors=50, regularization=0.01, dtype=np.float64, # pylint: disable=no-member iterations=50) confidence = 40 model.fit(confidence * stars) similar_ids = model.similar_items(repo_ids[reference_repo]) LOGGER.info('found %d similar repos' % len(similar_ids)) similar_repos = [] for idx in range(1, len(similar_ids)): similar_repos.append(repos[similar_ids[idx][0]]) return { 'reference_repo': reference_repo, 'similar_repos': similar_repos, 'error': '' }
def fit(user_item_matrix): """Обучает ALS""" model = AlternatingLeastSquares(factors=100, regularization=0.01, iterations=15, num_threads=4) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4, show_progress=False): """Обучает ALS""" model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=show_progress) return model
def fit(user_item_matrix, n_factors=20, regularization=0.1, iterations=40, num_threads=0): """Обучает ALS""" model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def _train_als(hyperparameters, train): h = hyperparameters model = AlternatingLeastSquares(factors=h['factors'], iterations=h['n_iter'], num_threads=nproc) model.fit(train) # test_eval = {'p@k': precision_at_k(model, train.T.tocsr(), factorization.T.tocsr(), K=10)} # val_eval = {'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10)} return model
def collab_filter(song_id, user_song_df, num_songs=5): ''' song_id = spotify id for individual song user_song_df= dataframe with users, songs, playcounts etc for the time being i am not going to enable filtering by key/tempo as not enough songs but in future will do ''' song_num = user_song_df[user_song_df.spotify_id == song_id].song_nums.values[0] print(song_num) print(type(song_num)) #orig_key = song_list[song_list.spotify_id==song_id].key.values[0] #orig_tempo= song_list[song_list.spotify_id==song_id].tempo.values[0] #check if you want songs of same key #if same_key=='yes': #if yes then filter out other keys # print(f'key:{orig_key}') # song_list = song_list[song_list.key ==orig_key] #can also enter number to specify what key you want # elif type(same_key) !=str: # song_list = song_list[song_list.key==same_key] # check if you want similar tempo # if similar_tempo=='yes': # print(f'tempo:{orig_tempo}') #if yes can also specify how similar you want it # lower= int(orig_tempo)-margin # higher=int(orig_tempo)+margin # song_list=song_list[song_list.tempo.between(lower,higher)] #elif type(similar_tempo) !=str: #can also specify a specific tempo that you want # song_list = song_list[song_list.tempo.between(int(similar_tempo)-margin,int(similar_tempo)+margin)] # refined_ids=song_list.spotify_id #this will be updated user_song_refined = user_song_df #[user_song_df.spotify_id.isin( # refined_ids)].copy() plays = user_song_refined['size'] user_nums = user_song_refined.user_nums song_nums = user_song_refined.song_nums B = coo_matrix((plays, (song_nums, user_nums))).tocsr() model = AlternatingLeastSquares(factors=30) model.fit(B) songs_inds = model.similar_items(song_num, N=num_songs) songs_inds = [tup[0] for tup in songs_inds] return user_song_df[user_song_df.song_nums.isin(songs_inds)]
def calculate_similar_businesses(input_filename, output_filename, model_name="als", factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar businesses. This might take a while") # read in the input data file logging.debug("reading data from %s", input_filename) start = time.time() df, ratings = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if exact: model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype) else: model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype) # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") ratings = bm25_weight(ratings, K1=100, B=0.8) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(ratings) logging.debug("trained model '%s' in %s", model_name, time.time() - start) # write out similar businesses by popularity logging.debug("calculating top businesses") user_count = df.groupby('business').size() businesses = dict(enumerate(df['business'].cat.categories)) to_generate = sorted(list(businesses), key=lambda x: -user_count[x]) # write out as a TSV of businessid, otherbusinessid, score with open(output_filename, "w") as o: for businessid in to_generate: business = businesses[businessid] for other, score in model.similar_items(businessid, 11): o.write("%s\t%s\t%s\n" % (business, businesses[other], score))
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): """ :param input_path: 训练数据集的路径 :param output_filename: 输出的文件名称 :param model_name: 采用的模型 :param min_rating: 过滤所需的阈值大小 :return: """ logging.debug("reading data from %s", input_path) start = time.time() rating_data, movies_data, m = read_data(input_path, min_rating=min_rating) logging.debug("reading data in %s", time.time() - start) if model_name == "als": model = AlternatingLeastSquares() logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender() else: raise NotImplementedError("TODU: model %s" % model_name) m = m.tocsr() logging.debug("Training model :%s" % model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = rating_data.groupby("movieId").size() movie_lookup = dict((i, m) for i,m in zip(movies_data['movieId'], movies_data['title'])) to_generate = sorted(list(movies_data['movieId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for movieid in to_generate: if(m.indptr[movieid] == m.indptr[movieid + 1]): continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def fit(self, user_item_matrix, n_factors, regularization=0.001, iterations=50, num_threads=1, use_gpu=False): """Обучает ALS""" model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads, use_gpu=use_gpu) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def fit(user_item_matrix, n_factors=32, regularization=0.001, iterations=15, num_threads=16): """Обучает ALS""" model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, calculate_training_loss=True, num_threads=num_threads) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
class ALSEngine(): def __init__(self, df, behaviors, factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=np.float64, cg=False): self.model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype) self.model.fit(behaviors) self.indexToRealId = dict(enumerate(df['artist'].cat.categories)) self.realIdToIndex = dict((v,k) for k,v in self.indexToRealId.iteritems()) self._item_norms = None print "init finish" @property def item_norms(self): if self._item_norms is None: self._item_norms = np.linalg.norm(self.model.item_factors, axis=-1) return self._item_norms def recommend(self, actions): localFactor = None for watchedPid, act in actions.iteritems(): index = self.realIdToIndex[watchedPid] if act == 1: if localFactor is None: localFactor = self.model.item_factors[index] * -1 else: localFactor += self.model.item_factors[index] * -1 else: if localFactor is None: localFactor = self.model.item_factors[index] else: localFactor += self.model.item_factors[index] if localFactor is None: return np.random.choice(self.realIdToIndex.keys()) localFactor /= len(actions) indexs = self.similar_items(localFactor, len(actions) + 1) # find first one which not in actions for index in indexs: pid = self.indexToRealId[index] if pid not in actions: return pid def similar_items(self, factor, N): """ Return the top N similar items for itemid. """ scores = self.model.item_factors.dot(factor) / self.item_norms / np.linalg.norm(factor, axis = -1) best = np.argpartition(scores, -N)[-N:] return sorted(best, key=lambda x: -scores[x])
def fit(self, n_factors=300, regularization=0.55, iterations=30, alpha_val=15): W_sparse = self.URM_train.T model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations) data_conf = (W_sparse * alpha_val).astype('double') model.fit(data_conf) self.user_factors = model.user_factors self.item_factors = model.item_factors
def test_cg_nan2(self): # test out Nan appearing in CG code (from https://github.com/benfred/implicit/issues/106) Ciu = random(m=100, n=100, density=0.0005, format='coo', dtype=np.float32, random_state=42, data_rvs=None).T.tocsr() configs = [{'use_native': True, 'use_gpu': False}, {'use_native': False, 'use_gpu': False}] if HAS_CUDA: configs.append({'use_gpu': True}) for options in configs: model = AlternatingLeastSquares(factors=32, regularization=10, iterations=10, dtype=np.float32, **options) model.fit(Ciu, show_progress=False) self.assertTrue(np.isfinite(model.item_factors).all()) self.assertTrue(np.isfinite(model.user_factors).all())
def test_explain(self): counts = csr_matrix([[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 4, 1, 0, 7, 0], [1, 1, 0, 0, 0, 0], [9, 0, 4, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 2, 0, 1, 1]], dtype=np.float64) user_items = counts * 2 item_users = user_items.T model = AlternatingLeastSquares(factors=4, regularization=20, use_native=False, use_cg=False, iterations=100) np.random.seed(23) model.fit(user_items, show_progress=False) userid = 0 # Assert recommendation is the the same if we recompute user vectors recs = model.recommend(userid, item_users, N=10) recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True) for (item1, score1), (item2, score2) in zip(recs, recalculated_recs): self.assertEqual(item1, item2) self.assertAlmostEqual(score1, score2, 4) # Assert explanation makes sense top_rec, score = recalculated_recs[0] score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec) scores = [s for _, s in contributions] items = [i for i, _ in contributions] self.assertAlmostEqual(score, score_explained, 4) self.assertAlmostEqual(score, sum(scores), 4) self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order") self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user") # Assert explanation with precomputed user weights is correct top_score_explained, top_contributions, W = model.explain( userid, item_users, itemid=top_rec, user_weights=W, N=2) top_scores = [s for _, s in top_contributions] top_items = [i for i, _ in top_contributions] self.assertEqual(2, len(top_contributions)) self.assertAlmostEqual(score, top_score_explained, 4) self.assertEqual(scores[:2], top_scores) self.assertEqual(items[:2], top_items)
def test_factorize(self): counts = csr_matrix([[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 1, 1]], dtype=np.float64) user_items = counts * 2 # try all 8 variants of native/python, cg/cholesky, and # 64 vs 32 bit factors options = [(dtype, cg, native, False) for dtype in (np.float32, np.float64) for cg in (False, True) for native in (False, True)] # also try out GPU support if available if HAS_CUDA: options.append((np.float32, False, False, True)) for dtype, use_cg, use_native, use_gpu in options: try: model = AlternatingLeastSquares(factors=6, regularization=0, dtype=dtype, use_native=use_native, use_cg=use_cg, use_gpu=use_gpu) np.random.seed(23) model.fit(user_items, show_progress=False) rows, cols = model.item_factors, model.user_factors except Exception as e: self.fail(msg="failed to factorize matrix. Error=%s" " dtype=%s, cg=%s, native=%s gpu=%s" % (e, dtype, use_cg, use_native, use_gpu)) reconstructed = rows.dot(cols.T) for i in range(counts.shape[0]): for j in range(counts.shape[1]): self.assertAlmostEqual(counts[i, j], reconstructed[i, j], delta=0.0001, msg="failed to reconstruct row=%s, col=%s," " value=%.5f, dtype=%s, cg=%s, native=%s gpu=%s" % (i, j, reconstructed[i, j], dtype, use_cg, use_native, use_gpu))
def benchmark_accuracy(plays): output = defaultdict(list) def store_loss(model, name): def inner(iteration, elapsed): loss = calculate_loss(plays, model.item_factors, model.user_factors, 0) print("model %s iteration %i loss %.5f" % (name, iteration, loss)) output[name].append(loss) return inner for steps in [2, 3, 4]: model = AlternatingLeastSquares(factors=100, use_native=True, use_cg=True, regularization=0, iterations=25) model.cg_steps = steps model.fit_callback = store_loss(model, 'cg%i' % steps) model.fit(plays) if has_cuda: model = AlternatingLeastSquares(factors=100, use_native=True, use_gpu=True, regularization=0, iterations=25) model.fit_callback = store_loss(model, 'gpu') model.use_gpu = True model.fit(plays) model = AlternatingLeastSquares(factors=100, use_native=True, use_cg=False, regularization=0, iterations=25) model.fit_callback = store_loss(model, 'cholesky') model.fit(plays) return output
def benchmark_times(plays, iterations=3): times = defaultdict(lambda: defaultdict(list)) def store_time(model, name): def inner(iteration, elapsed): print(name, model.factors, iteration, elapsed) times[name][model.factors].append(elapsed) return inner output = defaultdict(list) for factors in range(32, 257, 32): for steps in [2, 3, 4]: model = AlternatingLeastSquares(factors=factors, use_native=True, use_cg=True, regularization=0, iterations=iterations) model.fit_callback = store_time(model, 'cg%i' % steps) model.cg_steps = steps model.fit(plays) model = AlternatingLeastSquares(factors=factors, use_native=True, use_cg=False, regularization=0, iterations=iterations) model.fit_callback = store_time(model, 'cholesky') model.fit(plays) if has_cuda: model = AlternatingLeastSquares(factors=factors, use_native=True, use_gpu=True, regularization=0, iterations=iterations) model.fit_callback = store_time(model, 'gpu') model.fit(plays) # take the min time for the output output['factors'].append(factors) for name, stats in times.items(): output[name].append(min(stats[factors])) return output