class ALSRecommender(BaseRecommender): """ implement alternating least squares algorithm implementation based on implicit library """ def fit(self, train_df, col_user=cfg.USER_COL, col_item=cfg.ITEM_COL, col_rating=cfg.DEFAULT_RATING_COL, factors=100, confidence=5, regularization=0.1): """ Trains implicit ALS recommender on train data :param train_df: pandas DataFrame with train data :param col_user: str column name for user :param col_item: str column name for item :param col_rating: str column name for ratings :param factors: int number of factors to use in ALS model :param confidence: int as described in implicit documentation :param regularization: float higher values mean stronger regularization :return: None """ BaseRecommender.fit(self, train_df, col_user, col_item, col_rating) self.train_df[self.col_rating] = train_df[self.col_rating] * confidence self.uii_matrix = self.get_uii_matrix() self.als = AlternatingLeastSquares(factors=factors, use_gpu=False, regularization=regularization) self.als.fit(self.uii_matrix.T) def predict(self, test_df, k=cfg.DEFAULT_K): """ recommend k items for each user in test_df :param test_df: pandas DataFrame with test_users and truth recommendations :param k: int number of items to recommend :return: pandas DataFrame with k recommendations for each user in test_df """ test_users_indices = [ self.users.index(user) for user in test_df[self.col_user].values if user in self.users ] prediction_records = [] for item in test_users_indices: doc = { self.col_user: self.users[item], self.col_item: [ self.items[it[0]] for it in self.als.recommend( item, self.uii_matrix, k, filter_already_liked_items=False) ] } prediction_records.append(doc) prediction = pd.DataFrame.from_records(prediction_records) return prediction
def train_als(train_df, test_df, min_rating=4.0): # map each user/item to a unique numeric value train_df['user_id'] = train_df['user_id'].astype("category") train_df['item_id'] = train_df['item_id'].astype("category") ratings_csr = coo_matrix((train_df['rating'].astype(np.float32), (train_df['item_id'].cat.codes.copy(), train_df['user_id'].cat.codes.copy()))).tocsr() items = np.array(train_df['item_id'].cat.categories) users = np.array(train_df['user_id'].cat.categories) ratings = ratings_csr # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) model = AlternatingLeastSquares() # lets weight these models by bm25weight. ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() # train the model start = time.time() model.fit(ratings) print("Training time: {}".format(time.time() - start)) return model, users, items, ratings
def train_and_evaluate(self): self.model = AlternatingLeastSquares(factors= 16, \ iterations = 100) print('model is going to fit!') self.model.fit(self.train.transpose()) print('model is already!') self.evaluate()
class ALS(Model): def __init__(self): """ Model inicialization """ self.model = AlternatingLeastSquares() self.trainset = None def fit(self, X, y): #Create Coo-Matrix with X and y data = coo_matrix((y, (X[:, 0], X[:, 1]))) self.trainset = data data.transpose() #rows:[n_items] ; columns:[n_users] self.model.fit(data) def recommend(self, user_id, N=1): n_recomendation = self.model.recommend( user_id, self.trainset.tocsr(), N=N) #array of tuples (item_id,rating) #convert array of [tuples] in array of [item_id] result = np.zeros(N, dtype=int) pos = 0 for recomendation_tuple in n_recomendation: result[pos] = recomendation_tuple[0] pos = pos + 1 return result def get_params(self, deep=True): return dict()
def fit(self, train_df, col_user=cfg.USER_COL, col_item=cfg.ITEM_COL, col_rating=cfg.DEFAULT_RATING_COL, factors=100, confidence=5, regularization=0.1): """ Trains implicit ALS recommender on train data :param train_df: pandas DataFrame with train data :param col_user: str column name for user :param col_item: str column name for item :param col_rating: str column name for ratings :param factors: int number of factors to use in ALS model :param confidence: int as described in implicit documentation :param regularization: float higher values mean stronger regularization :return: None """ BaseRecommender.fit(self, train_df, col_user, col_item, col_rating) self.train_df[self.col_rating] = train_df[self.col_rating] * confidence self.uii_matrix = self.get_uii_matrix() self.als = AlternatingLeastSquares(factors=factors, use_gpu=False, regularization=regularization) self.als.fit(self.uii_matrix.T)
def __init__(self, user_df, song_df, k=100, knn_frac=0.5, max_overlap=0.2, cf_weighting_alpha=1, min_songs=5, mode='popular'): self.user_df = user_df self.song_df = song_df self.cf_weighting_alpha = cf_weighting_alpha self.knn_frac = knn_frac self.k = k self.max_overlap = max_overlap self.min_songs = min_songs self.mode = mode user_df_subset = user_df.loc[user_df['num_songs'] > (min_songs - 1)] self.kdtree = KDTree(user_df_subset['MUSIC'].tolist()) #build the collaborative filtering model with params hardcoded als_params = { 'factors': 16, 'dtype': np.float32, 'iterations': 2, 'calculate_training_loss': True } self.cf_model = AlternatingLeastSquares(**als_params)
def test_cg_nan(self): # test issue with CG code that was causing NaN values in output: # https://github.com/benfred/implicit/issues/19#issuecomment-283164905 raw = [[0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] counts = csr_matrix(raw, dtype=np.float64) for use_native in (True, False): model = AlternatingLeastSquares(factors=3, regularization=0.01, dtype=np.float64, use_native=use_native, use_cg=True, use_gpu=False) model.fit(counts, show_progress=False) rows, cols = model.item_factors, model.user_factors self.assertFalse(np.isnan(np.sum(cols))) self.assertFalse(np.isnan(np.sum(rows)))
def make_latent_feature(df: pd.DataFrame, index_col: str, value_col: str, n_factors: int, n_iterations: int, sum_col: Optional[str] = None): if sum_col is None: csr = make_count_csr(df, index_col=index_col, value_col=value_col) else: csr = make_sum_csr( df, index_col=index_col, value_col=value_col, col_to_sum=sum_col, ) model = AlternatingLeastSquares( factors=n_factors, dtype=np.float32, iterations=n_iterations, regularization=0.1, use_gpu=False, # True if n_factors >= 32 else False, ) np.random.seed(RANDOM_STATE) model.fit(csr.T) return model.user_factors
class ALSEstimator(BaseEstimator, TransformerMixin): def __init__(self, factors=50, regularization=0.01, iterations=10, filter_seen=True): self.factors = factors self.regularization = regularization self.iterations = iterations self.filter_seen = filter_seen def fit(self, X, y=None): self.model = AlternatingLeastSquares( factors=self.factors, regularization=self.regularization, iterations=self.iterations, dtype=np.float64, use_native=True, use_cg=True) self.model.fit(X) if self.fiter_seen: self.fit_X = X return self def predict(self, X, y=None): predictions = np.dot(self.model.item_factors, self.model.user_factors.T) if self.filter_seen: predictions[self.fit_x.nonzero()] = -99 return predictions
def test_cg_nan2(self): # test out Nan appearing in CG code (from https://github.com/benfred/implicit/issues/106) Ciu = random(m=100, n=100, density=0.0005, format='coo', dtype=np.float32, random_state=42, data_rvs=None).T.tocsr() configs = [{ 'use_native': True, 'use_gpu': False }, { 'use_native': False, 'use_gpu': False }] if HAS_CUDA: configs.append({'use_gpu': True}) for options in configs: model = AlternatingLeastSquares(factors=32, regularization=10, iterations=10, dtype=np.float32, **options) model.fit(Ciu, show_progress=False) self.assertTrue(np.isfinite(model.item_factors).all()) self.assertTrue(np.isfinite(model.user_factors).all())
def calculate_similar_event(path, output_filename): model = AlternatingLeastSquares() a, b = read_event_data(path) event, users = hfd5_from_dataframe(a, b, output_filename) users.eliminate_zeros() users.data = np.ones(len(users.data)) log.info("Start fitting") model.fit(users) user_count = np.ediff1d(users.indptr) to_generate = sorted(np.arange(len(event)), key=lambda x: -user_count[x]) with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf-8") as o: for eventid in to_generate: if users.indptr[eventid] != users.indptr[eventid + 1]: name = event[eventid] for other, score in model.similar_items( eventid, int(len(event) * 2 / 3) ): o.write(f"{name},{event[other]},{score}\n") progress.update(1)
def test_factorize(self): counts = csr_matrix( [ [1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 1, 1], ], dtype=np.float64, ) user_items = counts * 2 # try all 8 variants of native/python, cg/cholesky, and # 64 vs 32 bit factors options = [(dtype, cg, native, False) for dtype in (np.float32, np.float64) for cg in (False, True) for native in (False, True)] # also try out GPU support if available if HAS_CUDA: options.append((np.float32, False, False, True)) for dtype, use_cg, use_native, use_gpu in options: try: model = AlternatingLeastSquares( factors=6, regularization=0, dtype=dtype, use_native=use_native, use_cg=use_cg, use_gpu=use_gpu, random_state=42, ) model.fit(user_items, show_progress=False) rows, cols = model.item_factors, model.user_factors if use_gpu: rows, cols = rows.to_numpy(), cols.to_numpy() except Exception as e: self.fail(msg="failed to factorize matrix. Error=%s" " dtype=%s, cg=%s, native=%s gpu=%s" % (e, dtype, use_cg, use_native, use_gpu)) reconstructed = rows.dot(cols.T) for i in range(counts.shape[0]): for j in range(counts.shape[1]): self.assertAlmostEqual( counts[i, j], reconstructed[i, j], delta=0.0001, msg="failed to reconstruct row=%s, col=%s," " value=%.5f, dtype=%s, cg=%s, native=%s gpu=%s" % (i, j, reconstructed[i, j], dtype, use_cg, use_native, use_gpu), )
def benchmark_implicit(matrix, factors, reg, iterations): start = time.time() model = AlternatingLeastSquares(factors, regularization=reg, iterations=iterations, use_cg=True) model.fit(matrix) return time.time() - start
def _prep_for_fit(self, train_obs, **fit_params): # self.toggle_mkl_blas_1_thread(True) self._set_data(train_obs) self.set_params(**fit_params) self.model = AlternatingLeastSquares(**self.model_params) self.model.cg_steps = self.fit_params[ 'cg_steps'] # not passable to __init__() self._set_implib_train_mat(self.train_mat)
class AlsRecommender(OwnRecommender): """Модель, обученная ALS Input ----- ds: RecommenderDataset подготовленный RecommenderDataset обьект """ def fit(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4): """Обучает ALS""" self.model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) self.model.fit(self.ds.csr_matrix) return self def _similarItems(self, userId, N=5): """Рекомендуем товары, похожие на топ-N купленных юзером товаров""" if not self.ds.userExist(userId): return self.ds.extend([], N) def _get_similar_item(item_id): """Находит товар, похожий на item_id""" recs = self.model.similar_items(self.ds.itemid_to_id[item_id], N=2) if len(recs) > 1: top_rec = recs[1][0] return self.ds.id_to_itemid[top_rec] return item_id res = [_get_similar_item(item) for item in self.ds.userTop(userId, N)] return self.extend(res, N) def _similarUsers(self, userId, N=5): """Рекомендуем топ-N товаров, среди купленных похожими юзерами""" if not self.ds.userExist(userId): return self.ds.extend([], N) res = [] similar_users = [rec[0] for rec in self.model.similar_users(self.ds.userid_to_id[userId], N=N+1)] similar_users = similar_users[1:] for user in similar_users: res.extend(self.ds.userTop(userId, 1)) return self.extend(res, N) def items_embedings(self): emb = pd.DataFrame(data=self.model.item_factors).add_prefix('itm') emb['item_id'] = self.ds.itemids return emb def users_embedings(self): emb = pd.DataFrame(data=self.model.user_factors).add_prefix('usr') emb['user_id'] = self.ds.userids return emb
def fit(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4): """Обучает ALS""" self.model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) self.model.fit(self.ds.csr_matrix) return self
def fit(user_item_matrix, factors=20, regularization=0.001, iterations=15): """Обучает ALS""" model = AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def __init__(self, k, reg=1e-4, n_iters=15): """""" super().__init__() self.k = k self.reg = reg self.n_iters = n_iters self.als = AlternatingLeastSquares( k, regularization=reg, iterations=n_iters )
def _add_als_recs(self, n_factors=20, regularization=0.001, iterations=20, num_threads=0): als_model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) als_model.fit(csr_matrix(self.user_item_matrix).T.tocsr()) self.als_model = als_model als_recs = lambda i: [ self.id_to_itemid[rec[0]] for rec in als_model.recommend( userid=int(i), user_items=csr_matrix(self.user_item_matrix).tocsr(), N=self.first_model_rec_limit, filter_items=[self.itemid_to_id[999999]], recalculate_user=True, filter_already_liked_items=False) ] self.df_users['als_recommender'] = None self.df_users.loc[~self.df_users['id'].isnull(), 'als_recommender'] = self.df_users.loc[ ~self.df_users['id'].isnull(), 'id'].map(als_recs) self.df_users['als_recommender'] = self.df_users[ 'als_recommender'].map(lambda val: val if type(val) == type([]) else []) # adding embedings to df_users and df_items as features als_user_factors = pd.DataFrame( self.als_model.user_factors, columns=[ f'als_user_factor_{i}' for i in range(self.als_model.user_factors.shape[1]) ]) als_user_factors['id'] = als_user_factors.index self.df_users = pd.merge(left=self.df_users, right=als_user_factors, on='id', how='left') als_item_factors = pd.DataFrame( self.als_model.item_factors, columns=[ f'als_item_factor_{i}' for i in range(self.als_model.item_factors.shape[1]) ]) als_item_factors['id'] = als_item_factors.index self.df_items = pd.merge(left=self.df_items, right=als_item_factors, on='id', how='left')
def main(params): """Main function.""" # check for mandatory params if 'reference_repo' not in params: return {'error': 'Mandatory param reference_repo not present'} reference_repo = params['reference_repo'] LOGGER.info('reference_repo %s' % reference_repo) # get data LOGGER.info('read GBQ data') _GC_SVC_ACCOUNT['private_key_id'] = params['GC_SVC_PRIVATE_KEY_ID'] _GC_SVC_ACCOUNT['private_key'] = params['GC_SVC_PRIVATE_KEY'] data = pd.io.gbq.read_gbq(_QUERY, dialect="standard", project_id=_GC_SVC_ACCOUNT['project_id'], private_key=json.dumps(_GC_SVC_ACCOUNT)) # map each repo and user to a unique numeric value data['user'] = data['user'].astype("category") data['repo'] = data['repo'].astype("category") # dictionaries to translate names to ids and vice-versa repos = dict(enumerate(data['repo'].cat.categories)) repo_ids = {r: i for i, r in repos.items()} if reference_repo not in repo_ids: return {"message": "No result. Reference repo not in training set."} # create a sparse matrix of all the users/repos stars = coo_matrix( (np.ones(data.shape[0]), (data['repo'].cat.codes.copy(), data['user'].cat.codes.copy()))) # train model LOGGER.info('training model') model = AlternatingLeastSquares( factors=50, regularization=0.01, dtype=np.float64, # pylint: disable=no-member iterations=50) confidence = 40 model.fit(confidence * stars) similar_ids = model.similar_items(repo_ids[reference_repo]) LOGGER.info('found %d similar repos' % len(similar_ids)) similar_repos = [] for idx in range(1, len(similar_ids)): similar_repos.append(repos[similar_ids[idx][0]]) return { 'reference_repo': reference_repo, 'similar_repos': similar_repos, 'error': '' }
def fit(user_item_matrix): """Обучает ALS""" model = AlternatingLeastSquares(factors=100, regularization=0.01, iterations=15, num_threads=4) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def fit(user_item_matrix, n_factors=20, regularization=0.1, iterations=40, num_threads=0): """Обучает ALS""" model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def _train_als(hyperparameters, train): h = hyperparameters model = AlternatingLeastSquares(factors=h['factors'], iterations=h['n_iter'], num_threads=nproc) model.fit(train) # test_eval = {'p@k': precision_at_k(model, train.T.tocsr(), factorization.T.tocsr(), K=10)} # val_eval = {'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10)} return model
def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4, show_progress=False): """Обучает ALS""" model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=show_progress) return model
def __init__(self, params={"c": None}, nunique_feature=None): self.params = params.copy() self.c = params["c"] del params["c"] self.model = ALS(**params) self.song_model = ALS(**params) self.tag_model = ALS(**params) self.song_rec_csr = None self.tag_rec_csr = None self.nunique_feature = nunique_feature
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def collab_filter(song_id, user_song_df, num_songs=5): ''' song_id = spotify id for individual song user_song_df= dataframe with users, songs, playcounts etc for the time being i am not going to enable filtering by key/tempo as not enough songs but in future will do ''' song_num = user_song_df[user_song_df.spotify_id == song_id].song_nums.values[0] print(song_num) print(type(song_num)) #orig_key = song_list[song_list.spotify_id==song_id].key.values[0] #orig_tempo= song_list[song_list.spotify_id==song_id].tempo.values[0] #check if you want songs of same key #if same_key=='yes': #if yes then filter out other keys # print(f'key:{orig_key}') # song_list = song_list[song_list.key ==orig_key] #can also enter number to specify what key you want # elif type(same_key) !=str: # song_list = song_list[song_list.key==same_key] # check if you want similar tempo # if similar_tempo=='yes': # print(f'tempo:{orig_tempo}') #if yes can also specify how similar you want it # lower= int(orig_tempo)-margin # higher=int(orig_tempo)+margin # song_list=song_list[song_list.tempo.between(lower,higher)] #elif type(similar_tempo) !=str: #can also specify a specific tempo that you want # song_list = song_list[song_list.tempo.between(int(similar_tempo)-margin,int(similar_tempo)+margin)] # refined_ids=song_list.spotify_id #this will be updated user_song_refined = user_song_df #[user_song_df.spotify_id.isin( # refined_ids)].copy() plays = user_song_refined['size'] user_nums = user_song_refined.user_nums song_nums = user_song_refined.song_nums B = coo_matrix((plays, (song_nums, user_nums))).tocsr() model = AlternatingLeastSquares(factors=30) model.fit(B) songs_inds = model.similar_items(song_num, N=num_songs) songs_inds = [tup[0] for tup in songs_inds] return user_song_df[user_song_df.song_nums.isin(songs_inds)]
def load_recommender(als_model_file: str, index_file: str, item_feature_file: str = None, **kwargs) -> ImplicitRecommender: log.info("Loading als model") data = np.load(als_model_file, allow_pickle=True) model = AlternatingLeastSquares( factors=data['model.item_factors'].shape[1]) model.item_factors = data['model.item_factors'] model.YtY # This will initialize the _YtY instance variable which is used directly in internal methods if 'user_factors' in data: model.user_factors = data['model.user_factors'] user_labels = data['user_labels'] item_labels = data['item_labels'] if index_file is None: return ImplicitRecommender(model, user_labels, item_labels) elif index_file.endswith('.ann'): import annoy log.info("Loading annoy recommendation index") max_norm, extra = augment_inner_product_matrix(model.item_factors) recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular') recommend_index.load( index_file) # prefault=load_to_memory does not seem to work if item_feature_file is None: from .annoy import ImplicitAnnoyRecommender return ImplicitAnnoyRecommender(model, recommend_index, max_norm, user_labels, item_labels) else: log.info("Loading item features for recommendation") item_feature_data = pickle.load(open(item_feature_file, "rb")) tag_tfidf_transformer = item_feature_data['tag_tfidf_transformer'] tag_lookup = item_feature_data['tag_lookup'] item_embedding_weight = item_feature_data['item_embedding_weight'] from .annoy_item_features import ImplicitAnnoyItemFeatureRecommender return ImplicitAnnoyItemFeatureRecommender( model, recommend_index, max_norm, user_labels, item_labels, tag_tfidf_transformer, tag_lookup, item_embedding_weight) elif index_file.endswith('.hnsw'): import hnswlib from .hnsw import ImplicitHNSWRecommender log.info("Loading hnsw recommendation index") # we build the index in l2 space and load it in inner product space on purpose. # This space change gives us 0.96 recall l2_recommend_index = hnswlib.Index(space='ip', dim=model.item_factors.shape[1]) l2_recommend_index.load_index(index_file) l2_recommend_index.set_ef(kwargs.get('ef', 2000)) return ImplicitHNSWRecommender(model, l2_recommend_index, user_labels, item_labels) else: raise RecommenderException("Unsupported file type" + index_file)
def fit(self, user_item_matrix, n_factors, regularization=0.001, iterations=50, num_threads=1, use_gpu=False): """Обучает ALS""" model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads, use_gpu=use_gpu) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def fit(user_item_matrix, n_factors=32, regularization=0.001, iterations=15, num_threads=16): """Обучает ALS""" model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, calculate_training_loss=True, num_threads=num_threads) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def test_cg_nan2(self): # test out Nan appearing in CG code (from https://github.com/benfred/implicit/issues/106) Ciu = random(m=100, n=100, density=0.0005, format='coo', dtype=np.float32, random_state=42, data_rvs=None).T.tocsr() configs = [{'use_native': True, 'use_gpu': False}, {'use_native': False, 'use_gpu': False}] if HAS_CUDA: configs.append({'use_gpu': True}) for options in configs: model = AlternatingLeastSquares(factors=32, regularization=10, iterations=10, dtype=np.float32, **options) model.fit(Ciu, show_progress=False) self.assertTrue(np.isfinite(model.item_factors).all()) self.assertTrue(np.isfinite(model.user_factors).all())
def test_factorize(self): counts = csr_matrix([[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 1, 1]], dtype=np.float64) user_items = counts * 2 # try all 8 variants of native/python, cg/cholesky, and # 64 vs 32 bit factors options = [(dtype, cg, native, False) for dtype in (np.float32, np.float64) for cg in (False, True) for native in (False, True)] # also try out GPU support if available if HAS_CUDA: options.append((np.float32, False, False, True)) for dtype, use_cg, use_native, use_gpu in options: try: model = AlternatingLeastSquares(factors=6, regularization=0, dtype=dtype, use_native=use_native, use_cg=use_cg, use_gpu=use_gpu) np.random.seed(23) model.fit(user_items, show_progress=False) rows, cols = model.item_factors, model.user_factors except Exception as e: self.fail(msg="failed to factorize matrix. Error=%s" " dtype=%s, cg=%s, native=%s gpu=%s" % (e, dtype, use_cg, use_native, use_gpu)) reconstructed = rows.dot(cols.T) for i in range(counts.shape[0]): for j in range(counts.shape[1]): self.assertAlmostEqual(counts[i, j], reconstructed[i, j], delta=0.0001, msg="failed to reconstruct row=%s, col=%s," " value=%.5f, dtype=%s, cg=%s, native=%s gpu=%s" % (i, j, reconstructed[i, j], dtype, use_cg, use_native, use_gpu))
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant='20m'): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def test_explain(self): counts = csr_matrix([[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 4, 1, 0, 7, 0], [1, 1, 0, 0, 0, 0], [9, 0, 4, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 2, 0, 1, 1]], dtype=np.float64) user_items = counts * 2 item_users = user_items.T model = AlternatingLeastSquares(factors=4, regularization=20, use_native=False, use_cg=False, iterations=100) np.random.seed(23) model.fit(user_items, show_progress=False) userid = 0 # Assert recommendation is the the same if we recompute user vectors recs = model.recommend(userid, item_users, N=10) recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True) for (item1, score1), (item2, score2) in zip(recs, recalculated_recs): self.assertEqual(item1, item2) self.assertAlmostEqual(score1, score2, 4) # Assert explanation makes sense top_rec, score = recalculated_recs[0] score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec) scores = [s for _, s in contributions] items = [i for i, _ in contributions] self.assertAlmostEqual(score, score_explained, 4) self.assertAlmostEqual(score, sum(scores), 4) self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order") self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user") # Assert explanation with precomputed user weights is correct top_score_explained, top_contributions, W = model.explain( userid, item_users, itemid=top_rec, user_weights=W, N=2) top_scores = [s for _, s in top_contributions] top_items = [i for i, _ in top_contributions] self.assertEqual(2, len(top_contributions)) self.assertAlmostEqual(score, top_score_explained, 4) self.assertEqual(scores[:2], top_scores) self.assertEqual(items[:2], top_items)
def benchmark_accuracy(plays): output = defaultdict(list) def store_loss(model, name): def inner(iteration, elapsed): loss = calculate_loss(plays, model.item_factors, model.user_factors, 0) print("model %s iteration %i loss %.5f" % (name, iteration, loss)) output[name].append(loss) return inner for steps in [2, 3, 4]: model = AlternatingLeastSquares(factors=100, use_native=True, use_cg=True, regularization=0, iterations=25) model.cg_steps = steps model.fit_callback = store_loss(model, 'cg%i' % steps) model.fit(plays) if has_cuda: model = AlternatingLeastSquares(factors=100, use_native=True, use_gpu=True, regularization=0, iterations=25) model.fit_callback = store_loss(model, 'gpu') model.use_gpu = True model.fit(plays) model = AlternatingLeastSquares(factors=100, use_native=True, use_cg=False, regularization=0, iterations=25) model.fit_callback = store_loss(model, 'cholesky') model.fit(plays) return output
def benchmark_times(plays, iterations=3): times = defaultdict(lambda: defaultdict(list)) def store_time(model, name): def inner(iteration, elapsed): print(name, model.factors, iteration, elapsed) times[name][model.factors].append(elapsed) return inner output = defaultdict(list) for factors in range(32, 257, 32): for steps in [2, 3, 4]: model = AlternatingLeastSquares(factors=factors, use_native=True, use_cg=True, regularization=0, iterations=iterations) model.fit_callback = store_time(model, 'cg%i' % steps) model.cg_steps = steps model.fit(plays) model = AlternatingLeastSquares(factors=factors, use_native=True, use_cg=False, regularization=0, iterations=iterations) model.fit_callback = store_time(model, 'cholesky') model.fit(plays) if has_cuda: model = AlternatingLeastSquares(factors=factors, use_native=True, use_gpu=True, regularization=0, iterations=iterations) model.fit_callback = store_time(model, 'gpu') model.fit(plays) # take the min time for the output output['factors'].append(factors) for name, stats in times.items(): output[name].append(min(stats[factors])) return output