def _train_lmf(hyperparameters, train): h = hyperparameters model = LogisticMatrixFactorization(factors=h['factors'], iterations=h['n_iter'], num_threads=nproc) model.fit(train) # test_eval = {'p@k': precision_at_k(model, train.T.tocsr(), factorization.T.tocsr(), K=10)} # val_eval = {'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10)} return model
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def evaluate_lmf_model(hyperparameters, train, test, validation): h = hyperparameters model = LogisticMatrixFactorization(factors=h['factors'], iterations=h['n_iter'], num_threads=nproc) model.fit(train) test_eval = { 'p@k': precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10) } val_eval = { 'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10) } return test_eval, val_eval
class LMF: def __init__(self, factors=200, iterations=100, regularization=1, neg_prop=10, already_liked=None): self.model = LogisticMatrixFactorization(factors, iterations=iterations, regularization=regularization, neg_prop=neg_prop) self._already_liked = already_liked self._fitted = False self._user_items = None def fit(self, data_st, data_item, len_st_set, len_item_set): data = csr_matrix(([1] * len(data_st), (data_item, data_st)), shape=(len_item_set, len_st_set)) self._user_items = data.T.tocsr() self.model.fit(data, show_progress=False) self._fitted = True def recommend(self, user, k, selected_items=None): assert self._fitted, 'Model is not fitted' if selected_items is not None and self._already_liked is not None: if len(selected_items) != 0: return self.model.rank_items(user, self._user_items, [item for item in selected_items if item not in self._already_liked[user]])[:k] else: return [] else: return self.model.recommend(user, self._user_items, k, filter_already_liked_items=True) def recommend_item_based(self, k, choice, selected_items=None): assert self._fitted, 'Model is not fitted' assert len(choice) != 0, 'Given an empty list of chosen items' similar_items = [self.model.similar_items(x, 5*k) for x in choice] if selected_items is not None: d_list = [{e[0]: e[1] for e in similar_items[i] if (e[0] not in choice and e[0] in selected_items)} for i in range(len(similar_items))] else: d_list = [{e[0]: e[1] for e in similar_items[i] if e[0] not in choice} for i in range(len(similar_items))] results = dict() for d in d_list: for key in d: results[key] = results.get(key, 0) + d[key] return sorted(results.items(), key=lambda x: x[1], reverse=True)[:k]
def _get_model(self): return LogisticMatrixFactorization(factors=3, regularization=0, use_gpu=False, random_state=43)
def __init__(self, factors=200, iterations=100, regularization=1, neg_prop=10, already_liked=None): self.model = LogisticMatrixFactorization(factors, iterations=iterations, regularization=regularization, neg_prop=neg_prop) self._already_liked = already_liked self._fitted = False self._user_items = None
def calculate_similar_movies(input_filename, output_filename, model_name="als", min_rating=4.0, variant='20m'): # read in the input data file start = time.time() # titles, ratings = get_movielens(variant) user_item_df = read_user_item_data(input_filename) print(user_item_df) unique_user, unique_item, user_item_df = get_user_item_sparse_data_presto( user_item_df) #user_item_df = user_item_df.sort_values(by=['user_index','item_index']) user_item_ratings = scipy.sparse.csr_matrix( (user_item_df['score'], (user_item_df['item_index'], user_item_df['user_index']))) print(user_item_ratings) ''' # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) ''' log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares( factors=128, regularization=0.01, use_native=True, iterations=20, calculate_training_loss=True) # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") # ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(user_item_ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") k=10 iterations = 10000 similar_df_gen = similar_to_csv(model, k, unique_item, iterations) with tqdm.tqdm(total=len(unique_item) // iterations + 1) as progress: for similar_df_slice in similar_df_gen: similar_df_slice.to_csv(args.outputfile, mode='a', header=False, index=False) print("finsih a batch") progress.update(1) '''