예제 #1
0
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
예제 #2
0
파일: caffeine.py 프로젝트: pknn/radii
def calculate_similar_event(path, output_filename):
    model = AlternatingLeastSquares()

    a, b = read_event_data(path)
    event, users = hfd5_from_dataframe(a, b, output_filename)

    users.eliminate_zeros()
    users.data = np.ones(len(users.data))

    log.info("Start fitting")
    model.fit(users)

    user_count = np.ediff1d(users.indptr)
    to_generate = sorted(np.arange(len(event)), key=lambda x: -user_count[x])

    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf-8") as o:
            for eventid in to_generate:
                if users.indptr[eventid] != users.indptr[eventid + 1]:
                    name = event[eventid]
                    for other, score in model.similar_items(
                        eventid, int(len(event) * 2 / 3)
                    ):
                        o.write(f"{name},{event[other]},{score}\n")
                progress.update(1)
예제 #3
0
def calculate_similar_movies(output_filename,
                             model_name="als", min_rating=4.0,
                             variant='20m'):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings,  B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
예제 #4
0
파일: recommenders.py 프로젝트: SergeAA/rs
class AlsRecommender(OwnRecommender):
    """Модель, обученная ALS

    Input
    -----
    ds: RecommenderDataset
        подготовленный RecommenderDataset обьект
    """

    def fit(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""
        self.model = AlternatingLeastSquares(factors=n_factors,
                                             regularization=regularization,
                                             iterations=iterations,
                                             num_threads=num_threads)
        self.model.fit(self.ds.csr_matrix)

        return self

    def _similarItems(self, userId, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        if not self.ds.userExist(userId):
            return self.ds.extend([], N)

        def _get_similar_item(item_id):
            """Находит товар, похожий на item_id"""
            recs = self.model.similar_items(self.ds.itemid_to_id[item_id], N=2)
            if len(recs) > 1:
                top_rec = recs[1][0]
                return self.ds.id_to_itemid[top_rec]
            return item_id

        res = [_get_similar_item(item) for item in self.ds.userTop(userId, N)]
        return self.extend(res, N)

    def _similarUsers(self, userId, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        if not self.ds.userExist(userId):
            return self.ds.extend([], N)

        res = []
        similar_users = [rec[0] for rec in self.model.similar_users(self.ds.userid_to_id[userId], N=N+1)]
        similar_users = similar_users[1:]

        for user in similar_users:
            res.extend(self.ds.userTop(userId, 1))

        return self.extend(res, N)

    def items_embedings(self):
        emb = pd.DataFrame(data=self.model.item_factors).add_prefix('itm')
        emb['item_id'] = self.ds.itemids
        return emb

    def users_embedings(self):
        emb = pd.DataFrame(data=self.model.user_factors).add_prefix('usr')
        emb['user_id'] = self.ds.userids
        return emb
예제 #5
0
def calculate_similar_movies(input_path,
                             output_filename,
                             model_name="als",
                             min_rating=4.0):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, movies, m = read_data(input_path, min_rating=min_rating)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    m = m.tocsr()
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = ratings.groupby('movieId').size()
    movie_lookup = dict(
        (i, m) for i, m in zip(movies['movieId'], movies['title']))
    to_generate = sorted(list(movies['movieId']),
                         key=lambda x: -user_count.get(x, 0))

    with codecs.open(output_filename, "w", "utf8") as o:
        for movieid in to_generate:
            # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
            # no ratings > 4 meaning we've filtered out all data for it.
            if m.indptr[movieid] == m.indptr[movieid + 1]:
                continue

            movie = movie_lookup[movieid]
            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
예제 #6
0
파일: main.py 프로젝트: luebken/suasor
def main(params):
    """Main function."""
    # check for mandatory params
    if 'reference_repo' not in params:
        return {'error': 'Mandatory param reference_repo not present'}

    reference_repo = params['reference_repo']
    LOGGER.info('reference_repo %s' % reference_repo)

    # get data
    LOGGER.info('read GBQ data')
    _GC_SVC_ACCOUNT['private_key_id'] = params['GC_SVC_PRIVATE_KEY_ID']
    _GC_SVC_ACCOUNT['private_key'] = params['GC_SVC_PRIVATE_KEY']
    data = pd.io.gbq.read_gbq(_QUERY,
                              dialect="standard",
                              project_id=_GC_SVC_ACCOUNT['project_id'],
                              private_key=json.dumps(_GC_SVC_ACCOUNT))

    # map each repo and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['repo'] = data['repo'].astype("category")

    # dictionaries to translate names to ids and vice-versa
    repos = dict(enumerate(data['repo'].cat.categories))
    repo_ids = {r: i for i, r in repos.items()}

    if reference_repo not in repo_ids:
        return {"message": "No result. Reference repo not in training set."}

    # create a sparse matrix of all the users/repos
    stars = coo_matrix(
        (np.ones(data.shape[0]), (data['repo'].cat.codes.copy(),
                                  data['user'].cat.codes.copy())))

    # train model
    LOGGER.info('training model')
    model = AlternatingLeastSquares(
        factors=50,
        regularization=0.01,
        dtype=np.float64,  # pylint: disable=no-member
        iterations=50)
    confidence = 40
    model.fit(confidence * stars)

    similar_ids = model.similar_items(repo_ids[reference_repo])
    LOGGER.info('found %d similar repos' % len(similar_ids))

    similar_repos = []
    for idx in range(1, len(similar_ids)):
        similar_repos.append(repos[similar_ids[idx][0]])

    return {
        'reference_repo': reference_repo,
        'similar_repos': similar_repos,
        'error': ''
    }
예제 #7
0
def collab_filter(song_id, user_song_df, num_songs=5):
    '''
    song_id = spotify id for individual song
    user_song_df= dataframe with users, songs, playcounts etc
    for the time being i am not going to enable filtering by key/tempo as not enough songs
    but in future will do
    '''

    song_num = user_song_df[user_song_df.spotify_id ==
                            song_id].song_nums.values[0]
    print(song_num)
    print(type(song_num))
    #orig_key = song_list[song_list.spotify_id==song_id].key.values[0]
    #orig_tempo= song_list[song_list.spotify_id==song_id].tempo.values[0]

    #check if you want songs of same key
    #if same_key=='yes':
    #if yes then filter out other keys
    #    print(f'key:{orig_key}')
    #    song_list = song_list[song_list.key ==orig_key]

    #can also enter number to specify what key you want
    # elif type(same_key) !=str:
    #     song_list = song_list[song_list.key==same_key]

    # check if you want similar tempo
    #  if similar_tempo=='yes':
    #     print(f'tempo:{orig_tempo}')
    #if yes can also specify how similar you want it
    #     lower= int(orig_tempo)-margin
    #    higher=int(orig_tempo)+margin
    #    song_list=song_list[song_list.tempo.between(lower,higher)]

    #elif type(similar_tempo) !=str:
    #can also specify a specific tempo that you want
    #   song_list = song_list[song_list.tempo.between(int(similar_tempo)-margin,int(similar_tempo)+margin)]

    # refined_ids=song_list.spotify_id
    #this will be updated
    user_song_refined = user_song_df
    #[user_song_df.spotify_id.isin(
    #    refined_ids)].copy()

    plays = user_song_refined['size']
    user_nums = user_song_refined.user_nums
    song_nums = user_song_refined.song_nums

    B = coo_matrix((plays, (song_nums, user_nums))).tocsr()

    model = AlternatingLeastSquares(factors=30)
    model.fit(B)
    songs_inds = model.similar_items(song_num, N=num_songs)
    songs_inds = [tup[0] for tup in songs_inds]

    return user_song_df[user_song_df.song_nums.isin(songs_inds)]
def calculate_similar_movies(input_path, output_filename,
                             model_name="als", min_rating=4.0):
    """
    :param input_path: 训练数据集的路径
    :param output_filename: 输出的文件名称
    :param model_name: 采用的模型
    :param min_rating: 过滤所需的阈值大小
    :return:
    """

    logging.debug("reading data from %s", input_path)
    start = time.time()
    rating_data, movies_data, m = read_data(input_path, min_rating=min_rating)
    logging.debug("reading data in %s", time.time() - start)

    if model_name == "als":
        model = AlternatingLeastSquares()

        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender()

    else:
        raise NotImplementedError("TODU: model %s" % model_name)


    m = m.tocsr()
    logging.debug("Training model :%s" % model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = rating_data.groupby("movieId").size()
    movie_lookup = dict((i, m) for i,m in
                        zip(movies_data['movieId'], movies_data['title']))
    to_generate = sorted(list(movies_data['movieId']), key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for movieid in to_generate:
            if(m.indptr[movieid] == m.indptr[movieid + 1]):
                continue

            movie = movie_lookup[movieid]

            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
예제 #9
0
def calculate_similar_businesses(input_filename,
                                 output_filename,
                                 model_name="als",
                                 factors=50,
                                 regularization=0.01,
                                 iterations=15,
                                 exact=False,
                                 trees=20,
                                 use_native=True,
                                 dtype=numpy.float64,
                                 cg=False):
    logging.debug("Calculating similar businesses. This might take a while")

    # read in the input data file
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, ratings = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if exact:
        model = AlternatingLeastSquares(factors=factors,
                                        regularization=regularization,
                                        use_native=use_native,
                                        use_cg=cg,
                                        dtype=dtype)
    else:
        model = AnnoyAlternatingLeastSquares(factors=factors,
                                             regularization=regularization,
                                             use_native=use_native,
                                             use_cg=cg,
                                             dtype=dtype)

    # lets weight these models by bm25weight.
    logging.debug("weighting matrix by bm25_weight")
    ratings = bm25_weight(ratings, K1=100, B=0.8)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)

    # write out similar businesses by popularity
    logging.debug("calculating top businesses")
    user_count = df.groupby('business').size()
    businesses = dict(enumerate(df['business'].cat.categories))
    to_generate = sorted(list(businesses), key=lambda x: -user_count[x])

    # write out as a TSV of businessid, otherbusinessid, score
    with open(output_filename, "w") as o:
        for businessid in to_generate:
            business = businesses[businessid]
            for other, score in model.similar_items(businessid, 11):
                o.write("%s\t%s\t%s\n" % (business, businesses[other], score))
예제 #10
0
def calculate_similar_beers(input_path, output_filename, model_name="cosine"):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, beers, m = read_data(input_path)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    m = m.tocsr()
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top beers")

    user_count = ratings.groupby('beerId').size()
    beer_lookup = dict((i, m) for i, m in zip(beers['beerId'], beers['name']))
    to_generate = sorted(list(beers['beerId']),
                         key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for beerId in to_generate:
            if m.indptr[beerId] == m.indptr[beerId + 1]:
                continue
            beer = beer_lookup[beerId]
            for other, score in model.similar_items(beerId, 11):
                o.write("%s,%s,%s\n" % (beer, beer_lookup[other], score))
예제 #11
0
def calculate_similar_movies(input_path,
                             output_filename,
                             model_name="als",
                             min_rating=4.0):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, movies, m = read_data(input_path, min_rating=min_rating)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = ratings.groupby('movieId').size()
    movie_lookup = dict(
        (i, m) for i, m in zip(movies['movieId'], movies['title']))
    to_generate = sorted(list(movies['movieId']),
                         key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for movieid in to_generate:
            movie = movie_lookup[movieid]
            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
예제 #12
0
def implicit(args):
    row_dict, col_dict = {}, {}
    rows, cols, data = [], [], []
    for feedback in iter_implicit_feedbacks(
            os.path.join(args.in_dir, 'ua.base')):
        i = row_dict.setdefault(feedback.item_id, len(row_dict))
        j = col_dict.setdefault(feedback.user_id, len(col_dict))
        rows.append(i)
        cols.append(j)
        data.append(1)
    item_user_data = csr_matrix((data, (rows, cols)),
                                shape=(len(row_dict), len(col_dict)))

    model = AlternatingLeastSquares(factors=8)
    model.fit(item_user_data)

    # Evaluation
    user_items = item_user_data.T.tocsr()
    user_items_test = collections.defaultdict(set)
    for feedback in iter_implicit_feedbacks(
            os.path.join(args.in_dir, 'ua.test')):
        try:
            i = row_dict[feedback.item_id]
            j = col_dict[feedback.user_id]
        except KeyError as e:
            continue
        user_items_test[j].add(i)

    topk = 10
    precision = 0
    for user_index, item_indices in user_items_test.items():
        recommendations = model.recommend(user_index, user_items, topk, True)
        precision += sum(1 if item_index in item_indices else 0
                         for item_index, _ in recommendations) / topk
    precision = precision / len(user_items_test)
    print('precision:', precision)

    item_id = 1
    item_index = row_dict[item_id]
    index2id = {value: key for key, value in row_dict.items()}
    for _item_index, score in model.similar_items(item_index, 10):
        _item_id = index2id[_item_index]
        print(_item_id)
예제 #13
0
data_sparse = sparse.csr_matrix((plays, (cols, rows)), shape=(len(artists), len(users)))

model = AlternatingLeastSquares(factors=50)
model.fit(data_sparse)

userid = 0

user_items = data_sparse.T.tocsr()
recommendations = model.recommend(userid, user_items)

print(recommendations)

for r in recommendations:
    print(artist_id_name[str(r[0])])

itemid = 107209
related = model.similar_items(itemid)

print(related)

for a in related:
    print(artist_id_name[str(a[0])])

    artist_id_name['234786']


    



예제 #14
0
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """
    def __init__(self, data, weighting=True):

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(
            ['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity',
                                       ascending=False,
                                       inplace=True)
        self.top_purchases = self.top_purchases[
            self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby(
            'item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity',
                                               ascending=False,
                                               inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[
            self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist(
        )

        self.user_item_matrix = self._prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

    @staticmethod
    def _prepare_matrix(data):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(
            data,
            index='user_id',
            columns='item_id',
            values='quantity',  # Можно пробовать другие варианты
            aggfunc='count',
            fill_value=0)

        user_item_matrix = user_item_matrix.astype(
            float)  # необходимый тип матрицы для implicit

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    def fit_own_recommender(self, user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        self.own_recommender = ItemItemRecommender(K=1, num_threads=4)
        self.own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return self.own_recommender

    def fit_als(self,
                user_item_matrix,
                n_factors=20,
                regularization=0.001,
                iterations=15,
                num_threads=4,
                show_progress=True,
                use_gpu=True):
        """Обучает ALS"""

        self.model_als = AlternatingLeastSquares(factors=n_factors,
                                                 regularization=regularization,
                                                 iterations=iterations,
                                                 use_gpu=use_gpu,
                                                 num_threads=num_threads,
                                                 random_state=0)
        self.model_als.fit(csr_matrix(user_item_matrix).T.tocsr(),
                           show_progress=show_progress)

        return self.model_als

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model_als.similar_items(
            self.itemid_to_id[item_id],
            N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model_als, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)

        recs = model_als.recommend(userid=self.userid_to_id[user],
                                   user_items=csr_matrix(
                                       self.user_item_matrix).tocsr(),
                                   N=N,
                                   filter_already_liked_items=False,
                                   filter_items=[self.itemid_to_id[999999]],
                                   recalculate_user=True)

        res = [self.id_to_itemid[rec[0]] for rec in recs]

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model_als=self.model_als, N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user,
                                         model_als=self.own_recommender,
                                         N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        top_users_purchases = self.top_purchases[self.top_purchases['user_id']
                                                 == user].head(N)

        res = top_users_purchases['item_id'].apply(
            lambda x: self._get_similar_item(x)).tolist()
        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model_als.similar_users(self.userid_to_id[user],
                                                     N=N + 1)
        similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]  # удалим юзера из запроса

        for user in similar_users:
            userid = self.id_to_userid[
                user]  #own recommender works with user_ids
            res.extend(self.get_own_recommendations(userid, N=1))

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS
    
    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """
    
    def __init__(self, data, data_product, weighting=True):
                
        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()
        
        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid,self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)
        
        # Словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ
        self.item_id_to_ctm = self.prepare_ctm(data_product)
        
        # Own recommender обучается до взвешивания матрицы
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 
        
        self.model = self.fit(self)
     
    @staticmethod
    def prepare_matrix(data):
        
        user_item_matrix = pd.pivot_table(data, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

        user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit
        
        return user_item_matrix
    
    @staticmethod
    def prepare_ctm(data_product):
            """Создаем словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ"""

            ctm_items = data_product[['item_id', 'brand']]
            ctm_items['feature'] = data_product['brand'].isin(['Private'])
            ctm_items = ctm_items.replace(to_replace=[False, True], value=[0, 1])

            return dict(zip(ctm_items['item_id'], ctm_items['feature']))
        
    @staticmethod
    def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        
        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
     
    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
    
        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return own_recommender
    
    @staticmethod
    def fit(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""
        
        self.model = AlternatingLeastSquares(factors=n_factors, 
                                             regularization=regularization,
                                             iterations=iterations,  
                                             num_threads=num_threads)
        self.model.fit(csr_matrix(self.user_item_matrix).T.tocsr())
        
        return self.model
    

    def get_similar_items_recommendation(self, user, filter_ctm=True, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        # your_code
        # Практически полностью реализовали на прошлом вебинаре
        # Не забывайте, что нужно учесть параметр filter_ctm
        res = []
        recommends = self.model.similar_items(self.itemid_to_id[user], N)
        for item in recommends:
            res.append(item[0])

        return res		
				
	def get_similar_users_recommendation(self, user, N=5):
	"""Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
	
		res = []
        recommends = self.model.similar_users(self.userid_to_id[user], N)
        for item in recommends:
            res.append(item[0])

        return res
예제 #16
0
def calculate_similar_artists(input_filename,
                              output_filename,
                              model_name="als",
                              factors=50,
                              regularization=0.01,
                              iterations=15,
                              exact=False,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar artists. This might take a while")

    # read in the input data file
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        if exact:
            model = AlternatingLeastSquares(factors=factors,
                                            regularization=regularization,
                                            use_native=use_native,
                                            use_cg=cg,
                                            dtype=dtype,
                                            iterations=iterations)
        else:
            model = AnnoyAlternatingLeastSquares(factors=factors,
                                                 regularization=regularization,
                                                 use_native=use_native,
                                                 use_cg=cg,
                                                 dtype=dtype,
                                                 iterations=iterations)

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(K1=100, B=0.5)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)

    # write out similar artists by popularity
    logging.debug("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    with open(output_filename, "w") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in model.similar_items(artistid, 11):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
예제 #17
0
class ImplicitALS:
    def __init__(self, df, config, orig_df):
        df = self._calc_confidence_preference(df, config.alpha)
        self.config = config
        self.orig_df = orig_df

        def check_index_uniformity(index):
            return index.min() == 0 and \
                   index.max() == len(index) - 1

        def index_info(index):
            return 'index with min %d max %d count %d items' % (
                index.min(), index.max(), len(index))

        assert check_index_uniformity(
            df.user_id.drop_duplicates()), index_info(
                df.user_id.drop_duplicates())
        assert check_index_uniformity(
            df.item_id.drop_duplicates()), index_info(
                df.item_id.drop_duplicates())

        users = df.user_id.to_list()
        items = df.item_id.to_list()
        rate = df.rate.to_list()
        shape = (len(set(items)), len(set(users)))
        self.iu_mat = csr_matrix((rate, (items, users)), shape=shape)
        self.ui_mat = self.iu_mat.transpose()

        self.model = ALS(factors=config.factors,
                         calculate_training_loss=True,
                         iterations=config.iterations,
                         regularization=config.regularization)
        self.max_uix = max(users)

    def _calc_confidence_preference(self, df, alpha):
        # convert to confidence and preference
        # use split_rate as a threshold for bad and good classes.
        # to enlarge negative effect, use quadratic transform for rate
        split_rate = 6
        eps = 1e-4
        get_p = lambda v: 1 if v > split_rate else 0
        get_logp = lambda v: log(1 + get_p(v / eps))
        df['rate'] = 1 + alpha * df.rate.apply(get_logp)
        return df

    def _delete_bookmarks(self, recs, seen_items):
        # Since filter_already_liked doesnt work, filter by hand
        for i, rec in enumerate(recs):
            if rec[0] in seen_items:
                recs[i] = None
        recs = list(filter(lambda r: r is not None, recs))
        return recs

    def fit(self):
        self.model.fit(self.iu_mat)

    def recommend_user(self, user, k, return_scores=False):
        user_items = self.orig_df[self.orig_df.user_id ==
                                  user].item_id.tolist()

        # filter liked until len(recs) != given k
        base_k = k
        k = int(min(1.5 * k, k + 0.1 * len(user_items)))
        recs = self.model.recommend(user, self.ui_mat, N=k)
        recs = self._delete_bookmarks(recs, user_items)

        while len(recs) < base_k:
            k *= 2
            recs = self.model.recommend(user, self.ui_mat, N=k)
            recs = self._delete_bookmarks(recs, user_items)

        recs = recs[:base_k]
        # return with or without scores
        if not return_scores:
            return [rec[0] for rec in recs]
        else:
            return recs

    def similar_items(self, item, k, return_scores=False):
        # Returns items that are similar to item with given id
        recs = self.model.similar_items(item, k + 1)

        # avoid recommending same item
        recs = self._delete_bookmarks(recs, [item])
        recs = recs[:k]

        # return with or without scores
        if not return_scores:
            return [rec[0] for rec in recs]
        else:
            return recs

    def similar_items_for_user(self, item, user, k, return_scores=False):
        # Returns items that are similar to item with given id and havent
        # been seen by user with given id
        user_items = self.orig_df[self.orig_df.user_id ==
                                  user].item_id.tolist()
        user_items += [item]  # avoid recommending same item

        # filter liked until len(recs) != given k
        base_k = k
        k = int(min(1.5 * k, k + 0.1 * len(user_items)))
        recs = self.model.similar_items(item, k)
        recs = self._delete_bookmarks(recs, user_items)

        while len(recs) < base_k:
            k *= 2
            recs = self.model.recommend(item, k)
            recs = self._delete_bookmarks(recs, user_items)

        recs = recs[:base_k]
        # return with or without scores
        if not return_scores:
            return [rec[0] for rec in recs]
        else:
            return recs

    def _add_empty_user(self):
        # Enlarges ui_mat and als model user_factors for 1 extra user
        # Upd wrapper data
        self.max_uix += 1
        old_shape = self.ui_mat.shape
        self.ui_mat.resize((old_shape[0] + 1, old_shape[1]))

        # Upd inner model data
        k = self.model.factors
        # set random weights for new user
        self.model.user_factors = np.vstack(
            (self.model.user_factors, np.random.randn(k)))

    def update_user_data(self, user, user_views):
        # Updates model's data about user and recalculates it
        assert isinstance(user, int)
        assert isinstance(user_views, pd.DataFrame)
        assert len(user_views) > 0
        assert len(user_views.user_id.drop_duplicates()) == 1

        user_views = user_views[user_views.item_id != -1]
        user_views = user_views.drop_duplicates(
            subset='item_id user_id'.split(), keep='last')
        user_views = self._calc_confidence_preference(user_views,
                                                      self.config.alpha)
        iixs = user_views.item_id.tolist()
        rates = user_views.rate.tolist()

        # Create new user rates csr matrix
        rowscols = ([0 for _ in iixs], iixs)
        size = (1, self.ui_mat.shape[1])
        # Upd wrapper data
        assert user <= self.max_uix
        self.ui_mat[user] = csr_matrix((rates, rowscols), shape=size)

        # Upd inner model data
        k = self.model.factors
        # set random weights for new user
        self.model.user_factors = np.vstack(
            (self.model.user_factors, np.random.randn(k)))
        # recalculate
        new_user_factors = self.model.recalculate_user(user, self.ui_mat)
        self.model.user_factors[user] = new_user_factors

    def add_user(self, user, user_views=None):
        # Adds user to recommender model. Updates model's matrixes, allows making
        # predictions for new user
        assert isinstance(user, int)

        self._add_empty_user()
        if user_views is None:
            return

        assert isinstance(user_views, pd.DataFrame)
        assert len(user_views) > 0
        assert len(user_views.user_id.drop_duplicates()) == 1

        self.update_user_data(user, user_views)
예제 #18
0
class MainRecommender:
    
    
    
    own_recomender_defult_param = {'filter_already_liked_items':False, 
                        'filter_items':False, 
                        "recalculate_user":True}
    
    model_als_defult_param ={'factors':50, 'regularization':15, 'iterations':15, 
                             'num_threads':-1,'calculate_training_loss':False}
    
    def __init__(self, data,data_test=None,split_info=None):
        """ data - dataframe c данными
            data_test - даные для валидации, если нет и есть split_info то создаем
            split_info кортеж с инфрмацией как создать data_test (размер, поле деления) рассматривается только в слуяае отсутвя 
            data_test
        """     
        self.top = 5000
        self.data_validation={}
        self.data_validation['status'] = False
        self.user_item_matrix = {'status':False,'matrix':None,'params':None}
        self.own_recommender_is_fit= {'status':False,'params':None}
        self.als_recommender_is_fit= {'status':False,'params':None}
        self.data = data.copy()
        self.full_data_train = data.copy() #Оставим полный объем данный , если нужно будет предсказывать по полному объему данных
        self.data_train = data.copy()
        if data_test is not None:
            self.data_test = data_test.copy()
        else:
            self.data_test = None
            if split_info:
                self.data_train,self.data_test = self.train_test_split(test_size_num = split_info[0],split_column =split_info[1])
        if  self.data_test is not None:
            self.data_validation['data'] = self.get_validation_data()
            self.data_validation['status'] = True


 
    def prefiltr_1(self,my_data):
        df = my_data.copy()
        """Оставим только top самых популярных товаров остальные переименуем в 999999"""
        popularity = my_data.groupby('item_id')['quantity'].count().reset_index()
        popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
        top_5000 = popularity.sort_values('n_sold', ascending=False).head(self.top).item_id.tolist()
        df.loc[~df['item_id'].isin(top_5000), 'item_id'] = 999999 
        return df
    
    
    def prefiltr_2(self,data_train,n=5000):
        """Оставим только n самых популярных товаров, транзакции с остальными товрами удалим"""
        df = data_train.copy()
        popularity = df.groupby('item_id')['quantity'].count().reset_index()
        popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
        top_n = popularity.sort_values('n_sold', ascending=False).head(n).item_id.tolist()
        df = df.loc[df['item_id'].isin(top_n)]  
        return df
    
    
    def prefiltr_3(self,data_train,n=5000):
        """транзакции с самыми не популярными n товрами удалим"""
        df = data_train.copy()
        not_popularity = df.groupby('item_id')['quantity'].count().reset_index()
        not_popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
        not_top_n = not_popularity.sort_values('n_sold').head(n).item_id.tolist()
        df = df.loc[~df['item_id'].isin(not_top_n)]  
        return df   
    
    
    def prefiltr_4(self,data_train,weeks = 50):
        """Удалим транзакции с товарами, которые не покупали более n недель"""
        df = data_train.copy()
        old_item = df.groupby('item_id')['week_no'].max().reset_index()
        old_item = old_item.loc[old_item['week_no']>weeks,'item_id'].tolist()
        df = df.loc[df['item_id'].isin(old_item)]  
        return df
    


  
    def train_test_split(self,test_size_num,split_column):
        data_train = self.data[self.data[split_column] < self.data[split_column].max() - test_size_num]
        data_test = self.data[self.data[split_column] >= self.data[split_column].max() - test_size_num]
        return data_train, data_test
    
    
   
    def get_validation_data(self):
        result = self.data_test.groupby('user_id')['item_id'].unique().reset_index()
        users_train = self.data_train.user_id.unique()
        result = result[result.user_id.isin(users_train)]
        result['train'] = result['user_id'].map(self.data_train.groupby('user_id')['item_id'].unique())
        result['full_train'] = result['user_id'].map(self.full_data_train.groupby('user_id')['item_id'].unique())
        result.rename(columns={'item_id':'test'},inplace=True)
        result.reset_index(inplace=True,drop=True)
        return result

 
    def prepare_matrix(self,agg_column,full=None,filtr=None):
        my_data = self.data_train.copy()
        if full:
            my_data = self.full_data_train.copy()
        if  filtr:
            for i in filtr:
                prefiltr = 'self.prefiltr_'+str(i)+'(my_data)'
                my_data = eval(prefiltr)
            
        user_item_matrix = pd.pivot_table(my_data, 
                              index='user_id', columns='item_id', 
                              values=agg_column[0], 
                              aggfunc=agg_column[1], 
                              fill_value=0
                             )
        
        user_item_matrix = user_item_matrix.astype(float) 
        self.prepare_dicts(user_item_matrix)
        self.current_working_data = my_data.copy()

        return user_item_matrix
            


    def prepare_dicts(self,user_item_matrix):
        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        self.id_to_itemid = dict(zip(matrix_itemids, itemids))
        self.id_to_userid = dict(zip(matrix_userids, userids))

        self.itemid_to_id = dict(zip(itemids, matrix_itemids))
        self.userid_to_id = dict(zip(userids, matrix_userids))
        
        return  self.id_to_itemid,  self.id_to_userid,  self.itemid_to_id,  self.userid_to_id
    
    
     
    def make_data(self,agg_column,filtr=None,full =False,top = 5000):
        self.top = top
        self.full = full
        uim = self.prepare_matrix(agg_column=agg_column,full=full,filtr=filtr)
        uim_w = uim.copy()
        self.user_item_matrix['uim_matrix_w'] = csr_matrix(uim_w).tocsr()
        uim[uim>0]=1
        self.user_item_matrix['uim_matrix'] = csr_matrix(uim).tocsr()
        
        self.user_item_matrix['ium_matrix_w_tfidf'] = tfidf_weight(csr_matrix(uim_w.T).tocsr())
        self.user_item_matrix['ium_matrix_tfidf'] = tfidf_weight(csr_matrix(uim.T).tocsr())
        self.user_item_matrix['ium_matrix_w_bm25'] = bm25_weight(csr_matrix(uim_w.T).tocsr())
        self.user_item_matrix['ium_matrix_bm25'] = bm25_weight(csr_matrix(uim.T).tocsr())

        self.user_item_matrix['status'] = True
        self.user_item_matrix['params'] = {'agg_column':agg_column,'filtr':filtr,'full':full}
        return self.user_item_matrix
            
        
    def precision_at_k(x, k=5):
        if len(x['predict']) == 0:
            return 0
        bought_list = np.array(x['test'])
        recommended_list = np.array(x['predict'])[:k]
        flags = np.isin(bought_list, recommended_list)
        precision = flags.sum() / len(recommended_list)


        return precision
        
        
    
    def fit_own_recommender(self,weighting=False):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
        
        assert self.user_item_matrix['status'], 'необходимо сначала выполнить метод make_data(self,agg_column,filtr=None,weighting=None,full =False)'
        ium = self.user_item_matrix['uim_matrix'].T
        if weighting:
            assert (weighting == 'tf_idf' or weighting == 'bm25'), 'необходимо указать weighting: tf_idf или bm25 или None'
            if  weighting == 'tf_idf':
                ium = self.user_item_matrix['ium_matrix_tfidf']
            else:
                ium = self.user_item_matrix['ium_matrix_bm25']   
        self.own_recommender = ItemItemRecommender(K=1, num_threads=-1)
        self.own_recommender.fit(ium)      
        self.own_recommender_is_fit['status'] =True
        self.own_recommender_is_fit['params'] ={'model':'ItemItemRecommender(K=1, num_threads=-1)','weighting':weighting}
        self.own_recommender_is_fit['ium']=ium
        
        return self.own_recommender
    
    
    def predict_own_recommender(self,users,N=5,params=own_recomender_defult_param):
        
        param = params.copy()
        assert self.own_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_own_recommender()'
        assert type(users) == list, 'users - должен быть списком'
        uim = self.user_item_matrix['uim_matrix']
        param['user_items'] = uim
        param['N'] = N
        answer = pd.DataFrame()
        answer['user_id']=users
        if param['filter_items']:
            param['filter_items']=[self.itemid_to_id[i] for i in params['filter_items']]
        rec=[]
        for user in users:
            param['userid'] = self.userid_to_id[user]
            rec.append( [self.id_to_itemid[i[0]] for i in self.own_recommender.recommend(**param)])
        answer['result']  = rec
        return answer

    
    
    def validation_own_recommender(self,metric=precision_at_k,N=5,params=own_recomender_defult_param):
        assert self.data_validation['status'], 'тестовые данные не созданы'
        assert self.own_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_own_recommender()'
        df = self.data_validation['data']
        
        users = df['user_id'].to_list()
        
        predict = self.predict_own_recommender(users = users,N=N,params=params)
             
        df['predict'] = predict['result']
        
        return df.apply(metric,axis=1).mean()
            
        
  
    def fit_als(self, params = model_als_defult_param,weighting=False):
        """Обучает ALS"""
        
        assert self.user_item_matrix['status'], 'необходимо сначала выполнить метод make_data(self,agg_column,filtr=None,weighting=None,full =False)'
        ium = self.user_item_matrix['uim_matrix_w'].T
        if weighting:
            assert (weighting == 'tf_idf' or weighting == 'bm25'), 'необходимо указать weighting: tf_idf или bm25 или None'
            if  weighting == 'tf_idf':
                ium = self.user_item_matrix['ium_matrix_w_tfidf']
            else:
                ium = self.user_item_matrix['ium_matrix_w_bm25']
        
        self.model_als = AlternatingLeastSquares(**params)
        self.model_als.fit(ium)
        self.als_recommender_is_fit['status'] = True
        self.als_recommender_is_fit['params'] = {'model':params,'weighting':weighting}
        self.als_recommender_is_fit['ium'] = ium
        
        return self.model_als
    
    
    def predict_als(self,users,N=5,params=own_recomender_defult_param):
        
        param = params.copy()
        assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()'
        assert type(users) == list, 'users - должен быть списком'
        uim = self.user_item_matrix['uim_matrix_w']
        param['user_items'] = uim
        param['N'] = N
        answer = pd.DataFrame()
        answer['user_id']=users
        if param['filter_items']:
            param['filter_items']=[self.itemid_to_id[i] for i in params['filter_items']]
        rec=[]
        for user in users:
            param['userid'] = self.userid_to_id[user]
            rec.append( [self.id_to_itemid[i[0]] for i in self.model_als.recommend(**param)])
        answer['result']  = rec
        return answer
    
    
    def validation_als_recommender(self,metric=precision_at_k,N=5,params=own_recomender_defult_param):
        assert self.data_validation['status'], 'тестовые данные не созданы'
        assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()'
        df = self.data_validation['data'].copy()
        users = df['user_id'].to_list()
        predict = self.predict_als(users = users,N=N,params=params)
        df['predict'] = predict['result']

        return df.apply(metric,axis=1).mean()  
    
    
    def get_recs(self,user,popularity,not_my=0):
        result = []
        for item in popularity[popularity['user_id']==user]['item_id'].to_list():
            recs_ = self.model_als.similar_items(self.itemid_to_id[item], N=3)
            recs = [self.id_to_itemid[i[0]] for i in recs_]
            if 999999 in recs:
                recs.remove(999999)
            result.append(recs[not_my])
        return  result      


    def get_similar_items_recommendation(self, users,not_my=0, N=5):
        
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров
        not_my =1 если хотим предсказать поекупку собственных товаров (вроде own_recomender), 0 - обратно"""
        assert  self.als_recommender_is_fit['status'],'Модель als не обучена, используйте fit_als()'
        assert  type(users)==list,'параметр users должен быть list'
        assert  not_my in [0,1],'параметр not_my должен быть равен 0 или 1'
        my_data = self.current_working_data.copy()
        my_data = my_data[my_data['user_id'].isin(users)]    
        popularity = my_data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        popularity.sort_values('quantity', ascending=False, inplace=True)
        popularity = popularity[popularity['item_id'] != 999999]
        popularity =popularity.groupby('user_id').head(N)
        popularity.sort_values(['user_id','quantity'], ascending=False, inplace=True)
        result = pd.DataFrame()
        result['user_id'] = users
        result['similar_recommendation'] = result['user_id'].apply(\
                                            lambda x: self.get_recs(user = x,popularity = popularity,not_my=not_my))

        return result
    
    
    def validation_similar_items_recommendation(self,metric=precision_at_k,N=5,not_my=0):
        assert self.data_validation['status'], 'тестовые данные не созданы'
        assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()'
        assert  not_my in [0,1],'параметр not_my должен быть равен 0 или 1'
        df = self.data_validation['data'].copy()
        users = df['user_id'].to_list()
        predict = self.get_similar_items_recommendation(users = users,N=N,not_my=not_my)
        df['predict'] = predict['similar_recommendation']

        return df.apply(metric,axis=1).mean() 
    
    
    
    def get_user(self,user):
        users = self.model_als.similar_users(self.userid_to_id[user], N=2)
        
        return  self.id_to_userid[users[1][0]]
    
    
    def get_similar_users_recommendation(self, users, N=5,params=own_recomender_defult_param):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        assert  self.als_recommender_is_fit['status'],'Модель als не обучена, используйте fit_als()'
        assert  type(users)==list,'параметр users должен быть list'
        result = pd.DataFrame()
        result['user_id'] = users
        result['simular_user_id'] = result['user_id'].apply(self.get_user)
        result['similar_recommendation'] = self.predict_als(result['simular_user_id'].to_list(),N=5,params=params)['result']

        return result    
            
    def validation_similar_users_recommendation(self,metric=precision_at_k,N=5):
        assert self.data_validation['status'], 'тестовые данные не созданы'
        assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()'
        df = self.data_validation['data'].copy()
        users = df['user_id'].to_list()
        predict = self.get_similar_users_recommendation(users = users,N=N)
        df['predict'] = predict['similar_recommendation']

        return df.apply(metric,axis=1).mean()     
예제 #19
0
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """
    def __init__(self,
                 data,
                 user_features,
                 item_features,
                 items_to_filter=[999999],
                 weighting=True):

        self.items_to_filter = items_to_filter
        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(
            ['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity',
                                       ascending=False,
                                       inplace=True)
        self.top_purchases = self.top_purchases[
            self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby(
            'item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity',
                                               ascending=False,
                                               inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[
            ~self.overall_top_purchases['item_id'].isin(
                self.items_to_filter)]  # ~self.top_purchases отрицание
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist(
        )

        self.user_item_matrix, self.sparse_user_item = self._prepare_matrix(
            data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        # LightFM не будет взвешен при такой конструкции
        self.user_feat_lightfm_fixed, self.item_feat_lightfm_fixed = self._prepare_user_item_feat_lightfm(
            self.user_item_matrix, user_features, item_features)
        self.user_item_matrix_lightfm = self.user_item_matrix.copy()

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

    @staticmethod
    def _prepare_matrix(data):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(
            data,
            index='user_id',
            columns='item_id',
            values='quantity',  # Можно пробовать другие варианты
            aggfunc='count',
            fill_value=0)

        user_item_matrix = user_item_matrix.astype(
            float)  # необходимый тип матрицы для implicit

        # переведем в формат sparse matrix (для LightFM)
        sparse_user_item = csr_matrix(user_item_matrix).tocsr()

        return user_item_matrix, sparse_user_item

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def _prepare_user_item_feat_lightfm(user_item_matrix, user_features,
                                        item_features):
        """Готовит под нужный формат фичи для LightFM"""

        user_feat = pd.DataFrame(user_item_matrix.index)
        user_feat = user_feat.merge(
            user_features, on='user_id',
            how='left').drop(columns=['homeowner_desc'])
        user_feat.set_index('user_id', inplace=True)

        item_feat = pd.DataFrame(user_item_matrix.columns)
        item_feat = item_feat.merge(
            item_features, on='item_id', how='left').drop(
                columns=['sub_commodity_desc', 'curr_size_of_product'])
        item_feat.set_index('item_id', inplace=True)

        user_feat_lightfm_fixed = pd.get_dummies(
            user_feat, columns=user_feat.columns.tolist())
        item_feat_lightfm_fixed = pd.get_dummies(
            item_feat, columns=item_feat.columns.tolist())

        return user_feat_lightfm_fixed, item_feat_lightfm_fixed

    def fit_own_recommender(self):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        self.own_recommender = ItemItemRecommender(K=1, num_threads=4)
        self.own_recommender.fit(csr_matrix(self.user_item_matrix).T.tocsr())

        return self.own_recommender

    def fit_als(self,
                n_factors=20,
                regularization=0.001,
                iterations=15,
                num_threads=4,
                show_progress=True,
                use_gpu=True,
                random_state=42):
        """Обучает ALS"""

        self.model_als = AlternatingLeastSquares(factors=n_factors,
                                                 regularization=regularization,
                                                 iterations=iterations,
                                                 use_gpu=use_gpu,
                                                 num_threads=num_threads,
                                                 random_state=random_state)
        self.model_als.fit(csr_matrix(self.user_item_matrix).T.tocsr(),
                           show_progress=show_progress)

        return self.model_als

    def fit_lightfm(self,
                    no_components=16,
                    loss='warp',
                    learning_rate=0.05,
                    item_alpha=0.2,
                    user_alpha=0.05,
                    random_state=42,
                    epochs=15):
        """Обучает LightFM"""

        self.model_lightfm = LightFM(
            no_components=no_components,
            loss=loss,  # или 'warp' - ниже в уроке описана разница
            learning_rate=learning_rate,
            item_alpha=item_alpha,
            user_alpha=user_alpha,
            random_state=random_state)

        self.model_lightfm.fit(
            (self.sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
            sample_weight=coo_matrix(
                self.user_item_matrix_lightfm),  # матрица весов С
            user_features=csr_matrix(
                self.user_feat_lightfm_fixed.values).tocsr(),
            item_features=csr_matrix(
                self.item_feat_lightfm_fixed.values).tocsr(),
            epochs=epochs,
            num_threads=8)

        return self.model_lightfm

    def precision_at_k_lightfm(self, model_lightfm, sparse_user_item, k=5):
        """Precision встроенный в LightFM"""

        self.precision_res = precision_at_k(
            model_lightfm,
            sparse_user_item,
            user_features=csr_matrix(
                self.user_feat_lightfm_fixed.values).tocsr(),
            item_features=csr_matrix(
                self.item_feat_lightfm_fixed.values).tocsr(),
            k=k)

        return self.precision_res

    def recall_at_k_lightfm(self, model_lightfm, sparse_user_item, k=5):
        """Recall встроенный в LightFM"""

        self.recall_res = recall_at_k(
            model_lightfm,
            sparse_user_item,
            user_features=csr_matrix(
                self.user_feat_lightfm_fixed.values).tocsr(),
            item_features=csr_matrix(
                self.item_feat_lightfm_fixed.values).tocsr(),
            k=k)

        return self.recall_res

    def _update_dict(self, user_id):
        """Если появился новый user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""

        recs = self.model_als.similar_items(
            self.itemid_to_id[item_id],
            N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)

        recs = model.recommend(
            userid=self.userid_to_id[user],
            user_items=csr_matrix(self.user_item_matrix).tocsr(),
            N=N,
            filter_already_liked_items=False,
            filter_items=[
                self.itemid_to_id[item] for item in self.items_to_filter
                if item in self.itemid_to_id.keys()
            ],  # [self.itemid_to_id[item] for item in items_to_filter if item in self.itemid_to_id.keys()]
            recalculate_user=True)

        res = [self.id_to_itemid[rec[0]] for rec in recs]

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model_als, N=N)

    def get_lightfm_recommendations(self, user, N=5):
        """Рекомендации для библиотеки LightFM"""

        self._update_dict(user_id=user)
        test_item_ids = np.arange(len(self.itemid_to_id))

        scores = self.model_lightfm.predict(
            user_ids=int(
                self.userid_to_id[user]
            ),  # На <class 'numpy.int64'> ругается, поэтому в int перевожу
            item_ids=test_item_ids,
            user_features=csr_matrix(
                self.user_feat_lightfm_fixed.values).tocsr(),
            item_features=csr_matrix(
                self.item_feat_lightfm_fixed.values).tocsr(),
            num_threads=8)
        top_items = np.argsort(-scores)

        res = [
            self.id_to_itemid[item] for item in top_items
        ][:
          N]  # Конвертируем id обратно, делаем срез на нужное число, т.к. предсказания были для всех item
        res = self._extend_with_top_popular(
            res, N=N
        )  # Теоретически для этой модели не нужно, т.к. предсказания сразу для всех делает?

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров.
        Не фильтрует item_id!"""

        top_users_purchases = self.top_purchases[self.top_purchases['user_id']
                                                 == user].head(N)

        res = top_users_purchases['item_id'].apply(
            lambda x: self._get_similar_item(x)).tolist()
        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами.
        Не фильтрует item_id!"""

        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model_als.similar_users(self.userid_to_id[user],
                                                     N=N + 1)
        similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]  # удалим юзера из запроса

        for user in similar_users:
            userid = self.id_to_userid[
                user]  # own recommender works with user_ids
            res.extend(self.get_own_recommendations(userid, N=1))

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res
예제 #20
0
class Recommender:
    def __init__(self, factors=50):
        self.model = AlternatingLeastSquares(factors=factors,
                                             regularization=0.01,
                                             dtype=np.float64,
                                             iterations=50)

    def train(self, data):
        userids = data.userid.astype("category")
        itemids = data.itemid.astype("category")

        matrix = coo_matrix((data.confidence.astype('float64'),
                             (itemids.cat.codes.copy(),
                              userids.cat.codes.copy())))
        self.model.fit(matrix)
        self.t_matrix = matrix.T.tocsr()
        self.userid_to_code = dict([(category, code)
                                    for code, category in enumerate(userids.cat.categories)])
        self.itemid_to_code = dict([(category, code)
                                    for code, category in enumerate(itemids.cat.categories)])
        self.usercode_to_id = dict([(code, category)
                                    for code, category in enumerate(userids.cat.categories)])
        self.itemcode_to_id = dict([(code, category)
                                    for code, category in enumerate(itemids.cat.categories)])

    def similar_items(self, itemid, N=10):
        item_code = self.itemid_to_code[itemid]
        similar_codes = self.model.similar_items(item_code, N)
        similar_ids = [(self.itemcode_to_id[code], s)
                       for code, s in similar_codes]
        return pd.DataFrame(similar_ids, columns=["itemid", "similarity"])

    def recommendations(self, userid, N=10):
        user_code = self.userid_to_code[userid]
        user_item_codes = self.model.recommend(user_code, self.t_matrix, N)
        user_item_ids = [(self.itemcode_to_id[code], c)
                         for code, c in user_item_codes]
        return pd.DataFrame(user_item_ids, columns=["itemid", "confidence"])

    def explain(self, userid, itemid):
        user_code = self.userid_to_code[userid]
        item_code = self.itemid_to_code[itemid]
        return self.model.explain(user_code, self.t_matrix, item_code)

    def confidence(self, userid, itemid):
        item_code = self.itemid_to_code[itemid]
        user_code = self.userid_to_code[userid]
        item_factor = self.model.item_factors[item_code]
        user_factor = self.model.user_factors[user_code]
        return item_factor.dot(user_factor)

    def user_factors(self):
        factors = pd.DataFrame(self.model.user_factors).add_prefix("f")
        ids = factors.index.map(lambda code: self.usercode_to_id[code])
        factors.insert(0, "userid", ids)
        return factors

    def item_factors(self):
        factors = pd.DataFrame(self.model.item_factors).add_prefix("f")
        ids = factors.index.map(lambda code: self.itemcode_to_id[code])
        factors.insert(0, "itemid", ids)
        return factors

    def items_recommendations(self, itemids, N=10):
        user_code = 0
        item_codes = [self.itemid_to_code[id] for id in itemids]

        data = [1 for _ in item_codes]
        rows = [0 for _ in item_codes]
        shape = (1, self.model.item_factors.shape[0])
        user_items = coo_matrix(
            (data, (rows, item_codes)), shape=shape).tocsr()

        user_item_codes = self.model.recommend(
            user_code, user_items, N, recalculate_user=True)
        user_item_ids = [(self.itemcode_to_id[code], c)
                         for code, c in user_item_codes]
        return pd.DataFrame(user_item_ids, columns=["itemid", "confidence"])
예제 #21
0
class AlsEstimator(TransformerMixin, BaseEstimator):
    def __init__(self,
                 recommendations='als',
                 n_rec=5,
                 n_rec_pre=100,
                 n_new=2,
                 n_exp=1,
                 price_lte=7,
                 filter_item_id=-99,
                 filter=True,
                 filter_post=True,
                 postfilter_func=None,
                 factors=50,
                 regularization=0.01,
                 iterations=10,
                 matrix_values='quantity',
                 matrix_aggfunc='count',
                 weighting=True,
                 use_native=True,
                 use_gpu=False):

        self.n_rec = n_rec
        self.n_rec_pre = n_rec_pre
        self.n_new = n_new
        self.n_exp = n_exp
        self.price_lte = price_lte
        self.filter_item_id = filter_item_id
        self.filter = filter
        self.filter_post = filter_post
        self.postfilter_func = postfilter_func

        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.matrix_values = matrix_values
        self.matrix_aggfunc = matrix_aggfunc
        self.recommendations = recommendations
        self.weighting = True

        self.use_native = use_native
        self.use_gpu = use_gpu

    def _reset(self):
        if hasattr(self, 'item_info'):
            del self.item_info
        if hasattr(self, 'user_history'):
            del self.user_history
        if hasattr(self, 'top_purchases'):
            del self.top_purchases
        if hasattr(self, 'overall_top_purchases'):
            del self.overall_top_purchases
        if hasattr(self, 'user_item_matrix'):
            del self.user_item_matrix
        if hasattr(self, 'id_to_itemid'):
            del self.id_to_itemid
        if hasattr(self, 'id_to_userid'):
            del self.id_to_userid
        if hasattr(self, 'itemid_to_id'):
            del self.itemid_to_id
        if hasattr(self, 'userid_to_id'):
            del self.userid_to_id
        if hasattr(self, '_fit'):
            del self._fit

    @staticmethod
    def _prepare_matrix(data: pd.DataFrame, values: str, aggfunc: str):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(data,
                                          index='user_id',
                                          columns='item_id',
                                          values=values,
                                          aggfunc=aggfunc,
                                          fill_value=0)

        user_item_matrix = user_item_matrix.astype(float)

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    def fit(self, X, y=None):
        self._reset()
        self.item_info = X.groupby('item_id').agg({
            'price': 'max',
            'SUB_COMMODITY_DESC': 'first'
        })
        self.user_history = pd.DataFrame(
            X.groupby('user_id').item_id.unique().rename('history'))

        self.top_purchases = X.groupby(['user_id', 'item_id'
                                        ])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity',
                                       ascending=False,
                                       inplace=True)
        self.top_purchases = self.top_purchases[
            self.top_purchases['item_id'] != self.filter_item_id]

        # Топ покупок по всему датасету
        self.overall_top_purchases = X.groupby(
            'item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity',
                                               ascending=False,
                                               inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[
            self.overall_top_purchases['item_id'] != self.filter_item_id]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist(
        )

        self.user_item_matrix = self._prepare_matrix(X, self.matrix_values,
                                                     self.matrix_aggfunc)

        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if self.weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = AlternatingLeastSquares(
            factors=self.factors,
            regularization=self.regularization,
            iterations=self.iterations,
            dtype=np.float32,
            use_native=self.use_native,
            use_gpu=self.use_gpu,
        )

        self.model.fit(csr_matrix(self.user_item_matrix).T.tocsr())

        self.model_own_recommender = ItemItemRecommender(K=1)
        self.model_own_recommender.fit(
            csr_matrix(self.user_item_matrix).T.tocsr())

        self._fit = True

    def transform(self, X):
        if self._fit:
            X = X['user_id'].drop_duplicates()
            X.index = X.values
        return X

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(
            self.itemid_to_id[item_id],
            N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if self.filter_post:
            n_rec = self.n_rec_pre
        else:
            n_rec = self.n_rec

        if len(recommendations) < n_rec:
            recommendations.extend(self.overall_top_purchases[:n_rec])
            recommendations = recommendations[:n_rec]

        return recommendations

    def _get_recommendations(self, user, model, n_rec):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        try:
            res = [
                self.id_to_itemid[rec[0]] for rec in model.recommend(
                    userid=self.userid_to_id[user],
                    user_items=csr_matrix(self.user_item_matrix).tocsr(),
                    N=n_rec,
                    filter_already_liked_items=False,
                    filter_items=[self.itemid_to_id[self.filter_item_id]],
                    recalculate_user=True)
            ]
        except:
            res = list()
        finally:
            res = self._extend_with_top_popular(res)

            assert len(res) == n_rec, 'Количество рекомендаций != {}'.format(
                n_rec)
            return res

    def get_als_recommendations(self, user):
        """Рекомендации через стардартные библиотеки implicit"""
        if self.filter_post:
            n_rec = self.n_rec_pre
        else:
            n_rec = self.n_rec

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model, n_rec)

    def get_own_recommendations(self, user):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user,
                                         model=self.model_own_recommender)

    def get_similar_items_recommendations(self, user):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        if self.filter_post:
            n_rec = self.n_rec_pre
        else:
            n_rec = self.n_rec

        top_users_purchases = self.top_purchases[self.top_purchases['user_id']
                                                 == user].head(n_rec)

        res = top_users_purchases['item_id'].apply(
            lambda x: self._get_similar_item(x)).tolist()
        res = self._extend_with_top_popular(res)

        assert len(res) == n_rec, 'Количество рекомендаций != {}'.format(n_rec)
        return res

    def get_similar_users_recommendations(self, user):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        if self.filter_post:
            n_rec = self.n_rec_pre
        else:
            n_rec = self.n_rec
        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user],
                                                 N=n_rec + 1)
        similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]  # удалим юзера из запроса

        for user in similar_users:
            user_rec = self._get_recommendations(
                user, model=self.model_own_recommender, n_rec=1)
            res.extend(user_rec)

        res = self._extend_with_top_popular(res)

        assert len(res) == n_rec, 'Количество рекомендаций != {}'.format(n_rec)
        return res

    def predict(self, X):
        X = self.transform(X)
        recommender = getattr(self,
                              f'get_{self.recommendations}_recommendations')

        rec = X.swifter.progress_bar(False).apply(
            lambda item: recommender(user=item))
        if self.postfilter_func is not None and self.filter_post:
            rec = self.postfilter_func(
                rec,
                item_info=self.item_info,
                user_history=self.user_history,
                n_rec=self.n_rec,
                n_new=self.n_new,
                n_exp=self.n_exp,
                price_lte=self.price_lte,
            )

        assert (rec.swifter.progress_bar(False).apply(len) == self.n_rec).all(
        ), f'The number of recommendations is not equal {self.n_rec}.'

        return rec
예제 #22
0
class Recommender:
    def __init__(self, **args):
        self.TRAINING_THREADS = int(
            args.get("training_threads", os.cpu_count()))
        self.ALS_FACTORS = args.get("als_factors", 128)
        self.ALS_REGULARIZATION = args.get("als_regularization", 1e-2)
        self.ALS_ITERATIONS = args.get("als_iterations", 15)
        self.MIN_POST_FAVS = args.get("min_post_favs", 5)
        self.MIN_USER_FAVS = args.get("min_user_favs", 50)
        self.MAX_FAVS = args.get("max_favs", 1e12)
        self.FAVS_PATH = args.get("favs_path", "data/favs.csv")
        self.MODEL_PATH = args.get("model_path", "data/recommender.pickle")
        self.DATABASE_URL = args.get("database_url",
                                     "postgresql://localhost/danbooru2")

    @staticmethod
    def create(**args):
        env = {name.lower(): value for name, value in os.environ.items()}
        args = {**env, **args}

        recommender = Recommender(**args)
        recommender.dump_favorites()
        recommender.load_favorites()
        recommender.train()
        recommender.save(recommender.MODEL_PATH)

        return recommender

    @staticmethod
    def load(model_path):
        with open(model_path, "rb") as file:
            return pickle.load(file)

    def dump_favorites(self):
        query = f"""
      SELECT
        post_id,
        user_id
      FROM favorites
      WHERE
        post_id IN (SELECT id FROM posts WHERE fav_count > {self.MIN_POST_FAVS})
        AND user_id IN (SELECT id FROM users WHERE favorite_count > {self.MIN_USER_FAVS})
      ORDER BY post_id DESC
      LIMIT {self.MAX_FAVS}
    """

        self.shell(
            f"psql --no-psqlrc -c '\copy ({query}) TO STDOUT WITH (FORMAT CSV)' {self.DATABASE_URL} > {self.FAVS_PATH}"
        )

    def load_favorites(self):
        favs_df = pd.read_csv(self.FAVS_PATH,
                              dtype=np.int32,
                              names=["post_id", "user_id"])
        favs_df = favs_df.astype("category")

        self.favorites = csr_matrix(
            (np.ones(favs_df.shape[0]), (favs_df["post_id"].cat.codes.copy(),
                                         favs_df["user_id"].cat.codes.copy())),
            dtype=np.int32)
        self.users_to_id = {
            k: v
            for v, k in enumerate(favs_df["user_id"].cat.categories)
        }
        self.posts_to_id = {
            k: v
            for v, k in enumerate(favs_df["post_id"].cat.categories)
        }
        self.ids_to_post = {k: v for v, k in self.posts_to_id.items()}
        self.empty = csr_matrix(self.favorites.shape)

    def train(self):
        self.model = AlternatingLeastSquares(
            calculate_training_loss=True,
            dtype=np.float32,
            num_threads=self.TRAINING_THREADS,
            factors=self.ALS_FACTORS,
            regularization=self.ALS_REGULARIZATION,
            iterations=self.ALS_ITERATIONS)

        start = time.monotonic()
        self.model.fit(self.favorites)
        end = time.monotonic()
        dur = int(end - start)

        self.favorites = None
        self.trained_at = datetime.utcnow().isoformat()
        self.training_time = "{:02d}:{:02d}:{:02d}".format(
            dur // 3600, (dur % 3600 // 60), dur % 60)

    def recommend_for_user(self, user_id, limit=50):
        if not user_id in self.users_to_id:
            return []

        uid = self.users_to_id[user_id]
        recommendations = self.model.recommend(uid, self.empty, N=limit)
        recommendations = [(self.ids_to_post[id], float(score))
                           for id, score in recommendations]
        return recommendations

    def recommend_for_post(self, post_id, limit=50):
        if not post_id in self.posts_to_id:
            return []

        pid = self.posts_to_id[post_id]
        recommendations = self.model.similar_items(pid, N=limit)
        recommendations = [(self.ids_to_post[id], float(score))
                           for id, score in recommendations]
        return recommendations

    def metrics(self):
        return {
            "user_count":
            len(self.users_to_id),
            "post_count":
            len(self.posts_to_id),
            "factors":
            self.model.factors,
            "model_size":
            4 * self.model.factors *
            (len(self.users_to_id) + len(self.posts_to_id)),
            "trained_at":
            self.trained_at,
            "training_time":
            self.training_time,
        }

    def save(self, model_path):
        with open(model_path, "wb") as file:
            pickle.dump(self, file)

    def shell(self, cmd):
        subprocess.run(cmd,
                       stdout=sys.stdout,
                       stderr=sys.stderr,
                       shell=True,
                       check=True)