示例#1
0
class CosineRecommenderModel(BaseItemItemRecommenderModel):
    def __init__(self, products: np.ndarray, params: dict):
        self.cosine_recommender = CosineRecommender(**params)
        self._product_idx = dict(zip(products, range(len(products))))
        self._idx_product = products.tolist()

    def fit_recommender(self, purchases):
        user_item_matrix = self._create_user_item_matrix_from_purchases(
            purchases)
        logger.debug('Training CosineRecommender ...')
        self.cosine_recommender.fit(user_item_matrix.T)
        return self

    def recommend(self, products_counter):
        recs = []
        user_item_csr_row = self._make_user_item_csr_row(
            values=list(products_counter.values()),
            item_idx=products_counter.keys())

        cosine_preds = self.cosine_recommender.recommend(
            0,
            user_item_csr_row,
            N=30,
            recalculate_user=True,
            filter_already_liked_items=False)

        cosine_preds = [
            self._idx_product[idx] for (idx, score) in cosine_preds
        ]

        return cosine_preds
    def fit_cosin_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        cosin_recommender = CosineRecommender(K=2, num_threads=0)
        cosin_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return cosin_recommender
示例#3
0
class BaseItemItemRecommenderModel:
    def __init__(self, products: np.ndarray, params: dict):
        self.recommender = CosineRecommender(**params)
        self._product_idx = dict(zip(products, range(len(products))))
        self._idx_product = products.tolist()

    def _make_user_item_csr_row(self, values, item_idx):
        row = sp.coo_matrix(
            (values,
             (np.zeros(len(values)), [self._product_idx[p]
                                      for p in item_idx])),
            shape=(1, len(self._product_idx)))

        return row.tocsr()

    def _create_user_item_matrix_from_purchases(self, purchases):
        clients = purchases.client_id.unique()
        clients_mapper = dict(zip(clients, range(len(clients))))

        user_item_matrix = sp.coo_matrix(
            (purchases.relevance.values,
             (purchases.client_id.map(clients_mapper).values,
              purchases.product_id.map(self._product_idx).values)))

        user_item_matrix = user_item_matrix.tocsr()
        user_item_matrix.eliminate_zeros()
        return user_item_matrix

    def _fit_recommender(self, purchases):
        user_item_matrix = self._create_user_item_matrix_from_purchases(
            purchases)
        logger.debug('Training Recommender Model...')
        self.recommender.fit(user_item_matrix.T)
示例#4
0
def calculate_distance_matrix(dataset):
    logging.debug("Calculating similar items matrix. This might take a while")
    # generate a recommender model based off the input params
    model = CosineRecommender()
    # train the model
    logging.debug("calculating distant matrix")
    start = time.time()
    model.fit(dataset)
    similarity_matrix = model.similarity
    logging.debug("trained model '%s' in %s", 'cosine', time.time() - start)

    return similarity_matrix
示例#5
0
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
示例#6
0
    def __init__(self,
                 products: pd.DataFrame,
                 params_rec: dict,
                 params_catboost: dict,
                 catboost_features=CB_FEATURES):
        self.ranker = catboost.CatBoost(params_catboost)
        self._catboost_features = catboost_features
        self._nan_fill_dict = dict()

        self.recommender = CosineRecommender(**params_rec)
        self._product_idx = dict(zip(products.product_id,
                                     range(len(products))))

        self._idx_product = products.product_id.tolist()
        self._product_features = {
            row['product_id']: dict(row.drop(index='product_id'))
            for (i, row) in products.iterrows()
        }
示例#7
0
def calculate_similar_movies(input_path,
                             output_filename,
                             model_name="als",
                             min_rating=4.0):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, movies, m = read_data(input_path, min_rating=min_rating)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    m = m.tocsr()
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = ratings.groupby('movieId').size()
    movie_lookup = dict(
        (i, m) for i, m in zip(movies['movieId'], movies['title']))
    to_generate = sorted(list(movies['movieId']),
                         key=lambda x: -user_count.get(x, 0))

    with codecs.open(output_filename, "w", "utf8") as o:
        for movieid in to_generate:
            # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
            # no ratings > 4 meaning we've filtered out all data for it.
            if m.indptr[movieid] == m.indptr[movieid + 1]:
                continue

            movie = movie_lookup[movieid]
            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_similar_movies(input_path, output_filename,
                             model_name="als", min_rating=4.0):
    """
    :param input_path: 训练数据集的路径
    :param output_filename: 输出的文件名称
    :param model_name: 采用的模型
    :param min_rating: 过滤所需的阈值大小
    :return:
    """

    logging.debug("reading data from %s", input_path)
    start = time.time()
    rating_data, movies_data, m = read_data(input_path, min_rating=min_rating)
    logging.debug("reading data in %s", time.time() - start)

    if model_name == "als":
        model = AlternatingLeastSquares()

        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender()

    else:
        raise NotImplementedError("TODU: model %s" % model_name)


    m = m.tocsr()
    logging.debug("Training model :%s" % model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = rating_data.groupby("movieId").size()
    movie_lookup = dict((i, m) for i,m in
                        zip(movies_data['movieId'], movies_data['title']))
    to_generate = sorted(list(movies_data['movieId']), key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for movieid in to_generate:
            if(m.indptr[movieid] == m.indptr[movieid + 1]):
                continue

            movie = movie_lookup[movieid]

            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
示例#9
0
def calculate_similar_beers(input_path, output_filename, model_name="cosine"):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, beers, m = read_data(input_path)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    m = m.tocsr()
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top beers")

    user_count = ratings.groupby('beerId').size()
    beer_lookup = dict((i, m) for i, m in zip(beers['beerId'], beers['name']))
    to_generate = sorted(list(beers['beerId']),
                         key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for beerId in to_generate:
            if m.indptr[beerId] == m.indptr[beerId + 1]:
                continue
            beer = beer_lookup[beerId]
            for other, score in model.similar_items(beerId, 11):
                o.write("%s,%s,%s\n" % (beer, beer_lookup[other], score))
示例#10
0
def calculate_similar_movies(input_path,
                             output_filename,
                             model_name="als",
                             min_rating=4.0):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, movies, m = read_data(input_path, min_rating=min_rating)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = ratings.groupby('movieId').size()
    movie_lookup = dict(
        (i, m) for i, m in zip(movies['movieId'], movies['title']))
    to_generate = sorted(list(movies['movieId']),
                         key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for movieid in to_generate:
            movie = movie_lookup[movieid]
            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
示例#11
0
def calculate_recommendations(train_filename,
                              test_filename,
                              output_filename,
                              dir,
                              model_name="als",
                              factors=80,
                              regularization=0.8,
                              iterations=10,
                              exact=False,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar items. This might take a while")

    # read in the input data file
    logging.debug("reading data from %s", dir + train_filename)
    start = time.time()
    df, cnts = read_data(dir + train_filename)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based on the input params
    if model_name == "als":
        if exact:
            model = AlternatingLeastSquares(factors=factors,
                                            regularization=regularization,
                                            use_native=use_native,
                                            use_cg=cg,
                                            iterations=iterations,
                                            dtype=dtype)
        else:
            model = AnnoyAlternatingLeastSquares(factors=factors,
                                                 regularization=regularization,
                                                 use_native=use_native,
                                                 use_cg=cg,
                                                 iterations=iterations,
                                                 dtype=dtype)

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        cnts = bm25_weight(cnts, K1=100, B=0.8)

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(K1=100, B=0.5)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(cnts)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)

    #
    test_data = pandas.read_csv(test_filename,
                                sep="\t",
                                usecols=[0, 1, 2],
                                names=['user', 'item', 'cnt'])
    test_data = test_data.groupby(["user", "item"], as_index=False).sum()
    users_test = set(test_data['user'])
    users_train = set(df['user'])

    # position is important for recommendation list and actual list
    dict_actual = {}
    for user in users_test:
        if user not in users_train:
            continue
        matched_df = test_data.loc[test_data["user"] == user]
        matched_df.sort(["cnt"], ascending=[False], inplace=True)
        dict_actual[user] = list(matched_df["item"])

    user_items = cnts.T.tocsr()
    # print(user_items)
    # recommend items for a user
    dict_recommended = {}  # for computing MAP and MP

    for user in users_test:
        if user not in users_train:
            continue
        # print(user)
        recommendations = model.recommend(user, user_items)
        df = pandas.DataFrame(recommendations, columns=["item", "score"])
        # print(recommendations)
        # print(df["item"])
        dict_recommended[user] = list(df["item"])

    ndcg = NDCG(dict_actual, dict_recommended)

    err = ERR(dict_actual, dict_recommended)

    map = MAP(dict_actual, dict_recommended)

    mp = MP(dict_actual, dict_recommended)

    with open("%siALS_result_%s.txt" % (dir, train_filename), "w") as o:
        o.write("NDCG\tERR\tMAP\tMP\n")
        o.write("%s\t%s\t%s\t%s\n" % (ndcg, err, map, mp))

    return (ndcg, err, map, mp)
示例#12
0
    train_pairs[user_column] = leusers.fit_transform(train_pairs[user_column])
    leservices = LabelEncoder()
    train_pairs[item_column] = leservices.fit_transform(train_pairs[item_column])

    test_pairs[user_column] = leusers.transform(test_pairs[user_column])
    test_pairs[item_column] = leservices.transform(test_pairs[item_column])

    n_users = len(leusers.classes_)
    n_items = len(leservices.classes_)

    sparse_matrix = csr_matrix(
        (np.ones(len(train_pairs)), (train_pairs[user_column], train_pairs[item_column])),
        shape=(n_users, n_items)
    )

    model = CosineRecommender()
    model.fit(sparse_matrix.T)

    print('saving artifacts')
    with open('leservices.pkl', 'wb') as f:
        pickle.dump(leservices, f)
    with open('kdf_rec.pkl', 'wb') as f:
        pickle.dump(model, f)
    scipy.sparse.save_npz('sparse_kdf.npz', sparse_matrix)
    services_df_for_save = services_df[
        services_df.id_clustered.isin(leservices.inverse_transform(train_pairs[item_column]))].reset_index(drop=True)
    services_df_for_save = services_df_for_save.drop_duplicates('id_clustered').reset_index(drop=True)
    services_df_for_save['id_enc_cluster'] = leservices.transform(services_df_for_save.id_clustered)
    most_popular_items = Counter(train_df['id_clustered'])
    services_df_for_save['popularity'] = services_df_for_save.id_enc_cluster.apply(most_popular_items.get)
    services_df.to_csv('services.csv', index=False)
示例#13
0
    issues_train = issues_train.groupby(['reader_id', 'author'
                                         ])['record_id'].count().reset_index()
    issues_test = issues_test.groupby(['reader_id', 'author'
                                       ])['record_id'].count().reset_index()

    train_matrix = csr_matrix(
        (issues_train['record_id'],
         (issues_train['reader_id'], issues_train['author'])),
        shape=(n_readers, n_items)).astype('float64')

    test_matrix = csr_matrix(
        (issues_test['record_id'],
         (issues_test['reader_id'], issues_test['author'])),
        shape=(n_readers, n_items)).astype('float64')

    model = CosineRecommender()
    model.fit(train_matrix.T)

    author_top_items = get_authors_items(issues_prepared)

    similar_author_recommender = SimilarAuthorRecommender(model, train_matrix)
    author_top_items_recommender = AuthorTopItemsRecommender(
        similar_author_recommender, author_top_items, None)

    wrapper = RecommenderWrapper(user_encoder=user_lc,
                                 item_encoder=item_lc,
                                 model=author_top_items_recommender)

    dump_pickle(wrapper, AUTHOR_RECOMMENDER_PATH)
示例#14
0
def calculate_similar_movies(input_filename,
                             output_filename,
                             model_name="als", min_rating=4.0,
                             variant='20m'):
    # read in the input data file
    start = time.time()
    # titles, ratings = get_movielens(variant)

    user_item_df = read_user_item_data(input_filename)
    print(user_item_df)
    unique_user, unique_item, user_item_df = get_user_item_sparse_data_presto(
        user_item_df)

    #user_item_df = user_item_df.sort_values(by=['user_index','item_index'])
    user_item_ratings = scipy.sparse.csr_matrix(
        (user_item_df['score'], (user_item_df['item_index'], user_item_df['user_index'])))
    print(user_item_ratings)
    '''
    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))
    '''

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares(
            factors=128, regularization=0.01, use_native=True, iterations=20, calculate_training_loss=True)

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        # ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(user_item_ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    k=10
    iterations = 10000
    similar_df_gen = similar_to_csv(model, k, unique_item, iterations)

    with tqdm.tqdm(total=len(unique_item) // iterations + 1) as progress:
        for similar_df_slice in similar_df_gen:
            similar_df_slice.to_csv(args.outputfile, mode='a', header=False, index=False)
            print("finsih a batch")
            progress.update(1)

    '''
示例#15
0
 def __init__(self, products: np.ndarray, params: dict):
     self.cosine_recommender = CosineRecommender(**params)
     self._product_idx = dict(zip(products, range(len(products))))
     self._idx_product = products.tolist()
示例#16
0
def calculate_similar_artists(input_filename,
                              output_filename,
                              model_name="als",
                              factors=50,
                              regularization=0.01,
                              iterations=15,
                              exact=False,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar artists. This might take a while")

    # read in the input data file
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        if exact:
            model = AlternatingLeastSquares(factors=factors,
                                            regularization=regularization,
                                            use_native=use_native,
                                            use_cg=cg,
                                            dtype=dtype,
                                            iterations=iterations)
        else:
            model = AnnoyAlternatingLeastSquares(factors=factors,
                                                 regularization=regularization,
                                                 use_native=use_native,
                                                 use_cg=cg,
                                                 dtype=dtype,
                                                 iterations=iterations)

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(K1=100, B=0.5)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)

    # write out similar artists by popularity
    logging.debug("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    with open(output_filename, "w") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in model.similar_items(artistid, 11):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
示例#17
0
def train(alpha_=None, beta_=None):

    logger.info('Running ...')

    # Load all types of interactions, movies catalogue and test users
    transactions = c.data_interim.join('transactions.pkl').load()
    bookmarks = c.data_interim.join('bookmarks.pkl').load()
    ratings = c.data_interim.join('ratings.pkl').load()
    catalogue = c.data_interim.join('catalogue.pkl').load()
    test_users = c.data_interim.join('catalogue.pkl').load()

    logger.info('Data loaded')

    # Train/test split
    transactions_train, transactions_test = train_test_split_on_date(transactions)
    bookmarks_train, bookmarks_test = train_test_split_on_date(bookmarks)
    ratings_train, ratings_test = train_test_split_on_date(ratings)

    logger.info('Train/test split completed')

    # Processing
    ratings_train = to_universal_df_view(ratings_train)
    bookmarks_train = to_universal_df_view(bookmarks_train)
    transactions_train = to_universal_df_view(transactions_train)
    # ... and combining into a single dataframe
    all_interaction_train = combine_interaction_types(ratings_train, bookmarks_train, transactions_train)

    # Processing
    ratings = prepare_df_with_interactions(to_universal_df_view(ratings))
    bookmarks = prepare_df_with_interactions(to_universal_df_view(bookmarks))
    transactions = prepare_df_with_interactions(to_universal_df_view(transactions))

    # ... and combining into a single dataframe
    all_interaction = pd.concat([ratings, bookmarks, transactions])
    all_interaction = all_interaction.reset_index(drop=True)
    all_interaction = all_interaction.drop_duplicates(subset=['user_id', 'item_id']).reset_index(drop=True)

    unique_items = set(catalogue.element_uid.unique())

    # Creating global csr matrix
    csr, dict_of_users, dict_of_items = create_interaction_matrix(all_interaction, unique_items)
    assert np.all([k == v for k, v in dict_of_items.items()])

    logger.info('Csr_matrix with all interactions created')

    interaction_train = prepare_df_with_interactions(transactions_train)
    interaction_test = prepare_df_with_interactions(transactions_test)
    real_int_train_csr, real_int_test_csr = create_interaction_matrices(interaction_train, interaction_test,
                                                                        dict_of_users, dict_of_items, logger=None)

    logger.info('Data preparation finished')

    # count true labels for validation
    # validation dictionary for train data and test data
    train_true_dict = csr_to_dict(real_int_train_csr)
    test_true_dict = csr_to_dict(real_int_test_csr)
    # and an example with set inside for time optimization during filtering
    train_true_dict_set = {k: set(v) for k, v in train_true_dict.items()}
    # test_true_dict_set = {k: set(v) for k, v in test_true_dict.items()}

    # ----------------------------------------------------------------------------------------- #
    # Recency function parameter search
    if (alpha_ is None) and (beta_ is None):

        # Prepare attributes for recency function
        all_interaction_train['time_scaled'] = minmax_scale(all_interaction_train.ts)
        # Все операции выполняем на трейне
        all_interaction_train = all_interaction_train.merge(
            all_interaction_train.groupby('element_uid').time_scaled.min().reset_index().rename(
                {'time_scaled': 'element_launch_ts'}, axis=1))
        all_interaction_train['seen_ts_since_launch'] = all_interaction_train['time_scaled'] - all_interaction_train[
            'element_launch_ts']

        all_interaction_train = all_interaction_train[
            ['element_uid', 'user_uid', 'element_launch_ts', 'seen_ts_since_launch']]

        # takes parameters to search
        alpha_par = config_recency['grid_params']['alpha']
        beta_par = config_recency['grid_params']['beta']

        alpha_range_params = np.arange(alpha_par['min'], alpha_par['max'], alpha_par['step'])
        beta_range_params = np.arange(beta_par['min'], beta_par['max'], beta_par['step'])

        iters = [alpha_range_params, beta_range_params]
        all_variants = list(itertools.product(*iters))
        np.random.shuffle(all_variants)

        logger.info('Starting search ...')

        # ----------------------------------------------------------------------------------------- #

        for element in all_variants:

            alpha_ = element[0]
            beta_ = element[1]

            interaction_train_ = recency_function(all_interaction_train, alpha_, beta_)
            train_csr, test_csr = create_interaction_matrices(interaction_train_, interaction_test, dict_of_users,
                                                              dict_of_items, logger=None)
            train_true_dict_set = {k: set(v) for k, v in train_true_dict.items()}

            model = CosineRecommender(K=10200)
            model.fit(train_csr.T, show_progress=False)
        
            # without filtering in model
            test_predict = {}
            for id_ in tqdm(np.unique(test_csr.nonzero()[0])):
                test_predict[id_] = model.recommend(id_, train_csr, N=300, filter_already_liked_items=False)
            test_predict = {k: [x[0] for x in v] for k, v in test_predict.items()}
            # get rid of movies watched in train
            test_predict = {k: [x for x in v if x not in train_true_dict_set.get(k, [])][:20]
                            for k, v in tqdm(test_predict.items())}

            mapk = metric.metric(test_true_dict, test_predict)
            logger.info('alpha = {0}, beta = {1}, mnap@20 = {2}'.format(alpha_, beta_, mapk))

            # dump mlflow params
            run = mlflow.start_run(experiment_id=0)
            mlflow.set_tag("tag", "Implicit_with_recency")
            mlflow.log_param('lib', 'implicit')
            mlflow.log_param('feedbacks_mode', 'implicit')
            mlflow.log_param('type', 'CF')
            # search-related params
            mlflow.log_param('alpha', alpha_)
            mlflow.log_param('beta', beta_)
            mlflow.log_metric('MNAP_at_20_test', mapk)
            mlflow.end_run()
        # ----------------------------------------------------------------------------------------- #
    # ----------------------------------------------------------------------------------------- #

    # ----------------------------------------------------------------------------------------- #
    # Pruning parameter search
    else:
        pruning_range = config_recency['grid_params']['max_len']
        
        all_interaction_train = all_interaction_train.sort_values(by=['user_uid', 'ts'], ascending=False)
        for max_len in range(pruning_range['min'], pruning_range['max'], pruning_range['step']):
            # ----------------------------------------------------------------------------------------- #
            logger.info('max_len = {}'.format(max_len))
            all_interaction_train_pruned = all_interaction_train.groupby('user_uid').apply(lambda x: x[:max_len]).reset_index(drop=True)
        
            # Все операции выполняем на трейне
            all_interaction_train_pruned = all_interaction_train_pruned.merge(all_interaction_train_pruned.groupby('element_uid').time_scaled.min().reset_index().rename({'time_scaled': 'element_launch_ts'}, axis=1))
            all_interaction_train_pruned['seen_ts_since_launch'] = all_interaction_train_pruned['time_scaled'] - all_interaction_train_pruned['element_launch_ts']
            all_interaction_train_pruned = all_interaction_train_pruned[['element_uid', 'user_uid', 'element_launch_ts', 'seen_ts_since_launch']]

            inv_test_users = [dict_of_users.get(k, None) for k in test_users['users']]
            inv_test_users = [k for k in inv_test_users if k is not None]
            inv_test_users = set(inv_test_users)
            test_true_dict_50k = {k: v for k, v in test_true_dict.items() if k in inv_test_users}
            
            interaction_train_ = recency_function(all_interaction_train_pruned, int(alpha_), int(beta_))
            train_csr, test_csr = create_interaction_matrices(interaction_train_, interaction_test, dict_of_users, dict_of_items, logger=None)
            
            model = CosineRecommender(K=10200)
            model.fit(train_csr.T, show_progress=False)
            
            # without filtering in model
            test_predict = {}
            for id_ in tqdm(inv_test_users):
                test_predict[id_] = model.recommend(id_, train_csr, N=300, filter_already_liked_items=False)
            test_predict = {k: [x[0] for x in v] for k, v in test_predict.items()}
            # get rid of movies watched in train
            test_predict = {k: [x for x in v if x not in train_true_dict_set.get(k, [])][:20] for k, v in tqdm(test_predict.items())}
            mapk = metric.metric(test_true_dict_50k, test_predict)
            logger.info('mapk = {}'.format(mapk))

            # dump mlflow params
            run = mlflow.start_run(experiment_id=1)
            mlflow.set_tag("tag", "Implicit_with_pruning")
            mlflow.log_param('lib', 'implicit')
            mlflow.log_param('feedbacks_mode', 'implicit')
            mlflow.log_param('type', 'CF')
            # search-related params
            mlflow.log_param('max_len', max_len)
            mlflow.log_metric('MNAP_at_20_test', mapk)
            mlflow.end_run()
示例#18
0
def calculate_similar_artists(input_filename,
                              output_filename,
                              model="als",
                              factors=50,
                              regularization=0.01,
                              iterations=15,
                              exact=False,
                              trees=20,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar artists. This might take a while")
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    # write out artists by popularity
    logging.debug("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    start = time.time()

    if model == "als":
        logging.debug("weighting matrix by bm25")
        weighted = bm25_weight(plays, K1=100, B=0.8)

        logging.debug("calculating factors")
        artist_factors, user_factors = alternating_least_squares(
            weighted,
            factors=factors,
            regularization=regularization,
            iterations=iterations,
            use_native=use_native,
            dtype=dtype,
            use_cg=cg)
        logging.debug("calculated factors in %s", time.time() - start)

        if exact:
            calc = TopRelated(artist_factors)
        else:
            calc = ApproximateTopRelated(artist_factors, trees)
        logging.debug("writing top related to %s", output_filename)
        with open(output_filename, "w") as o:
            for artistid in to_generate:
                artist = artists[artistid]
                for other, score in calc.get_related(artistid):
                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))

    elif model in ("bm25", "tfidf", "cosine", "smoothed_cosine", "ochiai",
                   "overlap"):
        if model == "bm25":
            scorer = BM25Recommender(K1=100, B=0.5)

        elif model == "tfidf":
            scorer = TFIDFRecommender()

        elif model == "cosine":
            scorer = CosineRecommender()

        else:
            raise NotImplementedError("TODO: model %s" % model)

        logging.debug("calculating similar items")
        start = time.time()
        scorer.fit(plays, K=11)
        logging.debug("calculated all_pairs_knn in %s", time.time() - start)

        with open(output_filename, "w") as o:
            for artistid in to_generate:
                artist = artists[artistid]
                for other, score in scorer.similar_items(artistid):
                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
示例#19
0
class RetailHeroRecommender(BaseItemItemRecommenderModel):
    def __init__(self,
                 products: pd.DataFrame,
                 params_rec: dict,
                 params_catboost: dict,
                 catboost_features=CB_FEATURES):
        self.ranker = catboost.CatBoost(params_catboost)
        self._catboost_features = catboost_features
        self._nan_fill_dict = dict()

        self.recommender = CosineRecommender(**params_rec)
        self._product_idx = dict(zip(products.product_id,
                                     range(len(products))))

        self._idx_product = products.product_id.tolist()
        self._product_features = {
            row['product_id']: dict(row.drop(index='product_id'))
            for (i, row) in products.iterrows()
        }

    def _cat_features(self):
        return (
            'gender',
            'level_1',
            'level_2',
            'level_3',
            'level_4',
            'product_id',
            'is_alcohol',
            'brand_id',
            'store_id',
            'vendor_id',
            'segment_id',
            'is_own_trademark',
        )

    def _fillna(self, df):
        for feature, fill_value in self._nan_fill_dict.items():
            df.loc[:, feature] = df.loc[:, feature].fillna(fill_value)

        return df

    def _fit_ranker(self, train, valid=None):
        features = self._catboost_features
        cat_features = self._cat_features()
        cat_inds = [i for i, col in enumerate(features) if col in cat_features]

        for feature in features:
            if feature in cat_features:
                self._nan_fill_dict[feature] = 'unknown'
            else:
                self._nan_fill_dict[feature] = np.nanmedian(train[feature])

        train = self._fillna(train)

        logger.debug(f'Train shape: {train.shape}')
        logger.debug(f'Train target mean: {train.target.mean()}')
        for feature, nuniques in train[features].nunique().to_dict().items():
            logger.debug(f'{feature} has {nuniques} values')

        train_pool = catboost.Pool(data=train[features],
                                   label=train['target'],
                                   weight=train.weight,
                                   cat_features=cat_inds,
                                   group_id=train['client_id'])

        if valid is not None:
            valid = self._fillna(valid)
            val_pool = catboost.Pool(data=valid[features],
                                     label=valid['target'],
                                     weight=valid.weight,
                                     cat_features=cat_inds,
                                     group_id=valid['client_id'])

        else:
            val_pool = None

        logger.debug('Training Ranker Model...')
        self.ranker.fit(train_pool,
                        eval_set=val_pool,
                        early_stopping_rounds=100)

    def train_model(self, train_rec: pd.DataFrame, train_ranker: pd.DataFrame,
                    ranker_labels_dict: dict):
        self._fit_recommender(train_rec)

        cb_feats_df = train_ranker.sort_values(
            by=['client_id', 'transaction_datetime'])
        cb_feats_df = cb_feats_df.drop_duplicates(
            subset=['client_id', 'product_id'], keep='last')

        logger.debug('Preparing Train data for Ranker Model...')
        implicit_preds = []
        for cid, df in train_ranker.groupby('client_id'):
            csr_row = self._make_user_item_csr_row(
                values=df.relevance.values, item_idx=df.product_id.values)

            pred = self.recommender.recommend(0,
                                              csr_row,
                                              N=30,
                                              recalculate_user=True,
                                              filter_already_liked_items=False)

            for (i, (idx, score)) in enumerate(pred):
                implicit_preds.append({
                    'client_id':
                    cid,
                    'score':
                    score,
                    'product_id':
                    self._idx_product[idx],
                    'weight':
                    len(pred) - i,
                    'target':
                    int(self._idx_product[idx] in ranker_labels_dict[cid]),
                })

        logger.debug('Finished preparing Train data')

        len_before = len(implicit_preds)
        implicit_preds = pd.DataFrame(implicit_preds)
        cb_feats_df = pd.merge(implicit_preds,
                               cb_feats_df,
                               on=['client_id', 'product_id'],
                               how='left')

        assert len(cb_feats_df) == len_before, 'Shape after merge is different'
        uniq_clients = cb_feats_df.client_id.unique()
        train = cb_feats_df[cb_feats_df.client_id.isin(uniq_clients[:8000])]
        valid = cb_feats_df[cb_feats_df.client_id.isin(uniq_clients[8000:])]

        self._fit_ranker(train, valid=valid)

    def recommend(self, products_counter: dict, histdata_products: dict):
        user_item_csr_row = self._make_user_item_csr_row(
            values=list(products_counter.values()),
            item_idx=products_counter.keys())
        rec_preds = self.recommender.recommend(
            0,
            user_item_csr_row,
            N=30,
            recalculate_user=True,
            filter_already_liked_items=False)

        data_list = []
        nan_hist_data = dict(
            store_id=np.nan,
            regular_points_received=np.nan,
            express_points_received=np.nan,
        )

        for i, score in rec_preds:
            pid = self._idx_product[i]
            row_dic = dict(
                product_id=pid,
                score=score,
                age=histdata_products['age'],
                gender=histdata_products['gender'],
            )

            row_dic.update(self._product_features[pid])
            row_dic.update(histdata_products.get(pid, nan_hist_data))
            data_list.append(row_dic)

        preds_df = pd.DataFrame(data_list)
        preds_df = self._fillna(preds_df)

        preds_df.loc[:, 'catb_score'] = self.ranker.predict(
            preds_df[self._catboost_features])
        result = preds_df.sort_values(by='catb_score',
                                      ascending=False).product_id.tolist()

        if len(result) < 30:
            for t_prod in TOP_PRODUCTS:
                if t_prod not in result:
                    result.append(t_prod)

        return result[:30]