示例#1
0
    def test_explain(self):
        counts = csr_matrix(
            [
                [1, 1, 0, 1, 0, 0],
                [0, 1, 1, 1, 0, 0],
                [1, 4, 1, 0, 7, 0],
                [1, 1, 0, 0, 0, 0],
                [9, 0, 4, 1, 0, 1],
                [0, 1, 0, 0, 0, 1],
                [0, 0, 2, 0, 1, 1],
            ],
            dtype=np.float64,
        )
        user_items = counts * 2
        item_users = user_items.T

        model = AlternatingLeastSquares(
            factors=4,
            regularization=20,
            use_native=False,
            use_cg=False,
            use_gpu=False,
            iterations=100,
            random_state=23,
        )
        model.fit(user_items, show_progress=False)

        userid = 0

        # Assert recommendation is the the same if we recompute user vectors
        recs = model.recommend(userid, item_users, N=10)
        recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True)
        for (item1, score1), (item2, score2) in zip(recs, recalculated_recs):
            self.assertEqual(item1, item2)
            self.assertAlmostEqual(score1, score2, 4)

        # Assert explanation makes sense
        top_rec, score = recalculated_recs[0]
        score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec)
        scores = [s for _, s in contributions]
        items = [i for i, _ in contributions]
        self.assertAlmostEqual(score, score_explained, 4)
        self.assertAlmostEqual(score, sum(scores), 4)
        self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order")
        self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user")

        # Assert explanation with precomputed user weights is correct
        top_score_explained, top_contributions, W = model.explain(
            userid, item_users, itemid=top_rec, user_weights=W, N=2
        )
        top_scores = [s for _, s in top_contributions]
        top_items = [i for i, _ in top_contributions]
        self.assertEqual(2, len(top_contributions))
        self.assertAlmostEqual(score, top_score_explained, 4)
        self.assertEqual(scores[:2], top_scores)
        self.assertEqual(items[:2], top_items)
示例#2
0
class ALS(Model):
    def __init__(self):
        """ Model inicialization 
        """
        self.model = AlternatingLeastSquares()
        self.trainset = None

    def fit(self, X, y):
        #Create Coo-Matrix with X and y
        data = coo_matrix((y, (X[:, 0], X[:, 1])))
        self.trainset = data
        data.transpose()  #rows:[n_items] ; columns:[n_users]
        self.model.fit(data)

    def recommend(self, user_id, N=1):
        n_recomendation = self.model.recommend(
            user_id, self.trainset.tocsr(),
            N=N)  #array of tuples (item_id,rating)
        #convert array of [tuples] in array of [item_id]
        result = np.zeros(N, dtype=int)
        pos = 0
        for recomendation_tuple in n_recomendation:
            result[pos] = recomendation_tuple[0]
            pos = pos + 1
        return result

    def get_params(self, deep=True):
        return dict()
示例#3
0
class ALSRecommender(BaseRecommender):
    """
    implement alternating least squares algorithm implementation based on implicit library
    """
    def fit(self,
            train_df,
            col_user=cfg.USER_COL,
            col_item=cfg.ITEM_COL,
            col_rating=cfg.DEFAULT_RATING_COL,
            factors=100,
            confidence=5,
            regularization=0.1):
        """
        Trains implicit ALS recommender on train data
        :param train_df: pandas DataFrame with train data
         :param col_user: str column name for user
        :param col_item: str column name for item
        :param col_rating: str column name for ratings
        :param factors: int number of factors to use in ALS model
        :param confidence: int as described in implicit documentation
        :param regularization: float higher values mean stronger regularization
        :return: None
        """
        BaseRecommender.fit(self, train_df, col_user, col_item, col_rating)
        self.train_df[self.col_rating] = train_df[self.col_rating] * confidence
        self.uii_matrix = self.get_uii_matrix()
        self.als = AlternatingLeastSquares(factors=factors,
                                           use_gpu=False,
                                           regularization=regularization)
        self.als.fit(self.uii_matrix.T)

    def predict(self, test_df, k=cfg.DEFAULT_K):
        """
        recommend k items for each user in test_df
        :param test_df: pandas DataFrame with test_users and truth recommendations
        :param k: int number of items to recommend
        :return: pandas DataFrame with k recommendations for each user in test_df
        """
        test_users_indices = [
            self.users.index(user) for user in test_df[self.col_user].values
            if user in self.users
        ]

        prediction_records = []
        for item in test_users_indices:
            doc = {
                self.col_user:
                self.users[item],
                self.col_item: [
                    self.items[it[0]] for it in self.als.recommend(
                        item,
                        self.uii_matrix,
                        k,
                        filter_already_liked_items=False)
                ]
            }
            prediction_records.append(doc)
        prediction = pd.DataFrame.from_records(prediction_records)

        return prediction
示例#4
0
    def test_explain(self):
        counts = csr_matrix([[1, 1, 0, 1, 0, 0],
                             [0, 1, 1, 1, 0, 0],
                             [1, 4, 1, 0, 7, 0],
                             [1, 1, 0, 0, 0, 0],
                             [9, 0, 4, 1, 0, 1],
                             [0, 1, 0, 0, 0, 1],
                             [0, 0, 2, 0, 1, 1]], dtype=np.float64)
        user_items = counts * 2
        item_users = user_items.T

        model = AlternatingLeastSquares(factors=4,
                                        regularization=20,
                                        use_native=False,
                                        use_cg=False,
                                        iterations=100)
        np.random.seed(23)
        model.fit(user_items, show_progress=False)

        userid = 0

        # Assert recommendation is the the same if we recompute user vectors
        recs = model.recommend(userid, item_users, N=10)
        recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True)
        for (item1, score1), (item2, score2) in zip(recs, recalculated_recs):
            self.assertEqual(item1, item2)
            self.assertAlmostEqual(score1, score2, 4)

        # Assert explanation makes sense
        top_rec, score = recalculated_recs[0]
        score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec)
        scores = [s for _, s in contributions]
        items = [i for i, _ in contributions]
        self.assertAlmostEqual(score, score_explained, 4)
        self.assertAlmostEqual(score, sum(scores), 4)
        self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order")
        self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user")

        # Assert explanation with precomputed user weights is correct
        top_score_explained, top_contributions, W = model.explain(
            userid, item_users, itemid=top_rec, user_weights=W, N=2)
        top_scores = [s for _, s in top_contributions]
        top_items = [i for i, _ in top_contributions]
        self.assertEqual(2, len(top_contributions))
        self.assertAlmostEqual(score, top_score_explained, 4)
        self.assertEqual(scores[:2], top_scores)
        self.assertEqual(items[:2], top_items)
    def _add_als_recs(self,
                      n_factors=20,
                      regularization=0.001,
                      iterations=20,
                      num_threads=0):

        als_model = AlternatingLeastSquares(factors=n_factors,
                                            regularization=regularization,
                                            iterations=iterations,
                                            num_threads=num_threads)

        als_model.fit(csr_matrix(self.user_item_matrix).T.tocsr())
        self.als_model = als_model

        als_recs = lambda i: [
            self.id_to_itemid[rec[0]] for rec in als_model.recommend(
                userid=int(i),
                user_items=csr_matrix(self.user_item_matrix).tocsr(),
                N=self.first_model_rec_limit,
                filter_items=[self.itemid_to_id[999999]],
                recalculate_user=True,
                filter_already_liked_items=False)
        ]
        self.df_users['als_recommender'] = None
        self.df_users.loc[~self.df_users['id'].isnull(),
                          'als_recommender'] = self.df_users.loc[
                              ~self.df_users['id'].isnull(),
                              'id'].map(als_recs)
        self.df_users['als_recommender'] = self.df_users[
            'als_recommender'].map(lambda val: val
                                   if type(val) == type([]) else [])

        # adding embedings to df_users and df_items as features
        als_user_factors = pd.DataFrame(
            self.als_model.user_factors,
            columns=[
                f'als_user_factor_{i}'
                for i in range(self.als_model.user_factors.shape[1])
            ])
        als_user_factors['id'] = als_user_factors.index
        self.df_users = pd.merge(left=self.df_users,
                                 right=als_user_factors,
                                 on='id',
                                 how='left')

        als_item_factors = pd.DataFrame(
            self.als_model.item_factors,
            columns=[
                f'als_item_factor_{i}'
                for i in range(self.als_model.item_factors.shape[1])
            ])
        als_item_factors['id'] = als_item_factors.index
        self.df_items = pd.merge(left=self.df_items,
                                 right=als_item_factors,
                                 on='id',
                                 how='left')
示例#6
0
class WRMF(Recsys):
    def __init__(self, k, reg=1e-4, n_iters=15):
        """"""
        super().__init__()
        self.k = k
        self.reg = reg
        self.n_iters = n_iters
        self.als = AlternatingLeastSquares(
            k, regularization=reg, iterations=n_iters
        )

    def _recommend(self, user, user_item, n, gt=None):
        return np.array([
            itemid for itemid, score
            in self.als.recommend(user, user_item, n)
        ])
def implicit(args):
    row_dict, col_dict = {}, {}
    rows, cols, data = [], [], []
    for feedback in iter_implicit_feedbacks(
            os.path.join(args.in_dir, 'ua.base')):
        i = row_dict.setdefault(feedback.item_id, len(row_dict))
        j = col_dict.setdefault(feedback.user_id, len(col_dict))
        rows.append(i)
        cols.append(j)
        data.append(1)
    item_user_data = csr_matrix((data, (rows, cols)),
                                shape=(len(row_dict), len(col_dict)))

    model = AlternatingLeastSquares(factors=8)
    model.fit(item_user_data)

    # Evaluation
    user_items = item_user_data.T.tocsr()
    user_items_test = collections.defaultdict(set)
    for feedback in iter_implicit_feedbacks(
            os.path.join(args.in_dir, 'ua.test')):
        try:
            i = row_dict[feedback.item_id]
            j = col_dict[feedback.user_id]
        except KeyError as e:
            continue
        user_items_test[j].add(i)

    topk = 10
    precision = 0
    for user_index, item_indices in user_items_test.items():
        recommendations = model.recommend(user_index, user_items, topk, True)
        precision += sum(1 if item_index in item_indices else 0
                         for item_index, _ in recommendations) / topk
    precision = precision / len(user_items_test)
    print('precision:', precision)

    item_id = 1
    item_index = row_dict[item_id]
    index2id = {value: key for key, value in row_dict.items()}
    for _item_index, score in model.similar_items(item_index, 10):
        _item_id = index2id[_item_index]
        print(_item_id)
示例#8
0
    def mixed_(self,
               train_songs_A,
               train_tags_A,
               test_songs_A,
               test_tags_A,
               song_ntop=500,
               tag_ntop=50,
               iteration=20):

        print("MF for song / CF for tag...")

        res = []

        # song
        songs_A = spr.vstack([test_songs_A, train_songs_A])
        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)
        als_model.fit(songs_A.T * 100)

        # tag
        train_tags_A_T = train_tags_A.T.tocsr()  # shape) n_tags * n_train ply
        tag_val = test_tags_A.dot(train_tags_A_T)

        cand_tag_matrix = tag_val.dot(train_tags_A)

        del tag_val

        for r, pid in tqdm(enumerate(range(test_songs_A.shape[0]), 0)):

            # song
            if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop,
                    filter_already_liked_items=False)

            else:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop,
                    filter_already_liked_items=True)

            rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song]
            rec_song_score = [x[1] for x in cand_song]

            # tag
            tag_row = cand_tag_matrix.getrow(r).toarray().reshape(-1, )
            cand_tag_idx = tag_row.argsort()[-tag_ntop - 5:][::-1]

            tags_already = self.plylst_test.loc[self.n_train + pid, "tags_id"]

            if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1:
                rec_tag_idx = remove_seen(tags_already,
                                          cand_tag_idx)[:tag_ntop]

            else:
                tags_already = self.plylst_test.loc[self.n_train + pid,
                                                    "tags_id"]
                rec_tag_idx = remove_seen(tags_already,
                                          cand_tag_idx)[:tag_ntop]

            rec_tag_score = [tag_row.data[i] for i in cand_tag_idx]

            res.append({
                "id": self.plylst_nid_id[self.n_train + pid],
                "songs": rec_song_idx,
                "tags": [self.tag_tid_id[i] for i in rec_tag_idx],
                "songs_score": rec_song_score,
                "tags_score": rec_tag_score
            })
            return res
示例#9
0
class ALSpkNN():
    '''
    k = # of neighbours for KNN
    knn_frac = % of KNN recommendations
    max_overlap = maximum % overlap between user and their MUSIC neighbours
    min_songs = only use users with > min_songs in our KNN code
    mode = one of ['popular', 'weighted_random', 'random']
    '''
    def __init__(self,
                 user_df,
                 song_df,
                 k=100,
                 knn_frac=0.5,
                 max_overlap=0.2,
                 cf_weighting_alpha=1,
                 min_songs=5,
                 mode='popular'):

        self.user_df = user_df
        self.song_df = song_df
        self.cf_weighting_alpha = cf_weighting_alpha
        self.knn_frac = knn_frac
        self.k = k
        self.max_overlap = max_overlap
        self.min_songs = min_songs
        self.mode = mode

        user_df_subset = user_df.loc[user_df['num_songs'] > (min_songs - 1)]
        self.kdtree = KDTree(user_df_subset['MUSIC'].tolist())

        #build the collaborative filtering model with params hardcoded
        als_params = {
            'factors': 16,
            'dtype': np.float32,
            'iterations': 2,
            'calculate_training_loss': True
        }
        self.cf_model = AlternatingLeastSquares(**als_params)

    def fit(self, train_csr):
        #don't want to modify original incase it gets put into other models
        weighted_train_csr = weight_cf_matrix(train_csr,
                                              self.cf_weighting_alpha)
        self.cf_model.fit(weighted_train_csr)

    def calculate_overlap(self, list_1, list_2):
        overlap = len(set(list_1) & set(list_2))
        total = len(set(list_1)) + len(set(list_2))

        return float(overlap) / total

    def get_overlap_list(self, user_sparse_index,
                         closest_user_song_sparse_indices):

        overlap_list = []
        songs = self.user_df.loc[user_sparse_index]['song_sparse_indices']
        for i in range(len(closest_user_song_sparse_indices)):
            overlap_list.append(
                self.calculate_overlap(songs,
                                       closest_user_song_sparse_indices[i]))

        return overlap_list

    # Returns list of song_sparse_indices
    def get_knn_top_m_song_sparse_indices(self, user_sparse_index, m,
                                          songs_from_cf):

        user_MUSIC = self.user_df.loc[user_sparse_index]['MUSIC']
        distances, indices = self.kdtree.query(user_MUSIC, self.k, p=1)

        closest_user_song_sparse_indices = self.user_df.loc[indices][
            'song_sparse_indices'].values

        # calculate overlap for all songlists and delete those without enough overlap
        insufficient_overlap_indices = []

        overlap_list = self.get_overlap_list(user_sparse_index,
                                             closest_user_song_sparse_indices)
        for i in range(len(closest_user_song_sparse_indices)):
            if overlap_list[i] > self.max_overlap:
                insufficient_overlap_indices.append(i)

        #Users with only one or two songs in their listening history will almost
        # always exceed the overlap condition. This if statement checks if we
        # are clearing too many users. 5 was chosen as an arbitrary threshold
        if len(insufficient_overlap_indices) + 5 < len(
                closest_user_song_sparse_indices):
            closest_user_song_sparse_indices = np.delete(
                closest_user_song_sparse_indices, insufficient_overlap_indices)
        else:
            #Backup incase closest neighbours are all too similar to the user
            #Choose random MUSIC users since similarity of MUSIC scores has
            #became meaningless.
            random_sparse_user_indices = random.sample(
                list(self.user_df.index), m)
            closest_user_song_sparse_indices = self.user_df.loc[
                random_sparse_user_indices]['song_sparse_indices'].values

            print(
                "Choosing random users since not enough users have small enough overlap"
            )

        user_songs = self.user_df.loc[user_sparse_index]['song_sparse_indices']

        # closest_user_song_sparse_indices_flat -> list of song_ids
        closest_user_song_sparse_indices_flat = itertools.chain.from_iterable(
            closest_user_song_sparse_indices)

        filtered_songs = []
        for song in closest_user_song_sparse_indices_flat:
            if song not in (user_songs + songs_from_cf):
                filtered_songs.append(song)

        # song_count_tuples -> format [(song_sparse_index, count)]
        song_count_tuples = Counter(filtered_songs).most_common()
        if len(song_count_tuples) < m:
            print('len(song_count_tuples) < m')

        top_songs = [song_tuple[0] for song_tuple in song_count_tuples]
        if self.mode == 'popular':
            m_songs = top_songs[:m]

        elif self.mode in ['weighted_random', 'random']:
            top_song_probs = None
            if self.mode == 'weighted_random':
                top_song_counts = [
                    song_tuple[1] for song_tuple in song_count_tuples
                ]
                top_song_probs = top_song_counts / np.sum(top_song_counts)

            m_song_count_tuples_indices = np.random.choice(
                len(song_count_tuples),
                p=top_song_probs,
                size=m,
                replace=False)
            m_song_count_tuples = [
                song_count_tuples[idx] for idx in m_song_count_tuples_indices
            ]
            # Although randomly sampled, the songs should still be sorted by popularity to maximize MAP@K
            m_song_count_tuples.sort(key=lambda song_tuple: song_tuple[1],
                                     reverse=True)

            m_songs = [song_tuple[0] for song_tuple in m_song_count_tuples]

        return m_songs

    # Returns [song_sparse_index]
    def recommend(self, user_sparse_index, train_plays_transpose, N):
        # m -> number of songs from KNN recs
        m = int(np.round(self.knn_frac * N))
        # n -> number of songs from CF recs
        n = N - m

        n_songs = []
        if n > 0:
            n_song_tuples = self.cf_model.recommend(
                userid=user_sparse_index,
                user_items=train_plays_transpose,
                N=n)
            n_songs = [song_tuple[0] for song_tuple in n_song_tuples]

        m_songs = []
        if m > 0:
            m_songs = self.get_knn_top_m_song_sparse_indices(
                user_sparse_index=user_sparse_index,
                m=m,
                songs_from_cf=n_songs)

        return n_songs + m_songs
示例#10
0
    def mf_(self,
            train_songs_A,
            train_tags_A,
            test_songs_A,
            test_tags_A,
            song_ntop=500,
            tag_ntop=50,
            iteration=20):

        print(f'MF... iters:{iteration}')
        # 0711 기준 최고 하이퍼파라미터) * 100, song - 256, tag - 32, reg = 0.1, epoch 20 > song 56.4%, tag 61.3%

        val_song_res = []
        val_tag_res = []
        test_song_res = []
        test_tag_res = []

        songs_A = spr.vstack([test_songs_A, train_songs_A])
        tags_A = spr.vstack([test_tags_A, train_tags_A])

        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)  # epoch
        als_model.fit(songs_A.T * 100)

        als_model_tag = ALS(factors=32,
                            regularization=0.08,
                            use_gpu=True,
                            iterations=iteration)
        als_model_tag.fit(tags_A.T * 100)

        for id in tqdm(range(self.n_test_song)):  # 18636 / 태그 -> 11605 행

            # song
            cand_song = als_model.recommend(id,
                                            test_songs_A,
                                            N=song_ntop,
                                            filter_already_liked_items=True)

            rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song]
            rec_song_score = [x[1] for x in cand_song]

            if (id < self.n_val_song):  # 순서 - train, val, test
                val_song_res.append({
                    "id":
                    self.plylst_nid_id[self.plylst_test_song.index[id]],
                    "songs":
                    rec_song_idx,
                    "songs_score":
                    rec_song_score
                })
            else:
                test_song_res.append({
                    "id":
                    self.plylst_nid_id[self.plylst_test_song.index[id]],
                    "songs":
                    rec_song_idx,
                    "songs_score":
                    rec_song_score
                })

            # tag
            try:
                cand_tag = als_model_tag.recommend(
                    id,
                    test_tags_A,
                    N=tag_ntop,
                    filter_already_liked_items=True)

                rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag]
                rec_tag_score = [x[1] for x in cand_tag]

                if (id < self.n_val_song):
                    val_tag_res.append({
                        "id":
                        self.plylst_nid_id[self.plylst_test_tag.index[id]],
                        "tags":
                        rec_tag_idx,
                        "tags_score":
                        rec_tag_score
                    })
                else:
                    test_tag_res.append({
                        "id":
                        self.plylst_nid_id[self.plylst_test_tag.index[id]],
                        "tags":
                        rec_tag_idx,
                        "tags_score":
                        rec_tag_score
                    })

            except IndexError:
                pass

        print("DONE")

        return val_song_res, val_tag_res, test_song_res, test_tag_res
示例#11
0
def calculate_recommendations(train_filename,
                              test_filename,
                              output_filename,
                              dir,
                              model_name="als",
                              factors=80,
                              regularization=0.8,
                              iterations=10,
                              exact=False,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar items. This might take a while")

    # read in the input data file
    logging.debug("reading data from %s", dir + train_filename)
    start = time.time()
    df, cnts = read_data(dir + train_filename)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based on the input params
    if model_name == "als":
        if exact:
            model = AlternatingLeastSquares(factors=factors,
                                            regularization=regularization,
                                            use_native=use_native,
                                            use_cg=cg,
                                            iterations=iterations,
                                            dtype=dtype)
        else:
            model = AnnoyAlternatingLeastSquares(factors=factors,
                                                 regularization=regularization,
                                                 use_native=use_native,
                                                 use_cg=cg,
                                                 iterations=iterations,
                                                 dtype=dtype)

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        cnts = bm25_weight(cnts, K1=100, B=0.8)

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(K1=100, B=0.5)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(cnts)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)

    #
    test_data = pandas.read_csv(test_filename,
                                sep="\t",
                                usecols=[0, 1, 2],
                                names=['user', 'item', 'cnt'])
    test_data = test_data.groupby(["user", "item"], as_index=False).sum()
    users_test = set(test_data['user'])
    users_train = set(df['user'])

    # position is important for recommendation list and actual list
    dict_actual = {}
    for user in users_test:
        if user not in users_train:
            continue
        matched_df = test_data.loc[test_data["user"] == user]
        matched_df.sort(["cnt"], ascending=[False], inplace=True)
        dict_actual[user] = list(matched_df["item"])

    user_items = cnts.T.tocsr()
    # print(user_items)
    # recommend items for a user
    dict_recommended = {}  # for computing MAP and MP

    for user in users_test:
        if user not in users_train:
            continue
        # print(user)
        recommendations = model.recommend(user, user_items)
        df = pandas.DataFrame(recommendations, columns=["item", "score"])
        # print(recommendations)
        # print(df["item"])
        dict_recommended[user] = list(df["item"])

    ndcg = NDCG(dict_actual, dict_recommended)

    err = ERR(dict_actual, dict_recommended)

    map = MAP(dict_actual, dict_recommended)

    mp = MP(dict_actual, dict_recommended)

    with open("%siALS_result_%s.txt" % (dir, train_filename), "w") as o:
        o.write("NDCG\tERR\tMAP\tMP\n")
        o.write("%s\t%s\t%s\t%s\n" % (ndcg, err, map, mp))

    return (ndcg, err, map, mp)
示例#12
0
class Recommender:
    def __init__(self, factors=50):
        self.model = AlternatingLeastSquares(factors=factors,
                                             regularization=0.01,
                                             dtype=np.float64,
                                             iterations=50)

    def train(self, data):
        userids = data.userid.astype("category")
        itemids = data.itemid.astype("category")

        matrix = coo_matrix((data.confidence.astype('float64'),
                             (itemids.cat.codes.copy(),
                              userids.cat.codes.copy())))
        self.model.fit(matrix)
        self.t_matrix = matrix.T.tocsr()
        self.userid_to_code = dict([(category, code)
                                    for code, category in enumerate(userids.cat.categories)])
        self.itemid_to_code = dict([(category, code)
                                    for code, category in enumerate(itemids.cat.categories)])
        self.usercode_to_id = dict([(code, category)
                                    for code, category in enumerate(userids.cat.categories)])
        self.itemcode_to_id = dict([(code, category)
                                    for code, category in enumerate(itemids.cat.categories)])

    def similar_items(self, itemid, N=10):
        item_code = self.itemid_to_code[itemid]
        similar_codes = self.model.similar_items(item_code, N)
        similar_ids = [(self.itemcode_to_id[code], s)
                       for code, s in similar_codes]
        return pd.DataFrame(similar_ids, columns=["itemid", "similarity"])

    def recommendations(self, userid, N=10):
        user_code = self.userid_to_code[userid]
        user_item_codes = self.model.recommend(user_code, self.t_matrix, N)
        user_item_ids = [(self.itemcode_to_id[code], c)
                         for code, c in user_item_codes]
        return pd.DataFrame(user_item_ids, columns=["itemid", "confidence"])

    def explain(self, userid, itemid):
        user_code = self.userid_to_code[userid]
        item_code = self.itemid_to_code[itemid]
        return self.model.explain(user_code, self.t_matrix, item_code)

    def confidence(self, userid, itemid):
        item_code = self.itemid_to_code[itemid]
        user_code = self.userid_to_code[userid]
        item_factor = self.model.item_factors[item_code]
        user_factor = self.model.user_factors[user_code]
        return item_factor.dot(user_factor)

    def user_factors(self):
        factors = pd.DataFrame(self.model.user_factors).add_prefix("f")
        ids = factors.index.map(lambda code: self.usercode_to_id[code])
        factors.insert(0, "userid", ids)
        return factors

    def item_factors(self):
        factors = pd.DataFrame(self.model.item_factors).add_prefix("f")
        ids = factors.index.map(lambda code: self.itemcode_to_id[code])
        factors.insert(0, "itemid", ids)
        return factors

    def items_recommendations(self, itemids, N=10):
        user_code = 0
        item_codes = [self.itemid_to_code[id] for id in itemids]

        data = [1 for _ in item_codes]
        rows = [0 for _ in item_codes]
        shape = (1, self.model.item_factors.shape[0])
        user_items = coo_matrix(
            (data, (rows, item_codes)), shape=shape).tocsr()

        user_item_codes = self.model.recommend(
            user_code, user_items, N, recalculate_user=True)
        user_item_ids = [(self.itemcode_to_id[code], c)
                         for code, c in user_item_codes]
        return pd.DataFrame(user_item_ids, columns=["itemid", "confidence"])
class Wrmf:
    def __init__(self, params={"c": None}, nunique_feature=None):
        self.params = params.copy()
        self.c = params["c"]
        del params["c"]
        self.model = ALS(**params)
        self.song_model = ALS(**params)
        self.tag_model = ALS(**params)
        self.song_rec_csr = None
        self.tag_rec_csr = None
        self.nunique_feature = nunique_feature

    def fit(self, X):
        self.model.fit(self.c * X.T)
        self.song_model.user_factors = self.model.user_factors
        self.song_model.item_factors = self.model.item_factors[:self.
                                                               nunique_feature[
                                                                   "songs"]]

        self.tag_model.user_factors = self.model.user_factors
        self.tag_model.item_factors = self.model.item_factors[
            -self.nunique_feature["tags"]:]

        self.song_rec_csr = X[:, :self.nunique_feature["songs"]]
        self.tag_rec_csr = X[:, -self.nunique_feature["tags"]:]
        #save_model 로 따로 두는 것이 좋음

        return self

    def predict(self, idx, num_songs, num_tags):
        song_rec_df = pd.DataFrame()
        tag_rec_df = pd.DataFrame()
        for u in idx:
            song_rec = self.song_model.recommend(u,
                                                 self.song_rec_csr,
                                                 N=num_songs)
            song_ids = [id_ for id_, _ in song_rec]
            song_scores = [score for _, score in song_rec]
            song_plylst_ids = np.repeat(u, num_songs)
            song_recommended = pd.DataFrame({
                "plylst_id": song_plylst_ids,
                "song_id": song_ids,
                "score": song_scores
            })
            song_rec_df = pd.concat([song_rec_df, song_recommended])

            tag_rec = self.tag_model.recommend(u, self.tag_rec_csr, N=num_tags)
            tag_ids = [id_ for id_, _ in tag_rec]
            tag_scores = [score for _, score in tag_rec]
            tag_plylst_ids = np.repeat(u, num_tags)
            tag_recommended = pd.DataFrame({
                "plylst_id": tag_plylst_ids,
                "tag": tag_ids,
                "score": tag_scores
            })
            tag_rec_df = pd.concat([tag_rec_df, tag_recommended])

        return song_rec_df, tag_rec_df

    def save_model(self, save_file):
        with open(stage1_config.SAVE_FOLDER + save_file + ".pkl", "wb") as f:
            pkl.dump(self, f)
        with open(stage1_config.SAVE_FOLDER + save_file + "_config.txt",
                  "a") as f:
            f.write(str(self.params))

    def load_model(self, save_file):
        with open(stage1_config.SAVE_FOLDER + save_file + ".pkl", "rb") as f:
            self = pkl.load(f)
        return self
class HHimmlerEnsemble:
    def __init__(self, urm_train, urm_test, icm, parameters=None):

        if parameters is None:
            parameters = {
                "USER_CF": 0.8,
                "USER_BPR": 0.7,
                "ITEM_CF": 1,
                "ITEM_BPR": 0.8,
                "CBF": 0.3,
                "IALS": 1.0,
                "CBF_BPR": 1
            }

        self.ensemble_weights = parameters
        self.train = urm_train.tocsr()
        self.test = urm_test.tocsr()
        self.icm = icm.tocsr()

        self.initialize_components()

    def initialize_components(self):
        self.bpr_mf = BPR_matrix_factorization(factors=200,
                                               regularization=0.00000,
                                               learning_rate=0.01,
                                               iterations=65)
        self.ials_cg_mf = IALS_CG(iterations=15,
                                  calculate_training_loss=True,
                                  factors=500,
                                  use_cg=True,
                                  regularization=1e-3)

    def fit(self):
        self.bpr_mf.fit(self.train.T.tocoo())
        self.ials_cg_mf.fit(40 * self.train.T)
        self.bpr_mf_latent_x = self.bpr_mf.user_factors.copy()
        self.bpr_mf_latent_y = self.bpr_mf.item_factors.copy()
        self.ials_cg_mf_latent_x = self.ials_cg_mf.user_factors.copy()
        self.ials_cg_mf_latent_y = self.ials_cg_mf.item_factors.copy()

    def recommend(self, user_id, combiner, at=10):
        bpr_mf_r = np.dot(self.bpr_mf_latent_x[user_id],
                          self.bpr_mf_latent_y.T).ravel()
        ials_cg_mf_r = np.dot(self.ials_cg_mf_latent_x[user_id],
                              self.ials_cg_mf_latent_y.T).ravel()

        scores = [
            # [bpr_mf_r, self.ensemble_weights["BPR_MF"], "BPR_MF"],
            [ials_cg_mf_r, 1, "IALS_CG"]
        ]

        for r in scores:
            self.filter_seen(user_id, r[0])

        return combiner.combine(scores, at)

    def filter_seen(self, user_id, scores):

        start_pos = int(self.train.indptr[user_id])
        end_pos = int(self.train.indptr[user_id + 1])

        user_profile = self.train.indices[start_pos:end_pos]

        scores[user_profile] = -1000000  #-np.inf
        return scores

    def recommend_batch(self, user_list, combiner, at=10):
        res = np.array([])
        n = 0
        for i in user_list:
            bpr = self.bpr_mf.recommend(user_items=self.train,
                                        userid=i,
                                        N=at,
                                        recalculate_user=False)
            ials = self.ials_cg_mf.recommend(userid=i,
                                             user_items=self.train,
                                             N=10)
            list = [x[0] for x in ials]
            recList = np.array(list)
            tuple = np.concatenate(([i], recList))
            if (res.size == 0):
                res = tuple
            else:
                res = np.vstack([res, tuple])
        return res

    def get_component_data(self):
        print('cyka')
示例#15
0
print(plays[:5])
print(len(users))
print(len(artists))

rows = data.user_id.astype(int)
cols = data.artist_id.astype(int)

data_sparse = sparse.csr_matrix((plays, (cols, rows)), shape=(len(artists), len(users)))

model = AlternatingLeastSquares(factors=50)
model.fit(data_sparse)

userid = 0

user_items = data_sparse.T.tocsr()
recommendations = model.recommend(userid, user_items)

print(recommendations)

for r in recommendations:
    print(artist_id_name[str(r[0])])

itemid = 107209
related = model.similar_items(itemid)

print(related)

for a in related:
    print(artist_id_name[str(a[0])])

    artist_id_name['234786']
示例#16
0
class MainRecommender:
    
    
    
    own_recomender_defult_param = {'filter_already_liked_items':False, 
                        'filter_items':False, 
                        "recalculate_user":True}
    
    model_als_defult_param ={'factors':50, 'regularization':15, 'iterations':15, 
                             'num_threads':-1,'calculate_training_loss':False}
    
    def __init__(self, data,data_test=None,split_info=None):
        """ data - dataframe c данными
            data_test - даные для валидации, если нет и есть split_info то создаем
            split_info кортеж с инфрмацией как создать data_test (размер, поле деления) рассматривается только в слуяае отсутвя 
            data_test
        """     
        self.top = 5000
        self.data_validation={}
        self.data_validation['status'] = False
        self.user_item_matrix = {'status':False,'matrix':None,'params':None}
        self.own_recommender_is_fit= {'status':False,'params':None}
        self.als_recommender_is_fit= {'status':False,'params':None}
        self.data = data.copy()
        self.full_data_train = data.copy() #Оставим полный объем данный , если нужно будет предсказывать по полному объему данных
        self.data_train = data.copy()
        if data_test is not None:
            self.data_test = data_test.copy()
        else:
            self.data_test = None
            if split_info:
                self.data_train,self.data_test = self.train_test_split(test_size_num = split_info[0],split_column =split_info[1])
        if  self.data_test is not None:
            self.data_validation['data'] = self.get_validation_data()
            self.data_validation['status'] = True


 
    def prefiltr_1(self,my_data):
        df = my_data.copy()
        """Оставим только top самых популярных товаров остальные переименуем в 999999"""
        popularity = my_data.groupby('item_id')['quantity'].count().reset_index()
        popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
        top_5000 = popularity.sort_values('n_sold', ascending=False).head(self.top).item_id.tolist()
        df.loc[~df['item_id'].isin(top_5000), 'item_id'] = 999999 
        return df
    
    
    def prefiltr_2(self,data_train,n=5000):
        """Оставим только n самых популярных товаров, транзакции с остальными товрами удалим"""
        df = data_train.copy()
        popularity = df.groupby('item_id')['quantity'].count().reset_index()
        popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
        top_n = popularity.sort_values('n_sold', ascending=False).head(n).item_id.tolist()
        df = df.loc[df['item_id'].isin(top_n)]  
        return df
    
    
    def prefiltr_3(self,data_train,n=5000):
        """транзакции с самыми не популярными n товрами удалим"""
        df = data_train.copy()
        not_popularity = df.groupby('item_id')['quantity'].count().reset_index()
        not_popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
        not_top_n = not_popularity.sort_values('n_sold').head(n).item_id.tolist()
        df = df.loc[~df['item_id'].isin(not_top_n)]  
        return df   
    
    
    def prefiltr_4(self,data_train,weeks = 50):
        """Удалим транзакции с товарами, которые не покупали более n недель"""
        df = data_train.copy()
        old_item = df.groupby('item_id')['week_no'].max().reset_index()
        old_item = old_item.loc[old_item['week_no']>weeks,'item_id'].tolist()
        df = df.loc[df['item_id'].isin(old_item)]  
        return df
    


  
    def train_test_split(self,test_size_num,split_column):
        data_train = self.data[self.data[split_column] < self.data[split_column].max() - test_size_num]
        data_test = self.data[self.data[split_column] >= self.data[split_column].max() - test_size_num]
        return data_train, data_test
    
    
   
    def get_validation_data(self):
        result = self.data_test.groupby('user_id')['item_id'].unique().reset_index()
        users_train = self.data_train.user_id.unique()
        result = result[result.user_id.isin(users_train)]
        result['train'] = result['user_id'].map(self.data_train.groupby('user_id')['item_id'].unique())
        result['full_train'] = result['user_id'].map(self.full_data_train.groupby('user_id')['item_id'].unique())
        result.rename(columns={'item_id':'test'},inplace=True)
        result.reset_index(inplace=True,drop=True)
        return result

 
    def prepare_matrix(self,agg_column,full=None,filtr=None):
        my_data = self.data_train.copy()
        if full:
            my_data = self.full_data_train.copy()
        if  filtr:
            for i in filtr:
                prefiltr = 'self.prefiltr_'+str(i)+'(my_data)'
                my_data = eval(prefiltr)
            
        user_item_matrix = pd.pivot_table(my_data, 
                              index='user_id', columns='item_id', 
                              values=agg_column[0], 
                              aggfunc=agg_column[1], 
                              fill_value=0
                             )
        
        user_item_matrix = user_item_matrix.astype(float) 
        self.prepare_dicts(user_item_matrix)
        self.current_working_data = my_data.copy()

        return user_item_matrix
            


    def prepare_dicts(self,user_item_matrix):
        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        self.id_to_itemid = dict(zip(matrix_itemids, itemids))
        self.id_to_userid = dict(zip(matrix_userids, userids))

        self.itemid_to_id = dict(zip(itemids, matrix_itemids))
        self.userid_to_id = dict(zip(userids, matrix_userids))
        
        return  self.id_to_itemid,  self.id_to_userid,  self.itemid_to_id,  self.userid_to_id
    
    
     
    def make_data(self,agg_column,filtr=None,full =False,top = 5000):
        self.top = top
        self.full = full
        uim = self.prepare_matrix(agg_column=agg_column,full=full,filtr=filtr)
        uim_w = uim.copy()
        self.user_item_matrix['uim_matrix_w'] = csr_matrix(uim_w).tocsr()
        uim[uim>0]=1
        self.user_item_matrix['uim_matrix'] = csr_matrix(uim).tocsr()
        
        self.user_item_matrix['ium_matrix_w_tfidf'] = tfidf_weight(csr_matrix(uim_w.T).tocsr())
        self.user_item_matrix['ium_matrix_tfidf'] = tfidf_weight(csr_matrix(uim.T).tocsr())
        self.user_item_matrix['ium_matrix_w_bm25'] = bm25_weight(csr_matrix(uim_w.T).tocsr())
        self.user_item_matrix['ium_matrix_bm25'] = bm25_weight(csr_matrix(uim.T).tocsr())

        self.user_item_matrix['status'] = True
        self.user_item_matrix['params'] = {'agg_column':agg_column,'filtr':filtr,'full':full}
        return self.user_item_matrix
            
        
    def precision_at_k(x, k=5):
        if len(x['predict']) == 0:
            return 0
        bought_list = np.array(x['test'])
        recommended_list = np.array(x['predict'])[:k]
        flags = np.isin(bought_list, recommended_list)
        precision = flags.sum() / len(recommended_list)


        return precision
        
        
    
    def fit_own_recommender(self,weighting=False):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
        
        assert self.user_item_matrix['status'], 'необходимо сначала выполнить метод make_data(self,agg_column,filtr=None,weighting=None,full =False)'
        ium = self.user_item_matrix['uim_matrix'].T
        if weighting:
            assert (weighting == 'tf_idf' or weighting == 'bm25'), 'необходимо указать weighting: tf_idf или bm25 или None'
            if  weighting == 'tf_idf':
                ium = self.user_item_matrix['ium_matrix_tfidf']
            else:
                ium = self.user_item_matrix['ium_matrix_bm25']   
        self.own_recommender = ItemItemRecommender(K=1, num_threads=-1)
        self.own_recommender.fit(ium)      
        self.own_recommender_is_fit['status'] =True
        self.own_recommender_is_fit['params'] ={'model':'ItemItemRecommender(K=1, num_threads=-1)','weighting':weighting}
        self.own_recommender_is_fit['ium']=ium
        
        return self.own_recommender
    
    
    def predict_own_recommender(self,users,N=5,params=own_recomender_defult_param):
        
        param = params.copy()
        assert self.own_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_own_recommender()'
        assert type(users) == list, 'users - должен быть списком'
        uim = self.user_item_matrix['uim_matrix']
        param['user_items'] = uim
        param['N'] = N
        answer = pd.DataFrame()
        answer['user_id']=users
        if param['filter_items']:
            param['filter_items']=[self.itemid_to_id[i] for i in params['filter_items']]
        rec=[]
        for user in users:
            param['userid'] = self.userid_to_id[user]
            rec.append( [self.id_to_itemid[i[0]] for i in self.own_recommender.recommend(**param)])
        answer['result']  = rec
        return answer

    
    
    def validation_own_recommender(self,metric=precision_at_k,N=5,params=own_recomender_defult_param):
        assert self.data_validation['status'], 'тестовые данные не созданы'
        assert self.own_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_own_recommender()'
        df = self.data_validation['data']
        
        users = df['user_id'].to_list()
        
        predict = self.predict_own_recommender(users = users,N=N,params=params)
             
        df['predict'] = predict['result']
        
        return df.apply(metric,axis=1).mean()
            
        
  
    def fit_als(self, params = model_als_defult_param,weighting=False):
        """Обучает ALS"""
        
        assert self.user_item_matrix['status'], 'необходимо сначала выполнить метод make_data(self,agg_column,filtr=None,weighting=None,full =False)'
        ium = self.user_item_matrix['uim_matrix_w'].T
        if weighting:
            assert (weighting == 'tf_idf' or weighting == 'bm25'), 'необходимо указать weighting: tf_idf или bm25 или None'
            if  weighting == 'tf_idf':
                ium = self.user_item_matrix['ium_matrix_w_tfidf']
            else:
                ium = self.user_item_matrix['ium_matrix_w_bm25']
        
        self.model_als = AlternatingLeastSquares(**params)
        self.model_als.fit(ium)
        self.als_recommender_is_fit['status'] = True
        self.als_recommender_is_fit['params'] = {'model':params,'weighting':weighting}
        self.als_recommender_is_fit['ium'] = ium
        
        return self.model_als
    
    
    def predict_als(self,users,N=5,params=own_recomender_defult_param):
        
        param = params.copy()
        assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()'
        assert type(users) == list, 'users - должен быть списком'
        uim = self.user_item_matrix['uim_matrix_w']
        param['user_items'] = uim
        param['N'] = N
        answer = pd.DataFrame()
        answer['user_id']=users
        if param['filter_items']:
            param['filter_items']=[self.itemid_to_id[i] for i in params['filter_items']]
        rec=[]
        for user in users:
            param['userid'] = self.userid_to_id[user]
            rec.append( [self.id_to_itemid[i[0]] for i in self.model_als.recommend(**param)])
        answer['result']  = rec
        return answer
    
    
    def validation_als_recommender(self,metric=precision_at_k,N=5,params=own_recomender_defult_param):
        assert self.data_validation['status'], 'тестовые данные не созданы'
        assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()'
        df = self.data_validation['data'].copy()
        users = df['user_id'].to_list()
        predict = self.predict_als(users = users,N=N,params=params)
        df['predict'] = predict['result']

        return df.apply(metric,axis=1).mean()  
    
    
    def get_recs(self,user,popularity,not_my=0):
        result = []
        for item in popularity[popularity['user_id']==user]['item_id'].to_list():
            recs_ = self.model_als.similar_items(self.itemid_to_id[item], N=3)
            recs = [self.id_to_itemid[i[0]] for i in recs_]
            if 999999 in recs:
                recs.remove(999999)
            result.append(recs[not_my])
        return  result      


    def get_similar_items_recommendation(self, users,not_my=0, N=5):
        
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров
        not_my =1 если хотим предсказать поекупку собственных товаров (вроде own_recomender), 0 - обратно"""
        assert  self.als_recommender_is_fit['status'],'Модель als не обучена, используйте fit_als()'
        assert  type(users)==list,'параметр users должен быть list'
        assert  not_my in [0,1],'параметр not_my должен быть равен 0 или 1'
        my_data = self.current_working_data.copy()
        my_data = my_data[my_data['user_id'].isin(users)]    
        popularity = my_data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        popularity.sort_values('quantity', ascending=False, inplace=True)
        popularity = popularity[popularity['item_id'] != 999999]
        popularity =popularity.groupby('user_id').head(N)
        popularity.sort_values(['user_id','quantity'], ascending=False, inplace=True)
        result = pd.DataFrame()
        result['user_id'] = users
        result['similar_recommendation'] = result['user_id'].apply(\
                                            lambda x: self.get_recs(user = x,popularity = popularity,not_my=not_my))

        return result
    
    
    def validation_similar_items_recommendation(self,metric=precision_at_k,N=5,not_my=0):
        assert self.data_validation['status'], 'тестовые данные не созданы'
        assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()'
        assert  not_my in [0,1],'параметр not_my должен быть равен 0 или 1'
        df = self.data_validation['data'].copy()
        users = df['user_id'].to_list()
        predict = self.get_similar_items_recommendation(users = users,N=N,not_my=not_my)
        df['predict'] = predict['similar_recommendation']

        return df.apply(metric,axis=1).mean() 
    
    
    
    def get_user(self,user):
        users = self.model_als.similar_users(self.userid_to_id[user], N=2)
        
        return  self.id_to_userid[users[1][0]]
    
    
    def get_similar_users_recommendation(self, users, N=5,params=own_recomender_defult_param):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        assert  self.als_recommender_is_fit['status'],'Модель als не обучена, используйте fit_als()'
        assert  type(users)==list,'параметр users должен быть list'
        result = pd.DataFrame()
        result['user_id'] = users
        result['simular_user_id'] = result['user_id'].apply(self.get_user)
        result['similar_recommendation'] = self.predict_als(result['simular_user_id'].to_list(),N=5,params=params)['result']

        return result    
            
    def validation_similar_users_recommendation(self,metric=precision_at_k,N=5):
        assert self.data_validation['status'], 'тестовые данные не созданы'
        assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()'
        df = self.data_validation['data'].copy()
        users = df['user_id'].to_list()
        predict = self.get_similar_users_recommendation(users = users,N=N)
        df['predict'] = predict['similar_recommendation']

        return df.apply(metric,axis=1).mean()     
示例#17
0
class Implicit(object):
    def __init__(self):

        self.model = None
        self.mapped_trainset = None
        self.mapping_dict = None
        self.inv_mapping_dict = None
        self.max_index_of_item = None
        self.max_index_of_user = None

        self.item_users = None
        self.user_items = None

        self.k = 10

        self.param = None
        self.default_param = {
            'factors': 100,
            'regularization': 0.01,
            'iterations': 15,
            'use_native': True,
            'use_cg': True,
            'use_gpu': False,
            'calculate_training_loss': False,
            'num_threads': 0
        }

        self.mean_user_factors = None
        self.mean_item_factors = None

        self.baseline_recommend_items = None
        self.baseline_recommend_scores = None

    def fit_trainset(self, raw_train_dataset):
        trainset = copy.deepcopy(raw_train_dataset)
        #trainset = trainset.drop_duplicates(subset=['user','item'])

        self.mapping_dict, self.inv_mapping_dict = fit_coder(
            trainset, 'user', 'item', 'rating')
        self.mapped_trainset = code(copy.deepcopy(trainset), 'user', 'item',
                                    'rating', self.mapping_dict)

        self.max_index_of_item = len(self.mapped_trainset.item.unique())
        self.max_index_of_user = len(self.mapped_trainset.user.unique())

        row = self.mapped_trainset.item.values
        col = self.mapped_trainset.user.values
        data = self.mapped_trainset.rating.values

        self.item_users = csr_matrix(
            (data, (row, col)),
            shape=(self.max_index_of_item, self.max_index_of_user))
        self.user_items = self.item_users.T.tocsr()
        self.user_items = bm25_weight(self.user_items, B=0.7).tocsr() * 5
        self.item_users = self.user_items.T.tocsr()

        # #Experiment --------------
        # add_one = self.item_users.toarray() + 1
        # self.item_users = csr_matrix(add_one)
        # # -------------------------
        self.user_items = self.item_users.T.tocsr()

    def add_fit_trainset(self, new_raw_train_dataset):
        if self.mapped_trainset is None:
            self.fit_trainset(new_raw_train_dataset)
        else:
            new_trainset = copy.deepcopy(new_raw_train_dataset)
            new_train = code(copy.deepcopy(new_trainset), 'user', 'item',
                             'rating', self.mapping_dict)

            ind_item = new_train[new_train.item.isnull()].index
            ind_user = new_train[new_train.user.isnull()].index

            unknown_items = new_trainset.loc[ind_item, 'item'].unique()
            unknown_users = new_trainset.loc[ind_user, 'user'].unique()

            len_new_items = len(unknown_items)
            len_new_users = len(unknown_users)

            new_item_dic = {
                key: value
                for key, value in zip(
                    unknown_items,
                    range(self.max_index_of_item, self.max_index_of_item +
                          len_new_items))
            }
            new_user_dic = {
                key: value
                for key, value in zip(
                    unknown_users,
                    range(self.max_index_of_user, self.max_index_of_user +
                          len_new_users))
            }

            inv_new_item_dic = {
                value: key
                for key, value in zip(
                    unknown_items,
                    range(self.max_index_of_item, self.max_index_of_item +
                          len_new_items))
            }
            inv_new_user_dic = {
                value: key
                for key, value in zip(
                    unknown_users,
                    range(self.max_index_of_user, self.max_index_of_user +
                          len_new_users))
            }

            self.max_index_of_item += len_new_items
            self.max_index_of_user += len_new_users

            self.mapping_dict['item'].update(new_item_dic)
            self.mapping_dict['user'].update(new_user_dic)

            self.inv_mapping_dict['item'].update(inv_new_item_dic)
            self.inv_mapping_dict['user'].update(inv_new_user_dic)

            new_mapped_trainset = code(copy.deepcopy(new_raw_train_dataset),
                                       'user', 'item', 'rating',
                                       self.mapping_dict)

            #self.mapped_trainset = self.mapped_trainset.append(new_trainset, ignore_index=True)

            self.mapped_trainset = pd.concat(
                [self.mapped_trainset, new_mapped_trainset], ignore_index=True)
            self.mapped_trainset = self.mapped_trainset.drop_duplicates(
                subset=['user', 'item'])

            row = self.mapped_trainset.item.values
            col = self.mapped_trainset.user.values
            data = self.mapped_trainset.rating.values

            self.item_users = csr_matrix(
                (data, (row, col)),
                shape=(self.max_index_of_item, self.max_index_of_user))
            self.user_items = self.item_users.T.tocsr()

            if self.model:

                factors_for_new_unknown_users = [list(self.mean_user_factors)
                                                 ] * len_new_users
                if len(factors_for_new_unknown_users) > 0:
                    self.model.user_factors = np.concatenate([
                        self.model.user_factors, factors_for_new_unknown_users
                    ])

                factors_for_new_unknown_items = [list(self.mean_item_factors)
                                                 ] * len_new_items
                if len(factors_for_new_unknown_items) > 0:
                    self.model.item_factors = np.concatenate([
                        self.model.item_factors, factors_for_new_unknown_items
                    ])

                print('factors extended')

    def set_k(self, k):
        self.k = int(k)

    def fit_model(self, dic_param={}, fit_new_model=True):
        if self.item_users is None:
            print('Firstly fit trainset')
        else:

            if fit_new_model == False:  #check if available previous model
                if not self.model:
                    fit_new_model = True

            if fit_new_model:
                d = copy.deepcopy(self.default_param)
                d.update(dic_param)
                self.param = d

            else:
                d = copy.deepcopy(self.param)
                d.update(dic_param)
                if d['factors'] != self.param['factors']:
                    print('different amount of facors! Previous: ' +
                          str(self.param['factors']) + '; Now: ' +
                          str(d['factors']) + '; Fit new model')
                    fit_new_model = True

            if fit_new_model:
                self.model = AlternatingLeastSquares(
                    factors=d['factors'],
                    regularization=d['regularization'],
                    iterations=d['iterations'],
                    use_native=d['use_native'],
                    use_cg=d['use_cg'],
                    use_gpu=d['use_gpu'],
                    calculate_training_loss=d['calculate_training_loss'],
                    num_threads=d['num_threads'])  #dic_param
            else:
                previous_user_factors = self.model.user_factors
                previous_item_factors = self.model.item_factors
                self.model = AlternatingLeastSquares(
                    factors=d['factors'],
                    regularization=d['regularization'],
                    iterations=d['iterations'],
                    use_native=d['use_native'],
                    use_cg=d['use_cg'],
                    use_gpu=d['use_gpu'],
                    calculate_training_loss=d['calculate_training_loss'],
                    num_threads=d['num_threads'])  #dic_param
                self.model.user_factors = previous_user_factors
                self.model.item_factors = previous_item_factors

            self.model.fit(self.item_users)

            self.mean_user_factors = self.model.user_factors.mean(axis=0)
            self.mean_item_factors = self.model.item_factors.mean(axis=0)

            scores = np.dot(self.model.item_factors, self.mean_user_factors)
            items = list(range(self.max_index_of_item))
            result = list(zip(scores, items))
            result.sort(reverse=True)
            recommend = np.array(result)

            self.baseline_recommend_items = [
                self.inv_mapping_dict['item'][int(item)]
                for _, item in recommend
            ]
            self.baseline_recommend_scores = [score for score, _ in recommend]

    def get_user_factors(self):
        real_user_factors = {}
        for i in range(self.max_index_of_user):
            real_user_factors[self.inv_mapping_dict['user'][i]] = list(
                self.model.user_factors[i])
        return real_user_factors

    def get_item_factors(self):
        real_item_factors = {}
        for i in range(self.max_index_of_item):
            real_item_factors[self.inv_mapping_dict['item'][i]] = list(
                self.model.item_factors[i])
        return real_item_factors

    def recommend_for_user(self,
                           user_true_name,
                           filter_already_liked_items=True,
                           return_scores=False,
                           recalculate_user=False):
        if self.mapping_dict is None:
            print('Firstly fit_trainset')
            return None
        if self.model is None:
            print('Firstly fit_model')
            return None

        if user_true_name in self.mapping_dict['user'].keys():
            user = self.mapping_dict['user'][user_true_name]
            rec = self.model.recommend(
                user,
                self.user_items,
                self.k,
                filter_already_liked_items=filter_already_liked_items,
                recalculate_user=recalculate_user)
            items = [self.inv_mapping_dict['item'][item] for item, _ in rec]
            scores = [score for _, score in rec]
            if return_scores:
                return items, scores
            else:
                return items
        else:
            items = self.baseline_recommend_items[:self.k]
            if return_scores:
                scores = self.baseline_recommend_scores[:self.k]
                return items, scores
            else:
                return items

    def recommend(self,
                  users_list,
                  filter_already_liked_items=True,
                  return_scores=False,
                  recalculate_user=False):
        if self.mapping_dict is None:
            print('Firstly fit_trainset')
            return None
        if self.model is None:
            print('Firstly fit_model')
            return None

        result_user_items = {}
        result_user_scores = {}

        for user_true_name in tqdm(users_list):
            if return_scores:
                items, scores = self.recommend_for_user(
                    user_true_name, filter_already_liked_items, return_scores,
                    recalculate_user)
                result_user_items[user_true_name] = items
                result_user_scores[user_true_name] = scores
            else:
                items = self.recommend_for_user(user_true_name,
                                                filter_already_liked_items,
                                                return_scores,
                                                recalculate_user)
                result_user_items[user_true_name] = items

        if return_scores:
            return result_user_items, result_user_scores
        else:
            return result_user_items

    def recommend_df(self,
                     users_list,
                     filter_already_liked_items=True,
                     return_scores=False,
                     column_names=['user', 'item', 'rating'],
                     recalculate_user=False):
        if self.mapping_dict is None:
            print('Firstly fit_trainset')
            return None
        if self.model is None:
            print('Firstly fit_model')
            return None

        result = []

        for user_true_name in tqdm(users_list):
            user_column = [user_true_name] * int(self.k)
            if return_scores:
                items, scores = self.recommend_for_user(
                    user_true_name, filter_already_liked_items, return_scores,
                    recalculate_user)
                res = list(zip(user_column, items, scores))
            else:
                items = self.recommend_for_user(user_true_name,
                                                filter_already_liked_items,
                                                return_scores,
                                                recalculate_user)
                res = list(zip(user_column, items))
            result.extend(res)

        if return_scores:
            return pd.DataFrame(result, columns=column_names[:3])
        else:
            return pd.DataFrame(result, columns=column_names[:2])

    #
    # def rank_for_user(self, user):
    #     if self.max_index_of_user is None:
    #         print('Firstly fit_testset')
    #         return None
    #
    #     list_items = self.testset[self.testset.user == user].item
    #     items_to_rank = list_items[list_items < self.max_index_of_item].values
    #     items_to_end = list_items[list_items >= self.max_index_of_item].values
    #
    #     res = []
    #
    #     if user >= self.max_index_of_user:
    #         list_to_sort = []
    #         for item in items_to_rank:
    #             list_to_sort.append((round(self.item_value_counts[item]*0.001,3),item))
    #
    #         for item in items_to_end:
    #             list_to_sort.append((0,item))
    #
    #         list_to_sort.sort(reverse=True)
    #         res = [(t[1], t[0]) for t in list_to_sort]
    #     else:
    #         res = self.model.rank_items(user, self.user_items,selected_items=items_to_rank)
    #         for item in items_to_end:
    #             res.append((item, 0))
    #     return res
    #
    #
    #
    # def rank(self):
    #     if self.max_index_of_user is None:
    #         print('Firstly fit_testset')
    #         return None
    #
    #     result = pd.DataFrame(columns=['item','rating','user'])
    #
    #     users = list(self.testset.user.unique())
    #     for i in tqdm(range(len(users))):
    #         user = users[i]
    #         res = self.rank_for_user(user)
    #         df = pd.DataFrame(res, columns=['item','rating'])
    #         df['user'] = [user]*len(df)
    #
    #         result = pd.concat([result, df])
    #
    #     result = result[['user','item','rating']]
    #     output = code(copy.deepcopy(result), 'user','item','rating',self.inv_mapping_dict)
    #     output.index = range(len(output))
    #
    #     return output

    def dump_model(self, filename='dumped_file'):
        """
        Saving the model for further using.
        :param filename: str - path and name of file to save.
        :return:
        """
        if (self.model is None) | (self.mapped_trainset is None):
            print('Unable to dump model')
            print('Please firstly fit train dataset and train model')
        else:
            dump_obj = {
                'model': self.model,
                'mapped_trainset': self.mapped_trainset,
                'mapping_dict': self.mapping_dict,
                'inv_mapping_dict': self.inv_mapping_dict,
                'max_index_of_item': self.max_index_of_item,
                'max_index_of_user': self.max_index_of_user,
                'item_users': self.item_users,
                'user_items': self.user_items,
                'k': self.k,
                'mean_user_factors': self.mean_user_factors,
                'mean_item_factors': self.mean_item_factors,
                'baseline_recommend_items': self.baseline_recommend_items,
                'baseline_recommend_scores': self.baseline_recommend_scores,
                'param': self.param
            }
            pickle.dump(dump_obj,
                        open(filename, 'wb'),
                        protocol=pickle.HIGHEST_PROTOCOL)
            print('Model has succesfuly been dumped!')

    def load_model(self, filename='dumped_file'):
        """
        Function to load ready to use, pre trained model from file.
        :param filename: str - path to the file with model
        :return: nothing
        """
        dump_obj = pickle.load(open(filename, 'rb'))
        self.model = dump_obj['model']
        self.mapped_trainset = dump_obj['mapped_trainset']
        self.mapping_dict = dump_obj['mapping_dict']
        self.inv_mapping_dict = dump_obj['inv_mapping_dict']
        self.max_index_of_item = dump_obj['max_index_of_item']
        self.max_index_of_user = dump_obj['max_index_of_user']
        self.item_users = dump_obj['item_users']
        self.user_items = dump_obj['user_items']
        self.k = dump_obj['k']
        self.mean_user_factors = dump_obj['mean_user_factors']
        self.mean_item_factors = dump_obj['mean_item_factors']
        self.baseline_recommend_items = dump_obj['baseline_recommend_items']
        self.baseline_recommend_scores = dump_obj['baseline_recommend_scores']
        self.param = dump_obj['param']
示例#18
0
class ImplicitALS:
    def __init__(self, df, config, orig_df):
        df = self._calc_confidence_preference(df, config.alpha)
        self.config = config
        self.orig_df = orig_df

        def check_index_uniformity(index):
            return index.min() == 0 and \
                   index.max() == len(index) - 1

        def index_info(index):
            return 'index with min %d max %d count %d items' % (
                index.min(), index.max(), len(index))

        assert check_index_uniformity(
            df.user_id.drop_duplicates()), index_info(
                df.user_id.drop_duplicates())
        assert check_index_uniformity(
            df.item_id.drop_duplicates()), index_info(
                df.item_id.drop_duplicates())

        users = df.user_id.to_list()
        items = df.item_id.to_list()
        rate = df.rate.to_list()
        shape = (len(set(items)), len(set(users)))
        self.iu_mat = csr_matrix((rate, (items, users)), shape=shape)
        self.ui_mat = self.iu_mat.transpose()

        self.model = ALS(factors=config.factors,
                         calculate_training_loss=True,
                         iterations=config.iterations,
                         regularization=config.regularization)
        self.max_uix = max(users)

    def _calc_confidence_preference(self, df, alpha):
        # convert to confidence and preference
        # use split_rate as a threshold for bad and good classes.
        # to enlarge negative effect, use quadratic transform for rate
        split_rate = 6
        eps = 1e-4
        get_p = lambda v: 1 if v > split_rate else 0
        get_logp = lambda v: log(1 + get_p(v / eps))
        df['rate'] = 1 + alpha * df.rate.apply(get_logp)
        return df

    def _delete_bookmarks(self, recs, seen_items):
        # Since filter_already_liked doesnt work, filter by hand
        for i, rec in enumerate(recs):
            if rec[0] in seen_items:
                recs[i] = None
        recs = list(filter(lambda r: r is not None, recs))
        return recs

    def fit(self):
        self.model.fit(self.iu_mat)

    def recommend_user(self, user, k, return_scores=False):
        user_items = self.orig_df[self.orig_df.user_id ==
                                  user].item_id.tolist()

        # filter liked until len(recs) != given k
        base_k = k
        k = int(min(1.5 * k, k + 0.1 * len(user_items)))
        recs = self.model.recommend(user, self.ui_mat, N=k)
        recs = self._delete_bookmarks(recs, user_items)

        while len(recs) < base_k:
            k *= 2
            recs = self.model.recommend(user, self.ui_mat, N=k)
            recs = self._delete_bookmarks(recs, user_items)

        recs = recs[:base_k]
        # return with or without scores
        if not return_scores:
            return [rec[0] for rec in recs]
        else:
            return recs

    def similar_items(self, item, k, return_scores=False):
        # Returns items that are similar to item with given id
        recs = self.model.similar_items(item, k + 1)

        # avoid recommending same item
        recs = self._delete_bookmarks(recs, [item])
        recs = recs[:k]

        # return with or without scores
        if not return_scores:
            return [rec[0] for rec in recs]
        else:
            return recs

    def similar_items_for_user(self, item, user, k, return_scores=False):
        # Returns items that are similar to item with given id and havent
        # been seen by user with given id
        user_items = self.orig_df[self.orig_df.user_id ==
                                  user].item_id.tolist()
        user_items += [item]  # avoid recommending same item

        # filter liked until len(recs) != given k
        base_k = k
        k = int(min(1.5 * k, k + 0.1 * len(user_items)))
        recs = self.model.similar_items(item, k)
        recs = self._delete_bookmarks(recs, user_items)

        while len(recs) < base_k:
            k *= 2
            recs = self.model.recommend(item, k)
            recs = self._delete_bookmarks(recs, user_items)

        recs = recs[:base_k]
        # return with or without scores
        if not return_scores:
            return [rec[0] for rec in recs]
        else:
            return recs

    def _add_empty_user(self):
        # Enlarges ui_mat and als model user_factors for 1 extra user
        # Upd wrapper data
        self.max_uix += 1
        old_shape = self.ui_mat.shape
        self.ui_mat.resize((old_shape[0] + 1, old_shape[1]))

        # Upd inner model data
        k = self.model.factors
        # set random weights for new user
        self.model.user_factors = np.vstack(
            (self.model.user_factors, np.random.randn(k)))

    def update_user_data(self, user, user_views):
        # Updates model's data about user and recalculates it
        assert isinstance(user, int)
        assert isinstance(user_views, pd.DataFrame)
        assert len(user_views) > 0
        assert len(user_views.user_id.drop_duplicates()) == 1

        user_views = user_views[user_views.item_id != -1]
        user_views = user_views.drop_duplicates(
            subset='item_id user_id'.split(), keep='last')
        user_views = self._calc_confidence_preference(user_views,
                                                      self.config.alpha)
        iixs = user_views.item_id.tolist()
        rates = user_views.rate.tolist()

        # Create new user rates csr matrix
        rowscols = ([0 for _ in iixs], iixs)
        size = (1, self.ui_mat.shape[1])
        # Upd wrapper data
        assert user <= self.max_uix
        self.ui_mat[user] = csr_matrix((rates, rowscols), shape=size)

        # Upd inner model data
        k = self.model.factors
        # set random weights for new user
        self.model.user_factors = np.vstack(
            (self.model.user_factors, np.random.randn(k)))
        # recalculate
        new_user_factors = self.model.recalculate_user(user, self.ui_mat)
        self.model.user_factors[user] = new_user_factors

    def add_user(self, user, user_views=None):
        # Adds user to recommender model. Updates model's matrixes, allows making
        # predictions for new user
        assert isinstance(user, int)

        self._add_empty_user()
        if user_views is None:
            return

        assert isinstance(user_views, pd.DataFrame)
        assert len(user_views) > 0
        assert len(user_views.user_id.drop_duplicates()) == 1

        self.update_user_data(user, user_views)
示例#19
0
class Recommender:
    def __init__(self, **args):
        self.TRAINING_THREADS = int(
            args.get("training_threads", os.cpu_count()))
        self.ALS_FACTORS = args.get("als_factors", 128)
        self.ALS_REGULARIZATION = args.get("als_regularization", 1e-2)
        self.ALS_ITERATIONS = args.get("als_iterations", 15)
        self.MIN_POST_FAVS = args.get("min_post_favs", 5)
        self.MIN_USER_FAVS = args.get("min_user_favs", 50)
        self.MAX_FAVS = args.get("max_favs", 1e12)
        self.FAVS_PATH = args.get("favs_path", "data/favs.csv")
        self.MODEL_PATH = args.get("model_path", "data/recommender.pickle")
        self.DATABASE_URL = args.get("database_url",
                                     "postgresql://localhost/danbooru2")

    @staticmethod
    def create(**args):
        env = {name.lower(): value for name, value in os.environ.items()}
        args = {**env, **args}

        recommender = Recommender(**args)
        recommender.dump_favorites()
        recommender.load_favorites()
        recommender.train()
        recommender.save(recommender.MODEL_PATH)

        return recommender

    @staticmethod
    def load(model_path):
        with open(model_path, "rb") as file:
            return pickle.load(file)

    def dump_favorites(self):
        query = f"""
      SELECT
        post_id,
        user_id
      FROM favorites
      WHERE
        post_id IN (SELECT id FROM posts WHERE fav_count > {self.MIN_POST_FAVS})
        AND user_id IN (SELECT id FROM users WHERE favorite_count > {self.MIN_USER_FAVS})
      ORDER BY post_id DESC
      LIMIT {self.MAX_FAVS}
    """

        self.shell(
            f"psql --no-psqlrc -c '\copy ({query}) TO STDOUT WITH (FORMAT CSV)' {self.DATABASE_URL} > {self.FAVS_PATH}"
        )

    def load_favorites(self):
        favs_df = pd.read_csv(self.FAVS_PATH,
                              dtype=np.int32,
                              names=["post_id", "user_id"])
        favs_df = favs_df.astype("category")

        self.favorites = csr_matrix(
            (np.ones(favs_df.shape[0]), (favs_df["post_id"].cat.codes.copy(),
                                         favs_df["user_id"].cat.codes.copy())),
            dtype=np.int32)
        self.users_to_id = {
            k: v
            for v, k in enumerate(favs_df["user_id"].cat.categories)
        }
        self.posts_to_id = {
            k: v
            for v, k in enumerate(favs_df["post_id"].cat.categories)
        }
        self.ids_to_post = {k: v for v, k in self.posts_to_id.items()}
        self.empty = csr_matrix(self.favorites.shape)

    def train(self):
        self.model = AlternatingLeastSquares(
            calculate_training_loss=True,
            dtype=np.float32,
            num_threads=self.TRAINING_THREADS,
            factors=self.ALS_FACTORS,
            regularization=self.ALS_REGULARIZATION,
            iterations=self.ALS_ITERATIONS)

        start = time.monotonic()
        self.model.fit(self.favorites)
        end = time.monotonic()
        dur = int(end - start)

        self.favorites = None
        self.trained_at = datetime.utcnow().isoformat()
        self.training_time = "{:02d}:{:02d}:{:02d}".format(
            dur // 3600, (dur % 3600 // 60), dur % 60)

    def recommend_for_user(self, user_id, limit=50):
        if not user_id in self.users_to_id:
            return []

        uid = self.users_to_id[user_id]
        recommendations = self.model.recommend(uid, self.empty, N=limit)
        recommendations = [(self.ids_to_post[id], float(score))
                           for id, score in recommendations]
        return recommendations

    def recommend_for_post(self, post_id, limit=50):
        if not post_id in self.posts_to_id:
            return []

        pid = self.posts_to_id[post_id]
        recommendations = self.model.similar_items(pid, N=limit)
        recommendations = [(self.ids_to_post[id], float(score))
                           for id, score in recommendations]
        return recommendations

    def metrics(self):
        return {
            "user_count":
            len(self.users_to_id),
            "post_count":
            len(self.posts_to_id),
            "factors":
            self.model.factors,
            "model_size":
            4 * self.model.factors *
            (len(self.users_to_id) + len(self.posts_to_id)),
            "trained_at":
            self.trained_at,
            "training_time":
            self.training_time,
        }

    def save(self, model_path):
        with open(model_path, "wb") as file:
            pickle.dump(self, file)

    def shell(self, cmd):
        subprocess.run(cmd,
                       stdout=sys.stdout,
                       stderr=sys.stderr,
                       shell=True,
                       check=True)
示例#20
0
    def mf_(self,
            train_songs_A,
            train_tags_A,
            test_songs_A,
            test_tags_A,
            song_ntop=500,
            tag_ntop=50,
            iteration=20):

        print(f'MF... iters:{iteration}')
        # 0711 기준 최고 하이퍼파라미터) * 100, song - 256, tag - 32, reg = 0.1, epoch 20 > song 56.4%, tag 61.3%

        res = []

        songs_A = spr.vstack([test_songs_A, train_songs_A])
        tags_A = spr.vstack([test_tags_A, train_tags_A])

        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)  # epoch
        als_model.fit(songs_A.T * 100)

        als_model_tag = ALS(factors=32,
                            regularization=0.08,
                            use_gpu=True,
                            iterations=iteration)
        als_model_tag.fit(tags_A.T * 100)

        #rec_song = als_model.recommend_all(train_songs_A,N=500)
        #rec_tag = als_model_tag.recommend_all(train_tags_A,N=50) # list (no score)

        for pid in tqdm(range(test_songs_A.shape[0])):

            if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop + 50,
                    filter_already_liked_items=False)

            else:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop,
                    filter_already_liked_items=True)

            if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1:
                cand_tag = als_model_tag.recommend(
                    pid,
                    test_tags_A,
                    N=tag_ntop + 5,
                    filter_already_liked_items=True)
                #tags_already = self.orig_test[self.orig_test['id']== self.plylst_nid_id[self.n_train + pid]]['tags']
                #cand_tag = remove_seen(tags_already,cand_tag)[:tag_ntop]

            else:
                cand_tag = als_model_tag.recommend(
                    pid,
                    test_tags_A,
                    N=tag_ntop,
                    filter_already_liked_items=True)

            rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song]
            rec_song_score = [x[1] for x in cand_song]
            rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag]
            rec_tag_score = [x[1] for x in cand_tag]

            res.append({
                "id": self.plylst_nid_id[self.n_train + pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx,
                "songs_score": rec_song_score,
                "tags_score": rec_tag_score
            })

        print("DONE")

        return res
示例#21
0
class CollaborativeFiltering(object):
    """

    """
    def __init__(self,
                 factors=100,
                 regularization=0.01,
                 iterations=15,
                 calculate_training_loss=True,
                 num_threads=0,
                 random_state=42,
                 **kwargs):
        """

        """
        ## Initialize
        self._random_state = random_state
        self.model = AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            iterations=iterations,
            calculate_training_loss=calculate_training_loss,
            num_threads=num_threads,
            **kwargs)

    def __repr__(self):
        """

        """
        desc = f"CollaborativeFiltering(factors={self.model.factors}, regularization={self.model.regularization})"
        return desc

    def fit(self, X, rows=None, columns=None):
        """
        Args:
            X (csr sparse matrix): Rows are Items, Columns are Users
            rows (list): Identifiers for the rows
            columns (list): Identifiers for the columns
        """
        ## Fix Random Seed
        np.random.seed(self._random_state)
        ## Indices
        if rows is None:
            rows = list(range(X.shape[0]))
        if columns is None:
            columns = list(range(X.shape[1]))
        self._item_map = dict((row, r) for r, row in enumerate(rows))
        self._items = rows
        self._user_map = dict((col, c) for c, col in enumerate(columns))
        self._users = columns
        ## Fit
        self.model.fit(X, show_progress=self.model.calculate_training_loss)
        return self

    def get_similar_item(self, item, k_top=10):
        """
        Find similar items to a given item using cosine similarity

        Args:
            item (any): One of the rows in the training data
            k_top (int): Number of top similar items to return
        """
        if item not in self._item_map:
            raise KeyError("Item does not exist")
        ## Compute Cosine Similarity
        item_f = self.model.item_factors[self._item_map[item]]
        item_factors = self.model.item_factors
        sim = item_factors.dot(item_f) / (
            self.model.item_norms *
            self.model.item_norms[self._item_map[item]])
        ## Select Top-k
        best = np.argpartition(sim, -k_top)[-k_top:]
        sim = sorted(zip(best, sim[best]), key=lambda x: -x[1])
        ## Replace Indices with Names
        sim_items = list(map(lambda i: [self._items[i[0]], i[1]], sim))
        sim_items = pd.DataFrame(sim_items, columns=["item", "similarity"])
        return sim_items

    def recommend(self,
                  user_history,
                  filter_liked=False,
                  filter_items=[],
                  k_top=10):
        """
        Args:
            user_history (dict or list of raw items):
            k_top (int): Number of items to recommend
        """
        ## User History
        user_history = Counter(user_history)
        ## Compute User Factor
        user_vector = np.zeros(self.model.item_factors.shape[0])
        for item, count in user_history.items():
            if item not in self._item_map:
                continue
            user_vector[self._item_map[item]] = count
        ## Compute Score
        scores = self.model.recommend(userid=0,
                                      user_items=csr_matrix(user_vector),
                                      N=k_top,
                                      filter_already_liked_items=filter_liked,
                                      filter_items=list(
                                          map(lambda f: self._item_map[f],
                                              filter_items)),
                                      recalculate_user=True)
        ## Replace Indices with Names
        rec_items = list(map(lambda i: [self._items[i[0]], i[1]], scores))
        rec_items = pd.DataFrame(rec_items, columns=["item", "score"])
        return rec_items

    def dump(self, model_file, compress=3):
        """

        """
        _ = joblib.dump(self, model_file, compress=compress)