예제 #1
0
def train_and_predict(train_filepath, test_filepath):
    train_df = pd.read_json(train_filepath)
    test_df = pd.read_json(test_filepath)

    tr_songs = train_df.songs.tolist()
    te_songs = test_df.songs.tolist()
    tr_tags = train_df.tags.tolist()
    te_tags = test_df.tags.tolist()

    vocab = Vocabulary(pd.concat([train_df, test_df], ignore_index=True))

    train_data = encode_features(train_df, vocab)
    test_data = encode_features(test_df, vocab)

    # Shuffle train data
    train_data = shuffle(train_data)

    # list of lists -> CSR
    def lil_to_csr(indices, shape):
        data = []
        row_ind = []
        col_ind = []
        for row_idx, row in enumerate(indices):
            for col_idx in row:
                data.append(1)
                row_ind.append(row_idx)
                col_ind.append(col_idx)
        return csr_matrix((data, (row_ind, col_ind)), shape=shape)

    train_csr = lil_to_csr(train_data, (len(train_data), vocab.size))
    test_csr = lil_to_csr(test_data, (len(test_data), vocab.size))

    r = scipy.sparse.vstack([test_csr, train_csr])
    r = csr_matrix(r)

    factors = 512
    alpha = 500.0
    als_model = ALS(factors=factors, regularization=0.1)
    als_model.fit(r.T * alpha)

    song_model = ALS(factors=factors)
    tag_model = ALS(factors=factors)
    song_model.user_factors = als_model.user_factors
    tag_model.user_factors = als_model.user_factors
    song_model.item_factors = als_model.item_factors[:vocab.num_songs]
    tag_model.item_factors = als_model.item_factors[vocab.num_songs:]

    song_rec_csr = test_csr[:, :vocab.num_songs]
    tag_rec_csr = test_csr[:, vocab.num_songs:]

    song_rec = song_model.recommend_all(song_rec_csr, N=100)
    tag_rec = tag_model.recommend_all(tag_rec_csr, N=10)
    tag_rec += vocab.num_songs

    return [{
        "id": test_playlist_id,
        "songs": list(map(vocab.id_to_song, song_rec[test_row_idx])),
        "tags": list(map(vocab.id_to_tag, tag_rec[test_row_idx])),
    } for test_row_idx, test_playlist_id in enumerate(tqdm(test_df.id))]
예제 #2
0
def load_recommender(als_model_file: str,
                     index_file: str,
                     item_feature_file: str = None,
                     **kwargs) -> ImplicitRecommender:
    log.info("Loading als model")
    data = np.load(als_model_file, allow_pickle=True)
    model = AlternatingLeastSquares(
        factors=data['model.item_factors'].shape[1])
    model.item_factors = data['model.item_factors']
    model.YtY  # This will initialize the _YtY instance variable which is used directly in internal methods
    if 'user_factors' in data:
        model.user_factors = data['model.user_factors']

    user_labels = data['user_labels']
    item_labels = data['item_labels']

    if index_file is None:
        return ImplicitRecommender(model, user_labels, item_labels)

    elif index_file.endswith('.ann'):
        import annoy
        log.info("Loading annoy recommendation index")
        max_norm, extra = augment_inner_product_matrix(model.item_factors)
        recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular')
        recommend_index.load(
            index_file)  # prefault=load_to_memory does not seem to work

        if item_feature_file is None:
            from .annoy import ImplicitAnnoyRecommender
            return ImplicitAnnoyRecommender(model, recommend_index, max_norm,
                                            user_labels, item_labels)
        else:
            log.info("Loading item features for recommendation")
            item_feature_data = pickle.load(open(item_feature_file, "rb"))
            tag_tfidf_transformer = item_feature_data['tag_tfidf_transformer']
            tag_lookup = item_feature_data['tag_lookup']
            item_embedding_weight = item_feature_data['item_embedding_weight']
            from .annoy_item_features import ImplicitAnnoyItemFeatureRecommender
            return ImplicitAnnoyItemFeatureRecommender(
                model, recommend_index, max_norm, user_labels, item_labels,
                tag_tfidf_transformer, tag_lookup, item_embedding_weight)
    elif index_file.endswith('.hnsw'):
        import hnswlib
        from .hnsw import ImplicitHNSWRecommender
        log.info("Loading hnsw recommendation index")
        # we build the index in l2 space and load it in inner product space on purpose.
        # This space change gives us 0.96 recall
        l2_recommend_index = hnswlib.Index(space='ip',
                                           dim=model.item_factors.shape[1])
        l2_recommend_index.load_index(index_file)
        l2_recommend_index.set_ef(kwargs.get('ef', 2000))
        return ImplicitHNSWRecommender(model, l2_recommend_index, user_labels,
                                       item_labels)
    else:
        raise RecommenderException("Unsupported file type" + index_file)
예제 #3
0
def read_from_file(file_name: str):
    """
    Reads a model from a '.npz' file
    """
    data = np.load(file_name)
    model = AlternatingLeastSquares(
        factors=data['model.item_factors'].shape[1])
    model.item_factors = data['model.item_factors']
    model.YtY  # This will initialize the _YtY instance variable which is used directly in internal methods
    if 'user_factors' in data:
        model.user_factors = data['model.user_factors']
    return Recommender(model=model,
                       user_labels=data['user_labels'],
                       item_labels=data['item_labels'])
예제 #4
0
    def _split_collective_model(self, csr_matrix):
        splited_models = []
        splited_csr_matrix = []
        prev_idx = 0
        for num_entity in self.num_entities:
            model = ALS(factors=self.num_factors)
            model.user_factors = self.model.user_factors
            model.item_factors = self.model.item_factors[prev_idx:prev_idx+num_entity]
            splited_models.append(model)

            entity_csr_matrix = csr_matrix[:, prev_idx:prev_idx+num_entity]
            splited_csr_matrix.append(entity_csr_matrix)

            prev_idx = prev_idx + num_entity
        return splited_models, splited_csr_matrix
예제 #5
0
    def _fit_for_tag(self, train, val):
        df = self._t_data.get_preference(train, val)
        t_len = self._t_data.get_tag_length()

        # user x item csr_matrix
        user_item_csr = sparse.csr_matrix(
            (df['preference'].astype(float), (df['user_id'], df['item_id'])))

        t_model = AlternatingLeastSquares(factors=1800)
        t_model.fit(user_item_csr.T * 65)

        # Configure tag only model
        t_model.item_factors = t_model.item_factors[:t_len]

        user_tags_csr = user_item_csr[:, :t_len]
        self._t_best = t_model.recommend_all(user_tags_csr, N=self._t_topk)
예제 #6
0
    def _fit_for_song(self, train, val):
        df = self._s_data.get_preference(train, val)
        s_len = self._s_data.get_song_length()

        # user x item csr_matrix
        user_item_csr = sparse.csr_matrix(
            (df['preference'].astype(float), (df['user_id'], df['item_id'])))

        s_model = AlternatingLeastSquares(factors=2500)
        s_model.fit(user_item_csr.T * 160)

        # Configure song only model
        s_model.item_factors = s_model.item_factors[:s_len]

        user_song_csr = user_item_csr[:, :s_len]
        self._s_best = s_model.recommend_all(user_song_csr, N=self._s_topk)
예제 #7
0
    def multi_mf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A,
                  song_ntop, tag_ntop, iteration):

        print(f'Multi_MF... iters:{iteration}')

        #res = []

        songs_A = spr.vstack([test_songs_A, train_songs_A])
        tags_A = spr.vstack([test_tags_A, train_tags_A])

        A = spr.hstack([songs_A, tags_A])

        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)  # epoch
        als_model.fit(A.T * 15)

        song_model = ALS(use_gpu=False)
        tag_model = ALS(use_gpu=False)

        song_model.user_factors = als_model.user_factors
        tag_model.user_factors = als_model.user_factors

        song_model.item_factors = als_model.item_factors[:self.n_songs]
        tag_model.item_factors = als_model.item_factors[self.n_songs:]

        # for test
        song_rec_csr = songs_A[:self.n_test, :]
        tag_rec_csr = tags_A[:self.n_test, :]

        cand_song = song_model.recommend_all(song_rec_csr, N=500)
        cand_tag = tag_model.recommend_all(tag_rec_csr, N=50)

        res = [{
            "id": self.plylst_nid_id[self.n_train + id],
            "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()],
            "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()]
        } for id, rec_idx in enumerate(zip(cand_song, cand_tag))]

        #rec_song = als_model.recommend_all(train_songs_A,N=500)
        #rec_tag = als_model_tag.recommend_all(train_tags_A,N=50) # list (no score)
        '''
        for id in tqdm(range(self.n_test)):

            # song
            cand_song = song_model.recommend(id,song_rec_csr, N=song_ntop, filter_already_liked_items=True)
            rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song]
            rec_song_score = [x[1] for x in cand_song]

            # tag
            cand_tag = tag_model.recommend(id,tag_rec_csr, N=tag_ntop, filter_already_liked_items=True)
            rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag]
            rec_tag_score = [x[1] for x in cand_tag]

            res.append({
                "id": self.plylst_nid_id[self.n_train + id],
                "songs": rec_song_idx, 
                "tags": rec_tag_idx,
                "songs_score":rec_song_score,
                "tags_score":rec_tag_score
            })
        '''

        print("DONE")

        return res
            train_task = train.apply_async((queue, ), queue='trainer')
            with open('recommender/ratings.csv', 'a') as f:
                writer = csv.writer(f)
                for row in queue:
                    writer.writerow(row)
            queue = []


if __name__ == '__main__':
    import numpy as np
    import pandas as pd
    import csv
    from scipy.sparse import coo_matrix
    from implicit.als import AlternatingLeastSquares

    user_offset = 69878
    ratings_df = pd.read_csv('recommender/ratings.csv')
    ratings = coo_matrix((ratings_df['rating'], (ratings_df['movie_id'],
                                                 ratings_df['user_id'])))
    model = AlternatingLeastSquares(factors=32,
                                    regularization=0.01,
                                    dtype=np.float32,
                                    iterations=15,
                                    calculate_training_loss=True)
    model.user_factors = np.load('recommender/user_factors.npy')
    model.item_factors = np.load('recommender/item_factors.npy')
    user_items = ratings.T.tocsr()
    queue = []
    train_task = None
    app.worker_main()
예제 #9
0
    def multi_mf_(self,
                  train_songs_A,
                  train_tags_A,
                  val_songs_A,
                  val_tags_A,
                  test_songs_A,
                  test_tags_A,
                  meta=True,
                  song_ntop=500,
                  tag_ntop=50,
                  iteration=20,
                  score=False):

        print(f'Multi_MF... iters:{iteration}')

        val_res = []
        test_res = []

        songs_A = spr.vstack([val_songs_A, test_songs_A, train_songs_A])
        tags_A = spr.vstack([val_tags_A, test_tags_A, train_tags_A])

        print(val_songs_A.shape, test_songs_A.shape, train_songs_A.shape)

        if meta == True:
            s_meta = self.mkspr_for_meta()
            print(songs_A.shape, tags_A.shape, s_meta.shape)
            A = spr.hstack([songs_A, tags_A, s_meta])
        else:
            A = spr.hstack([songs_A, tags_A])

        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)
        als_model.fit(A.T * 100)

        song_model = ALS(use_gpu=True)
        tag_model = ALS(use_gpu=True)

        song_model.user_factors = als_model.user_factors
        tag_model.user_factors = als_model.user_factors

        song_model.item_factors = als_model.item_factors[:self.n_songs]
        tag_model.item_factors = als_model.item_factors[self.n_songs:]

        # for val
        val_song_rec_csr = songs_A[:self.n_val, :]
        val_tag_rec_csr = tags_A[:self.n_val, :]

        # for test
        test_song_rec_csr = songs_A[self.n_val:self.n_val + self.n_test, :]
        test_tag_rec_csr = tags_A[self.n_val:self.n_val + self.n_test, :]

        if score is True:
            pass

        else:
            # val
            cand_song = song_model.recommend_all(val_song_rec_csr, N=song_ntop)
            cand_tag = tag_model.recommend_all(val_tag_rec_csr, N=tag_ntop)

            val_res = [{
                "id":
                self.plylst_nid_id[self.n_train + id],
                "songs":
                [self.song_sid_id.get(x) for x in rec_idx[0].tolist()],
                "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()]
            } for id, rec_idx in enumerate(zip(cand_song, cand_tag))]

            # test
            cand_song = song_model.recommend_all(test_song_rec_csr,
                                                 N=song_ntop)
            cand_tag = tag_model.recommend_all(test_tag_rec_csr, N=tag_ntop)

            test_res = [{
                "id":
                self.plylst_nid_id[self.n_train + self.n_val + id],
                "songs":
                [self.song_sid_id.get(x) for x in rec_idx[0].tolist()],
                "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()]
            } for id, rec_idx in enumerate(zip(cand_song, cand_tag))]

        return val_res, test_res