Пример #1
0
def icm():
    datareader = Datareader(mode='offline', only_load=True)
    evaluator = Evaluator(datareader)

    print('NLP...')
    stopwords = STOP_WORDS
    token_weights = np.array(TOKEN_WEIGHTS)
    test_playlists = datareader.get_test_pids()

    nlp = NLP(datareader=datareader, stopwords=[], mode='tracks')
    print('Getting ucm and icm...')
    icm = nlp.get_icm()
    icm = bm25_row(icm)

    print('Computing similarity...')
    start = time.time()
    # Compute similarity
    similarity = tversky_similarity(icm, shrink=200, alpha=0.1, beta=1)
    similarity = similarity.tocsr()
    print(time.time() - start)

    urm = datareader.get_urm()

    print('Computing eurm...')
    start = time.time()
    # Compute eurm
    eurm_nlp = dot_product(urm[test_playlists, :], similarity, k=500)
    eurm_nlp = eurm_nlp.tocsr()

    # sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_weighted_offline.npz', eurm_nlp)
    evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp),
                       name='nlp_enriched')
Пример #2
0
class Top_pop_p(object):
    def __init__(self):

        self.dr_on = Datareader(verbose=False, mode='online', only_load=True)
        self.dr_of = Datareader(verbose=False, mode='online', only_load=True)
        self.urm_on = self.dr_on.get_urm()
        self.urm_of = self.dr_on.get_urm()
        self.urm_col = sps.csc_matrix(self.urm_of)
        self.top_p = np.zeros(self.urm_of.shape[1])



    def album(self):
        eurm = sps.lil_matrix(self.urm_of.shape)
        pids = self.dr_on.get_test_pids(cat=2)
        pids_all = self.dr_of.get_test_pids()
        ucm_album = self.dr_of.get_ucm_albums().tocsc()
        album_dic = self.dr_of.get_track_to_album_dict()

        for row in tqdm(pids):
            track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.urm_on.indptr[row + 1]][0]

            album = album_dic[track_ind]
            playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album.indptr[album+1]]

            top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32)
            track_ind_rec = top.argsort()[-501:][::-1]

            eurm[row, track_ind_rec] = top[track_ind_rec]

        eurm = eurm.tocsr()[pids_all]
        eurm = eurm_remove_seed(eurm, self.dr_on)

        return eurm

    def track(self):
        eurm = sps.lil_matrix(self.urm_of.shape)
        pids = self.dr_on.get_test_pids(cat=2)
        pids_all = self.dr_of.get_test_pids()

        for row in tqdm(pids):
            track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.urm_on.indptr[row + 1]][0]

            playlists =  self.urm_col.indices[ self.urm_col.indptr[track_ind]: self.urm_col.indptr[track_ind+1]]

            top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32)
            track_ind_rec = top.argsort()[-501:][::-1]

            eurm[row, track_ind_rec] = top[track_ind_rec]

        eurm = eurm.tocsr()[pids_all]
        eurm = eurm_remove_seed(eurm, self.dr_on)
        print(eurm)
        return eurm.copy()
Пример #3
0
def prova():

    dr = Datareader(mode='offline', only_load=True)
    print(dr.get_artist_to_tracks_dict())
    exit()

    dr = Datareader(mode='offline', only_load=True, verbose=False)
    test_playlists = dr.get_test_pids()

    stopwords = STOP_WORDS
    token_weights = np.array(TOKEN_WEIGHTS)

    nlp = NLP(mode='playlists', datareader=dr, stopwords=STOP_WORDS)
    s = nlp.get_ucm()
    print(s.shape)

    evaluator = Evaluator(dr)

    ucm = nlp.get_ucm()
    sim = sparse.load_npz(ROOT_DIR + '/data/cf_user_similarity.npz')

    print('Computing dot...')
    ucm = dot_product(sim, ucm, k=200)
    print('NNZ', ucm.nnz)
    exit()

    urm = dr.get_urm()

    # ucm = ucm.astype(np.float64)
    # inplace_csr_column_scale(ucm, token_weights)

    print('Computing similarity...')
    start = time.time()
    # Compute similarity
    similarity = tversky_similarity(ucm, shrink=200, alpha=0.1, beta=1)
    similarity = similarity.tocsr()
    print(time.time() - start)

    print('Computing eurm...')
    start = time.time()
    # Compute eurm
    eurm_nlp = dot_product(similarity, urm, k=500)
    eurm_nlp = eurm_nlp.tocsr()
    eurm_nlp = eurm_nlp[test_playlists, :]

    #sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_weighted_offline.npz', eurm_nlp)
    evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp),
                       name='nlp_enriched')
import sys
from scipy import sparse
import numpy as np
import utils.pre_processing as pre
from utils.definitions import *
from utils.datareader import Datareader
from utils.evaluator import Evaluator
from utils.pre_processing import *
from utils.post_processing import *

dr = Datareader(mode='offline', only_load=True, verbose=False)
ev = Evaluator(dr)
urm = dr.get_urm(binary=True)
urm_csc = urm.tocsc(copy=True)

sim_nlp = sparse.load_npz(ROOT_DIR + '/data/sim_nlp_lele.npz')

for k in [1, 2, 3, 4, 5]:
    eurm_top = dr.get_eurm_top_pop_filter_cat_1(sim_nlp, k, topk=500)
    eurm_top = norm_l1_row(eurm_top)

    eurm_nlp = sparse.load_npz(ROOT_DIR + '/data/nlp_fusion_tuned_offline.npz')
    eurm_nlp = norm_l1_row(eurm_nlp)

    for a in [0.05, 0.10, 0.15, 0.20]:
        eurm = eurm_nlp * (1.0 - a) + eurm_top * a
        rec_list = eurm_to_recommendation_list(eurm, datareader=dr)
        ev.evaluate(rec_list, name='pop_first_k=' + str(k) + '_a=' + str(a))
Пример #5
0
from utils.datareader import Datareader
from utils.evaluator import Evaluator
from utils.post_processing import eurm_remove_seed
from utils.post_processing import eurm_to_recommendation_list
import numpy as np
import scipy.sparse as sps
from tqdm import tqdm
from utils.definitions import *
from utils.post_processing import eurm_remove_seed, append_rec_list

dr = Datareader(verbose=False, mode='offline', only_load=True)

urm = dr.get_urm()
urm_col = sps.csc_matrix(urm)
top_p = np.zeros(urm.shape[1])
rec = []
eurm1 = sps.lil_matrix(urm.shape)
eurm2 = sps.lil_matrix(urm.shape)
print(eurm1.shape)
pids = dr.get_test_pids(cat=2)
pids_all = dr.get_test_pids()

# TopPop Album
# ucm_album = dr.get_ucm_albums().tocsc()
# album_dic = dr.get_track_to_album_dict()

# TopPop Artist
ucm_album = dr.get_ucm_albums().tocsc()
artists_dic = dr.get_track_to_artist_dict()

album_to_tracks = load_obj(name="album_tracks_dict_offline",
    nlp = NLP2(dr,
               stopwords=[],
               norm=True,
               work=True,
               split=True,
               date=False,
               skip_words=True,
               porter=False,
               porter2=True,
               lanca=False,
               lanca2=True)

    ucm_csr = nlp.get_UCM(data1=True).tocsr()
    ucm_csc = ucm_csr.tocsc(copy=True)

    urm_csr = dr.get_urm().tocsr()
    urm_csc = urm_csr.tocsc(copy=True)

    test_playlists = dr.get_test_pids(cat=1)
    # test_playlists.extend(dr.get_test_pids(cat=2))

    rec_list = [[] for x in range(10000)]

    i = 0
    for playlist_id in tqdm(test_playlists):
        tokens = ucm_csr.indices[ucm_csr.indptr[playlist_id]:ucm_csr.
                                 indptr[playlist_id + 1]]
        playlists_with_tokens = []
        for token in tokens:
            playlists_with_tokens.extend(
                ucm_csc.indices[ucm_csc.indptr[token]:ucm_csc.indptr[token +
Пример #7
0
knn = 100
topk = 750

complete_name = "maurizio_" + mode + "__knn=" + str(knn) + "_topk=" + str(topk)

if __name__ == '__main__':

    sim = sps.load_npz(ROOT_DIR +
                       "/similarities/offline-similarity_rp3beta_knn100.npz")

    dr = Datareader(mode=mode, only_load=True)

    ######### MAURIZ
    ICM = dr.get_icm(alid=True)

    cfw = CFW_D_Similarity_Linalg(URM_train=dr.get_urm(),
                                  ICM=ICM.copy(),
                                  S_matrix_target=sim,
                                  URM_validation=None)

    cfw.fit()

    weights = sps.diags(cfw.D_best)

    sps.save_npz("ICM_fw_maurizio", weights)

    ICM_weighted = ICM.dot(weights)

    sps.save_npz("ICM_fw_maurizio", ICM_weighted)

    ######## NOI
Пример #8
0
from recommenders.similarity.s_plus import tversky_similarity
from utils.evaluator import Evaluator
from utils.datareader import Datareader
from utils.post_processing import *
from tqdm import tqdm
from scipy import sparse
import utils.sparse as ut
import pandas as pd
import numpy as np
import sys


datareader = Datareader(mode='offline', only_load=True, verbose=False)
evaluator = Evaluator(datareader)

urm = datareader.get_urm()
ucm_album = datareader.get_ucm_albums()

albums_pop = ucm_album.sum(axis=0).A1
mask = np.argsort(albums_pop)[::-1][:100]
ut.inplace_set_cols_zero(ucm_album, mask)

ucm_album = bm25_row(ucm_album)

print('Similarity..')
sim = tversky_similarity(ucm_album, ucm_album.T, shrink=200, alpha=0.1, beta=1, k=800, verbose=1, binary=False)
sim = sim.tocsr()

test_pids = list(datareader.get_test_pids())

eurm = dot_product(sim, urm, k=750)
        import warnings
        warnings.warn(
            'This function still use the old version of the remove seed, it should be replaced soon by the one in post_processing class'
        )

        self.urm = sps.csr_matrix(self.urm[self.pid])
        tmp = self.urm.tocoo()
        row = tmp.row
        col = tmp.col
        min = self.eurm.tocoo().min()
        self.eurm = sps.lil_matrix(self.eurm)
        self.eurm[row, col] = -1
        self.eurm = sps.csr_matrix(self.eurm)

        return self.eurm


if __name__ == '__main__':
    from utils.datareader import Datareader
    dr = Datareader(verbose=False, mode="offline", only_load="False")

    rec = Top_pop()
    rec.fit(dr.get_urm(), dr.get_test_playlists().transpose()[0])
    eurm = rec.compute_rating().tocsr()
    sps.save_npz("top_pop online.npz", eurm.tocsr())
    exit()
    import utils.evaluator as ev
    from utils.post_processing import eurm_to_recommendation_list
    eva = ev.Evaluator(dr)

    eva.evaluate(eurm_to_recommendation_list(eurm), "cacca TOPTOP")
        # idft = np.power(idft, 0.5)
        # norm_urm = self.urm.multiply(idft.reshape(1,-1)).tocsr()

        # Computer the eURM
        self.eurm = dot_product(norm_urm, self.model, k=top_k)
        self.eurm = sps.csr_matrix(self.eurm)

        if verbose:
            print("time: " + str(int(time.time() - start_time) / 60))

        return self.eurm


if __name__ == '__main__':
    dr = Datareader(verbose=False, mode='offline', only_load=True)
    urm = dr.get_urm(binary=False)
    pid = dr.get_test_pids()
    position_urm = dr.get_position_matrix(position_type='last')
    pos_urm = position_urm.T.tocoo().tocsr()
    ev = Evaluator(dr)

    knn = 100
    topk = 750

    rec = TF_collaborative_item()
    # for knn in range(50, 300, 50):
    rec.fit(urm, pid)
    rec.compute_model(verbose=True, knn=knn, power=0.6, save_model=False)
    # rec.model = rec.model.tocsr()
    # rec.model.eliminate_zeros()
    #
class Top_pop_p(object):
    '''
    Class that allow the user to get the personalized top pop build following track or album
    '''
    def __init__(self):
        1

    def get_top_pop_album(self, mode):
        '''
        :return: csr_matrix filled with the reccomendation for the cat 2 following album
        '''

        if mode == "online":
            self.dr_on = Datareader(verbose=False,
                                    mode='online',
                                    only_load=True)
            self.urm_on = self.dr_on.get_urm()
            self.urm_col = sps.csc_matrix(self.urm_on)
            self.top_p = np.zeros(self.urm_on.shape[1])

            eurm = sps.lil_matrix(self.urm_on.shape)
            pids = self.dr_on.get_test_pids(cat=2)
            pids_all = self.dr_on.get_test_pids()
            ucm_album = self.dr_on.get_ucm_albums().tocsc()
            album_dic = self.dr_on.get_track_to_album_dict()

            for row in tqdm(pids):
                track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.
                                                urm_on.indptr[row + 1]][0]

                album = album_dic[track_ind]
                playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album
                                              .indptr[album + 1]]

                top = self.urm_on[playlists].sum(axis=0).A1.astype(np.int32)
                track_ind_rec = top.argsort()[-501:][::-1]

                eurm[row, track_ind_rec] = top[track_ind_rec]

            eurm = eurm.tocsr()[pids_all]
            eurm = eurm_remove_seed(eurm, self.dr_on)

        elif mode == "offline":
            self.dr_of = Datareader(verbose=False,
                                    mode='offline',
                                    only_load=True)
            self.urm_of = self.dr_of.get_urm()
            self.urm_col = sps.csc_matrix(self.urm_of)
            self.top_p = np.zeros(self.urm_of.shape[1])

            eurm = sps.lil_matrix(self.urm_of.shape)
            pids = self.dr_of.get_test_pids(cat=2)
            pids_all = self.dr_of.get_test_pids()
            ucm_album = self.dr_of.get_ucm_albums().tocsc()
            album_dic = self.dr_of.get_track_to_album_dict()

            for row in tqdm(pids):
                track_ind = self.urm_of.indices[self.urm_of.indptr[row]:self.
                                                urm_of.indptr[row + 1]][0]

                album = album_dic[track_ind]
                playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album
                                              .indptr[album + 1]]

                top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32)
                track_ind_rec = top.argsort()[-501:][::-1]

                eurm[row, track_ind_rec] = top[track_ind_rec]

            eurm = eurm.tocsr()[pids_all]
            eurm = eurm_remove_seed(eurm, self.dr_of)

        return eurm.copy().tocsr()

    def get_top_pop_track(self, mode):
        '''
        :return: csr_matrix filled with the reccomendation for the cat 2 following track
        '''
        if mode == "online":
            self.dr_on = Datareader(verbose=False,
                                    mode='online',
                                    only_load=True)
            self.urm_on = self.dr_on.get_urm()
            self.urm_col = sps.csc_matrix(self.urm_on)
            self.top_p = np.zeros(self.urm_on.shape[1])

            eurm = sps.lil_matrix(self.urm_on.shape)
            pids = self.dr_on.get_test_pids(cat=2)
            pids_all = self.dr_on.get_test_pids()

            for row in tqdm(pids):
                track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.
                                                urm_on.indptr[row + 1]][0]

                playlists = self.urm_col.indices[self.urm_col.
                                                 indptr[track_ind]:self.
                                                 urm_col.indptr[track_ind + 1]]

                top = self.urm_on[playlists].sum(axis=0).A1.astype(np.int32)
                track_ind_rec = top.argsort()[-501:][::-1]

                eurm[row, track_ind_rec] = top[track_ind_rec]

            eurm = eurm.tocsr()[pids_all]
            eurm = eurm_remove_seed(eurm, self.dr_on)

        elif mode == "offline":
            self.dr_of = Datareader(verbose=False,
                                    mode='offline',
                                    only_load=True)
            self.urm_of = self.dr_of.get_urm()
            self.urm_col = sps.csc_matrix(self.urm_of)
            self.top_p = np.zeros(self.urm_of.shape[1])

            eurm = sps.lil_matrix(self.urm_of.shape)
            pids = self.dr_of.get_test_pids(cat=2)
            pids_all = self.dr_of.get_test_pids()

            for row in tqdm(pids):
                track_ind = self.urm_of.indices[self.urm_of.indptr[row]:self.
                                                urm_of.indptr[row + 1]][0]

                playlists = self.urm_col.indices[self.urm_col.
                                                 indptr[track_ind]:self.
                                                 urm_col.indptr[track_ind + 1]]

                top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32)
                track_ind_rec = top.argsort()[-501:][::-1]

                eurm[row, track_ind_rec] = top[track_ind_rec]

            eurm = eurm.tocsr()[pids_all]
            eurm = eurm_remove_seed(eurm, self.dr_of)

        return eurm.copy().tocsr()
Пример #12
0
import sys
from scipy import sparse
import numpy as np
import utils.pre_processing as pre
from utils.definitions import *
from utils.datareader import Datareader
from utils.evaluator import Evaluator
from utils.pre_processing import *
from utils.post_processing import *
from fast_import import *

dr = Datareader(mode='offline', only_load=True, verbose=False)
ev = Evaluator(dr)
urm = dr.get_urm_with_position(1)

urm_std = dr.get_urm()

rec = CF_UB_BM25(urm=urm, datareader=dr, verbose_evaluation=False)
rec.model(alpha=1, beta=0, k=250)
rec.urm = urm_std
rec.fast_recommend()
res = rec.fast_evaluate_eurm()
print(res[1])