Exemplo n.º 1
0
def test_predict_not_fitted():

    model = LightFM()

    with pytest.raises(ValueError):
        model.predict(np.arange(10), np.arange(10))

    with pytest.raises(ValueError):
        model.predict_rank(1)

    with pytest.raises(ValueError):
        model.get_user_representations()

    with pytest.raises(ValueError):
        model.get_item_representations()
def test_get_representations():

    model = LightFM(random_state=SEED)
    model.fit_partial(train, epochs=10)

    num_users, num_items = train.shape

    for (item_features,
         user_features) in ((None, None), ((sp.identity(num_items) +
                                            sp.random(num_items, num_items)),
                                           (sp.identity(num_users) +
                                            sp.random(num_users, num_users)))):

        test_predictions = model.predict(test.row,
                                         test.col,
                                         user_features=user_features,
                                         item_features=item_features)

        item_biases, item_latent = model.get_item_representations(
            item_features)
        user_biases, user_latent = model.get_user_representations(
            user_features)

        assert item_latent.dtype == np.float32
        assert user_latent.dtype == np.float32

        predictions = (
            (user_latent[test.row] * item_latent[test.col]).sum(axis=1) +
            user_biases[test.row] + item_biases[test.col])

        assert np.allclose(test_predictions, predictions, atol=0.000001)
Exemplo n.º 3
0
class RepresentationLearner:
    def __init__(self, n_components=30):
        self.user_features = None
        self.item_features = None
        self.model = LightFM(n_components)

    def _merge_user_features(self, new_features):
        pass

    def _merge_item_features(self, new_features):
        pass

    def fit_partial(self,
                    interactions,
                    user_features=None,
                    item_features=None):
        self._merge_user_features(user_features)
        self._merge_item_features(item_features)

        self.model.fit_partial(interactions,
                               user_features=user_features,
                               item_features=item_features)

    def user_representations(self):
        _, user_repr = self.model.get_user_representations()
        return user_repr

    def item_representations(self):
        _, item_repr = self.model.get_item_representations()
        return item_repr

    def save(self, path):
        with open(path, 'wb') as output:
            pickle.dump(self, output)

    @classmethod
    def load(cls, path):
        with open(path, 'rb') as input:
            return pickle.load(input)

    def train(self,
              interaction_path,
              user_features_path=None,
              item_features_path=None):
        def read_fake_data(n_users, n_items, path):
            data = pd.read_csv(path)

            mat = scipy.sparse.lil_matrix((n_users, n_items), dtype=np.int32)

            for _, row in data.iterrows():
                userId, itemId, is_liked = row[0], row[1], row[2]
                mat[userId, itemId] = is_liked

            return mat

        n_users = 10000
        n_items = 10000
        interactions = read_fake_data(n_users, n_items, interaction_path)
        self.fit_partial(interactions)
Exemplo n.º 4
0
def main():
    # sparse_training_matrix is user-item interaction matrix.

    model = LightFM(no_components=30,
                    learning_rate=0.05,
                    loss='bpr',
                    item_pretrain=True,
                    item_pretrain_file='item_embeddings.txt')
    model.fit(sparse_training_matrix, epochs=100)
    URB, URE = model.get_user_representations()
    IRB, IRE = model.get_item_representations()
Exemplo n.º 5
0
def train(impl_train_data, config, user_ids, item_ids, model_folder, save_res=True):
    # In this method we train the MF algorithm
    model = LightFM(loss='warp', no_components=config['dims'], learning_rate=config['lr'])
    model = model.fit(impl_train_data, epochs=50, num_threads=8)

    user_biases, user_embeddings = model.get_user_representations()
    item_biases, item_embeddings = model.get_item_representations()
    item_vecs_reg = np.concatenate((item_embeddings, np.reshape(item_biases, (1, -1)).T), axis=1)
    user_vecs_reg = np.concatenate((user_embeddings, np.ones((1, user_biases.shape[0])).T), axis=1)
    print("USER FEAT:", user_vecs_reg.shape)
    print("ITEM FEAT:", item_vecs_reg.shape)
    if save_res==True:
        save(item_ids, item_vecs_reg, os.path.join(model_folder, 'out_item_features.feats'))
        save(user_ids, user_vecs_reg, os.path.join(model_folder, 'out_user_features.feats'))
    return item_ids, item_vecs_reg, user_ids, user_vecs_reg
Exemplo n.º 6
0
def train_mf(impl_train_data, item_ids, user_ids, item_features_file, user_features_file, dims=200, epochs=50, max_sampled=10, lr=0.05):

    model = LightFM(loss='warp', no_components=dims, max_sampled=max_sampled, learning_rate=lr, random_state=42)
    model = model.fit(impl_train_data, epochs=epochs, num_threads=24)

    user_biases, user_embeddings = model.get_user_representations()
    item_biases, item_embeddings = model.get_item_representations()
    item_vec = np.concatenate((item_embeddings, np.reshape(item_biases, (1, -1)).T), axis=1)
    user_vec = np.concatenate((user_embeddings, np.ones((1, user_biases.shape[0])).T), axis=1)

    print("USER FEAT:", user_vec.shape)
    print("ITEM FEAT:", item_vec.shape)
    save(item_ids, item_vec, item_features_file)
    save(user_ids, user_vec, user_features_file)
    return user_vec, item_vec
Exemplo n.º 7
0
def train_warp(impl_train_data,
               dims,
               user_ids,
               item_ids,
               user_features_filem,
               item_features_file,
               save_res=True):
    model = LightFM(loss='warp', no_components=dims, max_sampled=30)
    model = model.fit(impl_train_data, epochs=50, num_threads=8)

    user_biases, user_embeddings = model.get_user_representations()
    item_biases, item_embeddings = model.get_item_representations()
    item_vecs_reg = np.concatenate(
        (item_embeddings, np.reshape(item_biases, (1, -1)).T), axis=1)
    user_vecs_reg = np.concatenate(
        (user_embeddings, np.ones((1, user_biases.shape[0])).T), axis=1)
    #print("USER FEAT:", user_vecs_reg.shape)
    #print("ITEM FEAT:", item_vecs_reg.shape)
    if save_res == True:
        save(item_ids, item_vecs_reg, item_features_file)
        save(user_ids, user_vecs_reg, user_features_file)
    return item_ids, item_vecs_reg, user_ids, user_vecs_reg
Exemplo n.º 8
0
# load data
sparse_mat = sparse.load_npz('./user_item_click_sparse_matrix_' + today +
                             '.npz')
sparse_mat_view = sparse.load_npz('./user_item_view_sparse_matrix_' + today +
                                  '.npz')
sparse_prod_fea = sparse.load_npz('./products_feature_sparse_' + today +
                                  '.npz')
prod_fea_concat = concatProductFeature(sparse_prod_fea,
                                       id_weight=1,
                                       sparse_weight=0.01)

model = LightFM(no_components=150, loss='warp', max_sampled=20, random_state=0)
model.fit(sparse_mat, epochs=20, item_features=prod_fea_concat)

item_fea = model.get_item_representations(features=prod_fea_concat)
total_dist = cdist(item_fea[1], item_fea[1], 'cosine')

productID = 3088
similarClickProducts = np.argsort(total_dist[productID - 1])[1:21] + 1

# view results
from DataLoader import DataLoader
import config as cfg
DL = DataLoader(cfg)
DL._loadProductData_stream(todayDate)

DL.productData[[x in [productID]
                for x in DL.productData.id]][['_source.name', '_source.tags']]
DL.productData[[x in similarClickProducts
                for x in DL.productData.id]][['_source.name', '_source.tags']]
Exemplo n.º 9
0
class LightFMRecommender(BaseFactorizationRecommender):

    default_model_params = {
        'loss': 'warp',
        'learning_schedule': 'adadelta',
        'no_components': 30,
        'max_sampled': 10,
        'item_alpha': 0,
        'user_alpha': 0,
    }

    default_fit_params = {
        'epochs': 100,
        'item_features': None,
        'num_threads': N_CPUS,
        'verbose': True,
    }

    default_external_features_params = dict(add_identity_mat=True)

    def __init__(self,
                 use_sample_weight=False,
                 external_features=None,
                 external_features_params=None,
                 initialiser_model=None,
                 initialiser_scale=0.1,
                 **kwargs):
        self.use_sample_weight = use_sample_weight
        self.external_features = external_features
        self.external_features_params = external_features_params or \
                                        self.default_external_features_params.copy()
        self.initialiser_model = initialiser_model
        self.initialiser_scale = initialiser_scale
        super().__init__(**kwargs)

    def _prep_for_fit(self, train_obs, **fit_params):
        # self.toggle_mkl_blas_1_thread(True)
        # assign all observation data
        self._set_data(train_obs)
        fit_params['sample_weight'] = self.train_mat.tocoo() \
            if self.use_sample_weight else None
        self._set_fit_params(fit_params)
        self._add_external_features()
        # init model and set params
        self.model = LightFM(**self.model_params)
        if self.initialiser_model is not None:
            self._initialise_from_model(train_obs)

    def _initialise_from_model(self, train_obs):
        # fit initialiser model (this is done here to prevent any data leaks from passing fitted models)
        simple_logger.info('Training %s model to initialise LightFM model.' % str(self.initialiser_model))
        self.initialiser_model.fit(train_obs)
        self._reuse_data(self.initialiser_model)
        # have the internals initialised
        self.model.fit_partial(self.train_mat, epochs=0)

        # transplant factors from inititialiser model
        self.model.item_embeddings = self.initialiser_model._get_item_factors()[1]
        self.model.user_embeddings = self.initialiser_model._get_user_factors()[1]

        # scale the factors to be of similar scale
        scale = self.initialiser_scale
        self.model.item_embeddings *= scale / np.mean(np.abs(self.model.item_embeddings))
        self.model.user_embeddings *= scale / np.mean(np.abs(self.model.user_embeddings))


    def _add_external_features(self):
        if self.external_features is not None:
            self.external_features_mat = self.external_features.\
                fit_transform_ids_df_to_mat(
                    items_encoder=self.sparse_mat_builder.iid_encoder,
                    **self.external_features_params)
            simple_logger.info('External item features matrix: %s' %
                            str(self.external_features_mat.shape))

        # add external features if specified
        self.fit_params['item_features'] = self.external_features_mat
        if self.external_features_mat is not None:
            simple_logger.info('Fitting using external features mat: %s'
                               % str(self.external_features_mat.shape))

    def fit(self, train_obs, **fit_params):
        self._prep_for_fit(train_obs, **fit_params)
        self.model.fit_partial(self.train_mat, **self.fit_params)
        return self

    def fit_partial(self, train_obs, epochs=1):
        self._set_epochs(epochs)
        if self.model is None:
            self.fit(train_obs)
        else:
            self.model.fit_partial(self.train_mat)
        return self

    def fit_batches(self, train_obs, train_dfs, epochs_per_batch=None, **fit_params):
        self._prep_for_fit(train_obs)
        for i, df in enumerate(train_dfs):
            batch_train_mat = self.sparse_mat_builder.build_sparse_interaction_matrix(df)

            if epochs_per_batch is not None:
                fit_params['epochs'] = epochs_per_batch

            fit_params['sample_weight'] = batch_train_mat.tocoo() \
                if self.use_sample_weight else None

            self._set_fit_params(fit_params)

            simple_logger.info('Fitting batch %d (%d interactions)' % (i, len(df)))
            self.model.fit_partial(batch_train_mat, **self.fit_params)

    def _set_epochs(self, epochs):
        self.set_params(epochs=epochs)

    def set_params(self, **params):
        params = self._pop_set_params(
            params, ['use_sample_weight', 'external_features', 'external_features_params',
                     'initialiser_model', 'initialiser_scale'])
        super().set_params(**params)

    def _get_item_factors(self, mode=None):

        n_items = len(self.sparse_mat_builder.iid_encoder.classes_)

        biases, representations = self.model.get_item_representations(self.fit_params['item_features'])

        if mode is None:
            pass  # default mode

        elif mode == 'external_features':
            external_features_mat = self.external_features_mat

            assert external_features_mat is not None, \
                'Must define and add a feature matrix for "external_features" similarity.'

            representations = external_features_mat

        elif (mode == 'no_features') and (self.fit_params['item_features'] is not None):

            simple_logger.info('LightFM recommender: get_similar_items: "no_features" mode '
                               'assumes ID mat was added and is the last part of the feature matrix.')

            assert self.model.item_embeddings.shape[0] > n_items, \
                'Either no ID matrix was added, or no features added'

            representations = self.model.item_embeddings[-n_items:, :]

        else:
            raise ValueError('Uknown representation mode: %s' % mode)

        return biases, representations

    def _get_user_factors(self, mode=None):
        return self.model.get_user_representations()

    def _predict_on_inds(self, user_inds, item_inds):
        return self.model.predict(user_inds, item_inds,
                                  item_features=self.fit_params['item_features'],
                                  num_threads=N_CPUS)


    def _predict_rank(self, test_mat, train_mat=None):
        return self.model.predict_rank(
            test_interactions=test_mat,
            train_interactions=train_mat,
            item_features=self.fit_params['item_features'],
            num_threads=N_CPUS)

    def reduce_memory_for_serving(self):
        # would be best to set those to None, but than LightFM will complain, and more importantly
        # Cython code expects the right data format and will crash if its predict() will be used,
        # so I just point to the embeddings (which doesn't add memory).
        # the danger in this is that I don't know what will be the damage if someone calls one of the fit methods
        # for this reason it's in an explicit method "for_serving" and not in a __getstate__() method
        self.model.item_embedding_gradients = self.model.item_embeddings
        self.model.item_embedding_momentum= self.model.item_embeddings
        self.model.user_embedding_gradients = self.model.user_embeddings
        self.model.user_embedding_momentum = self.model.user_embeddings
        self.model.item_bias_gradients = self.model.item_biases
        self.model.item_bias_momentum= self.model.item_biases
        self.model.user_bias_gradients = self.model.user_biases
        self.model.user_bias_momentum = self.model.user_biases
        self.fit_params['sample_weight'] = None
        super().reduce_memory_for_serving()
class GRLightFMRecommender:
    def __init__(self,
                 path_to_dataset='data',
                 use_test_tags=False,
                 num_threads=1,
                 num_components=40,
                 num_epochs=100,
                 item_alpha=1e-6,
                 loss='warp',
                 debug=False):
        self._matrix_generator = GRSparseMatrixGenerator(
            path_to_dataset=path_to_dataset, use_test_tags=use_test_tags)

        self.item_user = self._matrix_generator.getCOOProgRepMatrix()
        self.user = self._matrix_generator.getCOORepoTags()
        self.item = self._matrix_generator.getCOOProgTags()

        self._item_tags = self.item.todense()

        self.num_threads = num_threads
        self.num_components = num_components
        self.num_epochs = num_epochs
        self.item_alpha = item_alpha
        self.loss = loss

        self._debug = debug

        if self._debug:
            print(self.num_threads, self.num_components, self.num_epochs,
                  self.item_alpha, self.loss)

    def fit(self):
        self.model = LightFM(loss='warp',
                             item_alpha=self.item_alpha,
                             no_components=self.num_components,
                             random_state=0)

        # Need to hstack item_features
        eye = sp.eye(self.items.shape[0], self.items.shape[0]).tocsr()
        item_features_concat = sp.hstack((eye, self.items))
        item_features_concat = item_features_concat.tocsr().astype(np.float32)

        # Need to hstack item_features
        eye = sp.eye(self.user.shape[0], self.user.shape[0]).tocsr()
        user_features_concat = sp.hstack((eye, self.user))
        user_features_concat = user_features_concat.tocsr().astype(np.float32)

        self.model = self.model.fit(self.item_user,
                                    item_features=item_features_concat,
                                    user_features=user_features_concat,
                                    epochs=self.num_epochs,
                                    num_threads=self.num_threads)

        self.trained = True

    def testAUC(self):
        self.train_auc = auc_score(self.model,
                                   self.item_user,
                                   item_features=self.item,
                                   user_features=self.user,
                                   num_threads=self.num_threads).mean()
        print('Hybrid testing set AUC: %s' % self.train_auc)

    #TODO check if its already an np array
    def predict(self, repo_id, prog_ids):
        users = np.ones(len(prog_ids)) * repo_id
        items = np.array(prog_ids)

        return self.model.predict(users,
                                  items,
                                  item_features=self.item,
                                  user_features=self.user,
                                  num_threads=self.num_threads)

    def getLatentVectors(self):
        return (self.model.get_item_representations(features=self.item),
                self.model.get_user_representations(features=self.user))

    def getProgTopSkills(self, prog_id, num_rec=10):
        values = self._item_tags[prog_id]
        tags = np.argsort(values)
        values = np.sort(values)
        limit = -1 * (num_rec + 1)
        return (tags[:, -1:limit:-1], values[:, -1:limit:-1])

    #TODO decide whether to generate the numpy array or the matrix generator
    def getSuggestionsForRepository(self,
                                    repo_id,
                                    num_suggestions=10,
                                    get_tags=False):
        progs = self._matrix_generator.getProgrammersNotInRepo(repo_id)
        rp = []
        rp.append(repo_id)
        scores = self.predict(rp, progs)
        scores = np.argsort(scores)
        suggs = []

        if not get_tags:
            if repo_id == 14:
                suggs.append('fchollet')

            for i in range(0, num_suggestions):
                suggs.append(
                    self._matrix_generator.getProgrammerFromID(
                        progs[scores[i]]))

        return suggs
Exemplo n.º 11
0
def process_mpd(playlists_path, target_playlists, output_file, prev_songs_window):
    max_prev_song = 0
    previous_tracks = defaultdict(lambda: defaultdict(int))
    playlists_tracks = []
    playlists = []
    playlists_extra = {'name': []}
    filenames = os.listdir(playlists_path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((playlists_path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            for playlist in mpd_slice['playlists']:
                nname = normalize_name(playlist['name'])
                playlists_extra['name'].append(nname)
                tracks = defaultdict(int)

                sorted_tracks = sorted(playlist['tracks'], key=lambda k: k['pos'])
                prev_track = []
                for track in sorted_tracks:
                    tracks[track['track_uri']] += 1
                    curr_prev_tracks = len(prev_track)
                    for i, song_in_window in enumerate(prev_track):
                        previous_tracks[song_in_window][track['track_uri']] += (i+1)/curr_prev_tracks
                        previous_tracks[track['track_uri']][song_in_window] += (i+1)/curr_prev_tracks
                        #previous_tracks[song_in_window][track['track_uri']] += 1
                        #previous_tracks[track['track_uri']][song_in_window] += 1
                        max_prev_song = max(max_prev_song, previous_tracks[track['track_uri']][song_in_window])
                        max_prev_song = max(max_prev_song, previous_tracks[song_in_window][track['track_uri']])
                    if len(prev_track) == prev_songs_window:
                        prev_track.pop(0)
                    prev_track.append(track['track_uri'])
                playlists_tracks.append(tracks)
                playlists.append(str(playlist['pid']))

    top_pop = []
    for i in previous_tracks.keys():
        top_pop.append((i, np.sum(list(previous_tracks[i].values()))))
    top_pop = sorted(top_pop, key=lambda x:x[1], reverse=True)[:10000]
    top_pop = [t[0] for t in top_pop]

    # Add playlists on testing set
    test_playlists = []
    test_playlists_tracks = []
    target = json.load(open(target_playlists))
    train_playlists_count = len(playlists)
    test_playlists_recommended_sum = []
    for playlist in target["playlists"]:
        nname = ""
        if 'name' in playlist:
            nname = normalize_name(playlist['name'])
        playlists_extra['name'].append(nname)
        playlists.append(str(playlist['pid']))
        test_playlists.append(str(playlist['pid']))
        if len(playlist['tracks']) == 0:
            test_playlists_recommended_sum.append(top_pop)
            test_playlists_tracks.append({})
            continue

        tracks = defaultdict(int)
        for track in playlist['tracks']:
            tracks[track['track_uri']] += 1

        #playlists_tracks.append(tracks)
        test_playlists_tracks.append(tracks)
        recommended_pop = defaultdict(list)
        for t in tracks.keys():
            for pt in previous_tracks[t].keys():
                if pt not in tracks:
                    recommended_pop[pt].append(previous_tracks[t][pt] /max_prev_song)
        recommended_pop_sum = [(t, np.sum(recommended_pop[t])) for t in recommended_pop.keys()]
        recommended_pop_sum = sorted(recommended_pop_sum, key=lambda x:x[1], reverse=True)
        recommended_pop_sum = [t[0] for t in recommended_pop_sum]
        test_playlists_recommended_sum.append(recommended_pop_sum)

    print ("Data loaded. Creating features matrix")

    dv = DictVectorizer()
    interaction_matrix = dv.fit_transform(playlists_tracks+[{}]*10000)

    lb = LabelBinarizer(sparse_output=True)
    pfeat_train = lb.fit_transform(playlists_extra['name'][:1000000]+[""]*10000)
    pfeat_test = lb.transform(playlists_extra['name'])

    print ("pfeat_train", pfeat_train.shape)
    print ("pfeat_test", pfeat_test.shape)

    playlist_features = pfeat_train
    # Need to hstack playlist_features
    eye = sparse.eye(playlist_features.shape[0], playlist_features.shape[0]).tocsr()
    playlist_features_concat = sparse.hstack((eye, playlist_features))

    print ("Features matrix created. Training model")
    model = LightFM(loss='warp', no_components=200, max_sampled=30, item_alpha=1e-06, user_alpha=1e-06, random_state=SEED)
    model = model.fit(interaction_matrix, user_features=playlist_features_concat, epochs=150, num_threads=32)

    # freeze the gradient and optimize held-out users
    model.item_embedding_gradients = np.finfo(np.float32).max * np.ones_like(model.item_embedding_gradients)
    model.item_bias_gradients = np.finfo(np.float32).max * np.ones_like(model.item_bias_gradients)
    model.item_alpha = 0.0
    model.user_alpha = 0.0
    model.user_embedding_gradients[:1000000,:] = np.finfo(np.float32).max * np.ones_like(model.user_embedding_gradients[:1000000,:])
    model.user_bias_gradients[:1000000] = np.finfo(np.float32).max * np.ones_like(model.user_bias_gradients[:1000000])

    # Use the trained model to get a representation of the playlists on challenge set
    interaction_matrix = dv.transform(playlists_tracks+test_playlists_tracks)
    playlist_features = pfeat_test
    playlist_features_concat = sparse.hstack((eye, playlist_features))
    model.user_embeddings[-10000:] = ((model.random_state.rand(10000, model.no_components) - 0.5) / model.no_components).astype(np.float32)
    model = model.fit_partial(interaction_matrix, user_features=playlist_features_concat, epochs=150, num_threads=32)
    print ("Model Trained")

    user_biases, user_embeddings = model.get_user_representations(playlist_features_concat)
    item_biases, item_embeddings = model.get_item_representations()

    fuse_perc = 0.7
    with open(output_file, 'w') as fout:
        print('team_info,cocoplaya,main,[email protected]', file=fout)
        for i, playlist in enumerate(test_playlists):
            playlist_pos = train_playlists_count+i
            y_pred = user_embeddings[playlist_pos].dot(item_embeddings.T) + item_biases
            topn = np.argsort(-y_pred)[:len(test_playlists_tracks[i])+4000]
            rets = [(dv.feature_names_[t], float(y_pred[t])) for t in topn]
            songids = [s for s, _ in rets if s not in test_playlists_tracks[i]]
            songids_dict = {s:1 for s in songids}
            max_score = max(len(songids), len(test_playlists_recommended_sum[i]))
            pop_sum = {s:(max_score - p) for p,s in enumerate(test_playlists_recommended_sum[i])}
            fuse_sum = []
            for p, s in enumerate(songids):
                pop_val_sum = 0 
                if s in pop_sum:
                    pop_val_sum = pop_sum[s]
                fuse_sum.append((s,((max_score - p)*fuse_perc + pop_val_sum*(1-fuse_perc) ) / 2))
            for s in pop_sum.keys():
                if s not in songids_dict:
                    fuse_sum.append((s,(pop_sum[s]*(1-fuse_perc) ) / 2))
            fuse_sum = sorted(fuse_sum, key=lambda x:x[1], reverse=True)
            print(' , '.join([playlist] + [x[0] for x in fuse_sum[:500]]), file=fout)
Exemplo n.º 12
0
def process_mpd(playlists_path, target_playlists, output_file,
                prev_songs_window):
    max_prev_song = 0
    previous_tracks = defaultdict(lambda: defaultdict(int))
    playlists_tracks = []
    playlists = []
    playlists_extra = {'name': []}
    filenames = os.listdir(playlists_path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((playlists_path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            for playlist in mpd_slice['playlists']:
                nname = normalize_name(playlist['name'])
                playlists_extra['name'].append(nname)
                tracks = defaultdict(int)

                sorted_tracks = sorted(playlist['tracks'],
                                       key=lambda k: k['pos'])
                prev_track = []
                for track in sorted_tracks:
                    tracks[track['track_uri']] += 1
                    curr_prev_tracks = len(prev_track)
                    for i, song_in_window in enumerate(prev_track):
                        previous_tracks[song_in_window][
                            track['track_uri']] += (i + 1) / curr_prev_tracks
                        previous_tracks[track['track_uri']][
                            song_in_window] += (i + 1) / curr_prev_tracks
                        #previous_tracks[song_in_window][track['track_uri']] += 1
                        #previous_tracks[track['track_uri']][song_in_window] += 1
                        max_prev_song = max(
                            max_prev_song, previous_tracks[track['track_uri']]
                            [song_in_window])
                        max_prev_song = max(
                            max_prev_song, previous_tracks[song_in_window][
                                track['track_uri']])
                    if len(prev_track) == prev_songs_window:
                        prev_track.pop(0)
                    prev_track.append(track['track_uri'])
                playlists_tracks.append(tracks)
                playlists.append(str(playlist['pid']))

    top_pop = []
    for i in previous_tracks.keys():
        top_pop.append((i, np.sum(list(previous_tracks[i].values()))))
    top_pop = sorted(top_pop, key=lambda x: x[1], reverse=True)[:10000]
    top_pop = [t[0] for t in top_pop]

    # Add playlists on testing set
    test_playlists = []
    target = json.load(open(target_playlists))
    train_playlists_count = len(playlists)
    test_playlists_recommended_sum = []
    for playlist in target["playlists"]:
        nname = ""
        if 'name' in playlist:
            nname = normalize_name(playlist['name'])
        playlists_extra['name'].append(nname)
        playlists.append(str(playlist['pid']))
        test_playlists.append(str(playlist['pid']))
        if len(playlist['tracks']) == 0:
            test_playlists_recommended_sum.append(top_pop)
            playlists_tracks.append({})
            continue

        tracks = defaultdict(int)
        for track in playlist['tracks']:
            tracks[track['track_uri']] += 1

        playlists_tracks.append(tracks)
        recommended_pop = defaultdict(list)
        for t in tracks.keys():
            for pt in previous_tracks[t].keys():
                if pt not in tracks:
                    recommended_pop[pt].append(previous_tracks[t][pt] /
                                               max_prev_song)

        recommended_pop_sum = [(t, np.sum(recommended_pop[t]))
                               for t in recommended_pop.keys()]
        recommended_pop_sum = sorted(recommended_pop_sum,
                                     key=lambda x: x[1],
                                     reverse=True)
        recommended_pop_sum = [t[0] for t in recommended_pop_sum]
        test_playlists_recommended_sum.append(recommended_pop_sum)

    print("Data loaded. Creating features matrix")

    dv = DictVectorizer()
    interaction_matrix = dv.fit_transform(playlists_tracks)

    lb = LabelBinarizer(sparse_output=True)
    pfeat = lb.fit_transform(playlists_extra['name'])
    playlist_features = pfeat

    # Need to hstack playlist_features
    eye = sparse.eye(playlist_features.shape[0],
                     playlist_features.shape[0]).tocsr()
    playlist_features_concat = sparse.hstack((eye, playlist_features))

    item_prev = []
    highlevel = []
    for track in dv.feature_names_:
        try:
            f = get_audio_features_dict(track.replace('spotify:track:', ''),
                                        False)
        except ValueError:
            print("Failed loading json", track)
            f = None
        curr_highlevel = {}
        if f is not None:
            curr_highlevel = {k: v for k, v in f.items() if 'class_f' in k}
        highlevel.append(curr_highlevel)

    ifeat_highlevel = DictVectorizer().fit_transform(highlevel)
    item_prev = ifeat_highlevel
    eye = sparse.eye(item_prev.shape[0], item_prev.shape[0]).tocsr()
    item_feat = sparse.hstack((eye, item_prev))

    print("Features matrix created. Training model")
    model = LightFM(loss='warp',
                    no_components=200,
                    max_sampled=30,
                    item_alpha=1e-06,
                    user_alpha=1e-06,
                    random_state=SEED)
    model = model.fit(interaction_matrix,
                      user_features=playlist_features_concat,
                      item_features=item_feat,
                      epochs=150,
                      num_threads=32)
    print("Model Trained")

    user_biases, user_embeddings = model.get_user_representations(
        playlist_features_concat)
    item_biases, item_embeddings = model.get_item_representations(item_feat)

    fuse_perc = 0.7
    with open(output_file, 'w') as fout:
        print('team_info,cocoplaya,creative,[email protected]', file=fout)
        for i, playlist in enumerate(test_playlists):
            playlist_pos = train_playlists_count + i
            y_pred = user_embeddings[playlist_pos].dot(
                item_embeddings.T) + item_biases
            topn = np.argsort(-y_pred)[:len(playlists_tracks[playlist_pos]) +
                                       4000]
            rets = [(dv.feature_names_[t], float(y_pred[t])) for t in topn]
            songids = [
                s for s, _ in rets if s not in playlists_tracks[playlist_pos]
            ]
            songids_dict = {s: 1 for s in songids}
            max_score = max(len(songids),
                            len(test_playlists_recommended_sum[i]))
            pop_sum = {
                s: (max_score - p)
                for p, s in enumerate(test_playlists_recommended_sum[i])
            }
            fuse_sum = []
            for p, s in enumerate(songids):
                pop_val_sum = 0
                if s in pop_sum:
                    pop_val_sum = pop_sum[s]
                fuse_sum.append(
                    (s, ((max_score - p) * fuse_perc + pop_val_sum *
                         (1 - fuse_perc)) / 2))
            for s in pop_sum.keys():
                if s not in songids_dict:
                    fuse_sum.append((s, (pop_sum[s] * (1 - fuse_perc)) / 2))
            fuse_sum = sorted(fuse_sum, key=lambda x: x[1], reverse=True)
            print(' , '.join([playlist] + [x[0] for x in fuse_sum[:500]]),
                  file=fout)
Exemplo n.º 13
0
model = LightFM(no_components=no_components,
                learning_schedule='adagrad',
                loss='warp',
                learning_rate=0.05,
                random_state=0)

model.fit(interactions=train,
          item_features=item_features,
          sample_weight=train_weights,
          epochs=10,
          verbose=True)


# Find Similar Items
item_biases, item_embeddings = model.get_item_representations(features=item_features)

def make_best_items_report(item_embeddings, book_id, num_search_items=10):
    item_id = book_id - 1

    # Cosine similarity
    scores = item_embeddings.dot(item_embeddings[item_id])  # (10000, )
    item_norms = np.linalg.norm(item_embeddings, axis=1)    # (10000, )
    item_norms[item_norms == 0] = 1e-10
    scores /= item_norms

    # best: score가 제일 높은 item의 id를 num_search_items 개 만큼 가져온다.
    best = np.argpartition(scores, -num_search_items)[-num_search_items:]
    similar_item_id_and_scores = sorted(zip(best, scores[best] / item_norms[item_id]), key=lambda x: -x[1])

    # Report를 작성할 pandas dataframe