Exemplo n.º 1
0
def main():
    movielens = fetch_movielens()

    train = movielens['train']
    test = movielens['test']
    print(train.shape)
    print(test.shape)

    model = LightFM(learning_rate=0.05, loss='bpr')
    model.fit(train, epochs=5)

    k = 10
    train_recall = recall_at_k(model, train, k=k).mean()
    test_recall = recall_at_k(model, test, k=k).mean()
    print(f'recall_at_{k}(train): {train_recall}')
    print(f'recall_at_{k}(test) : {test_recall}')

    train_auc = auc_score(model, train).mean()
    test_auc = auc_score(model, test).mean()
    print(f'auc_score(train): {train_auc}')
    print(f'auc_score(test) : {test_auc}')

    y_train_preds = model.predict_rank(train)
    y_test_preds = model.predict_rank(test)
    train_dcg = dcg_score(train.toarray(), y_train_preds.toarray())
    test_dcg = dcg_score(test.toarray(), y_test_preds.toarray())
    print(f'dcg_score(train): {train_dcg}')
    print(f'dcg_score(test) : {test_dcg}')

    print('DONE')

    return 0
Exemplo n.º 2
0
def test_matrix_types():

    mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix)

    dtypes = (np.int32, np.int64, np.float32, np.float64)

    no_users, no_items = (10, 100)
    no_features = 20

    for mattype in mattypes:
        for dtype in dtypes:
            train = mattype((no_users, no_items), dtype=dtype)
            weights = train.tocoo()

            user_features = mattype((no_users, no_features), dtype=dtype)
            item_features = mattype((no_items, no_features), dtype=dtype)

            model = LightFM()
            model.fit_partial(train,
                              sample_weight=weights,
                              user_features=user_features,
                              item_features=item_features)

            model.predict(np.random.randint(0, no_users, 10).astype(np.int32),
                          np.random.randint(0, no_items, 10).astype(np.int32),
                          user_features=user_features,
                          item_features=item_features)

            model.predict_rank(train,
                               user_features=user_features,
                               item_features=item_features)
Exemplo n.º 3
0
def test_predict_ranks():

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.float32)

    model = LightFM()
    model.fit_partial(train)

    # Compute ranks for all items
    rank_input = sp.csr_matrix(np.ones((no_users, no_items)))
    ranks = model.predict_rank(rank_input, num_threads=2).todense()

    assert np.all(ranks.min(axis=1) == 0)
    assert np.all(ranks.max(axis=1) == no_items - 1)

    for row in range(no_users):
        assert np.all(np.sort(ranks[row]) == np.arange(no_items))

    # Make sure this is true also when there are ties
    model.user_embeddings = np.zeros_like(model.user_embeddings)
    model.item_embeddings = np.zeros_like(model.item_embeddings)
    model.user_biases = np.zeros_like(model.user_biases)
    model.item_biases = np.zeros_like(model.item_biases)

    ranks = model.predict_rank(rank_input, num_threads=2).todense()

    assert np.all(ranks.min(axis=1) == 0)
    assert np.all(ranks.max(axis=1) == 0)

    # Wrong input dimensions
    with pytest.raises(ValueError):
        model.predict_rank(sp.csr_matrix((5, 5)), num_threads=2)
Exemplo n.º 4
0
def test_predict_not_fitted():

    model = LightFM()

    with pytest.raises(ValueError):
        model.predict(np.arange(10), np.arange(10))

    with pytest.raises(ValueError):
        model.predict_rank(1)

    with pytest.raises(ValueError):
        model.get_user_representations()

    with pytest.raises(ValueError):
        model.get_item_representations()
Exemplo n.º 5
0
def test_predict_ranks():

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.float32)
    train = sp.rand(no_users, no_items, format='csr', random_state=42)

    model = LightFM()
    model.fit_partial(train)

    # Compute ranks for all items
    rank_input = sp.csr_matrix(np.ones((no_users, no_items)))
    ranks = model.predict_rank(rank_input, num_threads=2).todense()

    assert np.all(ranks.min(axis=1) == 0)
    assert np.all(ranks.max(axis=1) == no_items - 1)

    for row in range(no_users):
        assert np.all(np.sort(ranks[row]) == np.arange(no_items))

    # Train set exclusions. All ranks should be zero
    # if train interactions is dense.
    ranks = model.predict_rank(rank_input,
                               train_interactions=rank_input).todense()
    assert np.all(ranks == 0)

    # Max rank should be num_items - 1 - number of positives
    # in train in that row
    ranks = model.predict_rank(rank_input,
                               train_interactions=train).todense()
    assert np.all(np.squeeze(np.array(ranks.max(axis=1))) ==
                  no_items - 1 - np.squeeze(np.array(train.getnnz(axis=1))))

    # Make sure ranks are computed pessimistically when
    # there are ties (that is, equal predictions for every
    # item will assign maximum rank to each).
    model.user_embeddings = np.zeros_like(model.user_embeddings)
    model.item_embeddings = np.zeros_like(model.item_embeddings)
    model.user_biases = np.zeros_like(model.user_biases)
    model.item_biases = np.zeros_like(model.item_biases)

    ranks = model.predict_rank(rank_input, num_threads=2).todense()

    assert np.all(ranks.min(axis=1) == 99)
    assert np.all(ranks.max(axis=1) == 99)

    # Wrong input dimensions
    with pytest.raises(ValueError):
        model.predict_rank(sp.csr_matrix((5, 5)), num_threads=2)
Exemplo n.º 6
0
def test_predict_ranks():

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.float32)
    train = sp.rand(no_users, no_items, format='csr')

    model = LightFM()
    model.fit_partial(train)

    # Compute ranks for all items
    rank_input = sp.csr_matrix(np.ones((no_users, no_items)))
    ranks = model.predict_rank(rank_input, num_threads=2).todense()

    assert np.all(ranks.min(axis=1) == 0)
    assert np.all(ranks.max(axis=1) == no_items - 1)

    for row in range(no_users):
        assert np.all(np.sort(ranks[row]) == np.arange(no_items))

    # Train set exclusions. All ranks should be zero
    # if train interactions is dense.
    ranks = model.predict_rank(rank_input,
                               train_interactions=rank_input).todense()
    assert np.all(ranks == 0)

    # Max rank should be num_items - 1 - number of positives
    # in train in that row
    ranks = model.predict_rank(rank_input,
                               train_interactions=train).todense()
    assert np.all(np.squeeze(np.array(ranks.max(axis=1))) ==
                  no_items - 1 - np.squeeze(np.array(train.getnnz(axis=1))))

    # Make sure invariants hold when there are ties
    model.user_embeddings = np.zeros_like(model.user_embeddings)
    model.item_embeddings = np.zeros_like(model.item_embeddings)
    model.user_biases = np.zeros_like(model.user_biases)
    model.item_biases = np.zeros_like(model.item_biases)

    ranks = model.predict_rank(rank_input, num_threads=2).todense()

    assert np.all(ranks.min(axis=1) == 0)
    assert np.all(ranks.max(axis=1) == 0)

    # Wrong input dimensions
    with pytest.raises(ValueError):
        model.predict_rank(sp.csr_matrix((5, 5)), num_threads=2)
Exemplo n.º 7
0
def test_predict_scores(num_threads=2):

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.float32)
    train = sp.rand(no_users, no_items, format='csr')

    model = LightFM()
    model.fit_partial(train)

    # Compute scores and check if results equal to model.predict
    predict_input = sp.csr_matrix(np.ones((no_users, no_items)))
    scores = model.predict_score(predict_input,
                                 num_threads=num_threads).todense()
    for uid in range(no_users):
        scores_arr = model.predict(np.repeat(uid, no_items),
                                   np.arange(no_items))
        score_slice = np.array(scores)[uid, :]
        assert np.array_equal(score_slice, scores_arr)

    # check if precompute and parallelization work correctly
    scores_serial = model.predict_score(predict_input,
                                        num_threads=1).todense()
    scores_no_prec = model.predict_score(predict_input,
                                         num_threads=num_threads,
                                         precompute_representations=False
                                         ).todense()
    scores_ser_no_prec = model.predict_score(predict_input,
                                             num_threads=1,
                                             precompute_representations=False
                                             ).todense()
    assert np.array_equal(scores, scores_serial)
    assert np.array_equal(scores, scores_no_prec)
    assert np.array_equal(scores, scores_ser_no_prec)

    # Compute ranks and compares with ranks computed from scores
    ranks = model.predict_rank(predict_input,
                               num_threads=num_threads).todense()

    def rank_scores(s):
        # ranks from scores as in http://stackoverflow.com/a/14672797/5251962
        u, v = np.unique(s, return_inverse=True)
        return len(s) - 1 - (np.cumsum(np.bincount(v)) - 1)[v]

    check_ranks = np.apply_along_axis(rank_scores, 1, scores)
    assert np.array_equal(ranks, check_ranks)

    # Train set exclusions. All scores should be zero
    # if train interactions is dense.
    scores = model.predict_score(predict_input,
                                 train_interactions=predict_input).todense()
    assert np.all(scores == 0)

    # Make sure invariants hold when there are ties
    model.user_embeddings = np.zeros_like(model.user_embeddings)
    model.item_embeddings = np.zeros_like(model.item_embeddings)
    model.user_biases = np.zeros_like(model.user_biases)
    model.item_biases = np.zeros_like(model.item_biases)

    scores = model.predict_score(predict_input,
                                 num_threads=num_threads).todense()

    assert np.all(scores.min(axis=1) == 0)
    assert np.all(scores.max(axis=1) == 0)

    # Wrong input dimensions
    with pytest.raises(ValueError):
        model.predict_score(sp.csr_matrix((5, 5)), num_threads=num_threads)
Exemplo n.º 8
0
class LightFMRecommender(BaseFactorizationRecommender):

    default_model_params = {
        'loss': 'warp',
        'learning_schedule': 'adadelta',
        'no_components': 30,
        'max_sampled': 10,
        'item_alpha': 0,
        'user_alpha': 0,
    }

    default_fit_params = {
        'epochs': 100,
        'item_features': None,
        'num_threads': N_CPUS,
        'verbose': True,
    }

    default_external_features_params = dict(add_identity_mat=True)

    def __init__(self,
                 use_sample_weight=False,
                 external_features=None,
                 external_features_params=None,
                 initialiser_model=None,
                 initialiser_scale=0.1,
                 **kwargs):
        self.use_sample_weight = use_sample_weight
        self.external_features = external_features
        self.external_features_params = external_features_params or \
                                        self.default_external_features_params.copy()
        self.initialiser_model = initialiser_model
        self.initialiser_scale = initialiser_scale
        super().__init__(**kwargs)

    def _prep_for_fit(self, train_obs, **fit_params):
        # self.toggle_mkl_blas_1_thread(True)
        # assign all observation data
        self._set_data(train_obs)
        fit_params['sample_weight'] = self.train_mat.tocoo() \
            if self.use_sample_weight else None
        self._set_fit_params(fit_params)
        self._add_external_features()
        # init model and set params
        self.model = LightFM(**self.model_params)
        if self.initialiser_model is not None:
            self._initialise_from_model(train_obs)

    def _initialise_from_model(self, train_obs):
        # fit initialiser model (this is done here to prevent any data leaks from passing fitted models)
        simple_logger.info('Training %s model to initialise LightFM model.' % str(self.initialiser_model))
        self.initialiser_model.fit(train_obs)
        self._reuse_data(self.initialiser_model)
        # have the internals initialised
        self.model.fit_partial(self.train_mat, epochs=0)

        # transplant factors from inititialiser model
        self.model.item_embeddings = self.initialiser_model._get_item_factors()[1]
        self.model.user_embeddings = self.initialiser_model._get_user_factors()[1]

        # scale the factors to be of similar scale
        scale = self.initialiser_scale
        self.model.item_embeddings *= scale / np.mean(np.abs(self.model.item_embeddings))
        self.model.user_embeddings *= scale / np.mean(np.abs(self.model.user_embeddings))


    def _add_external_features(self):
        if self.external_features is not None:
            self.external_features_mat = self.external_features.\
                fit_transform_ids_df_to_mat(
                    items_encoder=self.sparse_mat_builder.iid_encoder,
                    **self.external_features_params)
            simple_logger.info('External item features matrix: %s' %
                            str(self.external_features_mat.shape))

        # add external features if specified
        self.fit_params['item_features'] = self.external_features_mat
        if self.external_features_mat is not None:
            simple_logger.info('Fitting using external features mat: %s'
                               % str(self.external_features_mat.shape))

    def fit(self, train_obs, **fit_params):
        self._prep_for_fit(train_obs, **fit_params)
        self.model.fit_partial(self.train_mat, **self.fit_params)
        return self

    def fit_partial(self, train_obs, epochs=1):
        self._set_epochs(epochs)
        if self.model is None:
            self.fit(train_obs)
        else:
            self.model.fit_partial(self.train_mat)
        return self

    def fit_batches(self, train_obs, train_dfs, epochs_per_batch=None, **fit_params):
        self._prep_for_fit(train_obs)
        for i, df in enumerate(train_dfs):
            batch_train_mat = self.sparse_mat_builder.build_sparse_interaction_matrix(df)

            if epochs_per_batch is not None:
                fit_params['epochs'] = epochs_per_batch

            fit_params['sample_weight'] = batch_train_mat.tocoo() \
                if self.use_sample_weight else None

            self._set_fit_params(fit_params)

            simple_logger.info('Fitting batch %d (%d interactions)' % (i, len(df)))
            self.model.fit_partial(batch_train_mat, **self.fit_params)

    def _set_epochs(self, epochs):
        self.set_params(epochs=epochs)

    def set_params(self, **params):
        params = self._pop_set_params(
            params, ['use_sample_weight', 'external_features', 'external_features_params',
                     'initialiser_model', 'initialiser_scale'])
        super().set_params(**params)

    def _get_item_factors(self, mode=None):

        n_items = len(self.sparse_mat_builder.iid_encoder.classes_)

        biases, representations = self.model.get_item_representations(self.fit_params['item_features'])

        if mode is None:
            pass  # default mode

        elif mode == 'external_features':
            external_features_mat = self.external_features_mat

            assert external_features_mat is not None, \
                'Must define and add a feature matrix for "external_features" similarity.'

            representations = external_features_mat

        elif (mode == 'no_features') and (self.fit_params['item_features'] is not None):

            simple_logger.info('LightFM recommender: get_similar_items: "no_features" mode '
                               'assumes ID mat was added and is the last part of the feature matrix.')

            assert self.model.item_embeddings.shape[0] > n_items, \
                'Either no ID matrix was added, or no features added'

            representations = self.model.item_embeddings[-n_items:, :]

        else:
            raise ValueError('Uknown representation mode: %s' % mode)

        return biases, representations

    def _get_user_factors(self, mode=None):
        return self.model.get_user_representations()

    def _predict_on_inds(self, user_inds, item_inds):
        return self.model.predict(user_inds, item_inds,
                                  item_features=self.fit_params['item_features'],
                                  num_threads=N_CPUS)


    def _predict_rank(self, test_mat, train_mat=None):
        return self.model.predict_rank(
            test_interactions=test_mat,
            train_interactions=train_mat,
            item_features=self.fit_params['item_features'],
            num_threads=N_CPUS)

    def reduce_memory_for_serving(self):
        # would be best to set those to None, but than LightFM will complain, and more importantly
        # Cython code expects the right data format and will crash if its predict() will be used,
        # so I just point to the embeddings (which doesn't add memory).
        # the danger in this is that I don't know what will be the damage if someone calls one of the fit methods
        # for this reason it's in an explicit method "for_serving" and not in a __getstate__() method
        self.model.item_embedding_gradients = self.model.item_embeddings
        self.model.item_embedding_momentum= self.model.item_embeddings
        self.model.user_embedding_gradients = self.model.user_embeddings
        self.model.user_embedding_momentum = self.model.user_embeddings
        self.model.item_bias_gradients = self.model.item_biases
        self.model.item_bias_momentum= self.model.item_biases
        self.model.user_bias_gradients = self.model.user_biases
        self.model.user_bias_momentum = self.model.user_biases
        self.fit_params['sample_weight'] = None
        super().reduce_memory_for_serving()
Exemplo n.º 9
0
    print("Splitting the data into train/test set...\n")
    train, test = cross_validation.random_train_test_split(user_items_train)
    # print(train,test)
    # print(train.shape(),test.shape())

    model1 = LightFM(learning_rate=0.05, loss='bpr')
    model2 = LightFM(learning_rate=0.05, loss='warp')

    print("Fitting models of BPR & WARP ranking losses...\n")
    model1.fit(train, epochs=10)
    model2.fit(train, epochs=10)
    #ranks = model.predict(user_items_train,num_threads=1)
    #print(ranks)

    res = model1.predict_rank(test)
    print(res)
    print("Evaluating methods...\n")

    train_recall1_10 = recall_at_k(model1, train, k=10).mean()
    test_recall1_10 = recall_at_k(model1, test, k=10).mean()

    train_recall1_20 = recall_at_k(model1, train, k=20).mean()
    test_recall1_20 = recall_at_k(model1, test, k=20).mean()

    #train_mrr1 = reciprocal_rank(model1, train).mean()
    #train_mrr_20 = reciprocal_rank(model1, user_items_train).mean()
    #train_mrr2 = reciprocal_rank(model2, user_items_train).mean()

    train_recall2_10 = recall_at_k(model2, train, k=10).mean()
    test_recall2_10 = recall_at_k(model2, test, k=10).mean()
Exemplo n.º 10
0
def test_predict_scores(num_threads=2):

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users, no_items), dtype=np.float32)
    train = sp.rand(no_users, no_items, format='csr')

    model = LightFM()
    model.fit_partial(train)

    # Compute scores and check if results equal to model.predict
    predict_input = sp.csr_matrix(np.ones((no_users, no_items)))
    scores = model.predict_score(predict_input,
                                 num_threads=num_threads).todense()
    for uid in range(no_users):
        scores_arr = model.predict(np.repeat(uid, no_items),
                                   np.arange(no_items))
        score_slice = np.array(scores)[uid, :]
        assert np.array_equal(score_slice, scores_arr)

    # check if precompute and parallelization work correctly
    scores_serial = model.predict_score(predict_input, num_threads=1).todense()
    scores_no_prec = model.predict_score(
        predict_input,
        num_threads=num_threads,
        precompute_representations=False).todense()
    scores_ser_no_prec = model.predict_score(
        predict_input, num_threads=1,
        precompute_representations=False).todense()
    assert np.array_equal(scores, scores_serial)
    assert np.array_equal(scores, scores_no_prec)
    assert np.array_equal(scores, scores_ser_no_prec)

    # Compute ranks and compares with ranks computed from scores
    ranks = model.predict_rank(predict_input,
                               num_threads=num_threads).todense()

    def rank_scores(s):
        # ranks from scores as in http://stackoverflow.com/a/14672797/5251962
        u, v = np.unique(s, return_inverse=True)
        return len(s) - 1 - (np.cumsum(np.bincount(v)) - 1)[v]

    check_ranks = np.apply_along_axis(rank_scores, 1, scores)
    assert np.array_equal(ranks, check_ranks)

    # Train set exclusions. All scores should be zero
    # if train interactions is dense.
    scores = model.predict_score(predict_input,
                                 train_interactions=predict_input).todense()
    assert np.all(scores == 0)

    # Make sure invariants hold when there are ties
    model.user_embeddings = np.zeros_like(model.user_embeddings)
    model.item_embeddings = np.zeros_like(model.item_embeddings)
    model.user_biases = np.zeros_like(model.user_biases)
    model.item_biases = np.zeros_like(model.item_biases)

    scores = model.predict_score(predict_input,
                                 num_threads=num_threads).todense()

    assert np.all(scores.min(axis=1) == 0)
    assert np.all(scores.max(axis=1) == 0)

    # Wrong input dimensions
    with pytest.raises(ValueError):
        model.predict_score(sp.csr_matrix((5, 5)), num_threads=num_threads)
Exemplo n.º 11
0
def train_model():

    # uesr features
    user_features, user_feature_names = get_user_features()

    # create data
    data_ws = Dataset(user_identity_features=True)  # warm start

    # create map between user_id, post_id, user_features and internal indices
    data_ws.fit((x['user_id'] for x in get_data()),
                (x['post_id'] for x in get_data()),
                user_features=user_features)
    #user_biases =

    #---------------------------
    # Building the interactions matrix
    #---------------------------
    # create interaction matrix to optimize
    (interactions_ws, weights_ws) = data_ws.build_interactions(
        ((x['user_id'], x['post_id']) for x in get_data()))
    print(repr(interactions_ws))

    # retrieve mapping from dataset
    user_id_map, user_feature_map, item_id_map, item_feature_map = data_ws.mapping(
    )

    #---------------------------
    # train model
    #---------------------------
    # initialize model
    model_warp_ws = LightFM(learning_rate=0.05,
                            loss='warp',
                            no_components=len(user_feature_names))

    # train model
    model_warp_ws.fit(interactions_ws, user_features=user_features, epochs=30)

    #---------------------------
    # make predictions
    #---------------------------
    # make predictions for all user
    prediction_ws = model_warp_ws.predict_rank(interactions_ws,
                                               user_features=user_features)

    # create identity matrix that represent user features of hypothetical user
    user_features_identity = sparse.csr_matrix(
        np.identity(len(user_feature_names)))

    # make prediction for hypothetical user
    prediction_hypo = []

    for user_irt in range(len(user_feature_names)):

        # calculate prediction score
        prediction_score = model_warp_ws.predict(
            user_ids=0,
            item_ids=item_id_map.values(),
            user_features=user_features_identity)

        # combine prediction score with item map
        prediction_zipped = zip(prediction_score, item_id_map)

        # sort by prediction score
        prediction_sorted = sorted(prediction_zipped,
                                   key=lambda x: x[0],
                                   reverse=True)

        # add to list of hypothetical users
        prediction_hypo.append(prediction_sorted)

    return prediction_hypo, prediction_ws, user_id_map, item_id_map, user_feature_names