Exemplo n.º 1
0
def evaluation():
    print("\nStarting evaluation our model...")

    model = LightFM(loss='warp')

    train = fetch_movielens()['train']
    test = fetch_movielens()['test']

    model.fit_partial(train, epochs=30, num_threads=2)

    train_precision = precision_at_k(model, train, k=10).mean()
    test_precision = precision_at_k(model, test, k=10).mean()

    train_auc = auc_score(model, train).mean()
    test_auc = auc_score(model, test).mean()

    print('Precision: train %.2f, test %.2f.' %
          (train_precision, test_precision))
    print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
Exemplo n.º 2
0
def test_movielens_genre_accuracy():

    item_features = fetch_movielens(indicator_features=False,
                                    genre_features=True)['item_features']

    assert item_features.shape[1] < item_features.shape[0]

    model = LightFM(random_state=SEED)
    model.fit_partial(train, item_features=item_features, epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col,
                                      item_features=item_features)
    test_predictions = model.predict(test.row,
                                     test.col,
                                     item_features=item_features)

    assert roc_auc_score(train.data, train_predictions) > 0.75
    assert roc_auc_score(test.data, test_predictions) > 0.69
Exemplo n.º 3
0
def test_movielens_genre_accuracy():

    item_features = movielens_data.get_movielens_item_metadata(
        use_item_ids=False)

    assert item_features.shape[1] < item_features.shape[0]

    model = LightFM()
    model.fit_partial(train, item_features=item_features, epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col,
                                      item_features=item_features)
    test_predictions = model.predict(test.row,
                                     test.col,
                                     item_features=item_features)

    assert roc_auc_score(train.data, train_predictions) > 0.75
    assert roc_auc_score(test.data, test_predictions) > 0.69
def main():
    current_stage = 6
    model = LightFM(no_components=30)
    dataset = Dataset()

    for c in range(0, current_stage + 1):
        click_train = pd.read_csv(
            train_path + "/underexpose_train_click-{}.csv".format(c),
            header=None,
            names=["user_id", "item_id", "time"],
        )
        click_test = pd.read_csv(
            test_path + "/underexpose_test_click-{}.csv".format(c),
            header=None,
            names=["user_id", "item_id", "time"],
        )
        dataset.fit_partial(click_train["user_id"], click_train["item_id"])
        num_users, num_items = dataset.interactions_shape()
        log('Num users: {}, num_items {}.'.format(num_users, num_items))
Exemplo n.º 5
0
def graph_accuracies_cutoff(data):
    print("\nTraining models with different sampling cutoffs and recording their accuracies...")

    # array used to store the values at each step size
    precisions = []
    aucs = []

    # iterate over the range of cutoffs and measure the accuracies
    for c in range(1,MAX_CUTOFF):
        test_model = LightFM(loss="warp", max_sampled=c)
        current_trained = test_model.fit(data["train"], epochs=5)
        precisions.append(precision_at_k(current_trained, data["test"], k=PRECISION_K).mean())
        aucs.append(auc_score(current_trained, data["test"]).mean())
    print("Done!")

    x_axis = range(1,MAX_CUTOFF)
    # plot the graph
    plot_accuracies(x_axis, precisions, aucs, "cutoff", "magnitude of the accuracy metric", \
            ["precisions@10", "AUROC"], 3, "accuracies_cutoff.png")
Exemplo n.º 6
0
def graph_accuracies_epochs(data):
    print("\nTraining models with varying epochs from 0 to %d and recording their accuracies..." % MAX_EPOCHS)
    
    # array used to store the values at each epoch
    precisions = []
    aucs = []
    # setup the model
    test_model = LightFM(loss="warp")
    # iterate over the range of epochs and measure the accuracies
    for e in range(MAX_EPOCHS):
        current_trained = test_model.fit(data["train"], epochs=e)
        precisions.append(precision_at_k(current_trained, data["test"], k=PRECISION_K).mean())
        aucs.append(auc_score(current_trained, data["test"]).mean())
    print("Done!")

    x_axis = np.arange(MAX_EPOCHS)
    # plot the graph
    plot_accuracies(x_axis, precisions, aucs, "number of epochs", "magnitude of the accuracy metric", \
            ["precisions@10", "AUROC"], 2, "accuracies_epochs.png")
Exemplo n.º 7
0
def test_warp_precision_max_sampled():

    model = LightFM(learning_rate=0.05,
                    max_sampled=1,
                    loss='warp',
                    random_state=SEED)

    # This is equivalent to a no-op pass
    # over the training data
    model.max_sampled = 0

    model.fit_partial(train, epochs=1)

    (train_precision, test_precision, full_train_auc,
     full_test_auc) = _get_metrics(model, train, test)

    # The AUC should be no better than random
    assert full_train_auc < 0.55
    assert full_test_auc < 0.55
Exemplo n.º 8
0
def graph_accuracies_step_size(data):
    print("\nTraining models with an epoch of 5 at different step sizes and recording their accuracies...")

    # array used to store the values at each step size
    precisions = []
    aucs = []

    # iterate over the range of step sizes and measure the accuracies
    for s in np.arange(0.1, MAX_STEP + STEP_INCREMENT, STEP_INCREMENT):
        test_model = LightFM(loss="warp", learning_rate=s)
        current_trained = test_model.fit(data["train"], epochs=5)
        precisions.append(precision_at_k(current_trained, data["test"], k=PRECISION_K).mean())
        aucs.append(auc_score(current_trained, data["test"]).mean())
    print("Done!")

    x_axis = np.arange(0.1, MAX_STEP + STEP_INCREMENT, STEP_INCREMENT)
    # plot the graph
    plot_accuracies(x_axis, precisions, aucs, "initial step size", "magnitude of the accuracy metric", \
            ["precisions@10", "AUROC"], 2, "accuracies_step_size.png")
Exemplo n.º 9
0
def test_movielens_accuracy_sample_weights():
    # Scaling weights down and learning rate up
    # by the same amount should result in
    # roughly the same accuracy

    scale = 0.5
    weights = train.copy()
    weights.data = np.ones(train.getnnz(), dtype=np.float32) * scale

    for loss, exp_score in (('logistic', 0.74), ('bpr', 0.84), ('warp', 0.89)):
        model = LightFM(loss=loss, random_state=SEED)
        model.learning_rate * 1.0 / scale

        model.fit_partial(train, sample_weight=weights, epochs=10)

        (train_precision, test_precision, full_train_auc,
         full_test_auc) = _get_metrics(model, train, test)

        assert full_train_auc > exp_score
Exemplo n.º 10
0
def test_user_supplied_features_accuracy():

    model = LightFM(random_state=SEED)
    model.fit_partial(train,
                      user_features=train_user_features,
                      item_features=train_item_features,
                      epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col,
                                      user_features=train_user_features,
                                      item_features=train_item_features)
    test_predictions = model.predict(test.row,
                                     test.col,
                                     user_features=test_user_features,
                                     item_features=test_item_features)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Exemplo n.º 11
0
def test_full_batch_predict_wo_features():
    no_components = 2
    top_k = 5
    ds = RandomDataset(density=1.0)

    model = LightFM(no_components=no_components)
    model.fit_partial(ds.train)
    user_ids = [0, 1, 2]

    # Single process
    model.batch_setup({0: ds.item_ids})
    recoms = model.batch_predict(
        user_ids=user_ids,
        chunk_id=0,
        top_k=top_k,
    )
    for user_id in user_ids:
        assert user_id in recoms
        assert len(recoms[user_id][0]) == top_k
Exemplo n.º 12
0
def test_movielens_excessive_regularization():

    for loss in ('logistic', 'warp', 'bpr', 'warp-kos'):

        # Should perform poorly with high regularization.
        # Check that regularization does not accumulate
        # until it reaches infinity.
        model = LightFM(no_components=10,
                        item_alpha=1.0,
                        user_alpha=1.0,
                        loss=loss,
                        random_state=SEED)
        model.fit_partial(train, epochs=10, num_threads=4)

        train_predictions = model.predict(train.row, train.col)
        test_predictions = model.predict(test.row, test.col)

        assert roc_auc_score(train.data, train_predictions) < 0.65
        assert roc_auc_score(test.data, test_predictions) < 0.65
Exemplo n.º 13
0
def test_full_batch_predict():
    no_components = 2
    top_k = 5
    ds = RandomDataset()

    model = LightFM(no_components=no_components)
    model.fit_partial(ds.train,
                      user_features=ds.user_features,
                      item_features=ds.item_features)
    user_ids = [0, 1, 2]
    chunks = {0: ds.item_ids}

    # Single process
    model.batch_setup(item_chunks=chunks,
                      user_features=ds.user_features,
                      item_features=ds.item_features,
                      n_process=1)
    recoms = model.batch_predict(
        user_ids=user_ids,
        chunk_id=0,
        top_k=top_k,
    )
    for user_id in user_ids:
        assert user_id in recoms
        assert len(recoms[user_id][0]) == top_k
    initial_recoms = recoms
    model.batch_cleanup()

    model.batch_setup(item_chunks=chunks,
                      user_features=ds.user_features,
                      item_features=ds.item_features,
                      n_process=2)

    # Multiple processes
    recoms = model.batch_predict(
        user_ids=user_ids,
        chunk_id=0,
        top_k=top_k,
    )
    for user_id in user_ids:
        assert user_id in recoms
        assert_array_almost_equal(recoms[user_id], initial_recoms[user_id])
Exemplo n.º 14
0
def test_warp_precision_adadelta_multithreaded():

    model = LightFM(learning_schedule='adadelta',
                    rho=0.95,
                    epsilon=0.000001,
                    loss='warp')

    model.fit_partial(train, epochs=10, num_threads=4)

    train_precision = precision_at_k(model, train, 10)
    test_precision = precision_at_k(model, test, 10)

    full_train_auc = full_auc(model, train)
    full_test_auc = full_auc(model, test)

    assert train_precision > 0.45
    assert test_precision > 0.07

    assert full_train_auc > 0.94
    assert full_test_auc > 0.9
Exemplo n.º 15
0
def test_input_dtypes():
    no_users, no_items = 10, 100
    no_features = 20

    for dtype in dtypes:
        train = sp.coo_matrix((no_users, no_items), dtype=dtype)
        user_features = sp.coo_matrix((no_users, no_features), dtype=dtype)
        item_features = sp.coo_matrix((no_items, no_features), dtype=dtype)

        model = LightFM()
        model.fit_partial(train,
                          user_features=user_features,
                          item_features=item_features)

        model.predict(
            np.random.randint(0, no_users, 10).astype(np.int32),
            np.random.randint(0, no_items, 10).astype(np.int32),
            user_features=user_features,
            item_features=item_features,
        )
Exemplo n.º 16
0
def test_feature_inference_fails():
    # On predict if we try to use feature inference and supply
    # higher ids than the number of features that were supplied to fit
    # we should complain

    no_users, no_items = 10, 100
    no_features = 20

    train = sp.coo_matrix((no_users, no_items), dtype=np.int32)

    user_features = sp.csr_matrix((no_users, no_features), dtype=np.int32)
    item_features = sp.csr_matrix((no_items, no_features), dtype=np.int32)
    model = LightFM()
    model.fit_partial(train,
                      user_features=user_features,
                      item_features=item_features)

    with pytest.raises(ValueError):
        model.predict(np.array([no_features], dtype=np.int32),
                      np.array([no_features], dtype=np.int32))
Exemplo n.º 17
0
 def objective(params):
     # unpack
     epochs, learning_rate, no_components = params
 
     model = LightFM(loss=loss,
                     random_state=random_state,
                     learning_rate=learning_rate,
                     no_components=no_components)
     model.fit(train, epochs=epochs,
               num_threads=4, verbose=True)
 
     patks = auc_score(model, test, num_threads=4)
     maptk = np.mean(patks)
     # Make negative because we want to _minimize_ objective
     out = -maptk
     # Handle some weird numerical shit going on
     if np.abs(out + 1) < 0.01 or out < -1.0:
         return 0.0
     else:
         return out
Exemplo n.º 18
0
def objective(params):
    epochs, learning_rate, no_components, item_alpha, scale = params  # 'k_os'

    user_alpha = item_alpha * scale

    model = LightFM(loss=loss, random_state=2019, learning_rate=learning_rate,
                    no_components=no_components, user_alpha=user_alpha, item_alpha=item_alpha)
    model.fit(train, item_features=item_features, epochs=epochs, num_threads=threads, verbose=True)

    patks = function_to_optimize(model, test, item_features=item_features, num_threads=threads)
    mapatk = np.mean(patks)

    # Make negative because we want to minimize objective
    out = -mapatk

    # Handle some weird numerical shit going on
    if np.abs(out + 1) < 0.01 or out < -1.0:
        return 0.0
    else:
        return out
Exemplo n.º 19
0
def trainTheModel():
    movielens = fetch_movielens()

    train = movielens['train']
    test = movielens['test']

    user_features = None
    item_features = movielens['item_features']

    model = LightFM(learning_rate=0.05, loss='warp')
    model.fit_partial(train, item_features=item_features, epochs=10)
    train_precision = precision_at_k(
        model, train, item_features=item_features,  k=10).mean()
    test_precision = precision_at_k(
        model, test, item_features=item_features, k=10).mean()

    train_auc = auc_score(model, train).mean()
    test_auc = auc_score(model, test).mean()

    return model, user_features, item_features, movielens['item_labels'], movielens['item_feature_labels']
Exemplo n.º 20
0
def test_not_enough_features_fails():

    no_users, no_items = (10, 100)
    no_features = 20

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.int32)

    user_features = sp.csr_matrix((no_users - 1,
                                   no_features),
                                  dtype=np.int32)
    item_features = sp.csr_matrix((no_items - 1,
                                   no_features),
                                  dtype=np.int32)
    model = LightFM()
    with pytest.raises(Exception):
        model.fit_partial(train,
                          user_features=user_features,
                          item_features=item_features)
def lightfm_train(train, num_components, num_epochs):
    '''Train a LightFM collaborative filtering model from a training set.
    
    Returns: LightFM recommendation system model.'''

    # Set parameters for model
    NUM_THREADS = 1
    NUM_COMPONENTS = num_components
    NUM_EPOCHS = num_epochs
    ITEM_ALPHA = 1e-6  # Recommended by LightFM

    # Let's fit a WARP model: these generally have the best performance.
    model = LightFM(loss='warp',
                    item_alpha=ITEM_ALPHA,
                    no_components=NUM_COMPONENTS)

    # Fit model
    model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

    return model
    def build(self):
        print('start', datetime.datetime.now())
        df = pd.read_csv(self.source_file)
        number_of_users = df['user_id'].max()
        number_of_items = df['item_id'].max()
        train = sp.coo_matrix((df['rating'], (df['user_id'], df['item_id'])))
        # Load the MovieLens 100k dataset. Only five
        # star ratings are treated as positive.
        # data = fetch_movielens(min_rating=5.0)

        # Instantiate and train the model
        model = LightFM(loss='warp')
        model.fit(train, epochs=30, num_threads=2)
        prediction = model.predict(np.array([3]), np.array([2]))
        print(prediction)
        pickle.dump(model, open('lightfm.p', 'wb'))
        # Evaluate the trained model
        # test_precision = precision_at_k(model, data['test'], k=5).mean()
        # print(test_precision)
        return model
Exemplo n.º 23
0
    def _train(self, verbose=True):
        start_time = time.time()

        if verbose:
            print("LightFM training started!")

        # Let's fit a WARP model: these generally have the best performance.
        self.model = LightFM(loss=self.loss,
                             item_alpha=self.item_alpha,
                             user_alpha=self.user_alpha,
                             learning_schedule=self.learning_schedule,
                             no_components=self.num_components)

        # Run 3 epochs and time it.
        self.model = self.model.fit(self.URM,
                                    epochs=self.epochs,
                                    num_threads=self.threads)
        if verbose:
            print("LightFM training model fitted in {:.2f} seconds".format(
                time.time() - start_time))
Exemplo n.º 24
0
def test_matrix_types():

    mattypes = (sp.coo_matrix,
                sp.lil_matrix,
                sp.csr_matrix,
                sp.csc_matrix)

    dtypes = (np.int32,
              np.int64,
              np.float32,
              np.float64)

    no_users, no_items = (10, 100)
    no_features = 20

    for mattype in mattypes:
        for dtype in dtypes:
            train = mattype((no_users,
                             no_items),
                            dtype=dtype)

            user_features = mattype((no_users,
                                     no_features),
                                    dtype=dtype)
            item_features = mattype((no_items,
                                     no_features),
                                    dtype=dtype)

            model = LightFM()
            model.fit_partial(train,
                              user_features=user_features,
                              item_features=item_features)

            model.predict(np.random.randint(0, no_users, 10).astype(np.int32),
                          np.random.randint(0, no_items, 10).astype(np.int32),
                          user_features=user_features,
                          item_features=item_features)

            model.predict_rank(train,
                               user_features=user_features,
                               item_features=item_features)
 def init_model(self,
                no_components=10,
                k=5,
                n=10,
                learning_schedule='adagrad',
                loss='logistic',
                learning_rate=0.05,
                rho=0.95,
                epsilon=1e-06,
                item_alpha=0.0,
                user_alpha=0.0,
                max_sampled=10,
                random_state=None):
     """
     Initialize model to be evaluated.
     :param no_components:(int, optional) – the dimensionality of the feature latent embeddings.
     :param k:(int, optional) – for k-OS training, the k-th positive example will be selected from the
            n positive examples sampled for every user.
     :param n:(int, optional) – for k-OS training, maximum number of positives sampled for each update.
     :param learning_schedule:(string, optional) – one of (‘adagrad’, ‘adadelta’).
     :param loss:(string, optional) – one of (‘logistic’, ‘bpr’, ‘warp’, ‘warp-kos’): the loss function.
     :param learning_rate:(float, optional) – initial learning rate for the adagrad learning schedule.
     :param rho:(float, optional) – moving average coefficient for the adadelta learning schedule.
     :param epsilon:(float, optional) – conditioning parameter for the adadelta learning schedule.
     :param item_alpha:(float, optional) – L2 penalty on item features.
     :param user_alpha:(float, optional) – L2 penalty on user features.
     :param max_sampled:(int, optional) – maximum number of negative samples used during WARP fitting.
     :param random_state:(int seed, RandomState instance, or None)
     """
     self.model = LightFM(no_components=no_components,
                          k=k,
                          n=n,
                          learning_schedule=learning_schedule,
                          loss=loss,
                          learning_rate=learning_rate,
                          rho=rho,
                          epsilon=epsilon,
                          item_alpha=item_alpha,
                          user_alpha=user_alpha,
                          max_sampled=max_sampled,
                          random_state=random_state)
Exemplo n.º 26
0
    def __init__(
        self,
        URM_train,
        ICM_train,
        no_components=1024,
        k=5,
        n=10,
        learning_schedule="adagrad",
        loss="logistic",
        learning_rate=0.05,
        rho=0.95,
        epsilon=1e-06,
        item_alpha=0.0,
        user_alpha=0.0,
        max_sampled=10,
        random_state=None,
    ):

        super(LightFMRecommender, self).__init__(URM_train)

        self.URM_train = check_matrix(URM_train.copy(), "csr")
        self.ICM_train = check_matrix(ICM_train.copy(), "csr")

        # ICM_train_dense = pd.DataFrame(self.ICM_train.todense())
        # ICM_train_dense.index = ICM_train_dense.index.map(lambda x: item_mapper[str(x)])
        # self.ICM_train = sps.csr_matrix(ICM_train_dense.values)

        self.model = LightFM(
            no_components=no_components,
            k=k,
            n=n,
            learning_schedule=learning_schedule,
            loss=loss,
            learning_rate=learning_rate,
            rho=rho,
            epsilon=epsilon,
            item_alpha=item_alpha,
            user_alpha=user_alpha,
            max_sampled=max_sampled,
            random_state=random_state,
        )
Exemplo n.º 27
0
def test_logistic_precision():

    model = LightFM()
    model.fit_partial(train,
                      epochs=10)

    train_precision = precision_at_k(model,
                                     train,
                                     10)
    test_precision = precision_at_k(model,
                                    test,
                                    10)

    full_train_auc = full_auc(model, train)
    full_test_auc = full_auc(model, test)

    assert train_precision > 0.3
    assert test_precision > 0.03

    assert full_train_auc > 0.79
    assert full_test_auc > 0.74
Exemplo n.º 28
0
def test_warp_precision():

    model = LightFM(learning_rate=0.05,
                    loss='warp',
                    random_state=SEED)

    model.fit_partial(train,
                      epochs=10)

    (train_precision,
     test_precision,
     full_train_auc,
     full_test_auc) = _get_metrics(model,
                                   train,
                                   test)

    assert train_precision > 0.45
    assert test_precision > 0.07

    assert full_train_auc > 0.94
    assert full_test_auc > 0.9
 def train(self, X, y, lemmapos_list):
     
     # MODEL
     self.clf = LightFM(no_components = self.num_components, learning_schedule = 'adagrad', loss = 'warp', \
                        learning_rate = 0.05, epsilon = 1e-06, item_alpha = 0.0, user_alpha = 1e-6, \
                        max_sampled = self.max_sampled, random_state = None)
     
     # DATA
     # training data
     # X: list of vectors
     #    each vector is the initial representation for a sentence (more precisely, for a predicate with context)
     #    --> these are the user features in the training set
     # y: list of IDs for frames
     #    the frame IDs are the labels for the representations
     #    --> these are used to create the interaction matrix for the training set such that LightFM can deal with it
     # y_interactionLabels: interaction matrix is of size (num sentences in y) x (num frames) with 1 indicating the frame label for a predicate in its context sentence
     y_interactionLabels = self.createInteractionMatrix(y)
              
     # FIT
     self.clf = self.clf.fit(interactions = y_interactionLabels, user_features = X, item_features = None, \
                             sample_weight = None, epochs = self.num_epochs, num_threads = 2, verbose = True)
Exemplo n.º 30
0
def runMF(interactions,
          n_components=30,
          loss='warp',
          k=15,
          epoch=30,
          n_jobs=4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  -
        Model - Trained model
    '''
    x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components=n_components, loss=loss, k=k)
    model.fit(x, epochs=epoch, num_threads=n_jobs)
    return model