def test_basic_fetching_stackexchange():

    test_fractions = (0.2, 0.5, 0.6)

    for test_fraction in test_fractions:
        data = fetch_stackexchange(
            "crossvalidated",
            min_training_interactions=0,
            test_set_fraction=test_fraction,
        )

        train = data["train"]
        test = data["test"]

        assert isinstance(train, sp.coo_matrix)
        assert isinstance(test, sp.coo_matrix)

        assert train.shape == test.shape

        frac = float(test.getnnz()) / (train.getnnz() + test.getnnz())
        assert abs(frac - test_fraction) < 0.01

    for dataset in ("crossvalidated", "stackoverflow"):

        data = fetch_stackexchange(
            dataset,
            min_training_interactions=0,
            indicator_features=True,
            tag_features=False,
        )
        assert isinstance(data["item_features"], sp.csr_matrix)
        assert (data["item_features"].shape[0] ==
                data["item_features"].shape[1] == data["train"].shape[1])

        data = fetch_stackexchange(
            dataset,
            min_training_interactions=0,
            indicator_features=False,
            tag_features=True,
        )
        assert isinstance(data["item_features"], sp.csr_matrix)
        assert data["item_features"].shape[0] > data["item_features"].shape[1]

        data = fetch_stackexchange(
            dataset,
            min_training_interactions=0,
            indicator_features=True,
            tag_features=True,
        )
        assert isinstance(data["item_features"], sp.csr_matrix)
        assert data["item_features"].shape[0] < data["item_features"].shape[1]

        if dataset == "crossvalidated":
            assert data["train"].shape == (9431, 72360)
        else:
            assert data["train"].shape == (1349835, 11280896)

        assert np.all(data["train"].data == 1.0)
        assert np.all(data["test"].data == 1.0)
示例#2
0
def test_basic_fetching_stackexchange():

    test_fractions = (0.2, 0.5, 0.6)

    for test_fraction in test_fractions:
        data = fetch_stackexchange('crossvalidated',
                                   min_training_interactions=0,
                                   test_set_fraction=test_fraction)

        train = data['train']
        test = data['test']

        assert isinstance(train, sp.coo_matrix)
        assert isinstance(test, sp.coo_matrix)

        assert train.shape == test.shape

        frac = float(test.getnnz()) / (train.getnnz() + test.getnnz())
        assert abs(frac - test_fraction) < 0.01

    for dataset in ('crossvalidated', 'stackoverflow'):

        data = fetch_stackexchange(dataset,
                                   min_training_interactions=0,
                                   indicator_features=True, tag_features=False)
        assert isinstance(data['item_features'], sp.csr_matrix)
        assert (data['item_features'].shape[0] == data['item_features'].shape[1] ==
                data['train'].shape[1])

        data = fetch_stackexchange(dataset,
                                   min_training_interactions=0,
                                   indicator_features=False, tag_features=True)
        assert isinstance(data['item_features'], sp.csr_matrix)
        assert data['item_features'].shape[0] > data['item_features'].shape[1]

        data = fetch_stackexchange(dataset,
                                   min_training_interactions=0,
                                   indicator_features=True, tag_features=True)
        assert isinstance(data['item_features'], sp.csr_matrix)
        assert data['item_features'].shape[0] < data['item_features'].shape[1]

        if dataset == 'crossvalidated':
            assert data['train'].shape == (9431, 72360)
        else:
            assert data['train'].shape == (1349835, 11280896)

        assert np.all(data['train'].data == 1.0)
        assert np.all(data['test'].data == 1.0)
示例#3
0
def fetch_active_users():
    log.info("Fetching active users")
    data = fetch_stackexchange("crossvalidated",
                               test_set_fraction=0.1,
                               indicator_features=False,
                               tag_features=True)
    train = data["train"]
    return list(reversed(range(train.shape[0])))
示例#4
0
def validate_item_features(ctx, data_home):

    data = fetch_stackexchange('crossvalidated',
                               test_set_fraction=0.1,
                               indicator_features=False,
                               tag_features=True, data_home=data_home)

    train = data['train']
    test = data['test']

    # Set the number of threads; you can increase this
    # ify you have more physical cores available.
    NUM_COMPONENTS = 30
    NUM_EPOCHS = 3
    ITEM_ALPHA = 1e-6

    # Let's fit a WARP model: these generally have the best performance.
    model = LightFM(loss='warp',
                    item_alpha=ITEM_ALPHA,
                    no_components=NUM_COMPONENTS)

    # Run 3 epochs and time it.
    model = model.fit(train, epochs=NUM_EPOCHS)

    train_auc = auc_score(model, train).mean()
    print('Collaborative filtering train AUC: %s' % train_auc)

    test_auc = auc_score(model, test, train_interactions=train).mean()
    print('Collaborative filtering test AUC: %s' % test_auc)

    # Set biases to zero
    model.item_biases *= 0.0

    test_auc = auc_score(model, test, train_interactions=train).mean()
    print('Collaborative filtering test AUC: %s' % test_auc)

    item_features = data['item_features']
    tag_labels = data['item_feature_labels']

    print('There are %s distinct tags, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist()))

    # Define a new model instance
    model = LightFM(loss='warp',
                    item_alpha=ITEM_ALPHA,
                    no_components=NUM_COMPONENTS)

    # Fit the hybrid model. Note that this time, we pass
    # in the item features matrix.
    model = model.fit(train,
                      item_features=item_features,
                      epochs=NUM_EPOCHS)

    # Don't forget the pass in the item features again!
    train_auc = auc_score(model,  train, item_features=item_features).mean()
    print('Hybrid training set AUC: %s' % train_auc)

    test_auc = auc_score(model,  test, train_interactions=train, item_features=item_features).mean()
    print('Hybrid test set AUC: %s' % test_auc)
示例#5
0
文件: data.py 项目: asovchar/bachelor
def fetch_dataset():
    log.info("Fetching dataset")
    data = fetch_stackexchange("crossvalidated",
                               test_set_fraction=0.1,
                               indicator_features=False,
                               tag_features=True)
    train = data["train"]
    test = data["test"]
    item_features = data["item_features"]
    item_feature_labels = data["item_feature_labels"]

    # Remove duplicated train values from test set
    t1 = set(zip(*train.nonzero()))
    t2 = set(zip(*test.nonzero()))
    test = test.tocsr()
    for idx in t1 & t2:
        test[idx] = 0
    test = test.tocoo()

    return train, test, item_features, item_feature_labels
import numpy as np

from lightfm.datasets import fetch_stackexchange

data = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

train = data['train']
test = data['test']

import matplotlib.pylab as pl
import scipy.sparse as sps
import scipy.io
import matplotlib.pylab as plt
import scipy.sparse as sps

exit()

print('The dataset has %s users and %s items, '
      'with %s interactions in the test and %s interactions in the training set.'
      % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))

# Import the model
from lightfm import LightFM

# Set the number of threads; you can increase this
# ify you have more physical cores available.
NUM_THREADS = 2
示例#7
0
import numpy as np
from lightfm.datasets import fetch_movielens
from lightfm.datasets import fetch_stackexchange
from lightfm import LightFM

#CHALLENGE part 1 of 3 - write your own fetch and format method for a different recommendation
#dataset. Here a good few https://gist.github.com/entaroadun/1653794
#And take a look at the fetch_movielens method to see what it's doing
#

data_stack = fetch_stackexchange('crossvalidated', min_training_interactions=3)
print(repr(data_stack['train']))
print(repr(data_stack['test']))

#fetch data and format it
data = fetch_movielens(min_rating=4.0)

#print training and testing data
print(repr(data['train']))
print(repr(data['test']))
print("*****************************")
print(data)
print("*****************************")
print(data_stack)
print("*****************************")

# #CHALLENGE part 2 of 3 - use 3 different loss functions (so 3 different models), compare results, print results for
# #the best one. - Available loss functions are warp, logistic, bpr, and warp-kos.

#create model
model = LightFM(loss='warp')
import numpy as np 
# import the model
from lightfm import LightFM 
from lightfm.datasets import fetch_stackexchange
from lightfm.evaluation import auc_score

data = fetch_stackexchange('crossvalidated', test_set_fraction=0.1, indicator_features=False, tag_features=True)

train = data['train']
test = data['test']

print(repr(train))
print(repr(test))

print('The dataset has %s users and %s items, '
      'with %s interactions in the test and %s interactions in the training set.'
      # getnnz() --> gets the count of explicitly-stored values (i.e. non-zero values)
      % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))

# set the number of threads; can increase this
# if more physical cores are available. However, MacOS systems 
# use a default value of 1 thread if OpenMP is not supported
NUM_THREADS = 2
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

# Try to fit a WARP model - this is generally the model with the best performance
model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components = NUM_COMPONENTS)

# run 3 epochs and time it
def hybrid_model():
    """
    implements hybrid model using
    interaction data, as well as
    item features
    :return:
    """
    # Set the number of threads; you can increase this
    # if you have more physical cores available.
    NUM_THREADS = 2
    NUM_COMPONENTS = 30
    NUM_EPOCHS = 3
    ITEM_ALPHA = 1e-6
    data = fetch_stackexchange('crossvalidated',
                               test_set_fraction=0.1,
                               indicator_features=False,
                               tag_features=True)

    train = data['train'].tocsr().tocoo()
    test = data['test'].tocsr().tocoo()
    print(
        'The dataset has %s users and %s items, '
        'with %s interactions in the test and %s interactions in the training set.'
        % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))

    # Define a new model instance
    model = LightFM(loss='warp',
                    item_alpha=ITEM_ALPHA,
                    no_components=NUM_COMPONENTS)

    #fit the collaborative filtering model
    model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)
    # Compute and print the AUC score
    train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
    print('Collaborative filtering train AUC: %s' % train_auc)
    # We pass in the train interactions to exclude them from predictions.
    # This is to simulate a recommender system where we do not
    # re-recommend things the user has already interacted with in the train
    # set.

    #suppress the error of train/test overlap
    LightFM._check_test_train_intersections = lambda x, y, z: True

    test_auc = auc_score(model,
                         test,
                         train_interactions=train,
                         num_threads=NUM_THREADS).mean()
    print('Collaborative filtering test AUC: %s' % test_auc)
    """
    The fact that we score them lower than other items (AUC < 0.5) is due to estimated per-item biases, 
    which can be confirmed by setting them to zero and re-evaluating the model.
    """
    model.item_biases *= 0.0

    test_auc = auc_score(model,
                         test,
                         train_interactions=train,
                         num_threads=NUM_THREADS).mean()
    print('Collaborative filtering test AUC after using biases: %s' % test_auc)

    # Fit the hybrid model. Note that this time, we pass
    # in the item features matrix.
    item_features = data['item_features']
    tag_labels = data['item_feature_labels']

    print('There are %s distinct tags, with values like %s.' %
          (item_features.shape[1], tag_labels[:3].tolist()))

    model = LightFM(loss='warp',
                    item_alpha=ITEM_ALPHA,
                    no_components=NUM_COMPONENTS)
    model = model.fit(train,
                      item_features=item_features,
                      epochs=NUM_EPOCHS,
                      num_threads=NUM_THREADS)

    # Don't forget the pass in the item features again!
    train_auc = auc_score(model,
                          train,
                          item_features=item_features,
                          num_threads=NUM_THREADS).mean()

    print('Hybrid training set AUC: %s' % train_auc)
    test_auc = auc_score(model,
                         test,
                         train_interactions=train,
                         item_features=item_features,
                         num_threads=NUM_THREADS).mean()
    print('Hybrid test set AUC: %s' % test_auc)

    #find similar tags
    for tag in (u'bayesian', u'regression', u'survival'):
        tag_id = tag_labels.tolist().index(tag)
        print(
            'Most similar tags for %s: %s' %
            (tag_labels[tag_id], tag_labels[get_similar_tags(model, tag_id)]))
import numpy as np
from lightfm.datasets import fetch_movielens
from lightfm.datasets import fetch_stackexchange
from lightfm import LightFM

#Datos traidos desde lightfm
data = fetch_movielens(min_rating=4.0)
dataStack = fetch_stackexchange(dataset='crossvalidated',
                                min_training_interactions=1)

#Datos de pelicula
print(repr(data['train']))
print(repr(data['test']))

#Datos de StackOverflow
print(repr(dataStack['train']))
print(repr(dataStack['test']))

#El parametro loss indica al algoritmode ML si el resultado es
#correcto o erroneo, indicando que entre mas grande el valor, indica que esta mas equivocado
#WRAP= Pérdida de pareja ponderada de rango aproximado
model = LightFM(loss='warp')
modelStack = LightFM(loss='warp')

#Entrenamos el modelo, epoch= las veces que la NN va a iterar sobre los datos
model.fit(data['train'], epochs=30, num_threads=2)
modelStack.fit(dataStack['train'], epochs=30, num_threads=2)

print(dataStack['item_features'])