def test_basic_fetching_stackexchange(): test_fractions = (0.2, 0.5, 0.6) for test_fraction in test_fractions: data = fetch_stackexchange( "crossvalidated", min_training_interactions=0, test_set_fraction=test_fraction, ) train = data["train"] test = data["test"] assert isinstance(train, sp.coo_matrix) assert isinstance(test, sp.coo_matrix) assert train.shape == test.shape frac = float(test.getnnz()) / (train.getnnz() + test.getnnz()) assert abs(frac - test_fraction) < 0.01 for dataset in ("crossvalidated", "stackoverflow"): data = fetch_stackexchange( dataset, min_training_interactions=0, indicator_features=True, tag_features=False, ) assert isinstance(data["item_features"], sp.csr_matrix) assert (data["item_features"].shape[0] == data["item_features"].shape[1] == data["train"].shape[1]) data = fetch_stackexchange( dataset, min_training_interactions=0, indicator_features=False, tag_features=True, ) assert isinstance(data["item_features"], sp.csr_matrix) assert data["item_features"].shape[0] > data["item_features"].shape[1] data = fetch_stackexchange( dataset, min_training_interactions=0, indicator_features=True, tag_features=True, ) assert isinstance(data["item_features"], sp.csr_matrix) assert data["item_features"].shape[0] < data["item_features"].shape[1] if dataset == "crossvalidated": assert data["train"].shape == (9431, 72360) else: assert data["train"].shape == (1349835, 11280896) assert np.all(data["train"].data == 1.0) assert np.all(data["test"].data == 1.0)
def test_basic_fetching_stackexchange(): test_fractions = (0.2, 0.5, 0.6) for test_fraction in test_fractions: data = fetch_stackexchange('crossvalidated', min_training_interactions=0, test_set_fraction=test_fraction) train = data['train'] test = data['test'] assert isinstance(train, sp.coo_matrix) assert isinstance(test, sp.coo_matrix) assert train.shape == test.shape frac = float(test.getnnz()) / (train.getnnz() + test.getnnz()) assert abs(frac - test_fraction) < 0.01 for dataset in ('crossvalidated', 'stackoverflow'): data = fetch_stackexchange(dataset, min_training_interactions=0, indicator_features=True, tag_features=False) assert isinstance(data['item_features'], sp.csr_matrix) assert (data['item_features'].shape[0] == data['item_features'].shape[1] == data['train'].shape[1]) data = fetch_stackexchange(dataset, min_training_interactions=0, indicator_features=False, tag_features=True) assert isinstance(data['item_features'], sp.csr_matrix) assert data['item_features'].shape[0] > data['item_features'].shape[1] data = fetch_stackexchange(dataset, min_training_interactions=0, indicator_features=True, tag_features=True) assert isinstance(data['item_features'], sp.csr_matrix) assert data['item_features'].shape[0] < data['item_features'].shape[1] if dataset == 'crossvalidated': assert data['train'].shape == (9431, 72360) else: assert data['train'].shape == (1349835, 11280896) assert np.all(data['train'].data == 1.0) assert np.all(data['test'].data == 1.0)
def fetch_active_users(): log.info("Fetching active users") data = fetch_stackexchange("crossvalidated", test_set_fraction=0.1, indicator_features=False, tag_features=True) train = data["train"] return list(reversed(range(train.shape[0])))
def validate_item_features(ctx, data_home): data = fetch_stackexchange('crossvalidated', test_set_fraction=0.1, indicator_features=False, tag_features=True, data_home=data_home) train = data['train'] test = data['test'] # Set the number of threads; you can increase this # ify you have more physical cores available. NUM_COMPONENTS = 30 NUM_EPOCHS = 3 ITEM_ALPHA = 1e-6 # Let's fit a WARP model: these generally have the best performance. model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) # Run 3 epochs and time it. model = model.fit(train, epochs=NUM_EPOCHS) train_auc = auc_score(model, train).mean() print('Collaborative filtering train AUC: %s' % train_auc) test_auc = auc_score(model, test, train_interactions=train).mean() print('Collaborative filtering test AUC: %s' % test_auc) # Set biases to zero model.item_biases *= 0.0 test_auc = auc_score(model, test, train_interactions=train).mean() print('Collaborative filtering test AUC: %s' % test_auc) item_features = data['item_features'] tag_labels = data['item_feature_labels'] print('There are %s distinct tags, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist())) # Define a new model instance model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) # Fit the hybrid model. Note that this time, we pass # in the item features matrix. model = model.fit(train, item_features=item_features, epochs=NUM_EPOCHS) # Don't forget the pass in the item features again! train_auc = auc_score(model, train, item_features=item_features).mean() print('Hybrid training set AUC: %s' % train_auc) test_auc = auc_score(model, test, train_interactions=train, item_features=item_features).mean() print('Hybrid test set AUC: %s' % test_auc)
def fetch_dataset(): log.info("Fetching dataset") data = fetch_stackexchange("crossvalidated", test_set_fraction=0.1, indicator_features=False, tag_features=True) train = data["train"] test = data["test"] item_features = data["item_features"] item_feature_labels = data["item_feature_labels"] # Remove duplicated train values from test set t1 = set(zip(*train.nonzero())) t2 = set(zip(*test.nonzero())) test = test.tocsr() for idx in t1 & t2: test[idx] = 0 test = test.tocoo() return train, test, item_features, item_feature_labels
import numpy as np from lightfm.datasets import fetch_stackexchange data = fetch_stackexchange('crossvalidated', test_set_fraction=0.1, indicator_features=False, tag_features=True) train = data['train'] test = data['test'] import matplotlib.pylab as pl import scipy.sparse as sps import scipy.io import matplotlib.pylab as plt import scipy.sparse as sps exit() print('The dataset has %s users and %s items, ' 'with %s interactions in the test and %s interactions in the training set.' % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz())) # Import the model from lightfm import LightFM # Set the number of threads; you can increase this # ify you have more physical cores available. NUM_THREADS = 2
import numpy as np from lightfm.datasets import fetch_movielens from lightfm.datasets import fetch_stackexchange from lightfm import LightFM #CHALLENGE part 1 of 3 - write your own fetch and format method for a different recommendation #dataset. Here a good few https://gist.github.com/entaroadun/1653794 #And take a look at the fetch_movielens method to see what it's doing # data_stack = fetch_stackexchange('crossvalidated', min_training_interactions=3) print(repr(data_stack['train'])) print(repr(data_stack['test'])) #fetch data and format it data = fetch_movielens(min_rating=4.0) #print training and testing data print(repr(data['train'])) print(repr(data['test'])) print("*****************************") print(data) print("*****************************") print(data_stack) print("*****************************") # #CHALLENGE part 2 of 3 - use 3 different loss functions (so 3 different models), compare results, print results for # #the best one. - Available loss functions are warp, logistic, bpr, and warp-kos. #create model model = LightFM(loss='warp')
import numpy as np # import the model from lightfm import LightFM from lightfm.datasets import fetch_stackexchange from lightfm.evaluation import auc_score data = fetch_stackexchange('crossvalidated', test_set_fraction=0.1, indicator_features=False, tag_features=True) train = data['train'] test = data['test'] print(repr(train)) print(repr(test)) print('The dataset has %s users and %s items, ' 'with %s interactions in the test and %s interactions in the training set.' # getnnz() --> gets the count of explicitly-stored values (i.e. non-zero values) % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz())) # set the number of threads; can increase this # if more physical cores are available. However, MacOS systems # use a default value of 1 thread if OpenMP is not supported NUM_THREADS = 2 NUM_COMPONENTS = 30 NUM_EPOCHS = 3 ITEM_ALPHA = 1e-6 # Try to fit a WARP model - this is generally the model with the best performance model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components = NUM_COMPONENTS) # run 3 epochs and time it
def hybrid_model(): """ implements hybrid model using interaction data, as well as item features :return: """ # Set the number of threads; you can increase this # if you have more physical cores available. NUM_THREADS = 2 NUM_COMPONENTS = 30 NUM_EPOCHS = 3 ITEM_ALPHA = 1e-6 data = fetch_stackexchange('crossvalidated', test_set_fraction=0.1, indicator_features=False, tag_features=True) train = data['train'].tocsr().tocoo() test = data['test'].tocsr().tocoo() print( 'The dataset has %s users and %s items, ' 'with %s interactions in the test and %s interactions in the training set.' % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz())) # Define a new model instance model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) #fit the collaborative filtering model model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) # Compute and print the AUC score train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean() print('Collaborative filtering train AUC: %s' % train_auc) # We pass in the train interactions to exclude them from predictions. # This is to simulate a recommender system where we do not # re-recommend things the user has already interacted with in the train # set. #suppress the error of train/test overlap LightFM._check_test_train_intersections = lambda x, y, z: True test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean() print('Collaborative filtering test AUC: %s' % test_auc) """ The fact that we score them lower than other items (AUC < 0.5) is due to estimated per-item biases, which can be confirmed by setting them to zero and re-evaluating the model. """ model.item_biases *= 0.0 test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean() print('Collaborative filtering test AUC after using biases: %s' % test_auc) # Fit the hybrid model. Note that this time, we pass # in the item features matrix. item_features = data['item_features'] tag_labels = data['item_feature_labels'] print('There are %s distinct tags, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist())) model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) model = model.fit(train, item_features=item_features, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) # Don't forget the pass in the item features again! train_auc = auc_score(model, train, item_features=item_features, num_threads=NUM_THREADS).mean() print('Hybrid training set AUC: %s' % train_auc) test_auc = auc_score(model, test, train_interactions=train, item_features=item_features, num_threads=NUM_THREADS).mean() print('Hybrid test set AUC: %s' % test_auc) #find similar tags for tag in (u'bayesian', u'regression', u'survival'): tag_id = tag_labels.tolist().index(tag) print( 'Most similar tags for %s: %s' % (tag_labels[tag_id], tag_labels[get_similar_tags(model, tag_id)]))
import numpy as np from lightfm.datasets import fetch_movielens from lightfm.datasets import fetch_stackexchange from lightfm import LightFM #Datos traidos desde lightfm data = fetch_movielens(min_rating=4.0) dataStack = fetch_stackexchange(dataset='crossvalidated', min_training_interactions=1) #Datos de pelicula print(repr(data['train'])) print(repr(data['test'])) #Datos de StackOverflow print(repr(dataStack['train'])) print(repr(dataStack['test'])) #El parametro loss indica al algoritmode ML si el resultado es #correcto o erroneo, indicando que entre mas grande el valor, indica que esta mas equivocado #WRAP= Pérdida de pareja ponderada de rango aproximado model = LightFM(loss='warp') modelStack = LightFM(loss='warp') #Entrenamos el modelo, epoch= las veces que la NN va a iterar sobre los datos model.fit(data['train'], epochs=30, num_threads=2) modelStack.fit(dataStack['train'], epochs=30, num_threads=2) print(dataStack['item_features'])