def test_movielens_accuracy_sample_weights_grad_accumulation(): # Set weights to zero for all even-numbered users # and check that they have not accumulated any # gradient updates. weights = train.copy() weights.data = np.ones(train.getnnz(), dtype=np.float32) even_users = weights.row % 2 == 0 weights.data *= even_users even_idx = np.arange(train.shape[0]) % 2 == 0 odd_idx = np.arange(train.shape[0]) % 2 != 0 for loss in ('logistic', 'bpr', 'warp'): model = LightFM(loss=loss, random_state=SEED) model.fit_partial(train, sample_weight=weights, epochs=1) assert np.allclose(model.user_embedding_gradients[odd_idx], 1.0) assert np.allclose(model.user_bias_gradients[odd_idx], 1.0) assert not np.allclose(model.user_embedding_gradients[even_idx], 1.0) assert not np.allclose(model.user_bias_gradients[even_idx], 1.0)
def test_auc_score(): no_users, no_items = (10, 100) train, test = _generate_data(no_users, no_items) model = LightFM(loss='bpr') model.fit_partial(train) auc = evaluation.auc_score(model, test, num_threads=2) expected_auc = np.array(_auc(model, test)) assert auc.shape == expected_auc.shape assert np.abs(auc.mean() - expected_auc.mean()) < 0.01 assert len(auc) == (test.getnnz(axis=1) > 0).sum() assert len(evaluation.auc_score(model, train, preserve_rows=True)) == test.shape[0] # With omitting train interactions auc = evaluation.auc_score(model, test, train_interactions=train, num_threads=2) expected_auc = np.array(_auc(model, test, train)) assert np.abs(auc.mean() - expected_auc.mean()) < 0.01
def test_input_dtypes(): dtypes = (np.int32, np.int64, np.float32, np.float64) no_users, no_items = (10, 100) no_features = 20 for dtype in dtypes: train = sp.coo_matrix((no_users, no_items), dtype=dtype) user_features = sp.coo_matrix((no_users, no_features), dtype=dtype) item_features = sp.coo_matrix((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict(np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features)
def test_matrix_types(): mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix) dtypes = (np.int32, np.int64, np.float32, np.float64) no_users, no_items = (10, 100) no_features = 20 for mattype in mattypes: for dtype in dtypes: train = mattype((no_users, no_items), dtype=dtype) user_features = mattype((no_users, no_features), dtype=dtype) item_features = mattype((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict( np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features, )
def test_warp_kos_precision(): # Remove all negative examples training = train.copy() training.data[training.data < 1] = 0 training = training.tocsr() training.eliminate_zeros() model = LightFM(learning_rate=0.05, k=5, loss='warp-kos', random_state=SEED) model.fit_partial(training, epochs=10) (train_precision, test_precision, full_train_auc, full_test_auc) = _get_metrics(model, train, test) assert train_precision > 0.44 assert test_precision > 0.06 assert full_train_auc > 0.9 assert full_test_auc > 0.87
def test_feature_inference_fails(): # On predict if we try to use feature inference and supply # higher ids than the number of features that were supplied to fit # we should complain no_users, no_items = (10, 100) no_features = 20 train = sp.coo_matrix((no_users, no_items), dtype=np.int32) user_features = sp.csr_matrix((no_users, no_features), dtype=np.int32) item_features = sp.csr_matrix((no_items, no_features), dtype=np.int32) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) with pytest.raises(AssertionError): model.predict(np.array([no_features], dtype=np.int32), np.array([no_features], dtype=np.int32))
def test_warp_kos_precision(): # Remove all negative examples training = train.copy() training.data[training.data < 1] = 0 training = training.tocsr() training.eliminate_zeros() model = LightFM(learning_rate=0.05, k=5, loss='warp-kos') model.fit_partial(training, epochs=10) train_precision = precision_at_k(model, training, 10) test_precision = precision_at_k(model, test, 10) full_train_auc = full_auc(model, training) full_test_auc = full_auc(model, test) assert train_precision > 0.44 assert test_precision > 0.06 assert full_train_auc > 0.9 assert full_test_auc > 0.87
def test_movielens_accuracy_sample_weights(): # Scaling weights down and learning rate up # by the same amount should result in # roughly the same accuracy scale = 1e-01 weights = train.copy() weights.data = np.ones(train.getnnz(), dtype=np.float32) * scale for (loss, exp_score) in (('logistic', 0.74), ('bpr', 0.84), ('warp', 0.89)): model = LightFM(loss=loss, random_state=SEED) model.learning_rate * 1.0 / scale model.fit_partial(train, sample_weight=weights, epochs=10) (train_precision, test_precision, full_train_auc, full_test_auc) = _get_metrics(model, train, test) assert full_train_auc > exp_score
def test_warp_precision_adadelta_multithreaded(): model = LightFM(learning_schedule='adadelta', rho=0.95, epsilon=0.000001, loss='warp') model.fit_partial(train, epochs=10, num_threads=4) train_precision = precision_at_k(model, train, 10) test_precision = precision_at_k(model, test, 10) full_train_auc = full_auc(model, train) full_test_auc = full_auc(model, test) assert train_precision > 0.45 assert test_precision > 0.07 assert full_train_auc > 0.94 assert full_test_auc > 0.9
def test_training_schedules(): model = LightFM(no_components=10, learning_schedule='adagrad', random_state=SEED) model.fit_partial(train, epochs=0) assert (model.item_embedding_gradients == 1).all() assert (model.item_embedding_momentum == 0).all() assert (model.item_bias_gradients == 1).all() assert (model.item_bias_momentum == 0).all() assert (model.user_embedding_gradients == 1).all() assert (model.user_embedding_momentum == 0).all() assert (model.user_bias_gradients == 1).all() assert (model.user_bias_momentum == 0).all() model.fit_partial(train, epochs=1) assert (model.item_embedding_gradients > 1).any() assert (model.item_embedding_momentum == 0).all() assert (model.item_bias_gradients > 1).any() assert (model.item_bias_momentum == 0).all() assert (model.user_embedding_gradients > 1).any() assert (model.user_embedding_momentum == 0).all() assert (model.user_bias_gradients > 1).any() assert (model.user_bias_momentum == 0).all() model = LightFM(no_components=10, learning_schedule='adadelta', random_state=SEED) model.fit_partial(train, epochs=0) assert (model.item_embedding_gradients == 0).all() assert (model.item_embedding_momentum == 0).all() assert (model.item_bias_gradients == 0).all() assert (model.item_bias_momentum == 0).all() assert (model.user_embedding_gradients == 0).all() assert (model.user_embedding_momentum == 0).all() assert (model.user_bias_gradients == 0).all() assert (model.user_bias_momentum == 0).all() model.fit_partial(train, epochs=1) assert (model.item_embedding_gradients > 0).any() assert (model.item_embedding_momentum > 0).any() assert (model.item_bias_gradients > 0).any() assert (model.item_bias_momentum > 0).any() assert (model.user_embedding_gradients > 0).any() assert (model.user_embedding_momentum > 0).any() assert (model.user_bias_gradients > 0).any() assert (model.user_bias_momentum > 0).any()
def test_empty_matrix(): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.int32) model = LightFM() model.fit_partial(train)
def test_return_self(): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.int32) model = LightFM() assert model.fit_partial(train) is model assert model.fit(train) is model
def fit_lightfm_model(): """ Fit the lightFM model returns d_user_pred, list_user, list_coupon list_coupon = list of test coupons list_user = list of user ID d_user_pred : key = user, value = predicted ranking of coupons in list_coupon """ #Load data Mui_train = spi.mmread("../Data/Data_translated/biclass_user_item_train_mtrx.mtx") uf = spi.mmread("../Data/Data_translated/user_feat_mtrx.mtx") itrf = spi.mmread("../Data/Data_translated/train_item_feat_mtrx.mtx") itef = spi.mmread("../Data/Data_translated/test_item_feat_mtrx.mtx") #Print shapes as a check print "user_features shape: %s,\nitem train features shape: %s,\nitem test features shape: %s" % (uf.shape, itrf.shape, itef.shape) #Load test coupon and user lists cplte = pd.read_csv("../Data/Data_translated/coupon_list_test_translated.csv") ulist = pd.read_csv("../Data/Data_translated/user_list_translated.csv") list_coupon = cplte["COUPON_ID_hash"].values list_user = ulist["USER_ID_hash"].values #Build model no_comp, lr, ep = 10, 0.01, 5 model = LightFM(no_components=no_comp, learning_rate=lr, loss='warp') model.fit_partial(Mui_train, user_features = uf, item_features = itrf, epochs = ep, num_threads = 4, verbose = True) test = sps.csr_matrix((len(list_user), len(list_coupon)), dtype = np.int32) no_users, no_items = test.shape pid_array = np.arange(no_items, dtype=np.int32) #Create and initialise dict to store predictions d_user_pred = {} for user in list_user : d_user_pred[user] = [] # Loop over users and compute predictions for user_id, row in enumerate(test): sys.stdout.write("\rProcessing user " + str(user_id)+"/ "+str(len(list_user))) sys.stdout.flush() uid_array = np.empty(no_items, dtype=np.int32) uid_array.fill(user_id) predictions = model.predict(uid_array, pid_array,user_features = uf, item_features = itef, num_threads=4) user = str(list_user[user_id]) # apply MinMaxScaler for blending later on MMS = MinMaxScaler() pred = MMS.fit_transform(np.ravel(predictions)) d_user_pred[user] = pred # Pickle the predictions for future_use d_pred = {"list_coupon" : list_coupon.tolist(), "d_user_pred" : d_user_pred} with open("../Data/Data_translated/d_pred_lightfm.pickle", "w") as f: pickle.dump(d_pred, f, protocol = pickle.HIGHEST_PROTOCOL) return d_user_pred, list_user, list_coupon
def test_state_reset(): model = LightFM() model.fit(train, epochs=1) assert np.mean(model.user_embedding_gradients) > 1.0 model.fit(train, epochs=0) assert np.all(model.user_embedding_gradients == 1.0)
def test_not_enough_features_fails(): no_users, no_items = (10, 100) no_features = 20 train = sp.coo_matrix((no_users, no_items), dtype=np.int32) user_features = sp.csr_matrix((no_users - 1, no_features), dtype=np.int32) item_features = sp.csr_matrix((no_items - 1, no_features), dtype=np.int32) model = LightFM() with pytest.raises(Exception): model.fit_partial(train, user_features=user_features, item_features=item_features)
def test_warp_stability(): learning_rates = (0.05, 0.1, 0.5) for lrate in learning_rates: model = LightFM(learning_rate=lrate, loss='warp') model.fit_partial(train, epochs=10) assert not np.isnan(model.user_embeddings).any() assert not np.isnan(model.item_embeddings).any()
def test_predict(): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.int32) model = LightFM() model.fit_partial(train) for uid in range(no_users): scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) scores_int = model.predict(uid, np.arange(no_items)) assert np.allclose(scores_arr, scores_int)
def test_movielens_accuracy_fit(): model = LightFM(random_state=SEED) model.fit(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_predict(num_threads=2): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.int32) model = LightFM() model.fit_partial(train) for uid in range(no_users): scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) scores_int = model.predict(uid, np.arange(no_items)) assert np.allclose(scores_arr, scores_int) scores_parallel = model.predict(np.repeat(uid, no_items), np.arange(no_items), num_threads=num_threads) assert np.allclose(scores_parallel, scores_arr) scores_no_prec = model.predict(np.repeat(uid, no_items), np.arange(no_items), num_threads=num_threads, precompute_representations=False) assert np.allclose(scores_parallel, scores_no_prec) scores_no_prec_serial = model.predict(np.repeat(uid, no_items), np.arange(no_items), num_threads=1, precompute_representations=False) assert np.allclose(scores_parallel, scores_no_prec_serial)
def test_movielens_accuracy_resume(): model = LightFM(random_state=SEED) for _ in range(10): model.fit_partial(train, epochs=1) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_movielens_accuracy_pickle(): model = LightFM() model.fit(train, epochs=10) model = pickle.loads(pickle.dumps(model)) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_regularization(): # Let's regularize model = LightFM(no_components=50, item_alpha=0.0001, user_alpha=0.0001) model.fit_partial(train, epochs=30) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.80 assert roc_auc_score(test.data, test_predictions) > 0.75
def test_overfitting(): # Let's massivly overfit model = LightFM(no_components=50) model.fit_partial(train, epochs=30) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) overfit_train = roc_auc_score(train.data, train_predictions) overfit_test = roc_auc_score(test.data, test_predictions) assert overfit_train > 0.99 assert overfit_test < 0.75
def test_movielens_excessive_regularization(): # Should perform poorly with high regularization model = LightFM(no_components=10, item_alpha=1.0, user_alpha=1.0) model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) < 0.6 assert roc_auc_score(test.data, test_predictions) < 0.6
def test_zeros_negative_accuracy(): # Should get the same accuracy when zeros are used to # denote negative interactions train.data[train.data == -1] = 0 model = LightFM() model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_sklearn_api(): model = LightFM() params = model.get_params() model2 = LightFM(**params) params2 = model2.get_params() assert params == params2 model.set_params(**params) params['invalid_param'] = 666 with pytest.raises(ValueError): model.set_params(**params)
def test_logistic_precision(): model = LightFM(random_state=SEED) model.fit_partial(train, epochs=10) (train_precision, test_precision, full_train_auc, full_test_auc) = _get_metrics(model, train, test) assert train_precision > 0.3 assert test_precision > 0.03 assert full_train_auc > 0.79 assert full_test_auc > 0.73
def test_random_state_advanced(): # Check that using the random state # to seed rand_r in Cython advances # the random generator state. model = LightFM(learning_rate=0.05, loss='warp', random_state=SEED) model.fit_partial(train, epochs=1) rng_state = model.random_state.get_state()[1].copy() model.fit_partial(train, epochs=1) assert not np.all(rng_state == model.random_state.get_state()[1])
def test_movielens_accuracy_pickle(): model = LightFM(random_state=SEED) model.fit(train, epochs=10) model = pickle.loads(pickle.dumps(model)) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_coo_with_duplicate_entries(): # Calling .tocsr on a COO matrix with duplicate entries # changes its data arrays in-place, leading to out-of-bounds # array accesses in the WARP code. # Reported in https://github.com/lyst/lightfm/issues/117. rows, cols = 1000, 100 mat = sp.random(rows, cols) mat.data[:] = 1 # Duplicate entries in the COO matrix mat.data = np.concatenate((mat.data, mat.data[:1000])) mat.row = np.concatenate((mat.row, mat.row[:1000])) mat.col = np.concatenate((mat.col, mat.col[:1000])) for loss in ('warp', 'bpr', 'warp-kos'): model = LightFM(loss=loss) model.fit(mat)
def test_random_state_fixing(): model = LightFM( learning_rate=0.05, loss='warp', random_state=SEED, ) model.fit_partial(train, epochs=2) model_2 = LightFM( learning_rate=0.05, loss='warp', random_state=SEED, ) model_2.fit_partial(train, epochs=2) assert np.all(model.user_embeddings == model_2.user_embeddings) assert np.all(model.item_embeddings == model_2.item_embeddings)
def test_movielens_accuracy(): model = LightFM() model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_overfitting(): # Let's massivly overfit model = LightFM(no_components=50, random_state=SEED) model.fit_partial(train, epochs=30) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) overfit_train = roc_auc_score(train.data, train_predictions) overfit_test = roc_auc_score(test.data, test_predictions) assert overfit_train > 0.99 assert overfit_test < 0.75
def test_zeros_negative_accuracy(): # Should get the same accuracy when zeros are used to # denote negative interactions train.data[train.data == -1] = 0 model = LightFM(random_state=SEED) model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_matrix_types(): mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix) dtypes = (np.int32, np.int64, np.float32, np.float64) no_users, no_items = (10, 100) no_features = 20 for mattype in mattypes: for dtype in dtypes: train = mattype((no_users, no_items), dtype=dtype) user_features = mattype((no_users, no_features), dtype=dtype) item_features = mattype((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict(np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features) model.predict_rank(train, user_features=user_features, item_features=item_features)
def test_precision_at_k_with_ties(): no_users, no_items = (10, 100) train, test = _generate_data(no_users, no_items) model = LightFM(loss="bpr") model.fit_partial(train) # Make all predictions zero model.user_embeddings = np.zeros_like(model.user_embeddings) model.item_embeddings = np.zeros_like(model.item_embeddings) model.user_biases = np.zeros_like(model.user_biases) model.item_biases = np.zeros_like(model.item_biases) k = 10 precision = evaluation.precision_at_k(model, test, k=k) # Pessimistic precision with all ties assert precision.mean() == 0.0
def test_full_batch_predict_wo_features(): no_components = 2 top_k = 5 ds = RandomDataset(density=1.0) model = LightFM(no_components=no_components) model.fit_partial(ds.train) user_ids = [0, 1, 2] # Single process model.batch_setup({0: ds.item_ids}) recoms = model.batch_predict( user_ids=user_ids, chunk_id=0, top_k=top_k, ) for user_id in user_ids: assert user_id in recoms assert len(recoms[user_id][0]) == top_k
def main(): current_stage = 6 model = LightFM(no_components=30) dataset = Dataset() for c in range(0, current_stage + 1): click_train = pd.read_csv( train_path + "/underexpose_train_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) click_test = pd.read_csv( test_path + "/underexpose_test_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) dataset.fit_partial(click_train["user_id"], click_train["item_id"]) num_users, num_items = dataset.interactions_shape() log('Num users: {}, num_items {}.'.format(num_users, num_items))
def test_random_state_advanced(): # Check that using the random state # to seed rand_r in Cython advances # the random generator state. model = LightFM(learning_rate=0.05, loss='warp', random_state=SEED) model.fit_partial(train, epochs=1) rng_state = model.rng.get_state()[1].copy() model.fit_partial(train, epochs=1) assert not np.all(rng_state == model.rng.get_state()[1])
def test_regularization(): # Let's regularize model = LightFM(no_components=50, item_alpha=0.0001, user_alpha=0.0001, random_state=SEED) model.fit_partial(train, epochs=30) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.80 assert roc_auc_score(test.data, test_predictions) > 0.75
def test_movielens_excessive_regularization(): # Should perform poorly with high regularization model = LightFM(no_components=10, item_alpha=1.0, user_alpha=1.0, random_state=SEED) model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) < 0.6 assert roc_auc_score(test.data, test_predictions) < 0.6
def test_full_batch_predict(): no_components = 2 top_k = 5 ds = RandomDataset() model = LightFM(no_components=no_components) model.fit_partial(ds.train, user_features=ds.user_features, item_features=ds.item_features) user_ids = [0, 1, 2] chunks = {0: ds.item_ids} # Single process model.batch_setup(item_chunks=chunks, user_features=ds.user_features, item_features=ds.item_features, n_process=1) recoms = model.batch_predict( user_ids=user_ids, chunk_id=0, top_k=top_k, ) for user_id in user_ids: assert user_id in recoms assert len(recoms[user_id][0]) == top_k initial_recoms = recoms model.batch_cleanup() model.batch_setup(item_chunks=chunks, user_features=ds.user_features, item_features=ds.item_features, n_process=2) # Multiple processes recoms = model.batch_predict( user_ids=user_ids, chunk_id=0, top_k=top_k, ) for user_id in user_ids: assert user_id in recoms assert_array_almost_equal(recoms[user_id], initial_recoms[user_id])
def test_feature_inference_fails(): # On predict if we try to use feature inference and supply # higher ids than the number of features that were supplied to fit # we should complain no_users, no_items = 10, 100 no_features = 20 train = sp.coo_matrix((no_users, no_items), dtype=np.int32) user_features = sp.csr_matrix((no_users, no_features), dtype=np.int32) item_features = sp.csr_matrix((no_items, no_features), dtype=np.int32) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) with pytest.raises(ValueError): model.predict(np.array([no_features], dtype=np.int32), np.array([no_features], dtype=np.int32))
def test_input_dtypes(): no_users, no_items = 10, 100 no_features = 20 for dtype in dtypes: train = sp.coo_matrix((no_users, no_items), dtype=dtype) user_features = sp.coo_matrix((no_users, no_features), dtype=dtype) item_features = sp.coo_matrix((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict( np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features, )
def obtener_modelo_gui(self, lista_param): """ Método obtener_modelo_gui. Obtiene el modelo escogido según los parámetros pasados. Este método solo se utiliza en la interfaz web. Parameters ---------- lista_param: list lista que contiene los parámetros escogidos por el usuario para crear el modelo. """ global modelo # Se guardan los parámetros en variables para que sea más legible no_components = lista_param[0] k = lista_param[1] n = lista_param[2] learning_schedule = lista_param[3] loss = lista_param[4] learning_rate = lista_param[5] rho = lista_param[6] epsilon = lista_param[7] item_alpha = lista_param[8] user_alpha = lista_param[9] max_sampled = lista_param[10] # Se instancia el modelo según los parámetros anteriores modelo = LightFM(no_components=no_components, k=k, n=n, learning_schedule=learning_schedule, loss=loss, learning_rate=learning_rate, rho=rho, epsilon=epsilon, item_alpha=item_alpha, user_alpha=user_alpha, max_sampled=max_sampled)
def test_predict_ranks(): no_users, no_items = 10, 100 train = sp.rand(no_users, no_items, format='csr', random_state=42) model = LightFM() model.fit_partial(train) # Compute ranks for all items rank_input = sp.csr_matrix(np.ones((no_users, no_items))) ranks = model.predict_rank(rank_input, num_threads=2).todense() assert np.all(ranks.min(axis=1) == 0) assert np.all(ranks.max(axis=1) == no_items - 1) for row in range(no_users): assert np.all(np.sort(ranks[row]) == np.arange(no_items)) # Train set exclusions. All ranks should be zero # if train interactions is dense. ranks = model.predict_rank(rank_input, train_interactions=rank_input).todense() assert np.all(ranks == 0) # Max rank should be num_items - 1 - number of positives # in train in that row ranks = model.predict_rank(rank_input, train_interactions=train).todense() assert np.all( np.squeeze(np.array(ranks.max(axis=1))) == no_items - 1 - np.squeeze(np.array(train.getnnz(axis=1)))) # Make sure ranks are computed pessimistically when # there are ties (that is, equal predictions for every # item will assign maximum rank to each). model.user_embeddings = np.zeros_like(model.user_embeddings) model.item_embeddings = np.zeros_like(model.item_embeddings) model.user_biases = np.zeros_like(model.user_biases) model.item_biases = np.zeros_like(model.item_biases) ranks = model.predict_rank(rank_input, num_threads=2).todense() assert np.all(ranks.min(axis=1) == 99) assert np.all(ranks.max(axis=1) == 99) # Wrong input dimensions with pytest.raises(ValueError): model.predict_rank(sp.csr_matrix((5, 5)), num_threads=2)
def evaluate_ground_truth(self, model: LightFM, test: np.ndarray) -> None: """Evaluate a recommender by the ground truth perference labels.""" results = {} users = test[:, 0].astype(int) items = test[:, 1].astype(int) cv = np.zeros(test.shape[0]) if "ml" in self.data else test[:, 2] cvr = test[:, -1] if "ml" in self.data else np.zeros(test.shape[0]) ct = np.zeros(test.shape[0]) if "ml" in self.data else test[:, 3] for _k in self.k: for metric in self.metrics: results[f"{metric}@{_k}"] = [] for user in set(users): indices = users == user items_for_current_user = items[indices] cvr_for_current_user = cvr[indices] ct_for_current_user = ct[indices] cv_for_current_user = cv[indices] # predict ranking score for each user scores = model.predict(user_ids=np.int(user), item_ids=items_for_current_user) # calculate ranking metrics for _k in self.k: for metric, metric_func in self.metrics.items(): results[f"{metric}@{_k}"].append( metric_func( cv=cv_for_current_user, ct=ct_for_current_user, cv_hat=cvr_for_current_user, score=scores, k=_k, )) # aggregate results gt_results = pd.DataFrame(index=results.keys()) gt_results["gt"] = list(map(np.mean, list(results.values()))) return gt_results
def test_batch_predict_user_recs_per_user(): no_components = 2 ds = RandomDataset() model = LightFM(no_components=no_components) model.fit_partial(ds.train, user_features=ds.user_features, item_features=ds.item_features) model.batch_setup( item_chunks={0: ds.item_ids}, user_features=ds.user_features, item_features=ds.item_features, ) for uid in range(ds.no_users): rec_item_ids, rec_scores = model.predict_for_user( user_id=uid, top_k=5, item_ids=ds.item_ids, ) assert len(rec_scores) == 5 assert_array_almost_equal(rec_scores, -1 * np.sort(-1 * rec_scores))
def test_hogwild_accuracy(): # Should get comparable accuracy with 2 threads model = LightFM() model.fit_partial(train, epochs=10, num_threads=2) train_predictions = model.predict(train.row, train.col, num_threads=2) test_predictions = model.predict(test.row, test.col, num_threads=2) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_zero_weights_accuracy(): # When very small weights are used # accuracy should be no better than # random. weights = train.copy() weights.data = np.zeros(train.getnnz(), dtype=np.float32) for loss in ('logistic', 'bpr', 'warp'): model = LightFM(loss=loss, random_state=SEED) model.fit_partial(train, sample_weight=weights, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert 0.45 < roc_auc_score(train.data, train_predictions) < 0.55 assert 0.45 < roc_auc_score(test.data, test_predictions) < 0.55
def test_regression_full_batch_predict(): no_components = 2 np.random.seed(42) ds = RandomDataset(no_items=5, density=1) model = LightFM(no_components=no_components) model.fit(ds.train, user_features=ds.user_features, item_features=ds.item_features) # Set non zero biases model.item_biases += 0.2 model.user_biases += 0.5 user_ids = [0, 1, 2] model.batch_setup(item_chunks={0: ds.item_ids}, item_features=ds.item_features, user_features=ds.user_features) recoms = model.batch_predict( user_ids=user_ids, chunk_id=0, top_k=0, # Score all items ) zeros = 0 for user_id in user_ids: scores = model.predict( user_ids=user_id, item_ids=ds.item_ids, item_features=ds.item_features, user_features=ds.user_features, num_threads=1, ) if sum(scores) != 0: zeros += 1 assert_array_almost_equal(recoms[user_id][1], scores) assert zeros != 0
def test_user_supplied_features_accuracy(): model = LightFM() model.fit_partial(train, user_features=train_user_features, item_features=train_item_features, epochs=10) train_predictions = model.predict(train.row, train.col, user_features=train_user_features, item_features=train_item_features) test_predictions = model.predict(test.row, test.col, user_features=test_user_features, item_features=test_item_features) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_movielens_genre_accuracy(): item_features = movielens_data.get_movielens_item_metadata( use_item_ids=False) assert item_features.shape[1] < item_features.shape[0] model = LightFM() model.fit_partial(train, item_features=item_features, epochs=10) train_predictions = model.predict(train.row, train.col, item_features=item_features) test_predictions = model.predict(test.row, test.col, item_features=item_features) assert roc_auc_score(train.data, train_predictions) > 0.75 assert roc_auc_score(test.data, test_predictions) > 0.69
def test_movielens_excessive_regularization(): for loss in ('logistic', 'warp', 'bpr', 'warp-kos'): # Should perform poorly with high regularization. # Check that regularization does not accumulate # until it reaches infinity. model = LightFM(no_components=10, item_alpha=1.0, user_alpha=1.0, loss=loss, random_state=SEED) model.fit_partial(train, epochs=10, num_threads=4) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) < 0.65 assert roc_auc_score(test.data, test_predictions) < 0.65
def test_movielens_genre_accuracy(): item_features = fetch_movielens(indicator_features=False, genre_features=True)['item_features'] assert item_features.shape[1] < item_features.shape[0] model = LightFM(random_state=SEED) model.fit_partial(train, item_features=item_features, epochs=10) train_predictions = model.predict(train.row, train.col, item_features=item_features) test_predictions = model.predict(test.row, test.col, item_features=item_features) assert roc_auc_score(train.data, train_predictions) > 0.75 assert roc_auc_score(test.data, test_predictions) > 0.69
def validate(ctx, data_home): # matrix creation validation df = load_movielens(data_home) dic = fetch_movielens(data_home, download_if_missing=True) train_o = dic['train'] test_o = dic['test'] train_df = df[df['is_train']] test_df = df[~df['is_train']] shape = (df.user_id.unique().shape[0], df.item_id.unique().shape[0]) train_t = to_sparse_matrix(train_df.user_id.values, train_df.item_id.values, train_df.rating.values, shape) test_t = to_sparse_matrix(test_df.user_id.values, test_df.item_id.values, test_df.rating.values, shape) assert (train_o.shape == train_t.shape) assert (np.array_equal(test_o.diagonal(), test_t.diagonal())) model = LightFM(loss='warp') model.fit(train_o, epochs=10) train_precision, test_precision, train_auc, test_auc = evaluate_model(model, train_o, test_o) model.fit(train_t, epochs=10) train_precision_t, test_precision_t, train_auc_t, test_auc_t = evaluate_model(model, train_t, test_t) assert (abs(train_precision - train_precision_t) < 2) assert (abs(test_precision - test_precision_t) < 2) assert (abs(train_auc - train_auc_t) < 2) assert (abs(test_auc - test_auc_t) < 2) clf = LightWrapper(loss='warp', shape=shape) clf.fit(train_df[['user_id', 'item_id']].values, train_df.rating.values) train_precision_t, test_precision_t, train_auc_t, test_auc_t = clf.evaluate(test_df[['user_id', 'item_id']].values, test_df.rating.values) assert (abs(train_precision - train_precision_t) < 2) assert (abs(test_precision - test_precision_t) < 2) assert (abs(train_auc - train_auc_t) < 2) assert (abs(test_auc - test_auc_t) < 2) random_search(clf, df[['user_id', 'item_id', 'rating']].values, [[train_df.index.values, test_df.index.values]], param_dist={"epochs": [10], "learning_rate": [0.005]})