Пример #1
0
def test_bloom(compression_ratio, expected_rmse):

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    user_embeddings = BloomEmbedding(interactions.num_users,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    item_embeddings = BloomEmbedding(interactions.num_items,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    network = BilinearNet(interactions.num_users,
                          interactions.num_items,
                          user_embedding_layer=user_embeddings,
                          item_embedding_layer=item_embeddings)

    model = ExplicitFactorizationModel(loss='regression',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-5,
                                       representation=network,
                                       use_cuda=CUDA)

    model.fit(train)
    print(model)

    rmse = rmse_score(model, test)
    print(rmse)

    assert rmse - EPSILON < expected_rmse
Пример #2
0
def train_spotlight_models(train, test, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, is_save = False):
    """
    takes train, test, dataset_testing datasets as spotlight.interactions.
    train multiple spotlight models using ExplicitFactorizationModel, with given parameters.
    parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates.
    return predictions of train, test, dataset_testing datasets as well as rmse on train and test.
    """
    
    # initialize train_rmses and test_rmses, these store rmse on train and test set
    train_rmses = np.array([])
    test_rmses = np.array([])
    # initialize preds_train_trains, preds_train_tests, preds_tests; these store predictions of models  on train, test and dataset_testing datasets
    preds_train_trains = []
    preds_train_tests = []
    preds_tests = []
    
    # traverse all parameter combinations
    # embedding_din, n_iter, batch_size, l2 regularization, learning_rate 
    for embedding_dim in embedding_dims:
        for n_iter in n_iters:
            for batch_size in batch_sizes:
                for l2 in l2s:
                    for learning_rate in learning_rates:
                        # initialize model with parameter, ues GPU is torch.cuda.is_available() returns True, otherwise use CPU
                        model = ExplicitFactorizationModel(loss='regression',
                                                           embedding_dim=embedding_dim,  # latent dimensionality
                                                           n_iter=n_iter,  # number of epochs of training
                                                           batch_size=batch_size,  # minibatch size
                                                           l2=l2,  # strength of L2 regularization
                                                           learning_rate=learning_rate,
                                                           use_cuda=torch.cuda.is_available())
                        
                        # print which model is being trained
                        print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
                        # fit model
                        model.fit(train, verbose=True)
                        # find rmse on train
                        train_rmse = rmse_score(model, train)
                        # find rmse on test
                        test_rmse = rmse_score(model, test)
                        # store rmse on train and test sets
                        train_rmses = np.append(train_rmses, train_rmse)
                        test_rmses = np.append(test_rmses, test_rmse)   
                        # print train and test rmses
                        print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
                        # if is_save given, save the models to disk
                        if is_save:
                            torch.save(model, "models/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
                        # find predictions of train, test and dataset_testing datasets
                        preds_train_train = model.predict(train.user_ids,train.item_ids)
                        preds_train_test = model.predict(test.user_ids,test.item_ids)
                        preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
                        #store those predictions
                        preds_train_trains.append(preds_train_train)
                        preds_train_tests.append(preds_train_test)
                        preds_tests.append(preds_test)
    
    # return stored predictions on train, test, dataset_testing; return rmses on train and test
    return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
Пример #3
0
def best_params_spotlight(losses,
                          n_iters,
                          batch_sizes,
                          l2s,
                          learning_rates,
                          embedding_dims,
                          train_data,
                          t=Timer()):
    rmses = dict()
    params = dict()
    t.start()
    for loss in losses:
        params['loss'] = loss
        for n_iter in n_iters:
            params['n_iter'] = n_iter
            for batch_size in batch_sizes:
                params['batch_size'] = batch_size
                for l2 in l2s:
                    params['l2'] = l2
                    for learning_rate in learning_rates:
                        params['learning_rate'] = learning_rate
                        for embedding_dim in embedding_dims:
                            params['embedding_dim'] = embedding_dim
                            model = ExplicitFactorizationModel(
                                loss='regression',
                                embedding_dim=
                                embedding_dim,  # latent dimensionality
                                n_iter=n_iter,  # number of epochs of training
                                batch_size=batch_size,  # minibatch size
                                l2=l2,  # strength of L2 regularization
                                learning_rate=learning_rate,
                                use_cuda=torch.cuda.is_available())

                            params['model'] = model

                            train_tr_data, test_tr_data = random_train_test_split(
                                train_data,
                                random_state=np.random.RandomState(42))

                            model.fit(train_tr_data, verbose=True)

                            rmse = rmse_score(model, test_tr_data)

                            rmses[rmse] = params
                            print(
                                "-----------Time: {}, Loss: {}, n_iter: {}, l2: {}, batch_size: {}, learning_rate: {}, embedding_dim: {}, rmse: {}-------------\n\n"
                                .format(t.stop(), loss, n_iter, l2, batch_size,
                                        learning_rate, embedding_dim, rmse))
                            # restart timer
                            t.start()
Пример #4
0
def train(user_ids, item_ids, ratings, num_dimensions, verbose):
    dataset = Interactions(np.array(user_ids, dtype=np.int32),
                           np.array(item_ids, dtype=np.int32),
                           ratings=np.array(ratings, dtype=np.float32))

    is_cuda_available = False if device.type == 'cpu' else True

    m = ExplicitFactorizationModel(loss='logistic',
                                   use_cuda=is_cuda_available,
                                   embedding_dim=num_dimensions)
    m.fit(dataset, verbose=verbose)

    user_embeddings = m._net.user_embeddings.weight.detach().cpu().numpy()

    return user_embeddings
Пример #5
0
def test_explicit_serialization(data):

    train, test = data

    model = ExplicitFactorizationModel(loss='regression',
                                       n_iter=3,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-5,
                                       use_cuda=CUDA)
    model.fit(train)

    rmse_original = rmse_score(model, test)
    rmse_recovered = rmse_score(_reload(model), test)

    assert rmse_original == rmse_recovered
Пример #6
0
def build_model(data, loss, embedding_dim, n_iter, batch_size, l2,
                learning_rate, **kwargs):
    model = ExplicitFactorizationModel(
        loss=loss,
        embedding_dim=embedding_dim,  # latent dimensionality
        n_iter=n_iter,  # number of epochs of training
        batch_size=batch_size,  # minibatch size
        l2=l2,  # strength of L2 regularization
        learning_rate=learning_rate,
        use_cuda=torch.cuda.is_available())

    train, test = random_train_test_split(
        data, random_state=np.random.RandomState(42))
    model.fit(train, verbose=True)
    test_rmse = rmse_score(model, test)
    return test_rmse
Пример #7
0
def test_poisson():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='poisson',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse < 1.0
Пример #8
0
def test_poisson():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='poisson',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse < 1.0
Пример #9
0
def test_check_input():
    # Train for single iter.
    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='regression',
                                       n_iter=1,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    # Modify data to make imcompatible with original model.
    train.user_ids[0] = train.user_ids.max() + 1
    with pytest.raises(ValueError):
        model.fit(train)
Пример #10
0
def train_spotlight_models_using_all_data(train, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, verbose=True):
    """
    takes train dataset as spotlight.interactions.
    train multiple spotlight models using ExplicitFactorizationModel, with given parameters.
    parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates.
    saves trained models into disk
    """
    
    # store predictions on test set
    preds_tests = []

    # traverse all parameter combinations
    # embedding_din, n_iter, batch_size, l2 regularization, learning_rate 
    for embedding_dim in embedding_dims:
        for n_iter in n_iters:
            for batch_size in batch_sizes:
                for l2 in l2s:
                    for learning_rate in learning_rates:
                        # initialize model
                        model = ExplicitFactorizationModel(loss='regression',
                                                           embedding_dim=embedding_dim,  # latent dimensionality
                                                           n_iter=n_iter,  # number of epochs of training
                                                           batch_size=batch_size,  # minibatch size
                                                           l2=l2,  # strength of L2 regularization
                                                           learning_rate=learning_rate,
                                                           use_cuda=torch.cuda.is_available())
                        
                        # print if given True
                        if verbose:
                            print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
                        # fit model using train dataset
                        model.fit(train, verbose=verbose)
                        preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
                        preds_tests.append(preds_test)
                        # save model to disk
                        torch.save(model, "models_all_data/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))

    # return stored predictions on dataset_testing
    return preds_tests
Пример #11
0
def test_logistic():

    interactions = movielens.get_movielens_dataset('100K')

    # Convert to binary
    interactions.ratings = (interactions.ratings > 3).astype(np.float32)
    # Convert from (0, 1) to (-1, 1)
    interactions.ratings = interactions.ratings * 2 - 1

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='logistic',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6,
                                       use_cuda=CUDA)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse - EPSILON < 1.05
    def obtener_modelos(self):
        """
        Método obtener_modelos. Obtiene, entrena y guarda el modelo escogido.

        Este método solo se utiliza en la interfaz de texto.
        """
        
        global train, modelo
        
        # Se obtiene el modelo, se entrena con parámetros por defecto y se guarda
        if self.opcion_modelo == 1:
            modelo = ExplicitFactorizationModel(loss='logistic', use_cuda=torch.cuda.is_available())
            modelo.fit(train, verbose=True)
            guardar_modelos_dl(modelo, 'el modelo de factorización explícito')
        elif self.opcion_modelo == 2:
            modelo = ImplicitFactorizationModel(loss='bpr', use_cuda=torch.cuda.is_available())
            modelo.fit(train, verbose=True)
            guardar_modelos_dl(modelo, 'el modelo de factorización implícito')
        else:
            modelo = ImplicitSequenceModel(loss='bpr',  representation='pooling', use_cuda=torch.cuda.is_available())
            modelo.fit(train, verbose=True)
            guardar_modelos_dl(modelo, 'el modelo de secuencia explícito')
Пример #13
0
from spotlight.interactions import Interactions

user_ids = np.array(lite['user']).astype(np.int32)
item_ids = np.array(lite['item']).astype(np.int32)
ratings = np.array(lite['rating']).astype(np.float32)
times = np.array(lite['time']).astype(np.int32)
dataset = Interactions(user_ids, item_ids, ratings, times)

# Prepare train test
train, test = user_based_train_test_split(dataset)
# train, test = random_train_test_split(dataset)

# Test baseline
model = ExplicitFactorizationModel(n_iter=20)
model.fit(train, verbose=True)
print('RMSE', rmse_score(model, test))

from scipy.sparse import coo_matrix

ratings = coo_matrix((dataset.ratings, (dataset.user_ids, dataset.item_ids)),
                     shape=(dataset.num_users, dataset.num_items)).tocsr()

train_seq = train.to_sequence(SEQ_LEN)
test_seq = test.to_sequence(SEQ_LEN)

model = ExplicitSequenceModel(n_iter=30, representation='lstm', batch_size=1)
model.fit(train_seq, ratings, verbose=True)

SEQ_ID = 0
user_batch = train_seq.user_ids[SEQ_ID]
Пример #14
0
MRR_fra_rating_5 = RR_fra_rating_5.mean()
print("MRR of fraction of 5* ratings: ", MRR_fra_rating_5)

"""####  So, the best model of Question 1 is Fraction of 5* ratings(fra_rating_5)

## Question 2
"""

from spotlight.factorization.explicit import ExplicitFactorizationModel
from spotlight.factorization.implicit import ImplicitFactorizationModel

#ExplicitFactorizationModel
emodel = ExplicitFactorizationModel(n_iter=10,
                                    embedding_dim=32, 
                                    use_cuda=False)
emodel.fit(exp_train, verbose=True)
score_emodel = scoreAll(emodel)
print(calc_reciprank(exp_validation, score_emodel, train=exp_train).mean())


#ImplicitFactorizationModel
imodel = ImplicitFactorizationModel(n_iter=10,
                                    loss='bpr',
                                    embedding_dim=32, 
                                    use_cuda=False)
imodel.fit(exp_train, verbose=True)
score_imodel_32_on_exp = scoreAll(imodel)
print(calc_reciprank(exp_validation, score_imodel_32_on_exp, train=exp_train).mean())

#ImplicitFactorizationModel is more effective
#tune the number of latent factors
item_ids = np.array(df['beer'])
ratings = np.array(df['rating']).astype('float32')


#Explicit Factorization Model
explicit_interactions = Interactions(user_ids, item_ids, ratings)
explicit_interactions.tocoo().todense().shape


explicit_model = ExplicitFactorizationModel(loss='regression',
                                   embedding_dim=32,
                                   n_iter=10,
                                   batch_size=250,
                                   learning_rate=0.01)

explicit_model.fit(explicit_interactions)


user_df = pd.DataFrame(explicit_interactions.tocoo().todense())



#Spotlight Model

#This function uses the Spotlight model to make recommendations for a given user
def spotlight_predictions(user_id):
    app_user = pd.DataFrame(user_df.iloc[user_id]).T
    app_user_preds = pd.DataFrame({
        'beer': beer_encoder.classes_,
        'value': explicit_model.predict(np.array(app_user)), #needs to be passed as an array
        }).sort_values('value').tail(20)
Пример #16
0
    embedding_dim=5,  # latent dimensionality
    n_iter=10,  # number of epochs of training
    batch_size=256,  # minibatch size
    l2=1e-9,  # strength of L2 regularization
    learning_rate=2e-2,
    use_cuda=torch.cuda.is_available())
# model = ImplicitFactorizationModel(loss='bpr',
#                                    embedding_dim=128,  # latent dimensionality
#                                    n_iter=10,  # number of epochs of training
#                                    batch_size=256,  # minibatch size
#                                    l2=1e-9,  # strength of L2 regularization
#                                    learning_rate=1e-2,
#                                    use_cuda=torch.cuda.is_available())
from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(dataset,
                                      random_state=np.random.RandomState(42))

print('Split into \n {} and \n {}.'.format(train, test))
model.fit(train, verbose=True)
torch.save(model, 'spotlight.model')

from spotlight.evaluation import rmse_score

train_rmse = rmse_score(model, train)
test_rmse = rmse_score(model, test)

print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
predictions = model.predict(test.user_ids, test.item_ids)
print(((predictions > 0.5) == (test.ratings > 0)).sum() / len(predictions))
Пример #17
0
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score
from spotlight.factorization.explicit import ExplicitFactorizationModel

dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset)

model = ExplicitFactorizationModel(n_iter=1)
model.fit(train)

rmse = rmse_score(model, test)

print(rmse)
Пример #18
0
## Build interactions object (building torch tensors underneath)
log.info("Building interactions object")
interactions = Interactions(
    item_ids=ratings_df.movie_int.astype(np.int32).values,
    user_ids=ratings_df.user_int.astype(np.int32).values,
    num_items=len(ratings_df.movie_int.unique()),
    num_users=len(ratings_df.user_int.unique()),
    ratings=ratings_df.vote.astype(np.float32).values)

## Build Explicit Matrix Factorization Model
# We use logistic loss since the interaction rating is binary (-1, 1)
log.info(
    "Training the recommendation engine model using Explicit Matrix Factorization"
)
model = ExplicitFactorizationModel(loss='logistic', n_iter=10)
model.fit(interactions)

# Prepare to get predictions out for each user
full_movies = movie_ind.movie_int.unique()
recommendations = []
# Convert datetime to string to ensure serialization success
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
batch_count = 0

for device, user_row in user_ind.iterrows():
    # Get list of all movies this user voted on
    log.info("Generating recommendations for user {}".format(device))
    user = user_row.user_int
    user_votes = ratings_df[ratings_df.user_int == user].movie_int.unique()
    # Calculate difference in the two lists - rate those movies only
    m = np.setdiff1d(full_movies, user_votes)
Пример #19
0
def trainModelUntilOverfit(dataset, modelSteps, modelIterations,
                           numberDataSplits, embedding_dim, learning_rate):

    numUsers = dataset.num_users
    numMovies = dataset.num_items
    train, test = random_train_test_split(dataset, 0.2)

    print('Split into \n {} and \n {}.'.format(train, test))

    #add random seed
    seed = np.random.RandomState(seed=55555)
    model = ExplicitFactorizationModel(n_iter=modelIterations,
                                       embedding_dim=embedding_dim,
                                       learning_rate=learning_rate,
                                       random_state=seed)

    rmseResults = np.empty((modelSteps * numberDataSplits, 2))
    indexPreviousClosest = ["0"]

    if (numberDataSplits > 1):
        arraySplits = dataSplit(train, numberDataSplits)
        print("Data set split into", len(arraySplits), "*", (arraySplits[1]))
    # Each model step fits the entire dataset
    arrayOfSteps = []
    splitCounter = 0
    fullStepCounter = 0  # increases each time the entire data set has been visited
    currentStep = 0  # increases at every split of the data set (does not reset)
    for i in range(modelSteps * numberDataSplits):
        print("\nStarting step", fullStepCounter)
        print("Data split", splitCounter)
        if (numberDataSplits == 1):
            model.fit(train, verbose=True)
        elif (numberDataSplits > 1):
            print(arraySplits[splitCounter])
            model.fit(arraySplits[splitCounter], verbose=True)

        else:
            print("Invalid number of data splits")
            break

        #predictions for any user are made for all items, matrix has shape (944, 1683)
        modelPredict = np.empty((numUsers, numMovies))
        for userIndex in range(numUsers):
            modelPredict[userIndex, :] = model.predict(userIndex)

        # We take the transpose for tsne formatting (should be more rows than columns)
        modelPredict = modelPredict.T

        #Measure the model's effectiveness (how good predictions are):
        rmse = rmse_score(model, test)
        rmseTrain = rmse_score(model, train)
        rmseTest = rmse_score(model, test)
        print("RMSE TEST:", rmseTest, "\n")
        rmseResults[i, :] = [rmseTrain, rmseTest]
        arrayOfSteps += [i]

        if (stopTraining(rmseResults, arrayOfSteps)):
            rmseResults = rmseResults[:len(arrayOfSteps)]
            break

        if (numberDataSplits > 1):
            splitCounter += 1
            if (splitCounter >= len(arraySplits)):
                splitCounter = 0
                fullStepCounter += 1

    currentStep += 1

    return (model, rmseResults)