def train_initial_model(): dataset = get_movielens_dataset(variant='100K') train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42)) model = ImplicitFactorizationModel(loss='adaptive_hinge', embedding_dim=128, # latent dimensionality n_iter=10, # number of epochs of training batch_size=1024, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=1e-3, use_cuda=torch.cuda.is_available()) print('Fitting the model') model.fit(train, verbose=True) print(type(model)) model_file = open('models/filmclub.model', 'wb') pickle.dump(model, model_file) model_file.close() dataset.num_users = 1000000 dataset_file = open('data/dataset.pkl', 'wb') pickle.dump(dataset, dataset_file) dataset_file.close() train_rmse = rmse_score(model, train) test_rmse = rmse_score(model, test) print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
def train_spotlight_models(train, test, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, is_save = False): """ takes train, test, dataset_testing datasets as spotlight.interactions. train multiple spotlight models using ExplicitFactorizationModel, with given parameters. parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates. return predictions of train, test, dataset_testing datasets as well as rmse on train and test. """ # initialize train_rmses and test_rmses, these store rmse on train and test set train_rmses = np.array([]) test_rmses = np.array([]) # initialize preds_train_trains, preds_train_tests, preds_tests; these store predictions of models on train, test and dataset_testing datasets preds_train_trains = [] preds_train_tests = [] preds_tests = [] # traverse all parameter combinations # embedding_din, n_iter, batch_size, l2 regularization, learning_rate for embedding_dim in embedding_dims: for n_iter in n_iters: for batch_size in batch_sizes: for l2 in l2s: for learning_rate in learning_rates: # initialize model with parameter, ues GPU is torch.cuda.is_available() returns True, otherwise use CPU model = ExplicitFactorizationModel(loss='regression', embedding_dim=embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) # print which model is being trained print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # fit model model.fit(train, verbose=True) # find rmse on train train_rmse = rmse_score(model, train) # find rmse on test test_rmse = rmse_score(model, test) # store rmse on train and test sets train_rmses = np.append(train_rmses, train_rmse) test_rmses = np.append(test_rmses, test_rmse) # print train and test rmses print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse)) # if is_save given, save the models to disk if is_save: torch.save(model, "models/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # find predictions of train, test and dataset_testing datasets preds_train_train = model.predict(train.user_ids,train.item_ids) preds_train_test = model.predict(test.user_ids,test.item_ids) preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids) #store those predictions preds_train_trains.append(preds_train_train) preds_train_tests.append(preds_train_test) preds_tests.append(preds_test) # return stored predictions on train, test, dataset_testing; return rmses on train and test return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
def fit(self, num_epochs, report_int=1): for t in range(num_epochs): self.step_mcmc() ## REPORTING ### if t % report_int == 0: rmse_train = rmse_score(self, self.train) rmse_test = rmse_score(self, self.test) print( f'step: {t} \t rmse train: {rmse_train:.2f}, test: {rmse_test:.2f}' ) return rmse_train, rmse_test
def test_bloom(compression_ratio, expected_rmse): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) user_embeddings = BloomEmbedding(interactions.num_users, 32, compression_ratio=compression_ratio, num_hash_functions=2) item_embeddings = BloomEmbedding(interactions.num_items, 32, compression_ratio=compression_ratio, num_hash_functions=2) network = BilinearNet(interactions.num_users, interactions.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ExplicitFactorizationModel(loss='regression', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-5, representation=network, use_cuda=CUDA) model.fit(train) print(model) rmse = rmse_score(model, test) print(rmse) assert rmse - EPSILON < expected_rmse
def test_explicit_serialization(data): train, test = data model = ExplicitFactorizationModel(loss='regression', n_iter=3, batch_size=1024, learning_rate=1e-3, l2=1e-5, use_cuda=CUDA) model.fit(train) rmse_original = rmse_score(model, test) rmse_recovered = rmse_score(_reload(model), test) assert rmse_original == rmse_recovered
def best_params_spotlight(losses, n_iters, batch_sizes, l2s, learning_rates, embedding_dims, train_data, t=Timer()): rmses = dict() params = dict() t.start() for loss in losses: params['loss'] = loss for n_iter in n_iters: params['n_iter'] = n_iter for batch_size in batch_sizes: params['batch_size'] = batch_size for l2 in l2s: params['l2'] = l2 for learning_rate in learning_rates: params['learning_rate'] = learning_rate for embedding_dim in embedding_dims: params['embedding_dim'] = embedding_dim model = ExplicitFactorizationModel( loss='regression', embedding_dim= embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) params['model'] = model train_tr_data, test_tr_data = random_train_test_split( train_data, random_state=np.random.RandomState(42)) model.fit(train_tr_data, verbose=True) rmse = rmse_score(model, test_tr_data) rmses[rmse] = params print( "-----------Time: {}, Loss: {}, n_iter: {}, l2: {}, batch_size: {}, learning_rate: {}, embedding_dim: {}, rmse: {}-------------\n\n" .format(t.stop(), loss, n_iter, l2, batch_size, learning_rate, embedding_dim, rmse)) # restart timer t.start()
def load_spotlight_models(train, test, dataset_testing, verbose=False): """ Loads pretrained spotlight models from the folder in the directory. Takes train, test datasets and dataset_testing to generate predictions and calculate rmse """ # initialize predictions, stores predictions on train, test and dataset_testing datasets preds_train_trains = [] preds_train_tests = [] preds_tests = [] # initialize rmses, stores rmses on train and test dataset train_rmses = np.array([]) test_rmses = np.array([]) # for each file in the "models" folder in the directory for file in glob.glob("models/*"): # prinr filenames, if given True if verbose: print(file) # load model model = torch.load(file) # calculate and store rmses on train and test datasets train_rmse = rmse_score(model, train) test_rmse = rmse_score(model, test) train_rmses = np.append(train_rmses, train_rmse) test_rmses = np.append(test_rmses, test_rmse) # make predictions on train, test and dataset_testing datasets preds_train_train = model.predict(train.user_ids,train.item_ids) preds_train_test = model.predict(test.user_ids,test.item_ids) preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids) # store predictions preds_train_trains.append(preds_train_train) preds_train_tests.append(preds_train_test) preds_tests.append(preds_test) # return predictions on train, test and dataset_testing datasets; return rmse on train and test datasets return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
def resultados_factorizacion_explicito(self): """ Método resultados_factorizacion_explicito. Calcula las métricas del modelo de factorización explícito. Este método solo se utiliza en la interfaz de texto. """ global train, test, modelo # Se calculan las métricas rmse = rmse_score(modelo, test) mrr = mrr_score(modelo, test, train=train).mean() precision, recall = precision_recall_score(modelo, test, train=train, k=10) # Se imprimen las métricas imprimir_resultados_dl(mrr, precision.mean(), recall.mean(), rmse)
def build_model(data, loss, embedding_dim, n_iter, batch_size, l2, learning_rate, **kwargs): model = ExplicitFactorizationModel( loss=loss, embedding_dim=embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) train, test = random_train_test_split( data, random_state=np.random.RandomState(42)) model.fit(train, verbose=True) test_rmse = rmse_score(model, test) return test_rmse
def test_poisson(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='poisson', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6) model.fit(train) rmse = rmse_score(model, test) assert rmse < 1.0
def test_poisson(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='poisson', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6) model.fit(train) rmse = rmse_score(model, test) assert rmse < 1.0
def obtener_metricas_gui(self): """ Método obtener_metricas_gui. Obtiene las métricas del modelo escogido. Este método solo se utiliza en la interfaz web. Returns ------- metricas_devueltas: dict diccionario con las métricas del modelo """ global train, test, modelo # Se guardan las métricas en un diccionario para su futura muestra en la interfaz web metricas = dict() # Se calculan las métricas y se guardan en el diccionario formateadas if self.opcion_modelo == 1: rmse = rmse_score(modelo, test) mrr = mrr_score(modelo, test, train=train).mean() precision, recall = precision_recall_score(modelo, test, train=train, k=10) metricas_devueltas = {"RMSE": format(rmse, '.4f'), "MRR": format(mrr, '.4f'), "Precisión k": format(precision.mean(), '.4f'), "Recall k": format(recall.mean(), '.4f')} metricas_a_guardar = {"RMSE": [format(rmse, '.4f')], "MRR": [format(mrr, '.4f')], "Precisión k": [format(precision.mean(), '.4f')], "Recall k": [format(recall.mean(), '.4f')]} elif self.opcion_modelo == 2: mrr = mrr_score(modelo, test, train=train).mean() precision, recall = precision_recall_score(modelo, test, train=train, k=10) metricas_devueltas = {"MRR": format(mrr, '.4f'), "Precisión k": format(precision.mean(), '.4f'), "Recall k": format(recall.mean(), '.4f')} metricas_a_guardar = {"MRR": [format(mrr, '.4f')], "Precisión k": [format(precision.mean(), '.4f')], "Recall k": [format(recall.mean(), '.4f')]} else: mrr = sequence_mrr_score(modelo, test).mean() metricas_devueltas = {"MRR": format(mrr, '.4f')} metricas_a_guardar = {"MRR": [format(mrr, '.4f')]} # Se guardan las métricas en un archivo .csv guardar_resultados(metricas_a_guardar) return metricas_devueltas
def test_logistic(): interactions = movielens.get_movielens_dataset('100K') # Convert to binary interactions.ratings = (interactions.ratings > 3).astype(np.float32) # Convert from (0, 1) to (-1, 1) interactions.ratings = interactions.ratings * 2 - 1 train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='logistic', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6, use_cuda=CUDA) model.fit(train) rmse = rmse_score(model, test) assert rmse - EPSILON < 1.05
from spotlight.interactions import Interactions user_ids = np.array(lite['user']).astype(np.int32) item_ids = np.array(lite['item']).astype(np.int32) ratings = np.array(lite['rating']).astype(np.float32) times = np.array(lite['time']).astype(np.int32) dataset = Interactions(user_ids, item_ids, ratings, times) # Prepare train test train, test = user_based_train_test_split(dataset) # train, test = random_train_test_split(dataset) # Test baseline model = ExplicitFactorizationModel(n_iter=20) model.fit(train, verbose=True) print('RMSE', rmse_score(model, test)) from scipy.sparse import coo_matrix ratings = coo_matrix((dataset.ratings, (dataset.user_ids, dataset.item_ids)), shape=(dataset.num_users, dataset.num_items)).tocsr() train_seq = train.to_sequence(SEQ_LEN) test_seq = test.to_sequence(SEQ_LEN) model = ExplicitSequenceModel(n_iter=30, representation='lstm', batch_size=1) model.fit(train_seq, ratings, verbose=True) SEQ_ID = 0 user_batch = train_seq.user_ids[SEQ_ID] item_batch = train_seq.sequences[SEQ_ID]
from spotlight.cross_validation import random_train_test_split from spotlight.datasets.movielens import get_movielens_dataset from spotlight.evaluation import rmse_score from spotlight.factorization.explicit import ExplicitFactorizationModel dataset = get_movielens_dataset(variant='100K') train, test = random_train_test_split(dataset) model = ExplicitFactorizationModel(n_iter=1) model.fit(train) rmse = rmse_score(model, test) print(rmse)
def trainModelUntilOverfit(dataset, modelSteps, modelIterations, numberDataSplits, embedding_dim, learning_rate): numUsers = dataset.num_users numMovies = dataset.num_items train, test = random_train_test_split(dataset, 0.2) print('Split into \n {} and \n {}.'.format(train, test)) #add random seed seed = np.random.RandomState(seed=55555) model = ExplicitFactorizationModel(n_iter=modelIterations, embedding_dim=embedding_dim, learning_rate=learning_rate, random_state=seed) rmseResults = np.empty((modelSteps * numberDataSplits, 2)) indexPreviousClosest = ["0"] if (numberDataSplits > 1): arraySplits = dataSplit(train, numberDataSplits) print("Data set split into", len(arraySplits), "*", (arraySplits[1])) # Each model step fits the entire dataset arrayOfSteps = [] splitCounter = 0 fullStepCounter = 0 # increases each time the entire data set has been visited currentStep = 0 # increases at every split of the data set (does not reset) for i in range(modelSteps * numberDataSplits): print("\nStarting step", fullStepCounter) print("Data split", splitCounter) if (numberDataSplits == 1): model.fit(train, verbose=True) elif (numberDataSplits > 1): print(arraySplits[splitCounter]) model.fit(arraySplits[splitCounter], verbose=True) else: print("Invalid number of data splits") break #predictions for any user are made for all items, matrix has shape (944, 1683) modelPredict = np.empty((numUsers, numMovies)) for userIndex in range(numUsers): modelPredict[userIndex, :] = model.predict(userIndex) # We take the transpose for tsne formatting (should be more rows than columns) modelPredict = modelPredict.T #Measure the model's effectiveness (how good predictions are): rmse = rmse_score(model, test) rmseTrain = rmse_score(model, train) rmseTest = rmse_score(model, test) print("RMSE TEST:", rmseTest, "\n") rmseResults[i, :] = [rmseTrain, rmseTest] arrayOfSteps += [i] if (stopTraining(rmseResults, arrayOfSteps)): rmseResults = rmseResults[:len(arrayOfSteps)] break if (numberDataSplits > 1): splitCounter += 1 if (splitCounter >= len(arraySplits)): splitCounter = 0 fullStepCounter += 1 currentStep += 1 return (model, rmseResults)
embedding_dim=5, # latent dimensionality n_iter=10, # number of epochs of training batch_size=256, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=2e-2, use_cuda=torch.cuda.is_available()) # model = ImplicitFactorizationModel(loss='bpr', # embedding_dim=128, # latent dimensionality # n_iter=10, # number of epochs of training # batch_size=256, # minibatch size # l2=1e-9, # strength of L2 regularization # learning_rate=1e-2, # use_cuda=torch.cuda.is_available()) from spotlight.cross_validation import random_train_test_split train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42)) print('Split into \n {} and \n {}.'.format(train, test)) model.fit(train, verbose=True) torch.save(model, 'spotlight.model') from spotlight.evaluation import rmse_score train_rmse = rmse_score(model, train) test_rmse = rmse_score(model, test) print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse)) predictions = model.predict(test.user_ids, test.item_ids) print(((predictions > 0.5) == (test.ratings > 0)).sum() / len(predictions))