def test_bpr_bloom(compression_ratio, expected_mrr): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) user_embeddings = BloomEmbedding(interactions.num_users, 32, compression_ratio=compression_ratio, num_hash_functions=2) item_embeddings = BloomEmbedding(interactions.num_items, 32, compression_ratio=compression_ratio, num_hash_functions=2) network = BilinearNet(interactions.num_users, interactions.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6, representation=network, use_cuda=CUDA) model.fit(train) print(model) mrr = mrr_score(model, test, train=train).mean() assert mrr > expected_mrr
def test_implicit_serialization(data): train, test = data model = ImplicitFactorizationModel(loss='bpr', n_iter=3, batch_size=1024, learning_rate=1e-2, l2=1e-6, use_cuda=CUDA) model.fit(train) mrr_original = mrr_score(model, test, train=train).mean() mrr_recovered = mrr_score(_reload(model), test, train=train).mean() assert mrr_original == mrr_recovered
def evaluate_model(model, train, test, validation): start_time = time.time() model.fit(train, verbose=True) elapsed = time.time() - start_time print('Elapsed {}'.format(elapsed)) print(model) if hasattr(test, 'sequences'): test_mrr = sequence_mrr_score(model, test) val_mrr = sequence_mrr_score(model, validation) else: test_mrr = mrr_score(model, test) val_mrr = mrr_score(model, test.tocsr() + validation.tocsr()) return test_mrr, val_mrr, elapsed
def run(self, filtering, loss, k): for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) self.filter = filtering self.loss = loss self.model_name = str.join('_', (self.model_name, self.filter, self.loss)) self.logger(self.model_name) logger = logging.getLogger() NUM_EPOCHS = 5 logger.info("Training Spotlight Model, Loss: {}".format(self.loss)) df_interactions, df_timestamps = self.df[[ 'user_id', 'tag_id', 'count' ]], self.df['timestamp'] interactions = self.build_interactions_object(df_interactions, df_timestamps) train, test = spotlight_random_train_test_split(interactions) logger.info( 'The dataset has %s users and %s items with %s interactions in the test and %s interactions in the ' 'training set.' % (train.num_users, train.num_items, test.tocoo().getnnz(), train.tocoo().getnnz())) model = ImplicitFactorizationModel( n_iter=NUM_EPOCHS, loss=self.loss, random_state=RANDOM_STATE, use_cuda=True, embedding_dim=64, # latent dimensionality batch_size=128, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=1e-3, ) logger.info("Begin fitting {0} model for {1} epochs...".format( self.loss, NUM_EPOCHS)) model.fit(train, verbose=True) precrec = precision_recall_score(model=model, train=train, test=test, k=k) mrr = mrr_score(model=model, train=train, test=test).mean() precision = np.mean(precrec[0]) recall = np.mean(precrec[1]) fmeasure = 2 * ((precision * recall) / (precision + recall)) logger.info("Precision@{0}: {1}".format(k, precision)) logger.info("Recall@{0}: {1}".format(k, recall)) logger.info("F-Measure: {}".format(fmeasure)) logger.info("MRR: {}".format(mrr)) self.model_name = 'spot'
def obtener_metricas_gui(self): """ Método obtener_metricas_gui. Obtiene las métricas del modelo escogido. Este método solo se utiliza en la interfaz web. Returns ------- metricas_devueltas: dict diccionario con las métricas del modelo """ global train, test, modelo # Se guardan las métricas en un diccionario para su futura muestra en la interfaz web metricas = dict() # Se calculan las métricas y se guardan en el diccionario formateadas if self.opcion_modelo == 1: rmse = rmse_score(modelo, test) mrr = mrr_score(modelo, test, train=train).mean() precision, recall = precision_recall_score(modelo, test, train=train, k=10) metricas_devueltas = {"RMSE": format(rmse, '.4f'), "MRR": format(mrr, '.4f'), "Precisión k": format(precision.mean(), '.4f'), "Recall k": format(recall.mean(), '.4f')} metricas_a_guardar = {"RMSE": [format(rmse, '.4f')], "MRR": [format(mrr, '.4f')], "Precisión k": [format(precision.mean(), '.4f')], "Recall k": [format(recall.mean(), '.4f')]} elif self.opcion_modelo == 2: mrr = mrr_score(modelo, test, train=train).mean() precision, recall = precision_recall_score(modelo, test, train=train, k=10) metricas_devueltas = {"MRR": format(mrr, '.4f'), "Precisión k": format(precision.mean(), '.4f'), "Recall k": format(recall.mean(), '.4f')} metricas_a_guardar = {"MRR": [format(mrr, '.4f')], "Precisión k": [format(precision.mean(), '.4f')], "Recall k": [format(recall.mean(), '.4f')]} else: mrr = sequence_mrr_score(modelo, test).mean() metricas_devueltas = {"MRR": format(mrr, '.4f')} metricas_a_guardar = {"MRR": [format(mrr, '.4f')]} # Se guardan las métricas en un archivo .csv guardar_resultados(metricas_a_guardar) return metricas_devueltas
def evaluation(self, model, interactions: tuple): """Evaluates models on a number of metrics Takes model and evaluates it by Precision@K/Recall@K, Mean Reciprocal Rank metrics. Args: model (Arbitrary): A Spotlight model, can be of different types. sets (tuple): (spotlight.interactions.Interactions, spotlight.interactions.Interactions), A tuple of (train data, test data). Returns: dict: A dictionary with all the evaluation metrics. """ logger = logging.getLogger() train, test = interactions logger.info("Beginning model evaluation...") if self._models in ('S_POOL', 'S_CNN', 'S_LSTM'): mrr = sequence_mrr_score(model, test).mean() else: mrr = mrr_score(model, test).mean() logger.info('MRR {:.8f}'.format( mrr )) k = 3 prec, rec = sequence_precision_recall_score( model=model, test=test, k=k, ) logger.info('Precision@{k} {:.8f}'.format( prec.mean(), k=k )) logger.info('Recall@{k} {:.8f}'.format( rec.mean(), k=k )) return { 'test': { 'precision':prec.mean(), 'recall':rec.mean(), 'f1': 2*((prec.mean()*rec.mean())/(prec.mean()+rec.mean())), 'mrr':mrr, }, }
def resultados_factorizacion_implicito(self): """ Método resultados_factorizacion_implicito. Calcula las métricas del modelo de factorización implícito. Este método solo se utiliza en la interfaz de texto. """ global train, test, modelo # Se calculan las métricas mrr = mrr_score(modelo, test, train=train).mean() precision, recall = precision_recall_score(modelo, test, train=train, k=10) # Se imprimen las métricas imprimir_resultados_dl(mrr, precision.mean(), recall.mean())
def test_adaptive_hinge(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='adaptive_hinge', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.07
def test_adaptive_hinge(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='adaptive_hinge', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.07
def test_bpr(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6, use_cuda=CUDA) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr + EPSILON > 0.07
def test_bpr_custom_optimizer(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) def adagrad_optimizer(model_params, lr=1e-2, weight_decay=1e-6): return torch.optim.Adagrad(model_params, lr=lr, weight_decay=weight_decay) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, optimizer_func=adagrad_optimizer) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.06
def test_bpr_custom_optimizer(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) def adagrad_optimizer(model_params, lr=1e-2, weight_decay=1e-6): return torch.optim.Adagrad(model_params, lr=lr, weight_decay=weight_decay) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, optimizer_func=adagrad_optimizer) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.06
# fit models model.fit(train.to_sequence(), verbose=True) preserving_25_percent_model.fit(preserving_25_percent_train.to_sequence(), verbose=True) preserving_50_percent_model.fit(preserving_50_percent_train.to_sequence(), verbose=True) preserving_75_percent_model.fit(preserving_75_percent_train.to_sequence(), verbose=True) import torch torch.save(preserving_25_percent_model, './preserving_25_percent_model.model') torch.save(preserving_50_percent_model, './preserving_50_percent_model.model') torch.save(preserving_75_percent_model, './preserving_75_percent_model.model') # result evaluation from spotlight.evaluation import mrr_score train_mrrs = mrr_score(model, train) preserving_25_train_mrrs = mrr_score(preserving_25_percent_model, preserving_25_percent_train) preserving_50_train_mrrs = mrr_score(preserving_50_percent_model, preserving_50_percent_train) preserving_75_train_mrrs = mrr_score(preserving_75_percent_model, preserving_75_percent_train) test_mrrs = mrr_score(model, test) preserving_25_test_mrrs = mrr_score(preserving_25_percent_model, test) preserving_50_test_mrrs = mrr_score(preserving_50_percent_model, test) preserving_75_test_mrrs = mrr_score(preserving_75_percent_model, test) print('For 100% preserving items') print('Train MRRS {:.3f}, test MRRS {:.3f}'.format(train_mrrs.sum(), test_mrrs.sum())) print('For 25% preserving items') print('Train MRRS {:.3f}, test MRRS {:.3f}'.format(preserving_25_train_mrrs.sum(), preserving_25_test_mrrs.sum())) print('For 50% preserving items') print('Train MRRS {:.3f}, test MRRS {:.3f}'.format(preserving_50_train_mrrs.sum(), preserving_50_test_mrrs.sum()))
def objective(hyper): print(hyper) start = time.clock() if hyper['model']['type'] == 'lsh': num_hashes = int(hyper['model']['num_hash_functions']) num_layers = int(hyper['model']['num_layers']) nonlinearity = hyper['model']['nonlinearity'] residual = hyper['model']['residual'] embed = hyper['model']['embed'] gated = hyper['model']['gated'] item_embeddings = LSHEmbedding(train.num_items, int(hyper['embedding_dim']), embed=embed, gated=gated, residual_connections=residual, nonlinearity=nonlinearity, num_layers=num_layers, num_hash_functions=num_hashes) item_embeddings.fit(train.tocsr().T) user_embeddings = LSHEmbedding(train.num_users, int(hyper['embedding_dim']), embed=embed, gated=gated, residual_connections=residual, nonlinearity=nonlinearity, num_layers=num_layers, num_hash_functions=num_hashes) user_embeddings.fit(train.tocsr()) else: user_embeddings = ScaledEmbedding(train.num_users, int(hyper['embedding_dim']), padding_idx=0) item_embeddings = ScaledEmbedding(train.num_items, int(hyper['embedding_dim']), padding_idx=0) network = BilinearNet(train.num_users, train.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ImplicitFactorizationModel( loss=hyper['loss'], n_iter=int(hyper['n_iter']), batch_size=int(hyper['batch_size']), learning_rate=hyper['learning_rate'], embedding_dim=int(hyper['embedding_dim']), l2=hyper['l2'], representation=network, use_cuda=CUDA, random_state=random_state) model.fit(train, verbose=True) elapsed = time.clock() - start print(model) validation_mrr = mrr_score(model, validation, train=train).mean() test_mrr = mrr_score(model, test, train=train.tocsr() + validation.tocsr()).mean() print('MRR {} {}'.format(validation_mrr, test_mrr)) return { 'loss': -validation_mrr, 'status': STATUS_OK, 'validation_mrr': validation_mrr, 'test_mrr': test_mrr, 'elapsed': elapsed, 'hyper': hyper }
def _evaluate(model, test, train): test_mrr = mrr_score(model, test, train=train) return test_mrr.mean()
def objective(hyper): print(hyper) start = time.clock() h = hyper['model'] cls = ImplicitFactorizationModel if h['type'] == 'bilinear': representation = BilinearNet(train.num_users, train.num_items, embedding_dim=int(h['embedding_dim'])) elif h['type'] == 'mixture': representation = MixtureNet(train.num_users, train.num_items, num_components=int( h['num_components']), embedding_dim=int(h['embedding_dim'])) elif h['type'] == 'mixture_init': representation = MixtureNet(train.num_users, train.num_items, projection_scale=h['projection_scale'], num_components=int( h['num_components']), embedding_dim=int(h['embedding_dim'])) elif h['type'] == 'nonlinear_mixture': representation = NonlinearMixtureNet( train.num_users, train.num_items, num_components=int(h['num_components']), embedding_dim=int(h['embedding_dim'])) elif h['type'] == 'embedding_mixture': representation = EmbeddingMixtureNet( train.num_users, train.num_items, num_components=int(h['num_components']), embedding_dim=int(h['embedding_dim'])) else: raise ValueError('Unknown model type') model = cls(batch_size=int(h['batch_size']), loss=h['loss'], learning_rate=h['learning_rate'], l2=h['l2'], n_iter=int(h['n_iter']), representation=representation, use_cuda=CUDA, random_state=np.random.RandomState(42)) try: model.fit(train, verbose=True) except ValueError: elapsed = time.clock() - start return { 'loss': 0.0, 'status': STATUS_FAIL, 'validation_mrr': 0.0, 'test_mrr': 0.0, 'elapsed': elapsed, 'hyper': h } elapsed = time.clock() - start print(model) validation_mrr = mrr_score(model, validation, train=(train.tocsr() + test.tocsr())).mean() test_mrr = mrr_score(model, test, train=(train.tocsr() + validation.tocsr())).mean() print('MRR {} {}'.format(validation_mrr, test_mrr)) if np.isnan(validation_mrr): status = STATUS_FAIL else: status = STATUS_OK return { 'loss': -validation_mrr, 'status': status, 'validation_mrr': validation_mrr, 'test_mrr': test_mrr, 'elapsed': elapsed, 'hyper': h }
random_state=np.random.RandomState(42)) return model if __name__ == '__main__': random_state = np.random.RandomState(42) train, validation, test = load_data(random_state) # objective = get_objective(train, validation, test) # space = hyperparameter_space() # max_evals = 5 # for iteration in range(1, max_evals): # print('Iteration {}'.format(iteration)) # trials = optimize(objective, # space, # trials_fname='factorization_trials.pickle', # max_evals=iteration) model = build_factorization_model(train, random_state) model.fit(train, verbose=True) print(model) mrr = mrr_score(model, test, train=train).mean() print('MRR {}'.format(mrr))
timestamps=timeStamps) if name == "test": dataset_test = dataset elif name == "train": dataset_train = dataset if model_mode.lower() == "ifm": model = ImplicitFactorizationModel(n_iter=n_iter) if model_mode.lower() == "efm": model = ExplicitFactorizationModel(n_iter=n_iter) if model_mode.lower() == "cnn": net = CNNNet(num_items=int(foods_items)) model = ImplicitSequenceModel(n_iter=n_iter, use_cuda=torch.cuda.is_available(), representation=net) model.fit(dataset_train) with open(save_file, 'wb') as f: pickle.dump(model, f, pickle.HIGHEST_PROTOCOL) if model_mode.lower() == "cnn": mrr = sequence_mrr_score(model, dataset_test) else: mrr = mrr_score(model, dataset_test) print("mrr = ", len(mrr)) print("mean mrr = ", sum(mrr) / len(mrr)) rank = 1 / (sum(mrr) / len(mrr)) print("average rank = ", rank)