cores = 8 # Load and pre-process the data test_utility_mat = scipy.sparse.load_npz("data/mat_bin_train.npz") test_eval_utility_mat = scipy.sparse.load_npz("data/mat_bin_test.npz") test_utility_mat.sort_indices() test_utility_mat = test_utility_mat.astype(np.float64) test_eval_utility_mat.sort_indices() test_eval_utility_mat = test_eval_utility_mat.astype(np.float64) n_users, n_items = test_utility_mat.shape # Create the model slim = RecModel.Slim(num_items=n_items, num_users=n_users) # Train the model start = time.time() slim.train(X=test_utility_mat, alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter, tolerance=tol, cores=4, verbose=1) print(f"Execution took {(time.time() - start) / 60} minutes") # Evaluate the model start = time.time() recall = slim.eval_topn(test_eval_utility_mat,
def eval_Slim(params, cfg, train_mat, eval_mat, experiment): # This function is what Hyperopt is going to optimize (minimize 'loss' value) print(experiment) with mlflow.start_run(experiment_id=experiment): # Log the config utils.config_helpers.log_config(dict(cfg.model)) n_users, n_items = train_mat.shape np.random.seed(seed=cfg.model.seed) # Log relevant parameters for this run. mlflow.log_param("alpha", params['alpha']) mlflow.log_param("l1_ratio", params['l1_ratio']) mlflow.log_param("max_iter", params['max_iter']) mlflow.log_param("tol", params['tol']) # Log this run log.info( f"Testing alpha: {params['alpha']}, l1_ratio: {params['l1_ratio']}, max_iter: {params['max_iter']} and tol: {params['tol']}" ) start = time.time() # Create model slim = RecModel.Slim(num_items=n_items, num_users=n_users) # Train Model slim.train(X=train_mat.copy(), alpha=params['alpha'], l1_ratio=params['l1_ratio'], max_iter=params['max_iter'], tolerance=params['tol'], cores=1, verbose=int(cfg.model.verbose)) # Log run-time mlflow.log_metric("Runtime", int(round(time.time() - start, 0))) # Evaluate model perf_all = slim.eval_topn(eval_mat.copy(), rand_sampled=int(cfg.model.rand_sampled), topn=np.array(cfg.model.top_n_performances, dtype=np.int32), random_state=int(cfg.model.seed), cores=int(cfg.model.cores)) # Log the performance of the model for pos in range(len(cfg.model.top_n_performances)): mlflow.log_metric( f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}", perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"]) mlflow.log_metric('MAE_train', slim.eval_prec(train_mat.copy())) mlflow.log_metric('MAE_eval', slim.eval_prec(eval_mat.copy())) #We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.g rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"] log.info( f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}" ) loss = -rel_topn_perf return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}