Exemplo n.º 1
0
cores = 8

# Load and pre-process the data
test_utility_mat = scipy.sparse.load_npz("data/mat_bin_train.npz")
test_eval_utility_mat = scipy.sparse.load_npz("data/mat_bin_test.npz")

test_utility_mat.sort_indices()
test_utility_mat = test_utility_mat.astype(np.float64)

test_eval_utility_mat.sort_indices()
test_eval_utility_mat = test_eval_utility_mat.astype(np.float64)

n_users, n_items = test_utility_mat.shape

# Create the model
slim = RecModel.Slim(num_items=n_items, num_users=n_users)

# Train the model
start = time.time()
slim.train(X=test_utility_mat,
           alpha=alpha,
           l1_ratio=l1_ratio,
           max_iter=max_iter,
           tolerance=tol,
           cores=4,
           verbose=1)
print(f"Execution took {(time.time() - start) / 60} minutes")

# Evaluate the model
start = time.time()
recall = slim.eval_topn(test_eval_utility_mat,
Exemplo n.º 2
0
def eval_Slim(params, cfg, train_mat, eval_mat, experiment):
    # This function is what Hyperopt is going to optimize (minimize 'loss' value)
    print(experiment)
    with mlflow.start_run(experiment_id=experiment):

        # Log the config
        utils.config_helpers.log_config(dict(cfg.model))

        n_users, n_items = train_mat.shape
        np.random.seed(seed=cfg.model.seed)

        # Log relevant parameters for this run.
        mlflow.log_param("alpha", params['alpha'])
        mlflow.log_param("l1_ratio", params['l1_ratio'])
        mlflow.log_param("max_iter", params['max_iter'])
        mlflow.log_param("tol", params['tol'])

        # Log this run
        log.info(
            f"Testing  alpha: {params['alpha']},  l1_ratio: {params['l1_ratio']}, max_iter: {params['max_iter']} and tol: {params['tol']}"
        )

        start = time.time()
        # Create model
        slim = RecModel.Slim(num_items=n_items, num_users=n_users)

        # Train Model
        slim.train(X=train_mat.copy(),
                   alpha=params['alpha'],
                   l1_ratio=params['l1_ratio'],
                   max_iter=params['max_iter'],
                   tolerance=params['tol'],
                   cores=1,
                   verbose=int(cfg.model.verbose))

        # Log run-time
        mlflow.log_metric("Runtime", int(round(time.time() - start, 0)))

        # Evaluate model
        perf_all = slim.eval_topn(eval_mat.copy(),
                                  rand_sampled=int(cfg.model.rand_sampled),
                                  topn=np.array(cfg.model.top_n_performances,
                                                dtype=np.int32),
                                  random_state=int(cfg.model.seed),
                                  cores=int(cfg.model.cores))

        # Log the performance of the model
        for pos in range(len(cfg.model.top_n_performances)):
            mlflow.log_metric(
                f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}",
                perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"])
        mlflow.log_metric('MAE_train', slim.eval_prec(train_mat.copy()))
        mlflow.log_metric('MAE_eval', slim.eval_prec(eval_mat.copy()))

        #We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.g
        rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"]

        log.info(
            f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}"
        )
        loss = -rel_topn_perf
        return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}