Exemplo n.º 1
0
# Load data
train_mat = scipy.sparse.load_npz("data/mat_count_train.npz")
eval_mat = scipy.sparse.load_npz("data/mat_bin_test.npz")

# Optimal hyper parameters
alpha = 1.191168
damping	= np.NaN
eval_method	= 'k_step'
l1_ratio = 4.423678
max_iter = 11
phi	= 0.997003
steps = 3
tol = 0.05244

# Define and train the model
rec = RecModel.Recwalk(num_items=train_mat.shape[1], num_users=train_mat.shape[0], eval_method=eval_method, k_steps=steps, damping=damping)
rec.train(train_mat=train_mat.copy(), phi=phi, alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter, tolerance=tol, cores=8, verbose=1)

# Evaluate the model
start = time.time()
recall = rec.eval_topn(eval_mat.copy(), rand_sampled=1000, topn=np.array([4, 10, 20, 50], dtype=np.int32), random_state=1993, cores=8)

# Print out evaluation scores
print(f"Recall was {recall}.")

# Compute the coverage of the model.
count_vec = RecModel.test_coverage(rec, eval_mat, 4)
np.save('data/count_vec_recwalk.np', count_vec)
    
def eval_recwalk(params, cfg, train_mat_bin, train_mat_count, eval_mat,
                 experiment):
    # This function is what Hyperopt is going to optimize (minimize 'loss' value)
    print(experiment)
    with mlflow.start_run(experiment_id=experiment):

        # flatten the config.
        params = unfold_config(params)

        # Log the config in hydra
        utils.config_helpers.log_config(dict(cfg.model))

        # Log the params in mlflow
        utils.config_helpers.log_config(params)

        n_users, n_items = train_mat_bin.shape
        np.random.seed(seed=cfg.model.seed)

        # Log this run
        log.info(f"Hyper parameter for this run are {params}")

        if params['eval_method'] == 'PR':
            recwalk = RecModel.Recwalk(num_items=n_items,
                                       num_users=n_users,
                                       eval_method=params['eval_method'],
                                       k_steps=params['steps'],
                                       damping=params['damping'])
        else:
            recwalk = RecModel.Recwalk(num_items=n_items,
                                       num_users=n_users,
                                       eval_method=params['eval_method'],
                                       k_steps=params['steps'],
                                       damping=None)

        start = time.time()
        if params['train_mat'] == 'count':
            recwalk.train(train_mat_count.copy(),
                          phi=params['phi'],
                          alpha=params['alpha'],
                          l1_ratio=params['l1_ratio'],
                          max_iter=params['max_iter'],
                          tolerance=params['tol'],
                          cores=cfg.model.cores,
                          verbose=cfg.model.verbose)
        else:
            recwalk.train(train_mat_bin.copy(),
                          phi=params['phi'],
                          alpha=params['alpha'],
                          l1_ratio=params['l1_ratio'],
                          max_iter=params['max_iter'],
                          tolerance=params['tol'],
                          cores=cfg.model.cores,
                          verbose=cfg.model.verbose)

        # Log the training time
        mlflow.log_metric("training_time", int(round(time.time() - start, 0)))

        start = time.time()
        perf_all = recwalk.eval_topn(test_mat=eval_mat.copy(),
                                     topn=np.array(
                                         cfg.model.top_n_performances,
                                         dtype=np.int32),
                                     rand_sampled=int(cfg.model.rand_sampled),
                                     cores=int(cfg.model.cores),
                                     random_state=int(cfg.model.seed))
        mlflow.log_metric("Topn_evaluation_time",
                          int(round(time.time() - start, 0)))

        # Log the topn performance of the model
        for pos in range(len(cfg.model.top_n_performances)):
            mlflow.log_metric(
                f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}",
                perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"])

        # We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.
        rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"]
        log.info(
            f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}"
        )
        loss = -rel_topn_perf
        return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}