Exemplo n.º 1
0
def nb_test_coverage():
    train_mat = scipy.sparse.load_npz("data/mat_bin_train_test.npz")
    test_mat = scipy.sparse.load_npz("data/mat_bin_validate_test.npz")
    n_users, n_items = train_mat.shape

    test_neighbor = RecModel.Neighborhood(num_items=n_items,
                                          num_users=n_users,
                                          nb_size=50)
    test_neighbor.train(train_mat.copy(), 'cosine', cores=8)
    perf = test_neighbor.eval_topn(test_mat=test_mat.copy(),
                                   rand_sampled=1000,
                                   topn=np.array([4, 10, 20, 50],
                                                 dtype=np.int32),
                                   random_state=1993,
                                   cores=7)
    print(f"The performance is {perf}")

    # Do the coverage evaluation.
    start = time.time()
    coverage = test_coverage(test_neighbor, train_mat, 10)
    print(coverage[:10])
Exemplo n.º 2
0
cores = 8

# Load and pre-process the data
test_utility_mat = scipy.sparse.load_npz("data/mat_bin_train.npz")
test_eval_utility_mat = scipy.sparse.load_npz("data/mat_bin_test.npz")

test_utility_mat.sort_indices()
test_utility_mat = test_utility_mat.astype(np.float64)

test_eval_utility_mat.sort_indices()
test_eval_utility_mat = test_eval_utility_mat.astype(np.float64)

n_users, n_items = test_utility_mat.shape

# Create the model
slim = RecModel.Slim(num_items=n_items, num_users=n_users)

# Train the model
start = time.time()
slim.train(X=test_utility_mat,
           alpha=alpha,
           l1_ratio=l1_ratio,
           max_iter=max_iter,
           tolerance=tol,
           cores=4,
           verbose=1)
print(f"Execution took {(time.time() - start) / 60} minutes")

# Evaluate the model
start = time.time()
recall = slim.eval_topn(test_eval_utility_mat,
Exemplo n.º 3
0
def eval_MF(params, cfg, train_mat_bin, train_mat_count, eval_mat, experiment):
    # This function is what Hyperopt is going to optimize (minimize 'loss' value)
    print(experiment)
    with mlflow.start_run(experiment_id=experiment):

        # flatten the config.
        params = unfold_config(params)

        # Log the config in hydra
        utils.config_helpers.log_config(dict(cfg.model))

        # Log the params in mlflow
        utils.config_helpers.log_config(params)

        n_users, n_items = train_mat_bin.shape
        np.random.seed(seed=cfg.model.seed)

        # Create model and train and evaluate it.
        if params['weighted'] == 'weighted':
            if params['bias'] == 1:
                MF = RecModel.WMF(num_items=n_items,
                                  num_users=n_users,
                                  dim=params['dim'],
                                  gamma=params['gamma'],
                                  weighted=True,
                                  bias=True,
                                  seed=int(cfg.model.seed))
            elif params['bias'] == 0:
                MF = RecModel.WMF(num_items=n_items,
                                  num_users=n_users,
                                  dim=params['dim'],
                                  gamma=params['gamma'],
                                  weighted=True,
                                  bias=False,
                                  seed=int(cfg.model.seed))
        elif params['weighted'] == 'non_weighted':
            MF = RecModel.WMF(num_items=n_items,
                              num_users=n_users,
                              dim=params['dim'],
                              gamma=params['gamma'],
                              weighted=False,
                              bias=False,
                              seed=int(cfg.model.seed))

        start = time.time()
        if params['weighted'] == 'non_weighted':
            if params['mat'] == 'count':
                MF.train(utility_mat=train_mat_count.copy(),
                         iterations=int(cfg.model.iterations),
                         verbose=int(cfg.model.verbose),
                         eval_mat=eval_mat.copy(),
                         cores=int(cfg.model.cores),
                         alpha=params['alpha'],
                         stopping_rounds=int(cfg.model.stopping_rounds),
                         dtype='float32',
                         min_improvement=float(cfg.model.min_improvement),
                         pre_process_count=params['pre_process'],
                         beta=params['beta'],
                         preprocess_mat=params['pre_process'] != "None")

            elif params['mat'] == 'bin':
                MF.train(utility_mat=train_mat_count.copy(),
                         iterations=int(cfg.model.iterations),
                         verbose=int(cfg.model.verbose),
                         eval_mat=eval_mat.copy(),
                         cores=int(cfg.model.cores),
                         alpha=params['alpha'],
                         stopping_rounds=int(cfg.model.stopping_rounds),
                         dtype='float32',
                         min_improvement=float(cfg.model.min_improvement),
                         pre_process_count=params['pre_process'],
                         beta=params['beta'],
                         preprocess_mat=False)
            else:
                raise ValueError(
                    f"mat can only be one of ['count', 'binary'] not {params['mat']}"
                )

        elif params['weighted'] == 'weighted':
            MF.train(utility_mat=train_mat_bin.copy(),
                     count_mat=train_mat_count.copy(),
                     iterations=int(cfg.model.iterations),
                     verbose=int(cfg.model.verbose),
                     eval_mat=eval_mat.copy(),
                     cores=int(cfg.model.cores),
                     alpha=params['alpha'],
                     stopping_rounds=int(cfg.model.stopping_rounds),
                     dtype='float32',
                     min_improvement=float(cfg.model.min_improvement),
                     pre_process_count=params['pre_process'],
                     beta=params['beta'],
                     preprocess_mat=False)

        else:
            raise ValueError(
                f"weighted can only be one of ['weighted', 'non_weighted'] not {params['weighted']}"
            )

        # Log the training time
        mlflow.log_metric("training_time", int(round(time.time() - start, 0)))

        start = time.time()
        perf_all = MF.eval_topn(test_mat=eval_mat.copy(),
                                topn=np.array(cfg.model.top_n_performances,
                                              dtype=np.int32),
                                rand_sampled=int(cfg.model.rand_sampled),
                                cores=int(cfg.model.cores),
                                random_state=int(cfg.model.seed))
        mlflow.log_metric("Topn_evaluation_time",
                          int(round(time.time() - start, 0)))

        mse_train = MF.eval_prec(utility_mat=train_mat_count.copy())
        mse_test = MF.eval_prec(utility_mat=eval_mat.copy())

        # Log the topn performance of the model
        for pos in range(len(cfg.model.top_n_performances)):
            mlflow.log_metric(
                f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}",
                perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"])

        # Log the accuracy
        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("mse_test", mse_test)

        # We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.
        rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"]
        log.info(
            f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}"
        )
        loss = -rel_topn_perf
        return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}
Exemplo n.º 4
0
# Load data
train_mat = scipy.sparse.load_npz("data/mat_count_train.npz")
eval_mat = scipy.sparse.load_npz("data/mat_bin_test.npz")

# Optimal hyper parameters
alpha = 1.191168
damping	= np.NaN
eval_method	= 'k_step'
l1_ratio = 4.423678
max_iter = 11
phi	= 0.997003
steps = 3
tol = 0.05244

# Define and train the model
rec = RecModel.Recwalk(num_items=train_mat.shape[1], num_users=train_mat.shape[0], eval_method=eval_method, k_steps=steps, damping=damping)
rec.train(train_mat=train_mat.copy(), phi=phi, alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter, tolerance=tol, cores=8, verbose=1)

# Evaluate the model
start = time.time()
recall = rec.eval_topn(eval_mat.copy(), rand_sampled=1000, topn=np.array([4, 10, 20, 50], dtype=np.int32), random_state=1993, cores=8)

# Print out evaluation scores
print(f"Recall was {recall}.")

# Compute the coverage of the model.
count_vec = RecModel.test_coverage(rec, eval_mat, 4)
np.save('data/count_vec_recwalk.np', count_vec)
    
Exemplo n.º 5
0
import RecModel
import numpy as np
import scipy.sparse
import multiprocessing
import time

train_mat = scipy.sparse.load_npz("data/mat_bin_train.npz")
eval_mat = scipy.sparse.load_npz("data/mat_bin_test.npz")

# Optimal Hyperparamters
cores= 8
alpha = 1338.409547
verbose=1

# Define the model
ease = RecModel.Ease(num_items=train_mat.shape[1], num_users=train_mat.shape[0])

# Train the model
start = time.time()
ease.train(train_mat.copy(), alpha=alpha, verbose=verbose, cores=cores)
print(f"fitted ease in  {time.time() - start} seconds")

# Print out the performance
print(ease.eval_topn(test_mat=eval_mat.copy(), topn=np.array([4, 10, 20, 50], dtype=np.int32), rand_sampled =1000, cores=cores))

# Compute the coverage
count_vec = RecModel.test_coverage(ease, eval_mat, 4)

# Write out the coverage for later analysis
np.save('data/count_vec_ease.npy', count_vec)
def eval_recwalk(params, cfg, train_mat_bin, train_mat_count, eval_mat,
                 experiment):
    # This function is what Hyperopt is going to optimize (minimize 'loss' value)
    print(experiment)
    with mlflow.start_run(experiment_id=experiment):

        # flatten the config.
        params = unfold_config(params)

        # Log the config in hydra
        utils.config_helpers.log_config(dict(cfg.model))

        # Log the params in mlflow
        utils.config_helpers.log_config(params)

        n_users, n_items = train_mat_bin.shape
        np.random.seed(seed=cfg.model.seed)

        # Log this run
        log.info(f"Hyper parameter for this run are {params}")

        if params['eval_method'] == 'PR':
            recwalk = RecModel.Recwalk(num_items=n_items,
                                       num_users=n_users,
                                       eval_method=params['eval_method'],
                                       k_steps=params['steps'],
                                       damping=params['damping'])
        else:
            recwalk = RecModel.Recwalk(num_items=n_items,
                                       num_users=n_users,
                                       eval_method=params['eval_method'],
                                       k_steps=params['steps'],
                                       damping=None)

        start = time.time()
        if params['train_mat'] == 'count':
            recwalk.train(train_mat_count.copy(),
                          phi=params['phi'],
                          alpha=params['alpha'],
                          l1_ratio=params['l1_ratio'],
                          max_iter=params['max_iter'],
                          tolerance=params['tol'],
                          cores=cfg.model.cores,
                          verbose=cfg.model.verbose)
        else:
            recwalk.train(train_mat_bin.copy(),
                          phi=params['phi'],
                          alpha=params['alpha'],
                          l1_ratio=params['l1_ratio'],
                          max_iter=params['max_iter'],
                          tolerance=params['tol'],
                          cores=cfg.model.cores,
                          verbose=cfg.model.verbose)

        # Log the training time
        mlflow.log_metric("training_time", int(round(time.time() - start, 0)))

        start = time.time()
        perf_all = recwalk.eval_topn(test_mat=eval_mat.copy(),
                                     topn=np.array(
                                         cfg.model.top_n_performances,
                                         dtype=np.int32),
                                     rand_sampled=int(cfg.model.rand_sampled),
                                     cores=int(cfg.model.cores),
                                     random_state=int(cfg.model.seed))
        mlflow.log_metric("Topn_evaluation_time",
                          int(round(time.time() - start, 0)))

        # Log the topn performance of the model
        for pos in range(len(cfg.model.top_n_performances)):
            mlflow.log_metric(
                f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}",
                perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"])

        # We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.
        rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"]
        log.info(
            f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}"
        )
        loss = -rel_topn_perf
        return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}
def eval_neighborhood(params, cfg, train_mat_bin, train_mat_count, eval_mat, experiment):
    # This function is what Hyperopt is going to optimize (minimize 'loss' value)
    with mlflow.start_run(experiment_id=experiment):

        # flatten the config.
        params = unfold_config(params)

        # Make the neighborhood size to integer.
        params['neighborhood_size'] = max(int(params['neighborhood_size']), 1)
        try:
            params['p'] = int(params['p'])
        except KeyError:
            # give it some dummy value, will be provided if needed.
            params['p'] = 1

        # Log the config
        utils.config_helpers.log_config(dict(cfg.model))        

        n_users, n_items = train_mat_bin.shape

        # Log relevant parameters for this run.
        print("Testing the following hyper parmaters!")
        for key, val in dict(params).items():
            mlflow.log_param(key, val)
            if int(cfg.model.verbose) > 0:
                print(f"{key}: {val}")
        
        # Select the correct matrix to train.
        if params['matrix'] == 'count':
            train_mat = train_mat_count.copy()

        elif params['matrix'] == 'binary':
            train_mat = train_mat_bin.copy()

        else:
            raise ValueError(f"mat can only take values 'count' or 'bin' and not {params['matrix']}")
        
        # Create model
        # Create Mult_VAE
        neighborhood_model = RecModel.Neighborhood(num_items=n_items, num_users=n_users, nb_size=params['neighborhood_size'])

        print(f"start training!")
        start = time.time()
        neighborhood_model.train(X=train_mat.copy(), similarity_function=params['similarity_function'], cores=int(cfg.model.cores), p=params['p'])

        if params['matrix'] == 'count':
            neighborhood_model.weights_only=False
        elif params['matrix'] == 'binary':
            neighborhood_model.weights_only=True
        else:
            raise ValueError(f"mat can only take values 'count' or 'bin' and not {params['matrix']}")

        # Log run-time
        mlflow.log_metric("Runtime", int(round(time.time() - start, 0)))

        # Evaluate model
        perf_all = neighborhood_model.eval_topn(test_mat=eval_mat.copy(),  topn=np.array(cfg.model.top_n_performances, dtype=np.int32), random_state=int(cfg.model.random_state), cores=int(cfg.model.cores))
        
        # Log the performance of the model
        for pos in range(len(cfg.model.top_n_performances)):
            mlflow.log_metric(f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}", perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"])
        
        #We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.
        rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"]

        log.info(f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}.")
        loss = -rel_topn_perf
        return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}
Exemplo n.º 8
0
# Assign an optimizer
if weight_decay == 'decay':
    test.set_optimizer(
        torch.optim.AdamW(test.parameters(),
                          lr=learning_rate,
                          weight_decay=weight_decay_rate))
elif weight_decay == 'no_decay':
    test.set_optimizer(torch.optim.Adam(test.parameters(), lr=learning_rate))
else:
    raise ValueError(
        f"'{weight_decay}' is not a valid value for the weight decay")

# Train the model
test.train(X_train=train_mat.copy(),
           X_validate=eval_mat.copy(),
           batch_size=batch_size,
           epochs=epochs,
           verbose=verbose)

# Evaluate the model
top_n_on_test = test.eval_topn(test_mat=test_mat.copy(),
                               batch_size=batch_size,
                               topn=np.array([4, 10, 20, 50]),
                               rand_sampled=1000,
                               random_state=None)
print(f"topn on test: {top_n_on_test}")

# Compute the coverage
count_vec = RecModel.test_coverage(test, test_mat, 4)
np.save('data/count_vec_vae.npy', count_vec)
import RecModel
import numpy as np
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
import time

# Optimal Hyper paramters
distance_function = 'jaccard'
neighborhood_size = 2

# Load data
train_mat = scipy.sparse.load_npz("data/mat_bin_train.npz")
test_mat = scipy.sparse.load_npz("data/mat_bin_test.npz")
n_users, n_items = train_mat.shape

# Train the model
test_neighbor = RecModel.Neighborhood(num_items=n_items, num_users=n_users, nb_size=neighborhood_size)
start = time.time()
test_neighbor.train(train_mat.copy(), distance_function, cores=8)
start = time.time()

# Compute the performance and print it
perf=test_neighbor.eval_topn(test_mat=test_mat.copy(), rand_sampled=1000, topn=np.array([4, 10, 20, 50], dtype=np.int32), random_state=1993, cores=7)
print(f"Perf was: {perf}")

# Compute the coverage and print it!
count_vec = RecModel.test_coverage(test_neighbor, test_mat, 4)
np.save('data/count_vec_neighbor.npy', count_vec)
Exemplo n.º 10
0
def eval_VAE(params, cfg, train_mat_bin, train_mat_count, eval_mat,
             experiment):
    # This function is what Hyperopt is going to optimize (minimize 'loss' value)
    with mlflow.start_run(experiment_id=experiment):

        # flatten the config.
        params = unfold_config(params)

        # Scale the lr down, too high lr leads to nan as loss
        params['learning_rate'] /= 250

        # Number of epochs should be integer and at least 1.
        params['n_epochs'] = 1 + int(params['n_epochs'])

        # Log the config
        utils.config_helpers.log_config(dict(cfg.model))

        n_users, n_items = train_mat_bin.shape

        # Some simple pre-procesing steps for the params
        params['k'] = max(int(params['k']), 1)
        params['dense_layers_encoder_mu'] = max(
            int(params['dense_layers_encoder_mu']), 1)
        params['dense_layers_encoder_sigma'] = max(
            int(params['dense_layers_encoder_sigma']), 1)
        params['dense_layers_decoder'] = max(
            int(params['dense_layers_decoder']), 1)

        # Log relevant parameters for this run.
        print("Testing the following hyper parmaters!")
        for key, val in dict(params).items():
            mlflow.log_param(key, val)
            if int(cfg.model.verbose) > 0:
                print(f"{key}: {val}")

        # Select the correct matrix to train.
        if params['mat'] == 'count':
            train_mat = train_mat_count.copy()
        elif params['mat'] == 'bin':
            train_mat = train_mat_bin.copy()
        else:
            raise ValueError(
                f"mat can only take values 'count' or 'bin' and not {params['mat']}"
            )

    # Create model
    # Create Mult_VAE
        vae = RecModel.Mult_VAE(
            k=int(params['k']),
            num_items=train_mat.shape[1],
            dense_layers_encoder_mu=[int(params['dense_layers_encoder_mu'])],
            dense_layers_encoder_sigma=[
                int(params['dense_layers_encoder_sigma'])
            ],
            dense_layers_decoder=[int(params['dense_layers_decoder'])],
            batch_norm_encoder_mu=params['batch_norm_encoder_mu'],
            batch_norm_encoder_sigma=params['batch_norm_encoder_sigma'],
            batch_norm_decoder=params['batch_norm_decoder'],
            dropout_rate_decoder=params['dropout_rate_decoder'],
            dropout_rate_encoder_mu=params['dropout_rate_encoder_mu'],
            dropout_rate_encoder_sigma=params['dropout_rate_encoder_sigma'],
            dropout_rate_sparse_encoder_mu=params[
                'dropout_rate_sparse_encoder_mu'],
            dropout_rate_sparse_encoder_sigma=params[
                'dropout_rate_sparse_encoder_sigma'],
            beta=params['beta'])
        # Set optimizer
        if params['weight_decay'] == 'decay':
            vae.set_optimizer(
                torch.optim.AdamW(vae.parameters(),
                                  lr=params['learning_rate'],
                                  weight_decay=params['weight_decay_scale']))
        elif params['weight_decay'] == 'no_decay':
            vae.set_optimizer(
                torch.optim.Adam(vae.parameters(), lr=params['learning_rate']))
        else:
            raise ValueError(
                f"'{params['weigth_decay']}' is not a valid value for weight_decay'"
            )

        print(f"start training!")
        start = time.time()
        epochs = vae.train(X_train=train_mat.copy(),
                           X_validate=eval_mat.copy(),
                           batch_size=int(cfg.model.batch_size),
                           epochs=int(params['n_epochs']),
                           verbose=int(cfg.model.verbose))

        # Log run-time
        mlflow.log_metric("Runtime", int(round(time.time() - start, 0)))
        mlflow.log_metric("epochs_training", epochs + 1)

        # Evaluate model
        perf_all = vae.eval_topn(test_mat=eval_mat.copy(),
                                 batch_size=int(cfg.model.batch_size),
                                 topn=np.array(cfg.model.top_n_performances),
                                 rand_sampled=int(cfg.model.rand_sampled),
                                 random_state=None)

        # Log the performance of the model
        for pos in range(len(cfg.model.top_n_performances)):
            mlflow.log_metric(
                f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}",
                perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"])

        #We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.
        rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"]

        log.info(
            f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf} and model ran for {epochs + 1} epochs"
        )
        loss = -rel_topn_perf
        return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}
Exemplo n.º 11
0
def eval_Slim(params, cfg, train_mat, eval_mat, experiment):
    # This function is what Hyperopt is going to optimize (minimize 'loss' value)
    print(experiment)
    with mlflow.start_run(experiment_id=experiment):

        # Log the config
        utils.config_helpers.log_config(dict(cfg.model))

        n_users, n_items = train_mat.shape
        np.random.seed(seed=cfg.model.seed)

        # Log relevant parameters for this run.
        mlflow.log_param("alpha", params['alpha'])
        mlflow.log_param("l1_ratio", params['l1_ratio'])
        mlflow.log_param("max_iter", params['max_iter'])
        mlflow.log_param("tol", params['tol'])

        # Log this run
        log.info(
            f"Testing  alpha: {params['alpha']},  l1_ratio: {params['l1_ratio']}, max_iter: {params['max_iter']} and tol: {params['tol']}"
        )

        start = time.time()
        # Create model
        slim = RecModel.Slim(num_items=n_items, num_users=n_users)

        # Train Model
        slim.train(X=train_mat.copy(),
                   alpha=params['alpha'],
                   l1_ratio=params['l1_ratio'],
                   max_iter=params['max_iter'],
                   tolerance=params['tol'],
                   cores=1,
                   verbose=int(cfg.model.verbose))

        # Log run-time
        mlflow.log_metric("Runtime", int(round(time.time() - start, 0)))

        # Evaluate model
        perf_all = slim.eval_topn(eval_mat.copy(),
                                  rand_sampled=int(cfg.model.rand_sampled),
                                  topn=np.array(cfg.model.top_n_performances,
                                                dtype=np.int32),
                                  random_state=int(cfg.model.seed),
                                  cores=int(cfg.model.cores))

        # Log the performance of the model
        for pos in range(len(cfg.model.top_n_performances)):
            mlflow.log_metric(
                f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}",
                perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"])
        mlflow.log_metric('MAE_train', slim.eval_prec(train_mat.copy()))
        mlflow.log_metric('MAE_eval', slim.eval_prec(eval_mat.copy()))

        #We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.g
        rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"]

        log.info(
            f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}"
        )
        loss = -rel_topn_perf
        return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}
Exemplo n.º 12
0
min_improvement = 0.01

alpha = 21.951598
beta = 4.014181
preprocess = 'log'
rand_sampled = 1000

# Copy the matrices.
train_mat_save = train_mat.copy()
eval_mat_save = eval_mat.copy()
count_mat_save = count_mat.copy()

# Create the class object
test_MF = RecModel.WMF(num_items=train_mat.shape[1],
                       num_users=train_mat.shape[0],
                       dim=dim,
                       gamma=gamma,
                       weighted=weighted,
                       bias=bias)

# Train the model
iter_run = test_MF.train(utility_mat=train_mat.copy(),
                         count_mat=count_mat.copy(),
                         iterations=iterations,
                         verbose=verbose,
                         eval_mat=eval_mat.copy(),
                         cores=cores,
                         alpha=alpha,
                         stopping_rounds=stopping_rounds,
                         dtype='float32',
                         min_improvement=min_improvement,
                         pre_process_count=preprocess,
import RecModel
import numpy as np
import scipy.sparse
import multiprocessing

eval_mat = scipy.sparse.load_npz("data/mat_bin_test.npz")

# Fill in from the mlflow run

# Optimal hyperparameters
rand_sampled = 1000
cores = 1

# Define the variable
test_naive_baseline = RecModel.NaiveBaseline(eval_mat.shape[0])

# Compute the performance and write it out
perf_all = test_naive_baseline.eval_topn(test_mat=eval_mat,
                                         rand_sampled=1000,
                                         topn=np.array([4, 10, 20, 50],
                                                       dtype=np.int32),
                                         random_state=1993)
print(f"The recalls are {perf_all}")

# Compute the coverage
count_vec = RecModel.test_coverage(test_naive_baseline, eval_mat, 4)

# Compute and print the catalog coverage
print((count_vec > 0.0).sum() / len(count_vec))
Exemplo n.º 14
0
def eval_Ease(params, cfg, train_mat_bin, train_mat_count, eval_mat, experiment):
    # This function is what Hyperopt is going to optimize (minimize 'loss' value)
    print(experiment)
    with mlflow.start_run(experiment_id=experiment):

        # Log the config
        utils.config_helpers.log_config(dict(cfg.model))        

        n_users, n_items = train_mat_bin.shape
        np.random.seed(seed=cfg.model.seed)

        # Log relevant parameters for this run.
        mlflow.log_param("alpha", params['alpha'])
        mlflow.log_param("mat", params['mat'])

        # Log this run
        log.info(f"Testing  alpha: {params['alpha']}, and mat: {params['mat']}")
        
        start = time.time()       

        # Create model
        ease = RecModel.Ease(num_items=n_items, num_users=n_users)
        print(f"start training!, number of cores {int(cfg.model.cores)}")
        
        if params['mat'] == 'count':
            ease.train(train_mat_count.copy(), alpha=params['alpha'], verbose=int(cfg.model.verbose), cores=int(cfg.model.cores))
        elif params['mat'] == 'bin':
            ease.train(train_mat_bin.copy(), alpha=params['alpha'], verbose=int(cfg.model.verbose), cores=int(cfg.model.cores))
        else:
            raise ValueError(f"mat can only take values 'count' or 'bin' and not {params['mat']}")
        
        print('trained model')
        # Log run-time
        mlflow.log_metric("Runtime", int(round(time.time() - start, 0)))

        # Evaluate model
        perf_all = ease.eval_topn(test_mat=eval_mat.copy(), topn=np.array(cfg.model.top_n_performances,
         dtype=np.int32), rand_sampled=int(cfg.model.rand_sampled), cores=int(cfg.model.cores), random_state= int(cfg.model.seed))
        print('estimated performance')

        # Log the performance of the model
        for pos in range(len(cfg.model.top_n_performances)):
            mlflow.log_metric(f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}", perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"])

        print('logged recall')
        if params['mat'] == 'count':
            mlflow.log_metric('MSE_train', ease.eval_prec(utility_mat = train_mat_count.copy()))
            
        elif params['mat'] == 'bin':
            mlflow.log_metric('MSE_train', ease.eval_prec(utility_mat = train_mat_bin.copy()))
        else:
            raise ValueError(f"mat can only take values 'count' or 'bin' and not {params['mat']}")

        print('estimated mse')

        #We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.
        rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"]
        print('extracted performance')        
        log.info(f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}")
        loss = -rel_topn_perf
        return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}