def run_simulations(self, num_sims: int = 5) -> None: """Train mf.""" results_mse = [] results_mae = [] results_ndcg = [] # start running simulations start = time.time() for seed in np.arange(num_sims): train, val, test, num_users, num_items =\ preprocess_datasets(data=self.data, seed=seed) ops.reset_default_graph() tf.set_random_seed(seed) sess = tf.Session() if '-at' not in self.model_name: model = MFIPS(num_users=num_users, num_items=num_items, dim=self.dim, lam=self.lam, eta=self.eta) _, mse, mae, u_emb, i_emb, i_bias = train_mfips( sess, model=model, data=self.data, train=train, val=val, test=test, max_iters=self.max_iters, batch_size=self.batch_size, model_name=self.model_name, seed=seed) else: model = MFIPS(num_users=num_users, num_items=num_items, dim=self.dim, lam=self.lam, eta=self.eta, num=0) model1 = MFIPS(num_users=num_users, num_items=num_items, dim=self.dim, lam=self.lam, eta=self.eta, num=1) model2 = MFIPS(num_users=num_users, num_items=num_items, dim=self.dim, lam=self.lam, eta=self.eta, num=2) _, mse, mae, u_emb, i_emb, i_bias = train_mfips_with_at( sess, model=model, mfips1=model1, mfips2=model2, data=self.data, train=train, val=val, test=test, epsilon=self.epsilon, pre_iters=self.pre_iters, post_iters=self.post_iters, post_steps=self.post_steps, batch_size=self.batch_size, model_name=self.model_name, seed=seed) results_mae.append(mae) results_mse.append(mse) ndcg = aoa_evaluator(user_embed=u_emb, item_embed=i_emb, item_bias=i_bias, test=test) results_ndcg.append(ndcg) print(f'#{seed+1} {self.model_name}: {np.round((time.time() - start) / 60, 2)} min') # aggregate and save the final results result_path = Path(f'../logs/{self.data}/{self.model_name}') result_path.mkdir(parents=True, exist_ok=True) pd.concat([pd.DataFrame(results_mae, columns=['MAE']), pd.DataFrame(results_mse, columns=['MSE']), pd.DataFrame(results_ndcg, columns=['nDCG@3'])], 1)\ .to_csv(str(result_path / 'results.csv'))
def summarize_data_statistics() -> None: """Save dataset statistics with Tex Table Format.""" stat_data_list = [] Path('../paper_results').mkdir(exist_ok=True) for data in datasets: train, _, test, num_users, num_items = preprocess_datasets(data=data) num_data = train.shape[0] spasity = f'{100 * (num_data / (num_users * num_items)).round(4)}%' avg_train, avg_test = train[:, 2].mean().round(3), test[:, 2].mean().round(3) kl = calc_kl_div(train, test) stat_data = DataFrame(data=[ num_users, num_items, num_data, spasity, avg_train, avg_test, kl ], index=stats_idx, columns=[data]).T stat_data_list.append(stat_data) pd.concat(stat_data_list).to_csv('../paper_results/data_stat.csv', sep='&')
def __call__(self, trial: Trial) -> float: """Calculate an objective value.""" train, val, test, num_users, num_items =\ preprocess_datasets(data=self.data, seed=12345) # sample a set of hyperparameters. config = yaml.safe_load(open('../config.yaml', 'r')) eta = config['eta'] max_iters = config['max_iters'] batch_size = config['batch_size'] pre_iters = config['pre_iters'] post_iters = config['post_iters'] post_steps = config['post_steps'] dim = trial.suggest_discrete_uniform('dim', 5, 50, 5) lam = trial.suggest_loguniform('lam', 1e-6, 1) if '-at' in self.model_name: epsilon = trial.suggest_loguniform('epsilon', 1e-3, 1) ops.reset_default_graph() tf.set_random_seed(12345) sess = tf.Session() if '-at' not in self.model_name: model = MFIPS(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta) score, _, _, _, _, _ = train_mfips( sess, model=model, data=self.data, train=train, val=val, test=test, max_iters=max_iters, batch_size=batch_size, model_name=self.model_name) else: model = MFIPS(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta, num=0) model1 = MFIPS(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta, num=1) model2 = MFIPS(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta, num=2) score, _, _, _, _, _ = train_mfips_with_at( sess, model=model, mfips1=model1, mfips2=model2, data=self.data, train=train, val=val, test=test, epsilon=epsilon, pre_iters=pre_iters, post_iters=post_iters, post_steps=post_steps, batch_size=batch_size, model_name=self.model_name) return score
def run_simulations(self, num_sims: int = 5) -> None: """Train mf""" logger = logging.getLogger(__name__) # Create a custom logger # Create logging handlers c_handler = logging.StreamHandler() f_handler = logging.FileHandler( Path(f'../logs/{self.data}/{self.model_name}/simulations.log'), mode='w') # Create logging formatters and add them to handlers c_format = logging.Formatter('%(message)s') f_format = logging.Formatter('%(message)s') c_handler.setFormatter(c_format) f_handler.setFormatter(f_format) logger.addHandler(c_handler) logger.addHandler(f_handler) results_mse = [] results_mae = [] results_ndcg_at1, results_ndcg_at3, results_ndcg_at5 = [], [], [] results_recall_at1, results_recall_at3, results_recall_at5 = [], [], [] results_map_at1, results_map_at3, results_map_at5 = [], [], [] # start running simulations start = time.time() for seed in np.arange(num_sims): train, val, test, num_users, num_items =\ preprocess_datasets(data=self.data, seed=seed) ops.reset_default_graph() tf.set_random_seed(seed) sess = tf.Session() if '-without_ipw' in self.model_name: logger.debug('*** Without IPW ***') # instantiating the model in the strategy scope creates the model on the TPU model = MFMODEL(num_users=num_users, num_items=num_items, dim=self.dim, lam=self.lam, eta=self.eta) _, mse, mae, u_emb, i_emb, u_bias, i_bias, g_bias = train_mfmodel_without_ipw( sess, model=model, data=self.data, train=train, val=val, test=test, max_iters=self.max_iters, batch_size=self.batch_size, model_name=self.model_name, seed=seed) elif '-at' not in self.model_name: logger.debug( '*** With IPW and without Asymmetric Tri-training ***') # instantiating the model in the strategy scope creates the model on the TPU model = MFMODEL(num_users=num_users, num_items=num_items, dim=self.dim, lam=self.lam, eta=self.eta) _, mse, mae, u_emb, i_emb, u_bias, i_bias, g_bias = train_mfmodel( sess, model=model, data=self.data, train=train, val=val, test=test, max_iters=self.max_iters, batch_size=self.batch_size, model_name=self.model_name, seed=seed) else: logger.debug('*** With IPW and Asymmetric Tri-training ***') model = MFMODEL(num_users=num_users, num_items=num_items, dim=self.dim, lam=self.lam, eta=self.eta, num=0) model1 = MFMODEL(num_users=num_users, num_items=num_items, dim=self.dim, lam=self.lam, eta=self.eta, num=1) model2 = MFMODEL(num_users=num_users, num_items=num_items, dim=self.dim, lam=self.lam, eta=self.eta, num=2) _, mse, mae, u_emb, i_emb, u_bias, i_bias, g_bias = train_mfmodel_with_at( sess, model=model, mfmodel1=model1, mfmodel2=model2, data=self.data, train=train, val=val, test=test, epsilon=self.epsilon, pre_iters=self.pre_iters, post_iters=self.post_iters, post_steps=self.post_steps, batch_size=self.batch_size, model_name=self.model_name, seed=seed) # After building a model, summarize metrics results_mae.append(mae) results_mse.append(mse) ranking_results_dic = aoa_evaluator_all_biases( user_embed=u_emb, item_embed=i_emb, user_bias=u_bias, item_bias=i_bias, global_bias=g_bias, test=test, model_name=self.model_name) results_ndcg_at1.append(ranking_results_dic['nDCG@1']) results_ndcg_at3.append(ranking_results_dic['nDCG@3']) results_ndcg_at5.append(ranking_results_dic['nDCG@5']) results_recall_at1.append(ranking_results_dic['Recall@1']) results_recall_at3.append(ranking_results_dic['Recall@3']) results_recall_at5.append(ranking_results_dic['Recall@5']) results_map_at1.append(ranking_results_dic['MAP@1']) results_map_at3.append(ranking_results_dic['MAP@3']) results_map_at5.append(ranking_results_dic['MAP@5']) logger.debug( f'#{seed+1} {self.model_name}: {np.round((time.time() - start) / 60, 2)} min' ) # Aggregate and save the final results result_path = Path(f'../logs/{self.data}/{self.model_name}') result_path.mkdir(parents=True, exist_ok=True) pd.concat([pd.DataFrame(results_mae, columns=['MAE']), pd.DataFrame(results_mse, columns=['MSE']), pd.DataFrame(results_ndcg_at1, columns=['nDCG@1']), pd.DataFrame(results_ndcg_at3, columns=['nDCG@3']), pd.DataFrame(results_ndcg_at5, columns=['nDCG@5']), pd.DataFrame(results_recall_at1, columns=['Recall@1']), pd.DataFrame(results_recall_at3, columns=['Recall@3']), pd.DataFrame(results_recall_at5, columns=['Recall@5']), pd.DataFrame(results_map_at1, columns=['MAP@1']), pd.DataFrame(results_map_at3, columns=['MAP@3']), pd.DataFrame(results_map_at5, columns=['MAP@5']), ], 1 )\ .to_csv(str(result_path / 'results.csv'))
def __call__(self, trial: Trial) -> float: """Calculate an objective value.""" logger = logging.getLogger(__name__) # Create a custom logger # Create logging handlers c_handler = logging.StreamHandler() f_handler = logging.FileHandler( Path(f'../logs/{self.data}/{self.model_name}/simulations.log'), mode='w') # Create logging formatters and add them to handlers c_format = logging.Formatter('%(message)s') f_format = logging.Formatter('%(message)s') c_handler.setFormatter(c_format) f_handler.setFormatter(f_format) logger.addHandler(c_handler) logger.addHandler(f_handler) train, val, test, num_users, num_items =\ preprocess_datasets(data=self.data, seed=rand_seed_val) # sample a set of hyperparameters. config = yaml.safe_load(open('../config.yaml', 'r')) eta = config['eta'] max_iters = config['max_iters'] batch_size = config['batch_size'] pre_iters = config['pre_iters'] post_iters = config['post_iters'] post_steps = config['post_steps'] dim = trial.suggest_discrete_uniform('dim', 5, 50, 5) lam = trial.suggest_loguniform('lam', 1e-6, 1) if '-at' in self.model_name: epsilon = trial.suggest_loguniform('epsilon', 1e-3, 1) ops.reset_default_graph() tf.set_random_seed(rand_seed_val) sess = tf.Session() if '-without_ipw' in self.model_name: logger.debug('*** Without IPW ***') model = MFMODEL(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta) score, _, _, _, _, _, _, _ = train_mfmodel_without_ipw( sess, model=model, data=self.data, train=train, val=val, test=test, max_iters=max_iters, batch_size=batch_size, model_name=self.model_name) elif '-at' not in self.model_name: logger.debug( '*** With IPW and without Asymmetric Tri-training ***') model = MFMODEL(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta) score, _, _, _, _, _, _, _ = train_mfmodel( sess, model=model, data=self.data, train=train, val=val, test=test, max_iters=max_iters, batch_size=batch_size, model_name=self.model_name) else: logger.debug('*** With IPW and Asymmetric Tri-training ***') model = MFMODEL(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta, num=0) model1 = MFMODEL(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta, num=1) model2 = MFMODEL(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta, num=2) score, _, _, _, _, _, _, _ = train_mfmodel_with_at( sess, model=model, mfmodel1=model1, mfmodel2=model2, data=self.data, train=train, val=val, test=test, epsilon=epsilon, pre_iters=pre_iters, post_iters=post_iters, post_steps=post_steps, batch_size=batch_size, model_name=self.model_name) return score
""" Code for summarizing experimental results for the paper "Asymmetric Tri-training for Debiasing Missing-Not-At-Random Rating Feedback". """ import argparse import yaml from utils.preprocessor import preprocess_datasets from utils.result_tools import (summarize_data_statistics, summarize_experimental_results) parser = argparse.ArgumentParser() parser.add_argument('--datasets', '-d', type=str, nargs='*', required=True) if __name__ == "__main__": args = parser.parse_args() num_sims = yaml.safe_load(open('../config.yaml', 'rb'))['num_sims'] summarize_data_statistics() for data in args.datasets: train, _, test, _, _ = preprocess_datasets(data=data) summarize_experimental_results(data=data)