data.read_data() print('Time past for reading data: %d seconds' % (time.time() - start)) pretrain_models = prtr.read_many_models(args.model_file, data) n_models = pretrain_models.shape[0] print('Read %d models.' % n_models) (_, train_inv_rankings) = rnk.many_models_data_split_rank_and_invert( pretrain_models, data.train) print('Finished ranking.') rel_prob_f = clk.get_relevance_click_model(click_model) obs_prob = clk.inverse_rank_prob( np.arange(data.train.max_query_size(), dtype=np.float64), 1.) if cutoff: obs_prob[cutoff:] = 0. train_rel_prob = rel_prob_f(data.train.label_vector) ranker_obs_prob = obs_prob[train_inv_rankings] CTR_per_ranker = np.sum(train_rel_prob[None, :] * ranker_obs_prob, axis=1, dtype=np.float64) / float(data.train.num_queries()) print(CTR_per_ranker) print('Writing results to %s' % args.output_path) with open(args.output_path, 'w') as f: json.dump(list(CTR_per_ranker), f)
chosen_models = np.array([(args.ranker_pair-1)*2, (args.ranker_pair-1)*2+1]) pretrain_models = pretrain_models[chosen_models, :] n_models = pretrain_models.shape[0] (test_rankings, test_inv_rankings) = rnk.many_models_data_split_rank_and_invert( pretrain_models, data.test ) rel_prob_f = clk.get_relevance_click_model(click_model) obs_prob = clk.inverse_rank_prob( np.arange(max( data.train.max_query_size(), data.validation.max_query_size(), data.test.max_query_size(), ), dtype=np.float64), eta ) if cutoff: obs_prob[cutoff:] = 0. test_rel_prob = rel_prob_f(data.test.label_vector) ranker_test_ctr = np.zeros(n_models, dtype=np.float64) for i in range(n_models): ranker_test_ctr[i] = np.mean(obs_prob[test_inv_rankings[i]]*test_rel_prob) ranker_test_ctr[i] *= data.test.num_docs()/float(data.test.num_queries()) (models_train_rankings, models_train_inv_rankings) = rnk.many_models_data_split_rank_and_invert( pretrain_models,
default=0.00) args = parser.parse_args() num_select_queries = args.num_queries data = dataset.get_dataset_from_json_info( args.dataset, args.dataset_info_path, ) data = data.get_data_folds()[0] start = time.time() data.read_data() print('Time past for reading data: %d seconds' % (time.time() - start)) obs_prob = clk.inverse_rank_prob( np.arange(data.validation.max_query_size(), dtype=np.float64), args.eta) if args.cutoff: obs_prob[args.cutoff:] = 0 rel_prob_f = clk.get_relevance_click_model(args.click_model) def calc_true_loss(ranking_model, data_split): all_docs = data_split.feature_matrix all_scores = np.dot(all_docs, ranking_model) _, inv_rankings = rnk.data_split_rank_and_invert(all_scores, data_split) click_prob = obs_prob[inv_rankings] * rel_prob_f(data_split.label_vector) result = np.mean(click_prob, dtype=np.float64) result *= data_split.num_docs() / float(data_split.num_queries()) return -result