예제 #1
0
data.read_data()
print('Time past for reading data: %d seconds' % (time.time() - start))

pretrain_models = prtr.read_many_models(args.model_file, data)

n_models = pretrain_models.shape[0]

print('Read %d models.' % n_models)

(_, train_inv_rankings) = rnk.many_models_data_split_rank_and_invert(
    pretrain_models, data.train)

print('Finished ranking.')

rel_prob_f = clk.get_relevance_click_model(click_model)
obs_prob = clk.inverse_rank_prob(
    np.arange(data.train.max_query_size(), dtype=np.float64), 1.)
if cutoff:
    obs_prob[cutoff:] = 0.

train_rel_prob = rel_prob_f(data.train.label_vector)
ranker_obs_prob = obs_prob[train_inv_rankings]

CTR_per_ranker = np.sum(train_rel_prob[None, :] * ranker_obs_prob,
                        axis=1,
                        dtype=np.float64) / float(data.train.num_queries())

print(CTR_per_ranker)

print('Writing results to %s' % args.output_path)
with open(args.output_path, 'w') as f:
    json.dump(list(CTR_per_ranker), f)
예제 #2
0
chosen_models = np.array([(args.ranker_pair-1)*2, (args.ranker_pair-1)*2+1])

pretrain_models = pretrain_models[chosen_models, :]
n_models = pretrain_models.shape[0]

(test_rankings,
 test_inv_rankings) = rnk.many_models_data_split_rank_and_invert(
                            pretrain_models,
                            data.test
                          )

rel_prob_f = clk.get_relevance_click_model(click_model)
obs_prob = clk.inverse_rank_prob(
                    np.arange(max(
                        data.train.max_query_size(),
                        data.validation.max_query_size(),
                        data.test.max_query_size(),
                      ), dtype=np.float64),
                    eta
                  )
if cutoff:
  obs_prob[cutoff:] = 0.

test_rel_prob = rel_prob_f(data.test.label_vector)
ranker_test_ctr = np.zeros(n_models, dtype=np.float64)
for i in range(n_models):
  ranker_test_ctr[i] = np.mean(obs_prob[test_inv_rankings[i]]*test_rel_prob)
  ranker_test_ctr[i] *= data.test.num_docs()/float(data.test.num_queries())

(models_train_rankings,
 models_train_inv_rankings) = rnk.many_models_data_split_rank_and_invert(
                                        pretrain_models,
                    default=0.00)
args = parser.parse_args()
num_select_queries = args.num_queries

data = dataset.get_dataset_from_json_info(
    args.dataset,
    args.dataset_info_path,
)

data = data.get_data_folds()[0]

start = time.time()
data.read_data()
print('Time past for reading data: %d seconds' % (time.time() - start))

obs_prob = clk.inverse_rank_prob(
    np.arange(data.validation.max_query_size(), dtype=np.float64), args.eta)
if args.cutoff:
    obs_prob[args.cutoff:] = 0
rel_prob_f = clk.get_relevance_click_model(args.click_model)


def calc_true_loss(ranking_model, data_split):
    all_docs = data_split.feature_matrix
    all_scores = np.dot(all_docs, ranking_model)
    _, inv_rankings = rnk.data_split_rank_and_invert(all_scores, data_split)
    click_prob = obs_prob[inv_rankings] * rel_prob_f(data_split.label_vector)
    result = np.mean(click_prob, dtype=np.float64)
    result *= data_split.num_docs() / float(data_split.num_queries())
    return -result