예제 #1
0
def test_save_load(data_pack):
    dirpath = '.tmpdir'
    data_pack.save(dirpath)
    dp = load_data_pack(dirpath)
    assert len(data_pack) == 2
    assert len(dp) == 2
    shutil.rmtree(dirpath)
예제 #2
0
def test_save_load(data_pack):
    dirpath = '.tmpdir'
    data_pack.save(dirpath)
    dp = load_data_pack(dirpath)
    with pytest.raises(FileExistsError):
        data_pack.save(dirpath)
    assert len(data_pack) == 2
    assert len(dp) == 2
    shutil.rmtree(dirpath)
예제 #3
0
    def __init__(self,
                 dataset="robust04",
                 rerank_hits=1000,
                 shuffle=False,
                 debug_mode=False):
        self.dataset = dataset
        self.dataset_path = DATA_FOLDER.joinpath(dataset)
        self.pack = mz.load_data_pack(self.dataset_path.joinpath("train"))
        self.rerank_pack = mz.load_data_pack(
            self.dataset_path.joinpath("rerank.%d" % rerank_hits))
        self.topic_path = self.dataset_path.joinpath("topics")
        self.qrel_path = self.dataset_path.joinpath("qrels")
        self.debug_mode = debug_mode
        if shuffle:
            self.pack = self.pack.shuffle()
            self.rerank_pack = self.rerank_pack.shuffle()
        if debug_mode:
            self.pack = self.pack.shuffle()[:1000]
            self.rerank_pack = self.rerank_pack.shuffle()[:500]

        self.rerank_pack_processed = None
        self.pack_processed = None
def write_predictions_in_test_score_file(test_set_predictions,
                                         item_query_ids,
                                         best=False):
    result = {}
    for id, score in enumerate(test_set_predictions):
        result[item_query_ids[id]] = score[0]
    post_fix = "_last_model"
    if best:
        post_fix = "_best_model"
    Utils().write_dict_to_csv_with_a_row_for_each_key(
        result, "features" + post_fix + ".csv")


item_query_ids = generate_test_sample_index()

cikm_train_data = mz.load_data_pack(TRAIN_FILE)
cikm_test_data = mz.load_data_pack(TEST_FILE)
cikm_validation_data = mz.load_data_pack(VALID_FILE)
cikm_train_data.shuffle(inplace=True)

preprocessor = mz.preprocessors.CDSSMPreprocessor(fixed_length_left=20,
                                                  fixed_length_right=20)
preprocessor.fit(cikm_train_data)

print(preprocessor.context)
train_processed = preprocessor.transform(cikm_train_data)
test_processed = preprocessor.transform(cikm_test_data)
validation_processed = preprocessor.transform(cikm_validation_data)

print(
    "------------------------ data transformation done ------------------------"
예제 #5
0
]]
model.params['task'] = ranking_task
model.params['mask_value'] = -1
model.params['embedding_output_dim'] = glove_embedding.output_dim
model.params['mlp_num_layers'] = 2
model.params['mlp_num_units'] = 5
model.params['mlp_num_fan_out'] = 1
model.params['mlp_activation_func'] = 'tanh'
model.params['optimizer'] = 'adadelta'
model.build()
model.compile()
model.backend.summary()

print("model params set")

train_pack_processed = mz.load_data_pack(path + "robust_train_fold_" + fold)

dev_pack_processed = mz.load_data_pack(path + "robust_dev_fold_" + fold)

print("datapacks OK")

embedding_matrix = glove_embedding.build_matrix(
    preprocessor.context['vocab_unit'].state['term_index'])
#normalize the word embedding for fast histogram generating.
l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]

model.load_embedding_matrix(embedding_matrix)

print("embedding matrix loaded")