def test_save_load(data_pack): dirpath = '.tmpdir' data_pack.save(dirpath) dp = load_data_pack(dirpath) assert len(data_pack) == 2 assert len(dp) == 2 shutil.rmtree(dirpath)
def test_save_load(data_pack): dirpath = '.tmpdir' data_pack.save(dirpath) dp = load_data_pack(dirpath) with pytest.raises(FileExistsError): data_pack.save(dirpath) assert len(data_pack) == 2 assert len(dp) == 2 shutil.rmtree(dirpath)
def __init__(self, dataset="robust04", rerank_hits=1000, shuffle=False, debug_mode=False): self.dataset = dataset self.dataset_path = DATA_FOLDER.joinpath(dataset) self.pack = mz.load_data_pack(self.dataset_path.joinpath("train")) self.rerank_pack = mz.load_data_pack( self.dataset_path.joinpath("rerank.%d" % rerank_hits)) self.topic_path = self.dataset_path.joinpath("topics") self.qrel_path = self.dataset_path.joinpath("qrels") self.debug_mode = debug_mode if shuffle: self.pack = self.pack.shuffle() self.rerank_pack = self.rerank_pack.shuffle() if debug_mode: self.pack = self.pack.shuffle()[:1000] self.rerank_pack = self.rerank_pack.shuffle()[:500] self.rerank_pack_processed = None self.pack_processed = None
def write_predictions_in_test_score_file(test_set_predictions, item_query_ids, best=False): result = {} for id, score in enumerate(test_set_predictions): result[item_query_ids[id]] = score[0] post_fix = "_last_model" if best: post_fix = "_best_model" Utils().write_dict_to_csv_with_a_row_for_each_key( result, "features" + post_fix + ".csv") item_query_ids = generate_test_sample_index() cikm_train_data = mz.load_data_pack(TRAIN_FILE) cikm_test_data = mz.load_data_pack(TEST_FILE) cikm_validation_data = mz.load_data_pack(VALID_FILE) cikm_train_data.shuffle(inplace=True) preprocessor = mz.preprocessors.CDSSMPreprocessor(fixed_length_left=20, fixed_length_right=20) preprocessor.fit(cikm_train_data) print(preprocessor.context) train_processed = preprocessor.transform(cikm_train_data) test_processed = preprocessor.transform(cikm_test_data) validation_processed = preprocessor.transform(cikm_validation_data) print( "------------------------ data transformation done ------------------------"
]] model.params['task'] = ranking_task model.params['mask_value'] = -1 model.params['embedding_output_dim'] = glove_embedding.output_dim model.params['mlp_num_layers'] = 2 model.params['mlp_num_units'] = 5 model.params['mlp_num_fan_out'] = 1 model.params['mlp_activation_func'] = 'tanh' model.params['optimizer'] = 'adadelta' model.build() model.compile() model.backend.summary() print("model params set") train_pack_processed = mz.load_data_pack(path + "robust_train_fold_" + fold) dev_pack_processed = mz.load_data_pack(path + "robust_dev_fold_" + fold) print("datapacks OK") embedding_matrix = glove_embedding.build_matrix( preprocessor.context['vocab_unit'].state['term_index']) #normalize the word embedding for fast histogram generating. l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] model.load_embedding_matrix(embedding_matrix) print("embedding matrix loaded")