def __init__(self, dataset_id): self.__dataset_id = dataset_id data_spec = specification_handler.get_specification(self.SPECIFACATION_SUBDIRECTORY, dataset_id) self.file_path_all_data = data_spec['file_path'] self.encoding = data_spec['encoding'] self.n_documents_in_subset = data_spec['n_documents_in_subset'] self.subset_indices = self.generate_subset_indices(data_spec['n_documents_in_set'], self.n_documents_in_subset, data_spec['seed'], ) self.dataset_files_directory = data_spec["directory_path"]
def prepare_keywords(experiment_spec,keyword_cache_directory, index_cache_directory): keyword_spec = experiment_spec["keywords"] #if manual keywords keyword_method = keyword_spec["keyword_generate_algorithm"] if keyword_method == "manual": keywords = specification_handler.get_specification("keyword_setups",keyword_spec["setup_id"]) keyword_setup_id = keyword_spec["setup_id"] keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"]) cache.write(keyword_cache_directory,keyword_id,keywords) print("Manual keyword now stored in cache") return keyword_setup_id = keyword_spec["setup_id"] keyword_seed_id =keyword_spec["seed_id"] keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"]) if cache.in_cache(keyword_cache_directory, keyword_id): print("Keyword stored in cache: "+keyword_id) keywords = cache.load(keyword_cache_directory,keyword_id) keyword_factory.print_keyword_setup_to_json(keyword_id,keywords) # keywords = keyword_factory.check_for_constructed_keyword_setup(keyword_id) # print(keywords) return crete_new_keywords_spec = {} training_data_spec = experiment_spec["training_dataset"] crete_new_keywords_spec["seed_id"] = keyword_spec["seed_id"] crete_new_keywords_spec["training_dataset"] = training_data_spec crete_new_keywords_spec["training_dataset"]["index_directory"] = index_cache_directory crete_new_keywords_spec["training_dataset"]["index_id"]= get_all_index_indices(training_data_spec) given_reference_words = specification_handler.get_specification(__KEYWORD_SEEDS_DIRECTORY_SPECIFICATION, keyword_seed_id) given_reference_words = seed_words_to_index_terms(given_reference_words) crete_new_keywords_spec["given_reference_words"] = given_reference_words crete_new_keywords_spec["keyword_generate_algorithm"] = keyword_spec["keyword_generate_algorithm"] crete_new_keywords_spec["parameters"] = keyword_spec["parameters"] crete_new_keywords_spec["reference_word_filter"] = keyword_spec["reference_word_filter"] keywords = keyword_factory.get_keywords(crete_new_keywords_spec,keyword_id) cache.write(keyword_cache_directory,keyword_id, keywords)
def do_evaluation(experiment_spec, evaluation_scale): experiment_id = experiment_spec["id"] test_dataset_id = dataset_id_handler.get_test_data_id(experiment_spec) categorization = cache.load(__CATEGORIZATIONS_CACHE, experiment_id) gold_standard_categorization = cache.load(__GOLD_STANDARD_CATEGORIZATION_CACHE, test_dataset_id) category_hiearchy = specification_handler.get_specification(__CATEGORY_HIEARACHY_SPECIFICATION, test_dataset_id) all_categories = list(gold_standard_categorization.keys()) test_data_filter_spec = experiment_spec["test_dataset"]["test_category_filters"] test_categories = test_data_filters.get_test_categories(test_data_filter_spec, all_categories) print(test_categories) print(len(test_categories)) evaluation = evaluater.get_evaluation( test_categories, categorization, gold_standard_categorization, category_hiearchy, evaluation_scale ) print_to_json(evaluation) evaluation_levels = evaluater.get_evaluation_levels(evaluation_scale) result_analyser.print_evaluation_to_csv(experiment_id, evaluation, evaluation_levels) pr_matrix = result_analyser.calculate_precision_recall_matrix(evaluation, evaluation_levels) result_analyser.print_precision_recall_matrix_csv(experiment_id, pr_matrix, evaluation_levels) best_pr_matches = result_analyser.calculate_p_r_match_for_categories(evaluation, evaluation_levels) result_analyser.print_p_r_match_to_csv(experiment_id, best_pr_matches) general_recall = result_analyser.calculate_general_recall_at_precissions_levels(evaluation, evaluation_levels) result_analyser.print_general_recall_at_precissions_levels(experiment_id, general_recall, evaluation_levels) # Load seed words keyword_spec = experiment_spec["keywords"] keyword_seed_id = keyword_spec["seed_id"] seed_words = specification_handler.get_specification(__KEYWORD_SEEDS_DIRECTORY_SPECIFICATION, keyword_seed_id) seed_words = seed_words_to_index_terms(seed_words) index_indices = get_all_index_indices(experiment_spec["training_dataset"]) posting_lists_id = keyword_setup_id_generator.get_no_filtered_keywords_id( keyword_seed_id, experiment_spec["training_dataset"] ) min_doc_seed, max_doc_seed = result_analyser.get_n_documents_with_given_seed( seed_words, posting_lists_id, __INDEX_DIRECTORY_CACHE, index_indices ) result_analyser.print_n_documents_with_seed(experiment_id, min_doc_seed, max_doc_seed)
def run_experiment(experiment_id, evaluation_scale): experiment_directory = "experiments" experiment_spec = specification_handler.get_specification(experiment_directory, experiment_id) prepare_experiment_resources(experiment_spec) do_categorization(experiment_spec) do_evaluation(experiment_spec, evaluation_scale)
def __init__(self, dataset_id): self.dataset_id = dataset_id data_spec = specification_handler.get_specification(self.SPECIFACATION_SUBDIRECTORY, dataset_id) self.data_path = data_spec["directory_path"] self.encoding = data_spec["encoding"]
def __load_manual_created_keyword_setup(keyword_setup_id): keywords = specification_handler.get_specification("keyword_setups", keyword_setup_id) return keywords