def __init__(self, dataset_id):
     self.__dataset_id = dataset_id
     data_spec = specification_handler.get_specification(self.SPECIFACATION_SUBDIRECTORY, dataset_id)
     self.file_path_all_data = data_spec['file_path']
     self.encoding = data_spec['encoding']
     self.n_documents_in_subset = data_spec['n_documents_in_subset']
     self.subset_indices = self.generate_subset_indices(data_spec['n_documents_in_set'],
                                                          self.n_documents_in_subset,
                                                          data_spec['seed'],
                                                          )
     self.dataset_files_directory = data_spec["directory_path"]
def prepare_keywords(experiment_spec,keyword_cache_directory, index_cache_directory):
    keyword_spec = experiment_spec["keywords"]
    #if manual keywords
    keyword_method = keyword_spec["keyword_generate_algorithm"]
    if keyword_method == "manual":
        keywords = specification_handler.get_specification("keyword_setups",keyword_spec["setup_id"])
        keyword_setup_id = keyword_spec["setup_id"]
        keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"])
        cache.write(keyword_cache_directory,keyword_id,keywords)
        print("Manual keyword now stored in cache")
        return
    keyword_setup_id = keyword_spec["setup_id"]
    keyword_seed_id =keyword_spec["seed_id"]
    keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"])
    if cache.in_cache(keyword_cache_directory, keyword_id):
       print("Keyword stored in cache: "+keyword_id)
       keywords = cache.load(keyword_cache_directory,keyword_id)
       keyword_factory.print_keyword_setup_to_json(keyword_id,keywords)
       # keywords = keyword_factory.check_for_constructed_keyword_setup(keyword_id)
       # print(keywords)
       return

    crete_new_keywords_spec = {}

    training_data_spec = experiment_spec["training_dataset"]
    crete_new_keywords_spec["seed_id"] = keyword_spec["seed_id"]
    crete_new_keywords_spec["training_dataset"] = training_data_spec
    crete_new_keywords_spec["training_dataset"]["index_directory"] = index_cache_directory
    crete_new_keywords_spec["training_dataset"]["index_id"]= get_all_index_indices(training_data_spec)


    given_reference_words = specification_handler.get_specification(__KEYWORD_SEEDS_DIRECTORY_SPECIFICATION, keyword_seed_id)
    given_reference_words = seed_words_to_index_terms(given_reference_words)

    crete_new_keywords_spec["given_reference_words"] = given_reference_words
    crete_new_keywords_spec["keyword_generate_algorithm"] = keyword_spec["keyword_generate_algorithm"]
    crete_new_keywords_spec["parameters"] = keyword_spec["parameters"]
    crete_new_keywords_spec["reference_word_filter"] = keyword_spec["reference_word_filter"]
    keywords = keyword_factory.get_keywords(crete_new_keywords_spec,keyword_id)
    cache.write(keyword_cache_directory,keyword_id, keywords)
def do_evaluation(experiment_spec, evaluation_scale):
    experiment_id = experiment_spec["id"]
    test_dataset_id = dataset_id_handler.get_test_data_id(experiment_spec)
    categorization = cache.load(__CATEGORIZATIONS_CACHE, experiment_id)
    gold_standard_categorization = cache.load(__GOLD_STANDARD_CATEGORIZATION_CACHE, test_dataset_id)
    category_hiearchy = specification_handler.get_specification(__CATEGORY_HIEARACHY_SPECIFICATION, test_dataset_id)

    all_categories = list(gold_standard_categorization.keys())
    test_data_filter_spec = experiment_spec["test_dataset"]["test_category_filters"]
    test_categories = test_data_filters.get_test_categories(test_data_filter_spec, all_categories)
    print(test_categories)
    print(len(test_categories))

    evaluation = evaluater.get_evaluation(
        test_categories, categorization, gold_standard_categorization, category_hiearchy, evaluation_scale
    )
    print_to_json(evaluation)
    evaluation_levels = evaluater.get_evaluation_levels(evaluation_scale)
    result_analyser.print_evaluation_to_csv(experiment_id, evaluation, evaluation_levels)
    pr_matrix = result_analyser.calculate_precision_recall_matrix(evaluation, evaluation_levels)
    result_analyser.print_precision_recall_matrix_csv(experiment_id, pr_matrix, evaluation_levels)
    best_pr_matches = result_analyser.calculate_p_r_match_for_categories(evaluation, evaluation_levels)
    result_analyser.print_p_r_match_to_csv(experiment_id, best_pr_matches)
    general_recall = result_analyser.calculate_general_recall_at_precissions_levels(evaluation, evaluation_levels)
    result_analyser.print_general_recall_at_precissions_levels(experiment_id, general_recall, evaluation_levels)

    # Load seed words
    keyword_spec = experiment_spec["keywords"]
    keyword_seed_id = keyword_spec["seed_id"]
    seed_words = specification_handler.get_specification(__KEYWORD_SEEDS_DIRECTORY_SPECIFICATION, keyword_seed_id)
    seed_words = seed_words_to_index_terms(seed_words)
    index_indices = get_all_index_indices(experiment_spec["training_dataset"])
    posting_lists_id = keyword_setup_id_generator.get_no_filtered_keywords_id(
        keyword_seed_id, experiment_spec["training_dataset"]
    )
    min_doc_seed, max_doc_seed = result_analyser.get_n_documents_with_given_seed(
        seed_words, posting_lists_id, __INDEX_DIRECTORY_CACHE, index_indices
    )
    result_analyser.print_n_documents_with_seed(experiment_id, min_doc_seed, max_doc_seed)
def run_experiment(experiment_id, evaluation_scale):
    experiment_directory = "experiments"
    experiment_spec = specification_handler.get_specification(experiment_directory, experiment_id)
    prepare_experiment_resources(experiment_spec)
    do_categorization(experiment_spec)
    do_evaluation(experiment_spec, evaluation_scale)
 def __init__(self, dataset_id):
     self.dataset_id = dataset_id
     data_spec = specification_handler.get_specification(self.SPECIFACATION_SUBDIRECTORY, dataset_id)
     self.data_path = data_spec["directory_path"]
     self.encoding = data_spec["encoding"]
def __load_manual_created_keyword_setup(keyword_setup_id):
    keywords = specification_handler.get_specification("keyword_setups", keyword_setup_id)
    return keywords