Пример #1
0
def do_one_fold(
    fold_index: int,
    test_key_set: Set[Constant],
    # fd: FoldData
):
    print('\n===========================')
    print('=== start FOLD ' + str(fold_index + 1) + ' of ' + str(fd.nb_folds))
    print('===========================')

    training_example_collection, test_examples = split_examples_into_training_and_test_sets(
        fd.all_key_sets, test_key_set,
        fd.examples_collection_usable_for_training,
        fd.examples_usable_for_testing)
    print('\ttotal nb of labeled examples: ' +
          str(fd.total_nb_of_labeled_examples))
    nb_of_training_ex = len(training_example_collection.example_wrappers_sp)
    nb_of_test_ex = len(test_examples)
    print('\tnb of TRAINING ex: ' + str(nb_of_training_ex))
    print('\tnb of TEST ex: ' + str(nb_of_test_ex))

    # ===========================
    start_time = time.time()

    # ==============================================================================================================
    print('\t=== start building tree for fold ' + str(fold_index + 1))

    # TRAIN MODEL using training set
    tree = build_tree(
        fd.internal_ex_format,
        fd.treebuilder_type,
        fd.parsed_settings.language,
        fd.possible_labels,
        training_example_collection,
        prediction_goal=fd.prediction_goal,
        full_background_knowledge_sp=fd.full_background_knowledge_sp,
        debug_printing_tree_building=fd.debug_printing_tree_building,
        engine=fd.engine)

    tree = prune_tree(
        tree, debug_printing_tree_pruning=fd.debug_printing_tree_pruning)
    nb_of_nodes = tree.get_nb_of_nodes()
    nb_inner_nodes = tree.get_nb_of_inner_nodes()
    fd.total_nb_of_nodes_per_fold.append(nb_of_nodes)
    fd.nb_of_inner_node_per_fold.append(nb_inner_nodes)

    # write out tree
    tree_fname = fd.dir_output_files + fd.fname_prefix_fold + '_fold' + str(
        fold_index) + ".tree"
    write_out_tree(tree_fname, tree)

    print('\t=== end building tree for fold ' + str(fold_index + 1))

    # ==============================================================================================================

    print('\t=== start converting tree to program for fold ' +
          str(fold_index + 1))
    program = convert_tree_to_program(
        fd.kb_format,
        fd.treebuilder_type,
        tree,
        fd.parsed_settings.language,
        debug_printing=fd.debug_printing_program_conversion,
        prediction_goal=fd.prediction_goal,
        index_of_label_var=fd.index_of_label_var)
    program_fname = fd.dir_output_files + fd.fname_prefix_fold + '_fold' + str(
        fold_index) + ".program"
    write_out_program(program_fname, program)

    print('\t=== end converting tree to program for fold ' +
          str(fold_index + 1))

    # ==============================================================================================================

    print('\t=== start classifying test set' + str(fold_index + 1))
    # EVALUATE MODEL using test set
    classifier = get_keys_classifier(
        fd.internal_ex_format,
        program,
        fd.prediction_goal,
        fd.index_of_label_var,
        fd.stripped_background_knowledge,
        debug_printing=fd.debug_printing_get_classifier,
        engine=fd.engine)

    statistics_handler = do_labeled_examples_get_correctly_classified(
        classifier, test_examples, fd.possible_labels, fd.
        debug_printing_classification)  # type: ClassificationStatisticsHandler

    # ===================
    end_time = time.time()
    # time in seconds: # time in seconds
    elapsed_time = end_time - start_time
    fd.execution_time_per_fold.append(elapsed_time)

    accuracy, _ = statistics_handler.get_accuracy()
    fd.accuracies_folds.append(accuracy)

    statistics_fname = fd.dir_output_files + fd.fname_prefix_fold + '_fold' + str(
        fold_index) + ".statistics"
    statistics_handler.write_out_statistics_to_file(statistics_fname)

    with open(statistics_fname, 'a') as f:
        f.write('\n\nnb of TRAINING ex: ' + str(nb_of_training_ex) + "\n")
        f.write('nb of TEST ex: ' + str(nb_of_test_ex) + "\n\n")

        f.write("total nb of nodes: " + str(nb_of_nodes) + "\n")
        f.write("nb of internal nodes: " + str(nb_inner_nodes) + "\n\n")
        f.write("execution time of fold: " + str(elapsed_time) + " seconds\n")
    print("total nb of nodes: " + str(nb_of_nodes))
    print("nb of internal nodes: " + str(nb_inner_nodes))
    print("execution time of fold: ", elapsed_time, "seconds")

    print('\t=== end classifying test set' + str(fold_index + 1))
    print('\t=== end FOLD ' + str(fold_index + 1) + ' of ' + str(fd.nb_folds) +
          '\n')
Пример #2
0
def run_program(settings: ProgramSettings):
    # get the name of the program to run
    fname_labeled_examples = settings.filename_prefix + kb_suffix
    fname_settings = settings.filename_prefix + s_suffix

    # BACKGROUND KNOWLEDGE

    fname_background_knowledge = settings.filename_prefix + bg_suffix
    #     background_knowledge = parse_background_knowledge(fname_background_knowledge)
    # else:
    #     background_knowledge = None

    debug_printing = settings.debug_parsing

    if settings.kb_format is None:
        raise NotImplementedError(
            'Automatic recognition of input format is not yet supported.')
    else:
        # SETTINGS FILE
        settings_file_parser = SettingsParserMapper.get_settings_parser(
            settings.kb_format)
        parsed_settings = settings_file_parser.parse(fname_settings)

        if settings.kb_format is KnowledgeBaseFormat.MODELS:
            possible_labels = parsed_settings.possible_labels
            training_examples_collection, background_knowledge_wrapper \
                = preprocessing_examples_models(fname_labeled_examples, parsed_settings,
                                                settings.internal_examples_format, fname_background_knowledge)
            prediction_goal = None
            index_of_label_var = None
        elif settings.kb_format is KnowledgeBaseFormat.KEYS:
            training_examples_collection, prediction_goal, index_of_label_var, possible_labels, background_knowledge_wrapper = \
                preprocessing_examples_keys(fname_labeled_examples, parsed_settings, settings.internal_examples_format,
                                            fname_background_knowledge, filter_out_unlabeled_examples=False)
        else:
            raise KnowledgeBaseFormatException(
                'Only the input formats Models and Key are supported.')

        engine = DefaultEngine()
        engine.unknown = 1

        full_background_knowledge_sp = background_knowledge_wrapper.get_full_background_knowledge_simple_program(
        )
        tree = build_tree(
            settings.internal_examples_format,
            settings.treebuilder_type,
            parsed_settings.language,
            possible_labels,
            training_examples_collection,
            prediction_goal=prediction_goal,
            full_background_knowledge_sp=full_background_knowledge_sp,
            debug_printing_tree_building=debug_printing,
            engine=engine)

        tree = prune_tree(tree)

        program = convert_tree_to_program(
            settings.kb_format,
            settings.treebuilder_type,
            tree,
            parsed_settings.language,
            debug_printing=debug_printing,
            prediction_goal=prediction_goal,
            index_of_label_var=index_of_label_var)
Пример #3
0
def do_all_examples(fd: FoldData):
    print('\n=======================================')
    print('=== FINALLY, learn tree on all examples')
    print('========================================')
    print('\ttotal nb of labeled examples: ' +
          str(fd.total_nb_of_labeled_examples))

    print('\t=== start building tree for ALL examples')

    # ===========================
    start_time = time.time()

    # TRAIN MODEL using training set
    tree = build_tree(
        fd.internal_ex_format,
        fd.treebuilder_type,
        fd.parsed_settings.language,
        fd.possible_labels,
        fd.examples_collection_usable_for_training,
        prediction_goal=fd.prediction_goal,
        full_background_knowledge_sp=fd.full_background_knowledge_sp,
        debug_printing_tree_building=fd.debug_printing_tree_building,
        engine=fd.engine)

    tree = prune_tree(
        tree, debug_printing_tree_pruning=fd.debug_printing_tree_pruning)
    nb_of_nodes = tree.get_nb_of_nodes()
    nb_inner_nodes = tree.get_nb_of_inner_nodes()
    fd.total_nb_of_nodes_per_fold.append(nb_of_nodes)
    fd.nb_of_inner_node_per_fold.append(nb_inner_nodes)

    # write out tree
    tree_fname = fd.dir_output_files + fd.fname_prefix_fold + ".tree"
    write_out_tree(tree_fname, tree)

    print('=== end building tree for ALL examples')

    print('=== start converting tree to program for ALL examples')
    program = convert_tree_to_program(
        fd.kb_format,
        fd.treebuilder_type,
        tree,
        fd.parsed_settings.language,
        debug_printing=fd.debug_printing_program_conversion,
        prediction_goal=fd.prediction_goal,
        index_of_label_var=fd.index_of_label_var)
    program_fname = fd.dir_output_files + fd.fname_prefix_fold + ".program"
    write_out_program(program_fname, program)

    print('=== end converting tree to program for ALL examples')

    all_examples = fd.examples_collection_usable_for_training.get_labeled_examples(
    )

    print('\t=== start classifying total set')
    # EVALUATE MODEL using test set
    classifier = get_keys_classifier(
        fd.internal_ex_format,
        program,
        fd.prediction_goal,
        fd.index_of_label_var,
        fd.stripped_background_knowledge,
        debug_printing=fd.debug_printing_get_classifier,
        engine=fd.engine)

    statistics_handler = do_labeled_examples_get_correctly_classified(
        classifier, all_examples, fd.possible_labels, fd.
        debug_printing_classification)  # type: ClassificationStatisticsHandler
    end_time = time.time()
    # time in seconds: # time in seconds
    elapsed_time = end_time - start_time

    accuracy, _ = statistics_handler.get_accuracy()

    statistics_fname = fd.dir_output_files + fd.fname_prefix_fold + ".statistics"
    statistics_handler.write_out_statistics_to_file(statistics_fname)

    mean_accuracy_of_folds = mean(fd.accuracies_folds)
    var_accuracy_of_folds = variance(fd.accuracies_folds,
                                     mean_accuracy_of_folds)
    std_accuracy_of_folds = sqrt(var_accuracy_of_folds)

    confidence = 0.9
    mean_acc, conf_left, conf_right, diff_from_mean = mean_confidence_interval(
        fd.accuracies_folds, confidence)

    mean_total_nb_of_nodes = mean(fd.total_nb_of_nodes_per_fold)
    var_total_nb_of_nodes = variance(fd.total_nb_of_nodes_per_fold,
                                     mean_total_nb_of_nodes)
    std_total_nb_of_nodes = sqrt(var_total_nb_of_nodes)

    mean_nb_of_inner_nodes = mean(fd.nb_of_inner_node_per_fold)
    var_nb_of_inner_nodes = variance(fd.nb_of_inner_node_per_fold,
                                     mean_nb_of_inner_nodes)
    std_nb_of_inner_nodes = sqrt(var_nb_of_inner_nodes)

    total_execution_time_of_cross_validation = sum(fd.execution_time_per_fold)

    with open(statistics_fname, 'a') as f:
        f.write("\n\ntotal nb of examples (labeled + unlabeled): " +
                str(fd.total_nb_of_examples) + "\n")
        f.write("total nb of LABELED examples: " +
                str(fd.total_nb_of_labeled_examples) + "\n\n")

        f.write("list of accuracies per fold:\n")
        f.write("\t" + str(fd.accuracies_folds) + "\n")
        f.write("mean accuracy: " + str(mean_accuracy_of_folds) + "\n")
        f.write("var accuracy: " + str(var_accuracy_of_folds) + "\n")
        f.write("std accuracy: " + str(std_accuracy_of_folds) + "\n")
        f.write("accuracy of total tree: " +
                str(statistics_handler.get_accuracy()[0]) + "\n\n")
        f.write("accuracy " + str(confidence * 100) +
                "% confidence interval: [" + str(conf_left) + "," +
                str(conf_right) + "]\n")
        f.write("\taccuracy " + str(confidence * 100) +
                "% confidence interval around mean: " + str(mean_acc) +
                " +- " + str(diff_from_mean) + "\n\n")

        f.write("total nb of nodes in total tree: " + str(nb_of_nodes) + "\n")
        f.write("nb of internal nodes in total tree: " + str(nb_inner_nodes) +
                "\n\n")

        f.write("list of total nb of nodes per fold:\n")
        f.write("\t" + str(fd.total_nb_of_nodes_per_fold) + "\n")
        f.write("mean total nb of nodes: " + str(mean_total_nb_of_nodes) +
                "\n")
        f.write("var total nb of nodes: " + str(var_total_nb_of_nodes) + "\n")
        f.write("std total nb of nodes: " + str(std_total_nb_of_nodes) +
                "\n\n")

        f.write("list of nb of internal nodes per fold:\n")
        f.write("\t" + str(fd.nb_of_inner_node_per_fold) + "\n")
        f.write("mean nb of internal nodes: " + str(mean_nb_of_inner_nodes) +
                "\n")
        f.write("var nb of internal nodes: " + str(var_nb_of_inner_nodes) +
                "\n")
        f.write("std nb of internal nodes: " + str(std_nb_of_inner_nodes) +
                "\n\n")

        f.write("execution times of folds:\n")
        f.write("\t" + str(fd.execution_time_per_fold) + "\n")
        f.write("total time cross  (sum folds): " +
                str(total_execution_time_of_cross_validation) + " seconds\n")
        f.write("time total tree building + verifying: " + str(elapsed_time) +
                " seconds\n")

    print("total nb of nodes in total tree: " + str(nb_of_nodes))
    print("nb of internal nodes in total tree: " + str(nb_inner_nodes))
    print()
    print("list of accuracies per fold:")
    print("\t" + str(fd.accuracies_folds))
    print("mean accuracy: " + str(mean_accuracy_of_folds))
    print("var accuracy: " + str(var_accuracy_of_folds))
    print("std accuracy " + str(std_accuracy_of_folds))
    print("accuracy of total tree: " + str(statistics_handler.get_accuracy()))
    print()
    print("accuracy " + str(confidence * 100) + "% confidence interval: [" +
          str(conf_left) + "," + str(conf_right) + "]")
    print("\taccuracy " + str(confidence * 100) +
          "% confidence interval around mean: " + str(mean_acc) + " +- " +
          str(diff_from_mean))
    print()
    print("total nb of nodes in total tree: " + str(nb_of_nodes))
    print("nb of internal nodes in total tree: " + str(nb_inner_nodes))
    print()
    print("list of total nb of nodes per fold:")
    print("\t" + str(fd.total_nb_of_nodes_per_fold))
    print("mean total nb of nodes: " + str(mean_total_nb_of_nodes))
    print("var total nb of nodes: " + str(var_total_nb_of_nodes))
    print("std total nb of nodes: " + str(std_total_nb_of_nodes))
    print()
    print("list of nb of internal nodes per fold:")
    print("\t" + str(fd.nb_of_inner_node_per_fold))
    print("mean nb of internal nodes: " + str(mean_nb_of_inner_nodes))
    print("var nb of internal nodes: " + str(var_nb_of_inner_nodes))
    print("std nb of internal nodes: " + str(std_nb_of_inner_nodes))
    print()
    print("execution times of folds:")
    print("\t" + str(fd.execution_time_per_fold))
    print("total time cross  (sum folds):",
          total_execution_time_of_cross_validation, "seconds")
    print("time total tree building + verifying:", elapsed_time, "seconds")

    print('\t=== end classifying total set')