示例#1
0
def verify(decision_tree: DecisionTree, test_examples, debug_printing=False):
    possible_labels = decision_tree.tree.labels

    statistics_handler = ClassificationStatisticsHandler(possible_labels)

    # classifier.debug_printing = True

    actual_labels = []
    predicted_labels = []

    for test_ex in test_examples:
        actual_label = test_ex.label

        predicted_label = decision_tree.predict(test_ex)
        # found_labels = classifier.classify(example)
        # if len(found_labels) > 1:
        #     print('actual label: ', actual_label)
        #     print('found labels: ', found_labels)
        #
        # a_predicted_label = found_labels[0]

        # TEST
        actual_labels.append(str(actual_label))
        predicted_labels.append(str([predicted_label]))

        statistics_handler.update_statistics(actual_label, predicted_label)
    # -------------

    # conf_matrix = confusion_matrix(actual_labels, predicted_labels)
    # accuracy = accuracy_score(actual_labels, predicted_labels)
    #
    # possible_labels_str = [str(label) for label in possible_labels]

    if debug_printing:
        # print("sklearn confusion matrix:")
        # print(conf_matrix)
        # print("pretty print:")
        # print_cm(conf_matrix, labels=possible_labels_str)
        print("===  MODEL VERIFICATION STATISTICS ===")

        print(statistics_handler.get_accuracy()[1])

        # precision = precision_score(actual_labels, predicted_labels)
        # recall = recall_score(actual_labels, predicted_labels)
        # print('precision:')
        # print('\t' + str(precision))
        # print('recall:')
        # print('\t' + str(recall))

        print(statistics_handler.get_classification_report_str())
        print(statistics_handler.get_nb_of_examples_str_verbose() + '\n')
        print(statistics_handler.get_confusion_matrix_str())

        # nb_of_examples = len(examples)
        # nb_of_correcty_labeled_examples = statistics_handler.nb_ex_correctly_classified
        # nb_of_incorrecty_labeled_examples = statistics_handler.nb_ex_incorrectly_classified
        #
        # if debug_printing:
        #     print("total nb of examples: " + str(nb_of_examples))
        #     print(
        #         "examples labeled correctly: " + str(nb_of_correcty_labeled_examples) + "/" + str(
        #             nb_of_examples) + ", " + str(
        #             nb_of_correcty_labeled_examples / nb_of_examples * 100) + "%")
        #     print("examples labeled incorrectly: " + str(nb_of_incorrecty_labeled_examples) + "/" + str(
        #         nb_of_examples) + ", " + str(
        #         nb_of_incorrecty_labeled_examples / nb_of_examples * 100) + "%\n")
        #     print("--- confusion matrix: true/predicted --- :")
        #     print(statistics_handler.get_confusion_matrix_str())

    return statistics_handler
示例#2
0
# =================================================================================================================

examples = default_handler.get_transformed_example_list(training_examples_collection)

# =================================================================================================================


print('=== START tree building ===')

# test_evaluator = SimpleProgramQueryEvaluator(engine=engine)
# splitter = ProblogSplitter(language=language,split_criterion_str='entropy', test_evaluator=test_evaluator,
#                            query_head_if_keys_format=prediction_goal)

tree_builder = default_handler.get_default_decision_tree_builder(language, prediction_goal)
decision_tree = DecisionTree()

start_time = time.time()
decision_tree.fit(examples=examples, tree_builder=tree_builder)
end_time = time.time()
run_time_sec = end_time - start_time
run_time_ms = 1000.0 * run_time_sec
print("run time (ms):", run_time_ms)

print(decision_tree)


test_examples = []
for ex_wr_sp in training_examples_collection.get_example_wrappers_sp():
    example_clause = build_clause(ex_wr_sp, training=False)
    example = Example(data=example_clause, label=ex_wr_sp.label)
示例#3
0
def do_one_fold(fold_index: int, test_key_set: Set[Constant], fd: FoldData):
    print('\n===========================')
    print('=== start FOLD ' + str(fold_index + 1) + ' of ' + str(fd.nb_folds))
    print('===========================')

    training_example_collection, test_examples = split_examples_into_training_and_test_sets(
        fd.all_key_sets, test_key_set,
        fd.examples_collection_usable_for_training,
        fd.examples_usable_for_testing)
    print('\ttotal nb of labeled examples: ' +
          str(fd.total_nb_of_labeled_examples))
    nb_of_training_ex = len(training_example_collection.example_wrappers_sp)
    nb_of_test_ex = len(test_examples)
    print('\tnb of TRAINING ex: ' + str(nb_of_training_ex))
    print('\tnb of TEST ex: ' + str(nb_of_test_ex))

    # ===========================
    start_time = time.time()

    # ==============================================================================================================
    print('\t=== start building tree for fold ' + str(fold_index + 1))

    # TRAIN MODEL using training set
    tree_builder = get_default_decision_tree_builder(
        language, prediction_goal)  # type: TreeBuilder
    decision_tree = DecisionTree()
    decision_tree.fit(examples=training_examples_fold,
                      tree_builder=tree_builder)
    # tree = build_tree(fd.internal_ex_format, fd.treebuilder_type, fd.parsed_settings.language,
    #                   fd.possible_labels, training_example_collection, prediction_goal=fd.prediction_goal,
    #                   full_background_knowledge_sp=fd.full_background_knowledge_sp,
    #                   debug_printing_tree_building=fd.debug_printing_tree_building, engine=fd.engine)

    tree = prune_tree(
        tree, debug_printing_tree_pruning=fd.debug_printing_tree_pruning)
    nb_of_nodes = tree.get_nb_of_nodes()
    nb_inner_nodes = tree.get_nb_of_inner_nodes()
    fd.total_nb_of_nodes_per_fold.append(nb_of_nodes)
    fd.nb_of_inner_node_per_fold.append(nb_inner_nodes)

    # write out tree
    tree_fname = fd.dir_output_files + fd.fname_prefix_fold + '_fold' + str(
        fold_index) + ".tree"
    write_out_tree(tree_fname, tree)

    print('\t=== end building tree for fold ' + str(fold_index + 1))

    # ==============================================================================================================

    print('\t=== start converting tree to program for fold ' +
          str(fold_index + 1))
    program = convert_tree_to_program(
        fd.kb_format,
        fd.treebuilder_type,
        tree,
        fd.parsed_settings.language,
        debug_printing=fd.debug_printing_program_conversion,
        prediction_goal=fd.prediction_goal,
        index_of_label_var=fd.index_of_label_var)
    program_fname = fd.dir_output_files + fd.fname_prefix_fold + '_fold' + str(
        fold_index) + ".program"
    write_out_program(program_fname, program)

    print('\t=== end converting tree to program for fold ' +
          str(fold_index + 1))

    # ==============================================================================================================

    print('\t=== start classifying test set' + str(fold_index + 1))
    # EVALUATE MODEL using test set
    classifier = get_keys_classifier(
        fd.internal_ex_format,
        program,
        fd.prediction_goal,
        fd.index_of_label_var,
        fd.stripped_background_knowledge,
        debug_printing=fd.debug_printing_get_classifier,
        engine=fd.engine)

    statistics_handler = do_labeled_examples_get_correctly_classified(
        classifier, test_examples, fd.possible_labels, fd.
        debug_printing_classification)  # type: ClassificationStatisticsHandler

    # ===================
    end_time = time.time()
    # time in seconds: # time in seconds
    elapsed_time = end_time - start_time
    fd.execution_time_per_fold.append(elapsed_time)

    accuracy, _ = statistics_handler.get_accuracy()
    fd.accuracies_folds.append(accuracy)

    statistics_fname = fd.dir_output_files + fd.fname_prefix_fold + '_fold' + str(
        fold_index) + ".statistics"
    statistics_handler.write_out_statistics_to_file(statistics_fname)

    with open(statistics_fname, 'a') as f:
        f.write('\n\nnb of TRAINING ex: ' + str(nb_of_training_ex) + "\n")
        f.write('nb of TEST ex: ' + str(nb_of_test_ex) + "\n\n")

        f.write("total nb of nodes: " + str(nb_of_nodes) + "\n")
        f.write("nb of internal nodes: " + str(nb_inner_nodes) + "\n\n")
        f.write("execution time of fold: " + str(elapsed_time) + " seconds\n")
    print("total nb of nodes: " + str(nb_of_nodes))
    print("nb of internal nodes: " + str(nb_inner_nodes))
    print("execution time of fold: ", elapsed_time, "seconds")

    print('\t=== end classifying test set' + str(fold_index + 1))
    print('\t=== end FOLD ' + str(fold_index + 1) + ' of ' + str(fd.nb_folds) +
          '\n')
示例#4
0
    examples = default_handler.get_transformed_example_list(
        training_examples_collection)

    # =================================================================================================================

    run_time_list = []

    for i in range(0, 10):
        print('=== START tree building ===')

        # test_evaluator = SimpleProgramQueryEvaluator(engine=engine)
        # splitter = ProblogSplitter(language=language,split_criterion_str='entropy', test_evaluator=test_evaluator,
        #                            query_head_if_keys_format=prediction_goal)
        tree_builder = default_handler.get_default_decision_tree_builder(
            language, prediction_goal)  # type: TreeBuilder
        decision_tree = DecisionTree()
        start_time = time.time()
        decision_tree.fit(examples=examples, tree_builder=tree_builder)
        end_time = time.time()
        run_time_sec = end_time - start_time
        run_time_ms = 1000.0 * run_time_sec
        run_time_list.append(run_time_ms)
        print("run time (ms):", run_time_ms)

        print('=== END tree building ===\n')

    average_run_time_ms = statistics.mean(run_time_list)
    average_run_time_list.append((name, average_run_time_ms))

    print("average tree build time (ms):", average_run_time_ms)
    print(decision_tree)
示例#5
0
def run_experiment(file_name_data: FileNameData, fold_info_controller: FoldInfoController,
                   default_handler: DefaultHandler,
                   hide_printouts: bool = False,
                   filter_out_unlabeled_examples=False,
                   debug_printing_options=DebugPrintingOptions()):
    # -- create output directory
    if not os.path.exists(file_name_data.output_dir):
        os.makedirs(file_name_data.output_dir)

    print("start", file_name_data.test_name)
    save_stdout = sys.stdout
    if hide_printouts:
        sys.stdout = open(os.devnull, "w")

    experiment = Experiment()
    experiment.preprocess_examples_and_background_knowledge(file_name_data, filter_out_unlabeled_examples,
                                                            debug_printing_options)

    fold_example_splitter = FoldExampleSplitter(fold_info_controller)
    for fold_info, training_examples_collection, test_examples in fold_example_splitter.fold_split_generator(
            experiment):  # type: FoldInfo, ExampleCollection, List[ExampleWrapper]
        print("fold: ", fold_info.index)
        training_examples = default_handler.get_transformed_example_list(training_examples_collection)

        tree_builder = default_handler.get_default_decision_tree_builder(experiment.language,
                                                                         experiment.prediction_goal)  # type: TreeBuilder
        decision_tree = DecisionTree()
        start_build_time = time.time()
        decision_tree.fit(examples=training_examples, tree_builder=tree_builder)

        if debug_printing_options.tree_building:
            print("unpruned:")
            print(decision_tree)
        decision_tree.prune(prune_leaf_nodes_with_same_label)
        end_build_time = time.time()

        # run_time_sec = end_time - start_time
        build_time_sec = end_build_time - start_build_time
        build_time_ms = 1000.0 * build_time_sec
        fold_info.dt_build_time_ms = build_time_ms
        if debug_printing_options.tree_building or debug_printing_options.tree_pruning:
            print("build time (ms):", build_time_ms)
            print("pruned")
            print(decision_tree)

        # write out tree
        tree_fname = os.path.join(file_name_data.output_dir,
                                  default_handler.back_end_name + "_" + fold_info_controller.fname_prefix_fold + '_fold' + str(
                                      fold_info.index) + ".tree")
        write_out_tree(tree_fname, decision_tree)

        start_test_example_transformation = time.time()
        test_examples_reformed = default_handler.get_transformed_test_example_list(test_examples)
        # for ex_wr_sp in test_examples:
        #     example_clause = build_clause(ex_wr_sp, training=False)
        #     example = Example(data=example_clause, label=ex_wr_sp.label)
        #     example.classification_term = ex_wr_sp.classification_term
        #     test_examples_reformed.append(example)
        end_test_example_transformation = time.time()

        statistics_handler = verify(decision_tree, test_examples_reformed)
        accuracy = statistics_handler.get_accuracy()
        if debug_printing_options.get_classifier:
            print("accuracy:", accuracy)

        # ===================
        end_time = time.time()
        # time in seconds: # time in seconds
        elapsed_time = end_time - start_build_time - (end_test_example_transformation - start_test_example_transformation)
        elapsed_time_ms = 1000.0 * elapsed_time
        fold_info.execution_time_ms = elapsed_time_ms

        accuracy, _ = statistics_handler.get_accuracy()
        fold_info.accuracy = accuracy

        statistics_fname = os.path.join(file_name_data.output_dir,
                                        default_handler.back_end_name + "_" + fold_info_controller.fname_prefix_fold + '_fold'
                                        + str(fold_info.index) + ".statistics")

        # statistics_fname = file_name_data.output_dir + .fname_prefix_fold + '_fold' + str(fold_index) + ".statistics"
        statistics_handler.write_out_statistics_to_file(statistics_fname)

        with open(statistics_fname, 'a') as f:
            f.write('\n\nnb of TRAINING ex: ' + str(len(training_examples)) + "\n")
            f.write('nb of TEST ex: ' + str(len(test_examples)) + "\n\n")
            nb_nodes = decision_tree.get_nb_of_nodes()
            fold_info.n_nodes = nb_nodes
            f.write("total nb of nodes: " + str(nb_nodes) + "\n")
            nb_inner_nodes = decision_tree.get_nb_of_inner_nodes()
            fold_info.n_inner_nodes = nb_inner_nodes
            f.write("nb of internal nodes: " + str(nb_inner_nodes) + "\n\n")
            f.write("execution time of fold (ms): " + str(elapsed_time_ms) + "\n")

        # verify(decision_tree, )

        # ------------------------------------------

        # --- DESTRUCTION (necessary for Django) ---
        decision_tree.destruct()

        for ex in training_examples:
            ex.destruct()

    mean_accuracy_of_folds = statistics.mean(
        [fold_info.accuracy for (_index, fold_info) in fold_info_controller.fold_infos.items()])

    dt_build_times = [fold_info.dt_build_time_ms for (_index, fold_info) in fold_info_controller.fold_infos.items()]
    mean_decision_tree_build_time = statistics.mean(dt_build_times)

    fold_execution_times_ms = [
        fold_info.execution_time_ms for (_index, fold_info) in fold_info_controller.fold_infos.items()
    ]
    total_execution_time_ms_of_cross_validation = sum(fold_execution_times_ms)
    mean_execution_time_ms_of_folds = statistics.mean(fold_execution_times_ms)

    folds_total_nb_of_nodes = [
        fold_info.n_nodes for (_index, fold_info) in fold_info_controller.fold_infos.items()
    ]
    mean_total_nb_of_nodes = statistics.mean(folds_total_nb_of_nodes)

    fold_nb_of_inner_nodes = [
        fold_info.n_inner_nodes for (_index, fold_info) in fold_info_controller.fold_infos.items()
    ]
    mean_nb_of_inner_nodes = statistics.mean(fold_nb_of_inner_nodes)

    if debug_printing_options.debug_printing_classification:
        print("mean decision tree build time (ms):", mean_decision_tree_build_time)
        print("total time cross  (sum folds): " + str(total_execution_time_ms_of_cross_validation) + "\n")

    statistics_fname = os.path.join(file_name_data.output_dir,
                                    default_handler.back_end_name + "_" + fold_info_controller.fname_prefix_fold + ".statistics")
    # statistics_fname = fd.dir_output_files + fd.fname_prefix_fold + ".statistics"
    with open(statistics_fname, 'w') as f:
        f.write("mean accuracy: " + str(mean_accuracy_of_folds) + "\n")
        f.write("mean decision tree build time (ms):" + str(mean_decision_tree_build_time) + "\n")
        f.write("mean fold execution time (ms):" + str(mean_execution_time_ms_of_folds) + "\n")

        f.write("mean total nb of nodes:" + str(mean_total_nb_of_nodes) + "\n")
        f.write("mean nb of inner nodes:" + str(mean_nb_of_inner_nodes) + "\n")

        f.write("total time cross  (sum folds) (ms): " + str(total_execution_time_ms_of_cross_validation) + "\n")

    if hide_printouts:
        sys.stdout = save_stdout
    print("finished", file_name_data.test_name)