예제 #1
0
 def setUp(self):
     """
     Loads train dataset config, train Dataset and test dataset config.
     """
     self.config = dataset.load_config(os.path.join(
         '.', 'data', 'train_dataset2'))
     self.data = dataset.Dataset(self.config["filepath"],
                                 self.config["key attrib index"],
                                 self.config["class attrib index"],
                                 self.config["split char"],
                                 self.config["missing value string"],
                                 load_numeric=True)
     self.test_config = dataset.load_config(os.path.join(
         '.', 'data', 'test_dataset2'))
예제 #2
0
 def setUp(self):
     """
     Loads dataset config and Dataset without numeric attributes.
     """
     self.config = dataset.load_config(os.path.join(
         '.', 'data', 'train_dataset1'))
     self.data = dataset.Dataset(self.config["filepath"],
                                 self.config["key attrib index"],
                                 self.config["class attrib index"],
                                 self.config["split char"],
                                 self.config["missing value string"],
                                 load_numeric=False)
 def setUp(self):
     """
     Loads dataset config.
     """
     self.criterion = criteria.GiniGain
     self.config = dataset.load_config(
         os.path.join('.', 'data', 'train_dataset2'))
     self.data = dataset.Dataset(self.config["filepath"],
                                 self.config["key attrib index"],
                                 self.config["class attrib index"],
                                 self.config["split char"],
                                 self.config["missing value string"],
                                 load_numeric=True)
     self.decision_tree = decision_tree.DecisionTree(self.criterion)
 def setUp(self):
     """
     Loads dataset config and Dataset without numeric attributes.
     """
     self.config = dataset.load_config(
         os.path.join('.', 'data', 'train_dataset1'))
     self.data = dataset.Dataset(self.config["filepath"],
                                 self.config["key attrib index"],
                                 self.config["class attrib index"],
                                 self.config["split char"],
                                 self.config["missing value string"],
                                 load_numeric=True)
     self.tree_node = decision_tree.TreeNode(
         self.data,
         list(range(self.data.num_samples)),
         self.data.valid_nominal_attribute,
         self.data.valid_numeric_attribute,
         max_depth_remaining=0,
         min_samples_per_node=1,
         use_stop_conditions=False,
         max_p_value_chi_sq=None)
 def setUp(self):
     """
     Loads dataset config and Dataset without numeric attributes, trains the tree.
     """
     import criteria
     self.criterion = criteria.GiniGain
     self.config = dataset.load_config(
         os.path.join('.', 'data', 'train_dataset1'))
     self.data = dataset.Dataset(self.config["filepath"],
                                 self.config["key attrib index"],
                                 self.config["class attrib index"],
                                 self.config["split char"],
                                 self.config["missing value string"],
                                 load_numeric=False)
     self.decision_tree = decision_tree.DecisionTree(self.criterion)
     self.decision_tree.train(self.data,
                              list(range(self.data.num_samples)),
                              max_depth=1,
                              min_samples_per_node=1,
                              use_stop_conditions=False,
                              max_p_value_chi_sq=None)
    def test_test_from_csv(self):
        """
        Tests DecisionTree's test_from_csv().
        """
        test_config = dataset.load_config(
            os.path.join('.', 'data', 'train_dataset1'))
        (classifications, num_correct_classifications,
         num_correct_classifications_wo_unkown, total_cost,
         total_cost_wo_unkown, classified_with_unkown_value_array, num_unkown,
         unkown_value_attrib_index_array) = self.decision_tree.test_from_csv(
             test_config["filepath"], test_config["key attrib index"],
             test_config["class attrib index"], test_config["split char"],
             test_config["missing value string"])

        self.assertEqual(classifications, [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0])
        self.assertEqual(num_correct_classifications, 11)
        self.assertEqual(num_correct_classifications_wo_unkown, 11)
        self.assertEqual(total_cost, 1.0)
        self.assertEqual(total_cost_wo_unkown, 1.0)
        self.assertEqual(classified_with_unkown_value_array, [False] * 12)
        self.assertEqual(num_unkown, 0)
        self.assertEqual(unkown_value_attrib_index_array, [None] * 12)
예제 #7
0
def main(experiment_config):
    """Sets the configurations according to `experiment_config` and runs them.
    """
    raw_output_filepath = os.path.join(experiment_config["output folder"],
                                       'raw_output.csv')
    with open(raw_output_filepath, 'w') as fout:
        init_raw_output_csv(fout, output_split_char=',')
        criteria_list = get_criteria(experiment_config["criteria"])

        if "starting seed index" not in experiment_config:
            starting_seed = 1
        else:
            starting_seed = experiment_config["starting seed index"]

        if experiment_config["prunning parameters"]["use chi-sq test"]:
            max_p_value_chi_sq = experiment_config["prunning parameters"][
                "max chi-sq p-value"]
            decision_tree.MIN_SAMPLES_IN_SECOND_MOST_FREQUENT_VALUE = experiment_config[
                "prunning parameters"]["second most freq value min samples"]
        else:
            max_p_value_chi_sq = None
            decision_tree.MIN_SAMPLES_IN_SECOND_MOST_FREQUENT_VALUE = None

        decision_tree.USE_MIN_SAMPLES_SECOND_LARGEST_CLASS = experiment_config[
            "prunning parameters"]["use second most freq class min samples"]
        if decision_tree.USE_MIN_SAMPLES_SECOND_LARGEST_CLASS:
            decision_tree.MIN_SAMPLES_SECOND_LARGEST_CLASS = experiment_config[
                "prunning parameters"]["second most freq class min samples"]
        else:
            decision_tree.MIN_SAMPLES_SECOND_LARGEST_CLASS = None

        if experiment_config["use all datasets"]:
            datasets_configs = dataset.load_all_configs(
                experiment_config["datasets basepath"])
            datasets_configs.sort(key=lambda config: config["dataset name"])
        else:
            datasets_folders = [
                os.path.join(experiment_config["datasets basepath"],
                             folderpath)
                for folderpath in experiment_config["datasets folders"]
            ]
            datasets_configs = [
                dataset.load_config(folderpath)
                for folderpath in datasets_folders
            ]
        if experiment_config["load one dataset at a time"]:
            datasets = dataset.load_all_datasets(
                datasets_configs, experiment_config["use numeric attributes"])
            for ((dataset_name, curr_dataset),
                 min_num_samples_allowed) in itertools.product(
                     datasets, experiment_config["prunning parameters"]
                     ["min num samples allowed"]):
                for criterion in criteria_list:
                    print('-' * 100)
                    print(criterion.name)
                    print()
                    run(dataset_name,
                        curr_dataset,
                        experiment_config["num training samples"],
                        criterion,
                        min_num_samples_allowed=min_num_samples_allowed,
                        max_depth=experiment_config["max depth"],
                        num_trials=experiment_config["num trials"],
                        starting_seed=starting_seed,
                        use_numeric_attributes=experiment_config[
                            "use numeric attributes"],
                        use_chi_sq_test=experiment_config[
                            "prunning parameters"]["use chi-sq test"],
                        max_p_value_chi_sq=max_p_value_chi_sq,
                        output_file_descriptor=fout,
                        output_split_char=',')
        else:
            for (dataset_config, min_num_samples_allowed) in itertools.product(
                    datasets_configs, experiment_config["prunning parameters"]
                ["min num samples allowed"]):
                curr_dataset = dataset.Dataset(
                    dataset_config["filepath"],
                    dataset_config["key attrib index"],
                    dataset_config["class attrib index"],
                    dataset_config["split char"],
                    dataset_config["missing value string"],
                    experiment_config["use numeric attributes"])
                for criterion in criteria_list:
                    print('-' * 100)
                    print(criterion.name)
                    print()
                    run(dataset_config["dataset name"],
                        curr_dataset,
                        experiment_config["num training samples"],
                        criterion,
                        min_num_samples_allowed=min_num_samples_allowed,
                        max_depth=experiment_config["max depth"],
                        num_trials=experiment_config["num trials"],
                        starting_seed=starting_seed,
                        use_numeric_attributes=experiment_config[
                            "use numeric attributes"],
                        use_chi_sq_test=experiment_config[
                            "prunning parameters"]["use chi-sq test"],
                        max_p_value_chi_sq=max_p_value_chi_sq,
                        output_file_descriptor=fout,
                        output_split_char=',')
예제 #8
0
 def setUp(self):
     """
     Loads dataset config.
     """
     self.config = dataset.load_config(os.path.join(
         '.', 'data', 'dataset_without_valid_attributes'))
예제 #9
0
 def test_loading_correct_config(self):
     """
     Correct config.
     """
     self.assertIsInstance(dataset.load_config(os.path.join('.', 'data', 'train_dataset1')),
                           dict)
예제 #10
0
 def test_missing_missing_value_str(self):
     """
     Config without missing_value_str.
     """
     self.assertIsNone(
         dataset.load_config(os.path.join('.', 'data', 'wrong_configs', 'config_5')))
예제 #11
0
 def test_missing_split_char(self):
     """
     Config without split_char.
     """
     self.assertIsNone(
         dataset.load_config(os.path.join('.', 'data', 'wrong_configs', 'config_4')))
예제 #12
0
 def test_missing_class_index(self):
     """
     Config without class_attrib_index.
     """
     self.assertIsNone(
         dataset.load_config(os.path.join('.', 'data', 'wrong_configs', 'config_3')))
예제 #13
0
 def test_missing_name(self):
     """
     Config without dataset_name.
     """
     self.assertIsNone(
         dataset.load_config(os.path.join('.', 'data', 'wrong_configs', 'config_1')))
예제 #14
0
 def test_nonexistent_file(self):
     """
     Config pointing to non-existent file.
     """
     self.assertIsNone(dataset.load_config('.'))
예제 #15
0
 def setUp(self):
     """
     Loads dataset config.
     """
     self.config = dataset.load_config(os.path.join(
         '.', 'data', 'train_dataset2'))