def setUp(self): """ Loads train dataset config, train Dataset and test dataset config. """ self.config = dataset.load_config(os.path.join( '.', 'data', 'train_dataset2')) self.data = dataset.Dataset(self.config["filepath"], self.config["key attrib index"], self.config["class attrib index"], self.config["split char"], self.config["missing value string"], load_numeric=True) self.test_config = dataset.load_config(os.path.join( '.', 'data', 'test_dataset2'))
def setUp(self): """ Loads dataset config and Dataset without numeric attributes. """ self.config = dataset.load_config(os.path.join( '.', 'data', 'train_dataset1')) self.data = dataset.Dataset(self.config["filepath"], self.config["key attrib index"], self.config["class attrib index"], self.config["split char"], self.config["missing value string"], load_numeric=False)
def setUp(self): """ Loads dataset config. """ self.criterion = criteria.GiniGain self.config = dataset.load_config( os.path.join('.', 'data', 'train_dataset2')) self.data = dataset.Dataset(self.config["filepath"], self.config["key attrib index"], self.config["class attrib index"], self.config["split char"], self.config["missing value string"], load_numeric=True) self.decision_tree = decision_tree.DecisionTree(self.criterion)
def setUp(self): """ Loads dataset config and Dataset without numeric attributes. """ self.config = dataset.load_config( os.path.join('.', 'data', 'train_dataset1')) self.data = dataset.Dataset(self.config["filepath"], self.config["key attrib index"], self.config["class attrib index"], self.config["split char"], self.config["missing value string"], load_numeric=True) self.tree_node = decision_tree.TreeNode( self.data, list(range(self.data.num_samples)), self.data.valid_nominal_attribute, self.data.valid_numeric_attribute, max_depth_remaining=0, min_samples_per_node=1, use_stop_conditions=False, max_p_value_chi_sq=None)
def setUp(self): """ Loads dataset config and Dataset without numeric attributes, trains the tree. """ import criteria self.criterion = criteria.GiniGain self.config = dataset.load_config( os.path.join('.', 'data', 'train_dataset1')) self.data = dataset.Dataset(self.config["filepath"], self.config["key attrib index"], self.config["class attrib index"], self.config["split char"], self.config["missing value string"], load_numeric=False) self.decision_tree = decision_tree.DecisionTree(self.criterion) self.decision_tree.train(self.data, list(range(self.data.num_samples)), max_depth=1, min_samples_per_node=1, use_stop_conditions=False, max_p_value_chi_sq=None)
def test_test_from_csv(self): """ Tests DecisionTree's test_from_csv(). """ test_config = dataset.load_config( os.path.join('.', 'data', 'train_dataset1')) (classifications, num_correct_classifications, num_correct_classifications_wo_unkown, total_cost, total_cost_wo_unkown, classified_with_unkown_value_array, num_unkown, unkown_value_attrib_index_array) = self.decision_tree.test_from_csv( test_config["filepath"], test_config["key attrib index"], test_config["class attrib index"], test_config["split char"], test_config["missing value string"]) self.assertEqual(classifications, [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0]) self.assertEqual(num_correct_classifications, 11) self.assertEqual(num_correct_classifications_wo_unkown, 11) self.assertEqual(total_cost, 1.0) self.assertEqual(total_cost_wo_unkown, 1.0) self.assertEqual(classified_with_unkown_value_array, [False] * 12) self.assertEqual(num_unkown, 0) self.assertEqual(unkown_value_attrib_index_array, [None] * 12)
def main(experiment_config): """Sets the configurations according to `experiment_config` and runs them. """ raw_output_filepath = os.path.join(experiment_config["output folder"], 'raw_output.csv') with open(raw_output_filepath, 'w') as fout: init_raw_output_csv(fout, output_split_char=',') criteria_list = get_criteria(experiment_config["criteria"]) if "starting seed index" not in experiment_config: starting_seed = 1 else: starting_seed = experiment_config["starting seed index"] if experiment_config["prunning parameters"]["use chi-sq test"]: max_p_value_chi_sq = experiment_config["prunning parameters"][ "max chi-sq p-value"] decision_tree.MIN_SAMPLES_IN_SECOND_MOST_FREQUENT_VALUE = experiment_config[ "prunning parameters"]["second most freq value min samples"] else: max_p_value_chi_sq = None decision_tree.MIN_SAMPLES_IN_SECOND_MOST_FREQUENT_VALUE = None decision_tree.USE_MIN_SAMPLES_SECOND_LARGEST_CLASS = experiment_config[ "prunning parameters"]["use second most freq class min samples"] if decision_tree.USE_MIN_SAMPLES_SECOND_LARGEST_CLASS: decision_tree.MIN_SAMPLES_SECOND_LARGEST_CLASS = experiment_config[ "prunning parameters"]["second most freq class min samples"] else: decision_tree.MIN_SAMPLES_SECOND_LARGEST_CLASS = None if experiment_config["use all datasets"]: datasets_configs = dataset.load_all_configs( experiment_config["datasets basepath"]) datasets_configs.sort(key=lambda config: config["dataset name"]) else: datasets_folders = [ os.path.join(experiment_config["datasets basepath"], folderpath) for folderpath in experiment_config["datasets folders"] ] datasets_configs = [ dataset.load_config(folderpath) for folderpath in datasets_folders ] if experiment_config["load one dataset at a time"]: datasets = dataset.load_all_datasets( datasets_configs, experiment_config["use numeric attributes"]) for ((dataset_name, curr_dataset), min_num_samples_allowed) in itertools.product( datasets, experiment_config["prunning parameters"] ["min num samples allowed"]): for criterion in criteria_list: print('-' * 100) print(criterion.name) print() run(dataset_name, curr_dataset, experiment_config["num training samples"], criterion, min_num_samples_allowed=min_num_samples_allowed, max_depth=experiment_config["max depth"], num_trials=experiment_config["num trials"], starting_seed=starting_seed, use_numeric_attributes=experiment_config[ "use numeric attributes"], use_chi_sq_test=experiment_config[ "prunning parameters"]["use chi-sq test"], max_p_value_chi_sq=max_p_value_chi_sq, output_file_descriptor=fout, output_split_char=',') else: for (dataset_config, min_num_samples_allowed) in itertools.product( datasets_configs, experiment_config["prunning parameters"] ["min num samples allowed"]): curr_dataset = dataset.Dataset( dataset_config["filepath"], dataset_config["key attrib index"], dataset_config["class attrib index"], dataset_config["split char"], dataset_config["missing value string"], experiment_config["use numeric attributes"]) for criterion in criteria_list: print('-' * 100) print(criterion.name) print() run(dataset_config["dataset name"], curr_dataset, experiment_config["num training samples"], criterion, min_num_samples_allowed=min_num_samples_allowed, max_depth=experiment_config["max depth"], num_trials=experiment_config["num trials"], starting_seed=starting_seed, use_numeric_attributes=experiment_config[ "use numeric attributes"], use_chi_sq_test=experiment_config[ "prunning parameters"]["use chi-sq test"], max_p_value_chi_sq=max_p_value_chi_sq, output_file_descriptor=fout, output_split_char=',')
def setUp(self): """ Loads dataset config. """ self.config = dataset.load_config(os.path.join( '.', 'data', 'dataset_without_valid_attributes'))
def test_loading_correct_config(self): """ Correct config. """ self.assertIsInstance(dataset.load_config(os.path.join('.', 'data', 'train_dataset1')), dict)
def test_missing_missing_value_str(self): """ Config without missing_value_str. """ self.assertIsNone( dataset.load_config(os.path.join('.', 'data', 'wrong_configs', 'config_5')))
def test_missing_split_char(self): """ Config without split_char. """ self.assertIsNone( dataset.load_config(os.path.join('.', 'data', 'wrong_configs', 'config_4')))
def test_missing_class_index(self): """ Config without class_attrib_index. """ self.assertIsNone( dataset.load_config(os.path.join('.', 'data', 'wrong_configs', 'config_3')))
def test_missing_name(self): """ Config without dataset_name. """ self.assertIsNone( dataset.load_config(os.path.join('.', 'data', 'wrong_configs', 'config_1')))
def test_nonexistent_file(self): """ Config pointing to non-existent file. """ self.assertIsNone(dataset.load_config('.'))
def setUp(self): """ Loads dataset config. """ self.config = dataset.load_config(os.path.join( '.', 'data', 'train_dataset2'))