def test_forward_select(self): path = datasetsDir(self) + 'minigolf' + SEP + 'weather' _training = training(path) _attributes, _klass = metadata(path) _test = test(path) _gold = gold(path) verify_training = copy.deepcopy(_training) verify_attributes = copy.deepcopy(_attributes) feat_sel = fs.FeatureSelection(_training, _attributes, _klass, _test, _gold, ['1R', '4', '0.1']) feat_sel.forward_selection() self.assertEqual(1, len(_attributes)) self.assertEqual('outlook', _attributes[0].name) self.verify_number_of_attributes(_training, 1) self.verify_number_of_attributes(_test, 1) self.verify_number_of_attributes(_gold, 1) #verification verification_cv_datasets = verify_training.cross_validation_datasets(4) accuracies = {} for attribute in verify_attributes: accuracies[ attribute.name] = feat_sel.avg_accuracy_by_cross_validation( verification_cv_datasets, 4, attr.Attributes([attribute])) #'windy': 0.41666666666666663, 'outlook': 0.79166666666666663, 'temperature': 0.41666666666666663, 'humidity': 0.54166666666666663 self.assertAlmostEqual(0.4166666, accuracies['windy'], 6) self.assertAlmostEqual(0.79166666, accuracies['outlook'], 6) self.assertAlmostEqual(0.4166666, accuracies['temperature'], 6) self.assertAlmostEqual(0.5416666, accuracies['humidity'], 6) #outlook selected accuracies = {} for each in verify_attributes: if each.name == 'outlook': outlook = each verify_attributes.remove(outlook) for attribute in verify_attributes: accuracies[( 'outlook', attribute.name)] = feat_sel.avg_accuracy_by_cross_validation( verification_cv_datasets, 4, attr.Attributes([outlook, attribute])) #{('outlook', 'humidity'): 0.79166666666666663, ('outlook', 'temperature'): 0.79166666666666663, ('outlook', 'windy'): 0.54166666666666663} self.assertAlmostEqual(0.7916666, accuracies[('outlook', 'humidity')], 6) self.assertAlmostEqual(0.7916666, accuracies['outlook', 'temperature'], 6) self.assertAlmostEqual(0.5416666, accuracies[('outlook', 'windy')], 6)
def test_attributes_are_equal(self): attrs = a.Attributes([ a.Attribute('band', ['dual', 'tri', 'quad'], 0), a.Attribute('size', ['big', 'small', 'medium'], 1) ]) same = a.Attributes([ a.Attribute('band', ['dual', 'tri', 'quad'], 0), a.Attribute('size', ['big', 'small', 'medium'], 1) ]) self.assertEqual(attrs, same, 'they should be the same') other = a.Attributes([ a.Attribute('band', ['dual', 'tri', 'quad'], 0), a.Attribute('pda', ['y', 'n'], 1) ]) self.assertNotEqual(self.attrs, other, 'shouldnt be the same')
def test_empty_freq_dists(self): attr1 = a.Attribute("first", ['a', 'b', 'c'], 0) attr2 = a.Attribute("second", ['d', 'e'], 1) attrs = a.Attributes([attr1, attr2]) freq_dists = attrs.empty_freq_dists() self.assertEqual(2, len(freq_dists)) self.assertEqual(3, len(freq_dists[attr1])) self.assertEqual(2, len(freq_dists[attr2]))
def metadata(self, file_path): lines = self.__get_lines(file_path, self.NAMES) klass_values = item.NameItem(lines[0]).processed().split(',') index,attributes = 0, [] for line in lines: nameitem = item.NameItem(line) processed = nameitem.processed() if not len(processed) == 0 and nameitem.isAttribute(): attributes.append(a.Attribute(self.get_name(processed), self.get_values(processed), index)) index += 1 return (a.Attributes(attributes), klass_values)
def __select_attributes(self, max, selected, others, delta): if others is None or len(others) == 0: return selected max_at_level, attr_with_max_acc, fold = -1, None, self.get_fold() datasets = self.training.cross_validation_datasets(fold) for attribute in others: selected.append(attribute) avg_accuracy = self.avg_accuracy_by_cross_validation( datasets, fold, attr.Attributes(selected)) if avg_accuracy > max_at_level: max_at_level = avg_accuracy attr_with_max_acc = attribute selected.remove(attribute) if max_at_level - max < delta: return selected selected.append(attr_with_max_acc) others.remove(attr_with_max_acc) return self.__select_attributes(max_at_level, selected, others, delta)
def __eliminate_attributes(self, max, selected, delta): if selected is None or len(selected) == 0 or len(selected) == 1: return selected max_at_level, selections_with_max_acc, fold = -1, None, self.get_fold() datasets = self.training.cross_validation_datasets(fold) selected_for_iter = selected[:] for attribute in selected_for_iter: selected.remove(attribute) avg_accuracy = self.avg_accuracy_by_cross_validation( datasets, fold, attr.Attributes(selected)) if avg_accuracy > max_at_level: max_at_level = avg_accuracy selections_with_max_acc = selected[:] selected.append(attribute) if max_at_level - max < delta: return selected return self.__eliminate_attributes(max_at_level, selections_with_max_acc, delta)
def test_backward_select(self): path = datasetsDir(self) + 'minigolf' + SEP + 'weather' _training = training(path) _attributes, _klass = metadata(path) _test = test(path) _gold = gold(path) verify_training = copy.deepcopy(_training) verify_attributes = copy.deepcopy(_attributes) feat_sel = fs.FeatureSelection(_training, _attributes, _klass, _test, _gold, ['1R', '4', '0.1']) feat_sel.backward_elimination() self.assertEqual(3, len(_attributes)) self.verify_number_of_attributes(_training, 3) self.verify_number_of_attributes(_test, 3) self.verify_number_of_attributes(_gold, 3) #verification #level 0 avg_acc = feat_sel.avg_accuracy_by_cross_validation( verify_training.cross_validation_datasets(4), 4, verify_attributes) self.assertAlmostEqual(0.5416666, avg_acc, 6) verification_cv_datasets = verify_training.cross_validation_datasets(4) accuracies = {} for attribute in verify_attributes: attributes = verify_attributes[:] attributes.remove(attribute) accuracies[(attributes[0].name, attributes[1].name, attributes[2].name )] = feat_sel.avg_accuracy_by_cross_validation( verification_cv_datasets, 4, attr.Attributes(attributes)) # {('outlook', 'humidity', 'windy'): 0.54166666666666663, # ('outlook', 'temperature', 'windy'): 0.54166666666666663, # ('temperature', 'humidity', 'windy'): 0.29166666666666663, # ('outlook', 'temperature', 'humidity'): 0.79166666666666663} self.assertAlmostEqual(0.5416666, accuracies[('outlook', 'humidity', 'windy')], 6) self.assertAlmostEqual(0.5416666, accuracies[('outlook', 'temperature', 'windy')], 6) self.assertAlmostEqual( 0.2916666, accuracies[('temperature', 'humidity', 'windy')], 6) self.assertAlmostEqual( 0.7916666, accuracies[('outlook', 'temperature', 'humidity')], 6) # #('outlook', 'temperature', 'humidity') selected accuracies = {} for each in verify_attributes: if each.name == 'windy': windy = each verify_attributes.remove(windy) for attribute in verify_attributes: attributes = verify_attributes[:] attributes.remove(attribute) accuracies[(attributes[0].name, attributes[1].name )] = feat_sel.avg_accuracy_by_cross_validation( verification_cv_datasets, 4, attr.Attributes(attributes)) self.assertAlmostEqual(0.7916666, accuracies[('outlook', 'humidity')], 6) self.assertAlmostEqual(0.7916666, accuracies['outlook', 'temperature'], 6) self.assertAlmostEqual(0.4166666, accuracies[('temperature', 'humidity')], 6)
def test_to_string(self): attr1 = a.Attribute("first", ['a', 'b', 'c'], 0) attr2 = a.Attribute("second", ['d', 'e'], 1) attrs = a.Attributes([attr1, attr2]) self.assertEqual('[first:[a,b,c] index:0, second:[d,e] index:1]', str(attrs))