def __init__(self, training_set): """ Constructs a new decision tree. Args: training_set: model.DataSet The training data to use when building the decision tree. """ self.training_set = training_set self._tree = id3.build_tree(training_set) self._plotter = MatplotlibAnnotationTreePlotter(self._tree)
else: ret += bprint_aux(child_node, index + 2) return ret import pandas as pd has_ids = True has_header = True has_labels = True delimiter = "," header = 0 if has_header else None id_col = 0 if has_ids else None # dataframe = pd.read_csv("./test/datasets/play_tennis.data", index_col=id_col, header=header,delimiter=delimiter) dataframe = pd.read_csv("hope.csv", index_col=id_col, header=header, delimiter=delimiter) labels = dataframe.pop(dataframe.columns[-1]) if has_labels else None dataset = model.DataSet(dataframe, labels=labels) tree = id3.build_tree(dataset) printbonito = bprint(tree) print(printbonito) txt = open("output.txt", mode="w", encoding="UTF-8") txt.write(printbonito) txt.close() tp = MatplotlibAnnotationTreePlotter(tree) tp.plot()
class DecisionTree(AbstractClassifier): """ Decision tree classifier. Builds a tree which is like a flow chart. It allows a decision to be reached by checking the values for various features and following the appropriate branches until a destination is reached. In addition to being useful as a classifier, the structure of the decision tree can lend insight into the data. """ def __init__(self, training_set): """ Constructs a new decision tree. Args: training_set: model.DataSet The training data to use when building the decision tree. """ self.training_set = training_set self._tree = id3.build_tree(training_set) self._plotter = MatplotlibAnnotationTreePlotter(self._tree) def _classify(self, sample): """ Predicts a sample's classification based on the decision tree that was built from the training data. Args: sample: The sample or observation to be classified. Returns: The sample's classification. """ node = self._tree.get_root_node() while not node.is_leaf(): feature = node.get_value() branch = sample[feature] try: node = node.get_child(branch) except KeyError: return self._handle_value_not_trained_for() return node.get_value() def _handle_value_not_trained_for(self): """ Handles the case where a sample has a value for a feature which was not seen in the training set and therefore is not accounted for in the tree. Current strategy is to just return the most common label in the training data set. It might be better to narrow this down to the most common among samples that would reach the node at which the unrecognized value was found. Returns: label: The best guess at the label. """ return collection_utils.get_most_common( self.training_set.get_labels()) def plot(self): """ Generates a plot of the decision tree to visualize its structure. Returns: void """ self._plotter.plot()
class DecisionTree(AbstractClassifier): """ Decision tree classifier. Builds a tree which is like a flow chart. It allows a decision to be reached by checking the values for various features and following the appropriate branches until a destination is reached. In addition to being useful as a classifier, the structure of the decision tree can lend insight into the data. """ def __init__(self, training_set): """ Constructs a new decision tree. Args: training_set: model.DataSet The training data to use when building the decision tree. """ self.training_set = training_set self._tree = id3.build_tree(training_set) self._plotter = MatplotlibAnnotationTreePlotter(self._tree) def _classify(self, sample): """ Predicts a sample's classification based on the decision tree that was built from the training data. Args: sample: The sample or observation to be classified. Returns: The sample's classification. """ node = self._tree.get_root_node() while not node.is_leaf(): feature = node.get_value() branch = sample[feature] try: node = node.get_child(branch) except KeyError: return self._handle_value_not_trained_for() return node.get_value() def _handle_value_not_trained_for(self): """ Handles the case where a sample has a value for a feature which was not seen in the training set and therefore is not accounted for in the tree. Current strategy is to just return the most common label in the training data set. It might be better to narrow this down to the most common among samples that would reach the node at which the unrecognized value was found. Returns: label: The best guess at the label. """ return collection_utils.get_most_common(self.training_set.get_labels()) def plot(self): """ Generates a plot of the decision tree to visualize its structure. Returns: void """ self._plotter.plot()