def testTree(): best = ('A', 5) data = {'A': [1,2,6,7,8,9,3,4,5], 'C': [1,0,1,0,1,0,1,0,1], 'B': [1,1,0,0,0,0,1,1,1]} df = pd.DataFrame(data) print tree.find_best_label_new(df, 'A', 'B') print 'best feature and label' print tree.find_best_feature_and_label_for_split(df, 'B', regression=True)
def branch_node(node, df, threshold, Y, regression=False): """ :param node: Node object defined in Stats :param df: The dataframe being used by the tree :param threshold: max branching depth :param Y: Feature to predict :return: void """ print 'Branching Level : ' + str(node.level) data = node.get_node_data(df) print 'Length of data ' + str(len(data)) + ' len df: ' + str(len(df)) feature, label = mytree.find_best_feature_and_label_for_split(data, Y, regression) print 'feature: {} label: {}'.format(feature, label) if feature is not None and node.level < threshold: A_array, B_array = node.split(feature, df[feature], label) print ' A : {} B: {}'.format(sum(A_array), sum(B_array)) node.add_left(A_array) node.add_right(B_array) branch_node(node.left, df, threshold, Y, regression) branch_node(node.right, df, threshold, Y, regression) else: if not regression: predict = 0 prob = mystats.binary_probability(data, Y) print 'PROBABILITY ' + str(prob) if prob >= .5: predict = 1 error = mystats.binary_error(data, Y, predict) else: print str(feature) +'is fueaturea ' + str(label) + str(node.presence) predict = float(sum(data[Y]))/len(data[Y]) error = mystats.compute_MSE(predict, list(data[Y])) node.leaf(predict, error)