def main(): #Insert input file file = open('example_training.csv') target = "action" data = [[]] for line in file: line = line.strip("\r\n") data.append(line.split(',')) data.remove([]) attributes = data[0] data.remove(attributes) db = createDatabase(attributes,data) #Run ID3 # Note: we can remove the target attribute from the attributes list tree = DecisionTree.makeTree(db, attributes, target, 0) print "generated tree" tree.accept(DecisionNode.PrintTreeVisitor()) print 'adding new examples' example1 = ['hurt','exposed','with_ammo','defend'] print example1 tree = DecisionTree.updateTree(tree, db, example1, attributes, target) tree.accept(DecisionNode.PrintTreeVisitor()) example2 = ['healthy','exposed','with_ammo','run'] print example2 tree = DecisionTree.updateTree(tree, db, example2, attributes, target) tree.accept(DecisionNode.PrintTreeVisitor())
def AdaBoost(data, labels, test_data, test_labels, values, T, printError=False): # Step 1: Initialize weights weights = np.array([1/len(data)]*len(data)) int_labels = getIntLabels(labels) int_test_labels = getIntLabels(test_labels) # Step 2 for each iteration... h_ts = [] votes = [] for iteration in range(T): h_t = DecisionTree.getDecisionStump(data,labels,values,weights) h_ts.append(h_t) error, predictions = DecisionTree.getErrorAndPredictions(h_t, data, labels) summa = 0 for index in range(len(predictions)): summa += weights[index]*predictions[index]*int_labels[index] e_t = .5 - (.5*summa) if printError: error, test_predictions = DecisionTree.getErrorAndPredictions(h_t, test_data, test_labels) summa = 0 for index in range(len(test_predictions)): summa += (1/len(test_labels))*test_predictions[index]*int_test_labels[index] test_error = .5 - (.5*summa) print(iteration, e_t, test_error) # compute it's vote vote_t = 0.5 * np.log((1-e_t)/e_t) votes.append(vote_t) # update the weights new_weights_0 = weights*np.exp(-vote_t*(int_labels*predictions)) new_weights = new_weights_0/np.sum(new_weights_0) weights = new_weights # Step 3, return the final hypothesis: return h_ts, votes
def trainNN(db, params): plot = False show_output = False random.shuffle(db) test_set = db train_set = db num_input_nodes = len(db[0])-1 num_output_nodes = 1 if params['num_hidden_layers'] == 1: model = NN1.createModel(num_input_nodes, params['num_hidden_layers'], params['num_hidden_nodes'], num_output_nodes, params['learning_rate'], list(db[0].keys()), params['output_name']) model = NN1.trainModelMiniBatch(model, train_set, params['batchSize'], params['output_name'], params['act_type'], params['rounds'], plot=plot) if params['num_hidden_layers'] == 2: model = NN2.createModel(num_input_nodes, params['num_hidden_layers'], params['num_hidden_nodes'], num_output_nodes, params['learning_rate'], list(db[0].keys()), params['output_name']) model = NN2.trainModelMiniBatch(model, train_set, params['batchSize'], params['output_name'], params['act_type'], params['rounds'], plot=plot) if params['num_hidden_layers'] == 3: model = NN3.createModel(num_input_nodes, params['num_hidden_layers'], params['num_hidden_nodes'], num_output_nodes, params['learning_rate'], list(db[0].keys()), params['output_name']) model = NN3.trainModelMiniBatch(model, train_set, params['batchSize'], params['output_name'], params['act_type'], params['rounds'], plot=plot) if params['saveModels']: NN3.exportModel(model, "../models/"+params['experiment_name']+'/NN/') rules = None if params['deepRED'] == True: rules = RED.getRules(model, db, params['output_name'], simplify=True) file = open('../models/'+ params['experiment_name'] + '/NN/NN_tree.txt', "w") DT.getTXTReprentation(rules, file) file.close() file = open('../models/'+ params['experiment_name'] + '/NN/NN_rules.txt', "w") DT.getRuleReprentation(rules, file) file.close() return model, rules
def classify_dataset_test(): #create dataset filename = "Dataset/iris.data" dataset = DT.Dataset(filename, _delimiter=',') Tree = DT.DecisionTree(dataset) #load exemples exemple1 = np.array([5.4, 3.9, 1.3, 0.4]).astype('S15') exemple2 = np.array([6.3, 2.5, 4.9, 1.5]).astype('S15') exemple3 = np.array([ 6.5, 3.0, 5.5, 1.8, ]).astype('S15') #classify exemples class1 = Tree.classify(exemple1) class2 = Tree.classify(exemple2) class3 = Tree.classify(exemple3) #verify classification eq_(class1, b'Iris-setosa') eq_(class2, b'Iris-versicolor') eq_(class3, b'Iris-virginica')
def setUp(self): """ Setup internal parameters used multiple times. """ # Create decision with leaf size as 1 self.leaf_terminate_1 = 1 dt_1 = DT.RegressionDecisionTree(split_type='rss', leaf_terminate=self.leaf_terminate_1) # Create decision tree with leaf size as 2 self.leaf_terminate_2 = 2 dt_2 = DT.RegressionDecisionTree(split_type='rss', leaf_terminate=self.leaf_terminate_2) # Make simple input data self.x_data_1 = np.array([[1, 4], [6, 7], [1, 4], [2, 3], [4, 5], [1, 5], [3, 6], [1, 4], [3, 1], [8, 9]]) self.y_data_1 = np.array([5, 6, 5, 1, 6, 7, 8, 6, 4, 0]) # Train the data dt_1.fit(self.x_data_1, self.y_data_1) dt_2.fit(self.x_data_1, self.y_data_1) # Get the result object self.result_tree_1 = dt_1.get_tree() self.result_tree_2 = dt_2.get_tree()
def setUp(self): """ Setup internal parameters used multiple times. """ # Create decision with tree with a gain ratio # Create decision tree with leaf pure termination criteria dt_1 = DT.ClassificationDecisionTree( split_type='gain_ratio', terminate='pure', ) dt_2 = DT.ClassificationDecisionTree( split_type='gini', terminate='pure', ) # Make simple input data self.x_data_1 = np.array([[1, 1], [2, 1], [3, 1], [4, 1], [5, 1], [6, 2], [7, 2], [8, 2], [9, 2], [10, 2]]) self.y_data_1 = np.array([0, 1, 1, 0, 1, 0, 1, 1, 0, 1]) # Train the data dt_1.fit(self.x_data_1, self.y_data_1) dt_2.fit(self.x_data_1, self.y_data_1) # Get the result object self.result_tree_1 = dt_1.get_tree() self.result_tree_2 = dt_2.get_tree()
def setUp(self): """ Setup internal parameters used multiple times. """ # Create decision with leaf size as 1 self.leaf_terminate_1 = 1 self.dt_1 = DT.ClassificationDecisionTree( split_type='gini', terminate='leaf', leaf_terminate=self.leaf_terminate_1) # Create decision tree with leaf size as 2 self.leaf_terminate_2 = 2 self.dt_2 = DT.ClassificationDecisionTree( split_type='gini', terminate='leaf', leaf_terminate=self.leaf_terminate_2) # Create decision tree with leaf pure termination criteria self.dt_3_pure = DT.ClassificationDecisionTree( split_type='gini', terminate='pure', ) # Make simple input data self.x_data_1 = np.array([[1, 4], [6, 7], [1, 4], [2, 3], [4, 5], [1, 5], [3, 6], [1, 4], [3, 1], [8, 9]]) self.y_data_1 = np.array([0, 1, 1, 0, 1, 0, 1, 1, 0, 1]) # Train the data self.dt_1.fit(self.x_data_1, self.y_data_1) self.dt_2.fit(self.x_data_1, self.y_data_1) self.dt_3_pure.fit(self.x_data_1, self.y_data_1)
def decisionTreeLearning(examples, attributes, parents_examples=()): if len(examples) == 0: return pluralityValue( parents_examples ) #returns the most frequent classification among the examples elif allSameClass(examples): return DecisionTree.Leaf( examples[0][dataset.target] ) #if they all have the same class, I return the class of the first example elif len(attributes) == 0: return pluralityValue( examples ) #returns the most frequent classification among the examples else: mostImpAtt, threshold = chooseAttribute(attributes, examples) tree = DecisionTree.DecisionTree(mostImpAtt, threshold, dataset.attrnames[mostImpAtt]) ExampleMinor, ExampleMajor = splittingOnThreshold( mostImpAtt, threshold, examples) #separate based on threshold #do recursion and add to the tree branchesLeft = decisionTreeLearning(ExampleMinor, removeAttr( mostImpAtt, attributes), examples) #recursion branchesRight = decisionTreeLearning(ExampleMajor, removeAttr( mostImpAtt, attributes), examples) #recursion tree.addLeft(threshold, branchesLeft) tree.addRight(threshold, branchesRight) return tree
def decisionTreeLearning(examples, attributes, parents_examples=()): if len(examples) == 0: return pluralityValue( parents_examples ) #ritorna la piu frequente classificazione tra gli examples elif allSameClass(examples): return DecisionTree.Leaf( examples[0][dataset.target] ) #se tutti hanno la stessa classe ritorna la classe del primo esempio elif len(attributes) == 0: return pluralityValue( examples ) #ritorna la piu frequente classificazione tra gli esempi else: if ce == 0: mostImpAtt, threshold = chooseAttribute(attributes, examples) else: mostImpAtt, threshold = chooseAttribute2(attributes, examples) tree = DecisionTree.DecisionTree(mostImpAtt, threshold, dataset.attrnames[mostImpAtt]) ExampleMinor, ExampleMajor = splittingOnThreshold( mostImpAtt, threshold, examples) #separazione basata sulla soglia #fa la ricorsione ed aggiunge all albero branchesLeft = decisionTreeLearning(ExampleMinor, removeAttr( mostImpAtt, attributes), examples) #ricorsione branchesRight = decisionTreeLearning(ExampleMajor, removeAttr( mostImpAtt, attributes), examples) #ricorsione tree.addLeft(threshold, branchesLeft) tree.addRight(threshold, branchesRight) return tree
def predict(filename): """ 调用决策树 """ tree,score=DecisionTree.build_decision_tree(filename) while True: domain=domains.get(timeout=30) predict=DecisionTree.predict(domain,tree) predicts.put([domain,predict])
def main(argv): if len(argv) != 3: print "incorrect input format" return train_file = argv[1] test_file = argv[2] forest = train_random_forest(DT.load_data(train_file)) test_random_forest(DT.load_data(test_file), forest)
def binUsers(syn_pop_file): bnd_pop_file = bnd_syn_pop DecisionTree.classify(syn_pop_file, bnd_pop_file) csv_to_list = [] with open(bnd_pop_file, 'r') as f: csv_to_list = [{k: int(v) for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)] return csv_to_list
def fit(self, X, Y): N = len(X) d = np.int(len(X[0]) * 0.5) for i in range(N): print("Progress:", i, "of ", N) sel = np.random.choice(len(X), size=len(X), replace=True) Xb, Yb = X[sel], Y[sel] model = DecisionTree() model.fit(Xb, Yb, d) self.models.append(model)
def _rec_build_random_tree(training_data_cut, rec_count): # increase recursion count by 1 rec_count += 1 # find the feature to split the data that provides greatest information gain from a random sample # returns tuple ((feature_name, feature_index), (fc_has_vote, sc_has_vote), (fc_has_not_vote, sc_has_not_vote)) feature_and_votes = _find_best_sampled_feature(training_data_cut) # if training data falls below a preset threshold or the vote is unanimous build a Leaf node; # otherwise split data on feature and build a Tree node; also enforce a recursion limit fc_has_vote = feature_and_votes[1][0] sc_has_vote = feature_and_votes[1][1] fc_has_not_vote = feature_and_votes[2][0] sc_has_not_vote = feature_and_votes[2][1] # length of training data cut cut_length = len(training_data_cut) # build left (has feature) branch if cut_length < _leaf_threshold or fc_has_vote == 0 or sc_has_vote == 0 or rec_count > _rec_limit: # build Leaf based on votes left_branch = DecisionTree.Leaf((fc_has_vote, sc_has_vote)) else: # split out and build Tree has_feature_data = [] for tree_row in training_data_cut: # add 2 to feature index to skip RECORD and CLASS columns feature_index = feature_and_votes[0][1] + 2 if tree_row[feature_index]: has_feature_data.append(tree_row) # recurse into the left branch building the tree of data that has feature left_branch = _rec_build_random_tree(has_feature_data, rec_count) # build right (has not feature) branch if cut_length < _leaf_threshold or fc_has_not_vote == 0 or sc_has_not_vote == 0 or rec_count > _rec_limit: # build Leaf based on votes right_branch = DecisionTree.Leaf((fc_has_not_vote, sc_has_not_vote)) else: # split out and build Tree has_not_feature_data = [] for tree_row in training_data_cut: # add 2 to feature index to skip RECORD and CLASS columns feature_index = feature_and_votes[0][1] + 2 if not tree_row[feature_index]: has_not_feature_data.append(tree_row) # recurse into the right branch building the tree of data without feature right_branch = _rec_build_random_tree(has_not_feature_data, rec_count) # build tree with splitting feature name and index, and the left and right branches feature_name_index = feature_and_votes[0] random_tree = DecisionTree.Tree(feature_name_index, left_branch, right_branch) return random_tree
def buildtree(x,y, samples, min_node=1, result_cur = None): if type(x) != np.ndarray: x = np.array(x) if type(y) != np.ndarray: y = np.array(y) if type(samples) != np.ndarray: samples = np.array(samples) if len(samples) == 0: return DTme.decisionnode() ## transform old rank to new rank form if y.ndim == 2: # rank_old form # y = y.tolist() temp = map(rankO2New, y) y = np.array(temp) if result_cur is None: result_cur = MM(y[samples]) if len(samples)<= min_node: return DTme.decisionnode(result=result_cur[1]) # find best split best_gain = 0.0 best_split = [] best_sets = [] best_sets_result = [] N_feature = x.shape[1] start = datetime.now() ### test for feature in range(N_feature): # nlogn selection min_var, split, sets, sets_result = bestSplit(x,y,samples,feature) if min_var is None: continue gain = result_cur[0] - min_var # print "feature: ", feature, "gain: ", gain, "result_cur: ", result_cur, "min_var: ", min_var ### test if gain > best_gain and len(sets[0]) * len(sets[1]) > 0: best_gain = gain best_split = split best_sets = sets best_sets_result = sets_result duration = datetime.now() - start ### test print "Nsamps: ", len(samples) print "duration: ", duration.total_seconds() if best_gain > 0: tb = buildtree(x,y, best_sets[0], min_node = min_node, result_cur = best_sets_result[0]) fb = buildtree(x,y, best_sets[1], min_node = min_node, result_cur = best_sets_result[1]) return DTme.decisionnode(feature = best_split[0], value = best_split[1], result = result_cur[1], tb = tb, fb = fb, gain = (tb.gain+fb.gain+best_gain), size_subtree = (tb.size+fb.size)) else: return DTme.decisionnode(result = result_cur[1])
def predictInstance(input): instance = input[0] trees = input[1] func_predict = lambda x: DecisionTree.predict(instance, x) prediction = map(func_predict, trees) #print(prediction) summarizeVoting = DecisionTree.labelCounts(prediction) #print(summarizeVoting) #print(max(summarizeVoting, key = summarizeVoting.get)) return max(summarizeVoting, key=summarizeVoting.get)
def createPlot(inTree): fig = plt.figure(1, facecolor='white') fig.clf() axprops = dict(xticks=[], yticks=[]) createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) plotTree.totalW = float(DecisionTree.getNumLeafs(inTree)) plotTree.totalD = float(DecisionTree.getTreeDepth(inTree)) plotTree.xOff = -0.5 / plotTree.totalW plotTree.yOff = -1.0 plotTree(inTree, (0.5, 1.0), '') plt.show()
def main(): opts = util.parse_args() train_partition = util.read_arff(opts.train_filename, True) test_partition = util.read_arff(opts.test_filename, False) # create an instance of the DecisionTree class from the train_partition tree = DecisionTree(train_partition, (vars(opts)).get("depth")) rootnode = tree.constructsubtree(train_partition, (vars(opts)).get("depth"), 0) #print text representation of the DecisionTree tree.printtree(rootnode)
def fit(self, data, label): num_samples, total_features = data.shape for tree_num in range(self.num_trees): # print("TREE:", tree_num) random_rows = np.random.randint(0, num_samples, num_samples) random_features = np.random.choice(total_features, self.num_features, replace=False) random_data = data[random_rows, :][:, random_features] random_labels = label[random_rows] dt = DecisionTree(self.max_depth, self.min_obs) dt.fit(random_data, random_labels) self.trees += [(random_features, dt)]
def train(self, records, attributes): """Create subsample for each tree """ self.tree_num = int(self.tree_num) for number in range(self.tree_num): self.forest.append(self.bootstrap(records)) half = len(attributes)/2 index = 1 for tree in self.forest: dt = DecisionTree() print("Create TREE %d\n"%(index)) attributes = self.shuffle_attributes(attributes, half) dt.train_random_forest(tree, attributes) self.get_tree.append(dt) index += 1
def main(): training_data, features, classes = ingest_training_data(sys.argv[1]) #print "Prior training data:" #print training_data #print training_data dt = DecisionTree(training_data, features, classes) #print "All training data:" #print dt.training_data #print "\n" # print dt.get_possible_values('mpg') # print dt.get_fixed_value_subset('mpg', 'OK') # print dt.get_class_counts(dt.training_data) # print dt.entropy_of_subset(dt.training_data) # for f in dt.features: # print f + " " + str(dt.information_gain(f)) # #dt.bin_values_median('displacement') # #dt.bin_values_quartile('displacement') # print dt.training_data[0] # print dt.features # for f in dt.features: # print f + ": " + str(len(dt.get_possible_values(f))) # print dt.get_possible_values('displacement') dt.build_tree_id3() #display(dt) testing_data, testing_features, testing_classes = ingest_training_data(sys.argv[2]) print testing_data[0] bin_quartile(testing_data) print testing_data[0] num_test_instances = len(testing_data) num_correct = 0 for t in testing_data: print "Actual class: " + str(t['class']) predicted_class = dt.classify(t) print "Predicted class: " + str(predicted_class) if predicted_class == t['class']: num_correct += 1 accuracy = float(num_correct) / num_test_instances print "Accuracy = " + str(accuracy)
def decisionTreeLearning_test(): """"Function to test the general decision tree learning function.""" #create dataset filename = "/home/jorge/Documents/2-Programming/AI/DecisionTree/Dataset/restaurant.txt" dataset = DT.Dataset(filename, _delimiter='\t') Tree = DT.DecisionTree(dataset) #show first branch of decisition tree shown in page 702 # print(Tree.root) # print(Tree.leaf) #return tree for classify function next return Tree
def lensesTest2(): file = open('lenses.txt', 'r') dataSet = [inst.strip().split('\t') for inst in file.readlines()] labels = ['age', 'prescript', 'astigmatic', 'tearRate'] #特征标签 print('香农熵为:', DecisionTree.calcShannonEnt(dataSet)) print("最优特征索引值:" + str(DecisionTree.chooseBestFeatureToSplit(dataSet))) myTree = DecisionTree.createTree(dataSet, labels) print(myTree) testVec = ['normal', 'no', 'presbyopic', 'myope'] finalFeatLabels = ['tearRate', 'astigmatic', 'age', 'prescript'] result = DecisionTree.classify(myTree, finalFeatLabels, testVec) print("预测结果为" + result)
def dt_learn(dataset, attrs, parent_dist=None): if not dataset: return Dt.Leaf(parent_dist.get_most_common()) dist = Distribution(dataset) if dist.is_leaf() or not attrs: return Dt.Leaf(dist.get_most_common()) else: attr = max_gain(dataset, dist, attrs) tree = Dt.Node(attr) for v in attr.domain: dv = [d for d in dataset if d.x[attr.index] == v] child_attrs = [a for a in attrs if a != attr] subtree = dt_learn(dv, child_attrs, dist) tree.add_child(subtree, v) return tree
def fit(self, X_train, y_train, is_continuous): ''' 拟合数据 输入X,y 调用函数:决策树 ''' self.trees = [] m = X_train.shape[0] for _ in range(self.n_trees): indices = np.random.choice(m, m, replace=True) X_train_i = X_train.iloc[indices, :] y_train_i = y_train.iloc[indices] tree = DecisionTree(self.criterion, self.max_high) tree.fit(X_train_i, y_train_i, is_continuous) self.trees.append(tree)
def metrices(train_data, test_data, classes): actual_labels = [] predict_labels = [] for method in range(2): if method == 0: print('Decision Tree:') max_depth = DecisionTree.parameters(train_file) for sets in ['train_set', 'test_set']: if sets == 'train_set': print('training set') actual_labels = [row[-1] for row in train_data] predict_labels = DecisionTree.decision_tree( train_data, train_data, max_depth, 1) if sets == 'test_set': print('testing set') actual_labels = [row[-1] for row in test_data] predict_labels = DecisionTree.decision_tree( train_data, test_data, max_depth, 1) total, accuracy, conf_matrix = DecisionTree.conf_matrix( actual_labels, predict_labels, classes) print('overall accuracy: {}'.format(accuracy)) print_metrices(conf_matrix, total) continue else: print('Random Forest:') max_depth = len(train_data[0]) - 1 F_features, tree_number = RandomForest.parameters(train_file) for sets in ['train_set', 'test_set']: if sets == 'train_set': print('training set') actual_labels = [row[-1] for row in train_data] predict_labels = RandomForest.random_forest( train_data, train_data, max_depth, 1, 1, tree_number, F_features) if sets == 'test_set': print('testing set') actual_labels = [row[-1] for row in test_data] predict_labels = RandomForest.random_forest( train_data, test_data, max_depth, 1, 1, tree_number, F_features) total, accuracy, conf_matrix = RandomForest.conf_matrix( actual_labels, predict_labels, classes) print('overall accuracy: {}'.format(accuracy)) print_metrices(conf_matrix, total) return
def generateRules(self, randomCharacterBeingSentSomehow): print"generateRules" reduced_mdp_list = [] attributes = [] self.mdp_list[-1] = [self.combineIdenticalMDPs(self.mdp_list[-1])] self.writeToList(self.mdp_list[-1][-1]) training_set = self.StateActionPairs attr_shape = ("cube", "prism", "cuboid") attr_colour = ("red", "blue", "green") attr_size = ("small","medium","large") attribute_dict = [("has_shape(A,", attr_shape), ("has_colour(A, ", attr_colour), ("has_size(A, ", attr_size), ("has_shape(D, ", attr_shape), ("has_colour(D, ",attr_colour), ("has_size(D, ", attr_size)] attribute_dict = OrderedDict(attribute_dict) index = 0 names = attribute_dict.keys() values = attribute_dict.values() for name, vals in zip(names, values): attributes.append(Attribute(name, index, vals)) index += 1 self.decision_tree = DecisionTree(attributes, training_set) rules = self.decision_tree.getRules() rules = self.selectRules(rules) print "" print rules return rules
def main(): # # single data file file_name, target_attribute = 'datasets/bool_or.csv', 'output' data, attributes = read_csv(file_name, parse_attributes=True) train_data, test_data = train_test_split(data, test_size=0.3, shuffle=True) # split in train and test data # separate data files # train_file, test_file, target_attribute = 'datasets/VoteTraining.csv', 'datasets/Vote.csv', 'class' # train_file, test_file, target_attribute = 'datasets/WeatherTraining.csv', 'datasets/Weather.csv', 'play' # train_file, test_file, target_attribute = 'datasets/SoybeanTraining.csv', 'datasets/Soybean.csv', 'class' # train_data, attributes = read_csv(train_file, parse_attributes=True) # test_data = read_csv(test_file) print("%d training sample, %d testing samples" % (len(train_data), len(test_data))) # don't know where it comes from while [''] in train_data: train_data.remove(['']) while [''] in test_data: test_data.remove(['']) # Run ID3 to generate a tree m_tree = DecisionTree.make_tree(train_data, attributes, target_attribute, 0) a, b = info(m_tree) print("Decision Tree contains %d judging attributes" % a) DrawTree.DrawTree(m_tree) # tree and list of attributes count, miss, err = test(test_data=test_data, tree=m_tree, attributes=attributes, target_attribute=target_attribute) print("%d total test cases, %d missed, %d wrong" % (count, miss, err))
def main(): print("Enter main()") #========================================================================================== # 決定木 [DecisionTree] の不純度 [purity] を表す関数の作図 # ノードの誤り率 [eror rate], 交差エントロピー関数 [cross-entropy], ジニ係数 [Gini index] #========================================================================================== tree = DecisionTree.DecisionTree() #------------------------------- # 不純度を表す関数群の plot #------------------------------- figure = plt.figure() axis = plt.subplot(1,1,1) plt.grid(linestyle='-') tree.plotNodeErrorFunction( figure, axis ) tree.plotCrossEntropyFunction( figure, axis ) tree.plotGiniIndexFunction( figure, axis ) plt.title("purity functions (i=1)") # title plt.legend(loc = "upper left") # 凡例 plt.tight_layout() # グラフ同士のラベルが重ならない程度にグラフを小さくする。 # 図の保存&表示 plt.savefig("./DecisionTree_scikit-learn_1.png", dpi=300) plt.show() print("Finish main()") return
def test_data_intake_regression(self): """ Test the regression can intake the data in different formats and predict same result. """ # Setup y data y_data_train = np.array([5, 6, 5, 1, 6, 7, 8, 6, 4, 0]) y_data_true = np.array([5.3333333, 4, 0]) y_series_train = pd.Series(y_data_train) y_matrix_train = np.asmatrix(y_data_train) # Regression tree regress_tree = DT.RegressionDecisionTree(split_type='rss', leaf_terminate=1) # Test different inputs and assertions regress_tree.fit(self.x_df_train, y_series_train) self.assertEqual( list(np.round(regress_tree.predict(self.x_df_test), 6)), list(np.round(y_data_true, 6))) regress_tree.fit(self.x_matrix_train, y_matrix_train) self.assertEqual( list(np.round(regress_tree.predict(self.x_matrix_test), 6)), list(np.round(y_data_true, 6)))
def __init__(self, train, n_trees, sample_leaf_limits, sample_ratio, chara_ratio): ''' : __init__: 根据参数初始化随机森林,并根据训练集进行训练 : note: 实现步骤可以直接参照李航的统计学习方法中的步骤依次进行实现 : param train: 训练集,其中第一列为样本类别标签 : type train: pd.Dataframe : param n_trees: 随机森林中的决策树个数 : type n_trees: int : param sample_leaf_limits: 随机挑选的样本比例,范围在[0,1] : type sample_leaf_limits: float : param sample_ratio: 随机挑选的特征比例,范围在[0,1] : type chara_ratio: float ''' self.forest = [] fn = int(chara_ratio * (train.shape[1] - 1)) for n in range(n_trees): temp1 = time.time() sf = np.random.choice(np.arange(1, train.shape[1]), fn, replace=False) sf = np.append(0, sf) train_n = train.iloc[:, sf] p = np.random.random_sample() * (1 - sample_ratio) + sample_ratio train_n = train_n.loc[np.random.choice(train_n.index, int(p * train_n.index.size), replace=False)] tree = DT.DecisionTree(train_n, sample_leaf_limits) self.forest.append(tree) temp2 = time.time() print('随机森林中的第%d棵树构造成功,耗时%f' % (n, temp2 - temp1))
def __init__(self, T=10, M=30, bagging=False): self.t = T self.m = M self.bagging = bagging self.forest = map(lambda i: DecisionTree(), range(T)) self.shape = None self.selected_attributes = list()
def main(): #Insert input file file = open('dt-data.txt') #class attribute target = "Enjoy" data = [[]] for line in file: #cleaning the data line = line.strip("\r\n;:0123456789\t").replace(" ","") data.append(line.split(',') ) data.remove([]) attributes = data[0] # attributes = attributes.strip("()") attributes = [x.strip("()") for x in attributes] # need to pass only the non-target attributes without any attribute headers and that one empty line data.remove(data[0]) data.remove(data[0]) # Run ID3 print "Generated decision tree" tree = DecisionTree.makeTree(data, attributes, target, 0) pp = pprint.PrettyPrinter(indent = 4, depth = 14) pp.pprint(tree) #Generate IF THEN rules print "if - then rules" pre = "" rulegen(tree, pre)
def evaluate_tree(decisionTree, test_set, all_classes): print("Evaluating Tree") list_tuples = [] for t in range(len(test_set)): string = dt.evaluate_data(test_set[t], decisionTree) if string is None: print( "test #" + str(t) + " result: Unable to evaluate data, too much repetition on training set" ) else: print("test #" + str(t) + " result: " + "(Verdadeiro / Classificado) (" + test_set[t]["Class"] + " / " + string + ")") tup = (test_set[t]["Class"], string) list_tuples.append(tup) if len(all_classes) == 2: precision, recall, f1 = performance_binary(list_tuples, all_classes) print("performance_binary:") print("precision:", str(precision)) print("recall:", str(recall)) print("f1:", str(f1)) else: perf = performance_multiclass(list_tuples, all_classes) print("performance_multiclass:") print(perf)
def evaluate_forest(forest, test_set, all_classes): list_tuples = [] for t in range(len(test_set)): string = dt.evaluate_forest(test_set[t], forest, all_classes) if string is None: pass else: tup = (test_set[t]["Class"], string) list_tuples.append(tup) pesos = {} str_resultado = "Votos :" for opcao in all_classes: pesos[opcao] = 0 for t in list_tuples: if t[1] == opcao: pesos[opcao] += 1 str_resultado += " | " + opcao + ": " + str(pesos[opcao]) if len(all_classes) == 2: precision, recall, f1 = performance_binary(list_tuples, all_classes) return { "precision": precision, "recall": recall, "f1Score": f1, } else: perf = performance_multiclass(list_tuples, all_classes) return perf
def main(): #Insert input file #file = open('train.csv') file = open('OutsideTraining.csv') target = "outside" data = [[]] for line in file: line = line.strip("\r\n") data.append(line.split(',')) data.remove([]) attributes = data[0] data.remove(attributes) #Run ID3 tree = DecisionTree.makeTree(data, attributes, target, 0) print tree print "generated decision tree" #Generate program file = open('program.py', 'w') file.write("import Node\n\n") #open input file file.write("data = [[]]\n") """ IMPORTANT: Change this file path to change testing data """ file.write("f = open('Outside.csv')\n") #gather data file.write("for line in f:\n\tline = line.strip(\"\\r\\n\")\n\tdata.append(line.split(','))\n") file.write("data.remove([])\n") #input dictionary tree file.write("tree = %s\n" % str(tree)) file.write("attributes = %s\n" % str(attributes)) file.write("count = 0\n") file.write("for entry in data:\n") file.write("\tcount += 1\n") #copy dictionary file.write("\ttempDict = tree.copy()\n") file.write("\tresult = \"\"\n") #generate actual tree file.write("\twhile(isinstance(tempDict, dict)):\n") file.write("\t\troot = Node.Node(tempDict.keys()[0], tempDict[tempDict.keys()[0]])\n") file.write("\t\ttempDict = tempDict[tempDict.keys()[0]]\n") #this must be attribute file.write("\t\tindex = attributes.index(root.value)\n") file.write("\t\tvalue = entry[index]\n") #ensure that key exists file.write("\t\tif(value in tempDict.keys()):\n") file.write("\t\t\tchild = Node.Node(value, tempDict[value])\n") file.write("\t\t\tresult = tempDict[value]\n") file.write("\t\t\ttempDict = tempDict[value]\n") #otherwise, break file.write("\t\telse:\n") file.write("\t\t\tprint \"can't process input %s\" % count\n") file.write("\t\t\tresult = \"?\"\n") file.write("\t\t\tbreak\n") #print solutions file.write("\tprint (\"entry%s = %s\" % (count, result))\n") print "written program"
def run(searchForOptimal, basepath, filepath): sc = buildContext() trainingData, testData = loadData(sc, basepath, filepath) if searchForOptimal: optimalRandomForestModel = RandomForest.trainOptimalModel(trainingData, testData) Evaluation.evaluate(optimalRandomForestModel, testData, logMessage=True) optimalDecisionTreeModel = DecisionTree.trainOptimalModel(trainingData, testData) Evaluation.evaluate(optimalDecisionTreeModel, testData, logMessage=True) else: randomForestModel = RandomForest.trainModel(trainingData) Evaluation.evaluate(randomForestModel, testData, logMessage=True) decisionTreeModel = DecisionTree.trainModel(trainingData) Evaluation.evaluate(decisionTreeModel, testData, logMessage=True)
def __init__(self, file): fin = open(file, "r") # training set file lines = [line.strip() for line in fin.readlines()] lines.reverse() attributes = [attrib.strip() for attrib in lines.pop().split(",")] targetAttrib = attributes[-1] lines.reverse() # creating data dictionary data = [] for line in lines: data.append(dict(zip(attributes, [datum.strip() for datum in line.split(",")]))) # creating a decision tree based on training set self.tree = DecisionTree.createDecisionTree(data, attributes, targetAttrib, ID3.gain)
def randomForest(data, attributes, attributesType, target, depth, recursion, treeNum, epsilon = 0): # generate a random number # choose attr and data # generate trees trees = [] attrNum = math.sqrt(len(attributes)) epsilonPerTree = float(epsilon)/treeNum for i in range(0,treeNum): newData = genDataset(data) attrAndType = genAttrbutes(attributes, attributesType, attrNum, target) newAttributes = attrAndType['attr'] newAttrTypes = attrAndType['type'] newDataset = genNewDataset(newData, newAttributes, attributes) tree = DecisionTree.makeTree(newDataset, newAttributes, newAttrTypes, target, depth, recursion, epsilonPerTree) trees.append(tree) return trees
def classify(self, customers): # creating test data table = [] for i in range(len(customers)): collection = {} collection['Waiting'] = fuzzifyWaiting(customers[i][0]) collection['Meal'] = fuzzifyMeal(customers[i][1]) collection['Distance'] = fuzzifyDistance(customers[i][2]) table.append(collection) # classyfing test data classification = DecisionTree.classify(self.tree,table) table1=[] for item in classification: table1.append(item) return table1
def main(): file = open('pokemonTraining.csv') target = "class" data = [[]] for line in file: line = line.strip("\r\n") data.append(line.split(',')) data.remove([]) attributes = data[0] data.remove(attributes) tree = DecisionTree.makeTree(data, attributes, target, 0) print "generated decision tree" file = open('program.py', 'w') file.write("import Node\n\n") file.write("data = [[]]\n") file.write("f = open('pokemon.csv')\n") file.write("output = open('result', 'w')") file.write("first_line = f.readline()") file.write("second_line = f.readline()") file.write("for line in f:\n\tline = line.strip(\"\\r\\n\")\n\tdata.append(line.split(','))\n") file.write("data.remove([])\n") file.write("tree = %s\n" % str(tree)) file.write("attributes = %s\n" % str(attributes)) file.write("count = 0\n") file.write("for entry in data:\n") file.write("\tcount += 1\n") file.write("\ttempDict = tree.copy()\n") file.write("\tresult = \"\"\n") file.write("\twhile(isinstance(tempDict, dict)):\n") file.write("\t\troot = Node.Node(tempDict.keys()[0], tempDict[tempDict.keys()[0]])\n") file.write("\t\ttempDict = tempDict[tempDict.keys()[0]]\n") file.write("\t\tindex = attributes.index(root.value)\n") file.write("\t\tvalue = entry[index]\n") file.write("\t\tif(value in tempDict.keys()):\n") file.write("\t\t\tchild = Node.Node(value, tempDict[value])\n") file.write("\t\t\tresult = tempDict[value]\n") file.write("\t\t\ttempDict = tempDict[value]\n") file.write("\t\telse:\n") file.write("\t\t\tprint \"can't process input %s\" % count\n") file.write("\t\t\tresult = \"?\"\n") file.write("\t\t\tbreak\n") file.write("\toutput.write(\"%s\" % (result))\n") print "written program"
class LearningModule: def __init__(self): self.mdp_list = [] self.success_config = [] self.decision_tree = None self.StateActionPairs= [] src_error = rospy.Service('LMErrorHasOccured',LMErrorHasOccured, self.errorHandle) src_rules = rospy.Service('LMGenerateRules', LMGenerateRules, self.generateRules) srv_state = rospy.Service('LMInitialise', LMInitialise, self.initialise_mdp) srv_state = rospy.Service('LMNewBlocks', LMNewBlocks, self.newBlocks) srv_action = rospy.Service('LMStateActionTaken', LMStateActionTaken, self.onPolicyLearning) # initialise self.mdp_list.append([]) def initialise_lists(self): self.success_config.append([]) def initialise_mdp(self, state): try: blocks = [] for prop in state.initial_state.block_properties: blocks.append(Block(prop.label, prop.shape, prop.colour, prop.size)) start_config = state.initial_state.configuration.config startingState = State(0, start_config) self.initialise_lists() self.success_config[-1].append(startingState) label = len(self.mdp_list[-1]) print "" print label print "" mdp = MDP(label, blocks) mdp.statelist.append(startingState) mdp.initMDP(startingState) self.mdp_list[-1].append(mdp) print "MDP initialised" return True except: return False def newBlocks(self, blockSet): try: # combine MDPS self.mdp_list[-1] = self.combineIdenticalMDPs(self.mdp_list[-1]) # add combined MDPs state action pairs to the list! self.writeToList(self.mdp_list[-1][-1]) # start new layer self.new_layer() return True except: return False def new_layer(self): try: self.mdp_list.append([]) return True except: return False def combineIdenticalMDPs(self, mdp_list): print "combining" sum_distance = [[0.0 for i in range(0,len(mdp_list[0].getStateList()))] for j in range(0,len(mdp_list[0].getStateList()))] weighted_average = [[0.0 for i in range(0,len(mdp_list[0].getStateList()))] for j in range(0,len(mdp_list[0].getStateList()))] for mdp in mdp_list: for i, row in enumerate(mdp.getDistanceMatrix()): for j, distance in enumerate(row): if distance > 0: sum_distance[i][j] += 1/distance for mdp in mdp_list: for i, row in enumerate(mdp.getDistanceMatrix()): for j, distance in enumerate(row): if distance > 0.0 and sum_distance[i][j] > 0.0: weight = (1/distance)/(sum_distance[i][j]) weighted_average[i][j] += weight*mdp.getQMatrix()[i][j] newMDP = deepcopy(mdp_list[0]) newMDP.setQMatrix(weighted_average) return newMDP def findState(self, config, mdp): for state in mdp.getStateList(): if state.getConfiguration() == config: return state def errorHandle(self, action_chosen): # try: print "errr" action_chosen = action_chosen.action_chosen actionableBlock = int(re.findall('\d+$', action_chosen.actionableBlock)[0]) destinationBlock = int(re.findall('\d+$', action_chosen.destinationBlock)[0]) action_block = actionableBlock dest_block = destinationBlock action_chosen = None for action in self.mdp_list[-1][-1].getErrorState().getActions(): if action.getActionableBlock() == action_block: if action.getDestinationBlock() == dest_block: action_chosen = action self.mdp_list[-1][-1].onPolicyLearning(action_chosen) error_config = self.mdp_list[-1][-1].getErrorState() print self.success_config[-1] self.mdp_list[-1][-1].simulation(error_config, self.success_config[-1]) return True # except: # print "OMGMMMM" # return False def onPolicyLearning(self, action): # try: """ This will be the callback function""" actionableBlock = int(re.findall('\d+$',action.action_chosen.actionableBlock)[0]) if(re.findall('tab',action.action_chosen.destinationBlock)): print "###############TABLE################" destinationBlock = None else: destinationBlock = int(re.findall('\d+$',action.action_chosen.destinationBlock)[0]) action_chosen = None for action in self.mdp_list[-1][-1].errorstate.actions: print action.actionableBlock print action.destinationBlock if(actionableBlock == action.actionableBlock) and (destinationBlock == action.destinationBlock): action_chosen = action self.mdp_list[-1][-1].onPolicyLearning(action_chosen) config = self.mdp_list[-1][-1].getErrorState() self.success_config[-1].append(config) return True def writeToList(self, mdp): blocks = mdp.getBlocks() for state in mdp.getStateList(): for action in state.getActions(): action_block = action.getActionableBlock() dest_block = action.getDestinationBlock() if dest_block == None: example = (blocks[action_block].getShape(), blocks[action_block].getColour(), blocks[action_block].getSize(), mdp.getQMatrix()[state.getLabel()][action.getNextStateAddr()]) else: example = (blocks[action_block].getShape(), blocks[action_block].getColour(),blocks[action_block].getSize(), blocks[dest_block].getShape(), blocks[dest_block].getColour(),blocks[dest_block].getSize(), mdp.getQMatrix()[state.getLabel()][action.getNextStateAddr()]) self.StateActionPairs.append(example) return def generateRules(self, randomCharacterBeingSentSomehow): print"generateRules" reduced_mdp_list = [] attributes = [] self.mdp_list[-1] = [self.combineIdenticalMDPs(self.mdp_list[-1])] self.writeToList(self.mdp_list[-1][-1]) training_set = self.StateActionPairs attr_shape = ("cube", "prism", "cuboid") attr_colour = ("red", "blue", "green") attr_size = ("small","medium","large") attribute_dict = [("has_shape(A,", attr_shape), ("has_colour(A, ", attr_colour), ("has_size(A, ", attr_size), ("has_shape(D, ", attr_shape), ("has_colour(D, ",attr_colour), ("has_size(D, ", attr_size)] attribute_dict = OrderedDict(attribute_dict) index = 0 names = attribute_dict.keys() values = attribute_dict.values() for name, vals in zip(names, values): attributes.append(Attribute(name, index, vals)) index += 1 self.decision_tree = DecisionTree(attributes, training_set) rules = self.decision_tree.getRules() rules = self.selectRules(rules) print "" print rules return rules def selectRules(self, rules): """ Select the best rules """ """ Think about doing it using SVM""" rules = sorted(rules, key=operator.itemgetter(-1)) q_val = [] for index, rule in enumerate(rules): q_val.append([index, rule[-1]]) whitened = whiten(q_val) centroids,_ = kmeans(whitened, 3, thresh = 1,iter = 100) ids,_= vq(whitened, centroids) key = ids[-1] indices = [] for index, keys in enumerate(ids): if key == keys: indices.append(index) valid_rules = [] for index in indices: valid_rules.append(rules[index][0]) return self.parseRules(valid_rules) def parseRules(self, rules): valid_rules = [] for rule in rules: sentence = "" for segment in rule: sentence = sentence + segment + ", " sentence = sentence[:-2] valid_rules.append(sentence) return Rules(rule = valid_rules) def reduceMDP(self,errorconfig, stack_config, start_config, blocks): mdp_list = [] for i in range(0, len(errorconfig)): mdp_list.append(MDP(i, blocks)) startingState = State(0, start_config) mdp_list[i].statelist.append(startingState) mdp_list[i].initMDP(startingState) errorstate = self.findState(errorconfig[i], mdp_list[i]) stackstate = [] for j in range(0,len(stack_config)): stackstate.append(self.findState(stack_config[j], mdp_list[i])) mdp_list[i].simulation(errorstate, stackstate) mdp_list[i].updateDistanceMatrix(errorstate) reduced_mdp = self.combineIdenticalMDPs(mdp_list) return reduced_mdp
"-v", "--validate", type="string", dest="validate", default="bcan.validate", help="Validation Data File" ) parser.add_option("-s", "--stopping_parameter", type="int", dest="stop", default=1, help="Stopping Parameter") (options, args) = parser.parse_args() print "Loading\n...'%s' as training set,\n...'%s' as test set,\n...'%s' as validation set,\n...stopping parameter %d..." % ( options.train, options.test, options.validate, options.stop, ) training_set = load_data(options.train) test_set = load_data(options.test) validation_set = load_data(options.validate) s_para = options.stop # dt = DecisionTree(training_set, validation_set, stopping_parameter=s_para) dt = DecisionTree(training_set, stopping_parameter=1) print "test set: {} training_error : {} validation_error: {}".format( dt.prediction_error(test_set), dt.training_error(), dt.prediction_error(validation_set) ) print dt.count_nodes() dt = dt.post_prune(validation_set) print "test set: {} training_error : {} validation_error: {}".format( dt.prediction_error(test_set), dt.training_error(), dt.prediction_error(validation_set) ) print dt.count_nodes() print dt.print_tree()
def train_r(records, attributes, sqm, depth): """Recursive call of the train function. Use the given records and train a tree of maximal given depth, using sqm attributes among all the possible given attributes. The recursion stops when all records are the same, or have the same label, or the maximal depth is reached.""" if records.label_monotone or records.monotone or depth == 0: return Decision(records.mode) chosen_attributes = [] attributes_with_no_split = 0 # this loop ensure that we select attributes with distinct values. while len(chosen_attributes) == attributes_with_no_split: # select randomly sqm elements chosen_attributes = [attributes[randint(0, len(attributes)-1)] for i in xrange(sqm)] # repeat selection as long as at least one feature appears twice while len(list(set(chosen_attributes))) != len(chosen_attributes): chosen_attributes = [attributes[randint(0, len(attributes)-1)] for i in xrange(sqm)] best_gain = -1 former_best = None best_split = None best_index = None best_range = None is_numerical = None attributes_with_no_split = 0 for criteria in chosen_attributes: splits = generate_splits( records, criteria ) if len(splits) == 0: # there is no splits when all values of the feature # are the same attributes_with_no_split += 1 for s in splits: gain = s.gain if best_gain < gain: former_best = best_split best_split = s best_gain = gain if former_best is not None: del former_best del splits s = best_split decision_tree = DecisionTree( s.feature_index, s.feature_range, s.is_numerical ) if s.left.size == 0 or s.right.size == 0: del s return Decision( records.mode ) depth -= 1 decision_tree.right = train_r( s.right, attributes, sqm, depth ) decision_tree.left = train_r( s.left, attributes, sqm, depth ) del s return decision_tree
def learn(dataset, pruneFlag, maxDepth): tree = DecisionTree.makeTree(dataset, 9, [0, 1, 2, 3, 4, 5, 6, 7, 8], -1, maxDepth) return tree
from optparse import OptionParser from Loaders import * from DecisionTree import * # Options parser = OptionParser() parser.add_option("-r", "--train", type="string", dest="train", default="bcan.train", help="Training Data File") parser.add_option("-t", "--test", type="string", dest="test", default="bcan.test", help="Test Data File") parser.add_option("-v", "--validate", type="string", dest="validate", default="bcan.validate", help="Validation Data File") parser.add_option("-s", "--stopping_parameter", type="int", dest="stop", default=1, help="Stopping Parameter") (options, args) = parser.parse_args() print "Loading\n...'%s' as training set,\n...'%s' as test set,\n...'%s' as validation set,\n...stopping parameter %d..." % (options.train, options.test, options.validate, options.stop) training_set = load_data(options.train) test_set = load_data(options.test) validation_set = load_data(options.validate) s_para = options.stop dt = DecisionTree(training_set, stopping_parameter=s_para) print dt.prediction_error(test_set) print dt.training_error() print dt.count_nodes()
def id3(trainfilename,testfilename,originalValuefilename): trainingFile = open(trainfilename) """ IMPORTANT: Change this variable too change target attribute """ target_attribute = "Close" data = [[]] for line in trainingFile: line = line.strip("\r\n") data.append(line.split(',')) data.remove([]) attributes = data[0] data.remove(attributes) #Run ID3 tree = DecisionTree.makeTree(data, attributes, target_attribute, 0) #print "generated decision tree" data = [[]] testFile = open(testfilename) for line in testFile: line = line.strip("\r\n") data.append(line.split(',')) data.remove([]) #tree = str(tree) #tree = "%s\n" % str(tree) attributes = ['Open', 'High', 'Low', 'Close'] prediction = [] count = 0 for entry in data: count += 1 tempDict = tree.copy() result = "" while(isinstance(tempDict, dict)): root = Node.Node(tempDict.keys()[0], tempDict[tempDict.keys()[0]]) tempDict = tempDict[tempDict.keys()[0]] index = attributes.index(root.value) value = entry[index] if(value in tempDict.keys()): child = Node.Node(value, tempDict[value]) result = tempDict[value] tempDict = tempDict[value] else: result = recheck.some_func(value,trainfilename,testfilename) break #print ("entry%s = %s" % (count, result)) prediction.append(result) #showinfo("ID3 Algorithm","Predictions are done"+str(prediction[0])) total_predictions = len(prediction) predicted_2 = [] i = 0 while i < total_predictions: temp = float(prediction[i]) predicted_2.append(temp) i = i+1 #showinfo("ID3 Algorithm","Predictions are done"+str(temp)+str(type(temp))) open_values = gettingOriginalOpenValues(originalValuefilename) original_close_values = gettingOriginalCloseValues(originalValuefilename) #print open_values #print original_close_values # print predicted_2 # plotting plt.title("Results for given dataset using ID3 Algorithm") plt.plot(open_values,predicted_2,'g.',markersize=np.sqrt(150.),label ='ID3 Prediction') plt.plot(open_values,original_close_values,'b.',markersize=np.sqrt(100.),label = 'Orignial Values') plt.legend(loc='upper left') plt.xlabel("Open Values") plt.ylabel("Close Values") plt.grid() #plt.show() fig = plt.gcf() fig.set_size_inches(8, 4) ax=plt.subplot(111) # Shrink current axis's height by 10% on the bottom box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * 0.1,box.width, box.height * 0.9]) # Put a legend below current axis ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.09),fancybox=True, shadow=True, ncol=5,fontsize="10") fig.savefig('test_result_id3.jpg', dpi=100) #showinfo("Naive Bayes Algorithm","Plotting Completed")''' x = Image.open("E:\\4.2\Final Year Project\Code\Complete Project\\test_result_id3.jpg") y = ImageTk.PhotoImage(x) label6 = Label(image=y) label6.image = y label6.place(x=50, y=290) #Generate program ''' file = open('program.py', 'w') file.write("import Node\n\n") file.write("import recheck\n\n") #open input file file.write("data = [[]]\n") """ IMPORTANT: Change this file path to change testing data """ file.write("f = open('AAPLTest.csv')\n") #gather data file.write("for line in f:\n\tline = line.strip(\"\\r\\n\")\n\tdata.append(line.split(','))\n") file.write("data.remove([])\n") #input dictionary tree file.write("tree = %s\n" % str(tree)) file.write("attributes = %s\n" % str(attributes)) file.write("prediction = []\n") file.write("count = 0\n") file.write("for entry in data:\n") file.write("\tcount += 1\n") #copy dictionary file.write("\ttempDict = tree.copy()\n") file.write("\tresult = \"\"\n") #generate actual tree file.write("\twhile(isinstance(tempDict, dict)):\n") file.write("\t\troot = Node.Node(tempDict.keys()[0], tempDict[tempDict.keys()[0]])\n") file.write("\t\ttempDict = tempDict[tempDict.keys()[0]]\n") #this must be attribute file.write("\t\tindex = attributes.index(root.value)\n") file.write("\t\tvalue = entry[index]\n") #ensure that key exists file.write("\t\tif(value in tempDict.keys()):\n") file.write("\t\t\tchild = Node.Node(value, tempDict[value])\n") file.write("\t\t\tresult = tempDict[value]\n") file.write("\t\t\ttempDict = tempDict[value]\n") #otherwise, break file.write("\t\telse:\n") #file.write("\t\t\t#print \"can't process input %s\" % count\n") file.write("\t\t\tresult = recheck.some_func(value)\n") file.write("\t\t\tbreak\n") #print solutions file.write("\t#print (\"entry%s = %s\" % (count, result))\n") file.write("\tprediction.append(result)\n") print "written program" ''' result=accuracy_calculation(original_close_values,predicted_2) return result
def completeExexution(trainfilename,testfilename,originalValuefilename): #id3 algorithm trainingFile = open(trainfilename) target_attribute = "Close" data = [[]] for line in trainingFile: line = line.strip("\r\n") data.append(line.split(',')) data.remove([]) attributes = data[0] data.remove(attributes) #Run ID3 tree = DecisionTree.makeTree(data, attributes, target_attribute, 0) #print "generated decision tree" data = [[]] testFile = open(testfilename) for line in testFile: line = line.strip("\r\n") data.append(line.split(',')) data.remove([]) #tree = str(tree) #tree = "%s\n" % str(tree) attributes = ['Open', 'High', 'Low', 'Close'] prediction = [] count = 0 for entry in data: count += 1 tempDict = tree.copy() result = "" while(isinstance(tempDict, dict)): root = Node.Node(tempDict.keys()[0], tempDict[tempDict.keys()[0]]) tempDict = tempDict[tempDict.keys()[0]] index = attributes.index(root.value) value = entry[index] if(value in tempDict.keys()): child = Node.Node(value, tempDict[value]) result = tempDict[value] tempDict = tempDict[value] else: result = recheck.some_func(value,trainfilename,testfilename) break prediction.append(result) total_predictions = len(prediction) predicted_2 = [] i = 0 while i < total_predictions: temp = float(prediction[i]) predicted_2.append(temp) i = i+1 #naive bayes algorithm trainingdataset = loadTrainCsv(trainfilename) testdataset = loadTestCsv(testfilename) summaries = summarizeByClass(trainingdataset) naive_predictions = getPredictions(summaries, testdataset) predicted_1=naive_predictions open_values = gettingOriginalOpenValues(originalValuefilename) original_close_values = gettingOriginalCloseValues(originalValuefilename) #print "Naive Predictions"+str(predicted_1) #print "ID3"+str(predicted_2) plt.title("Results for given dataset using ID3 & Naive Bayes Algorithm") plt.plot(open_values,predicted_1,'r.',markersize=np.sqrt(150.),label ='Naive Bayes Prediction') plt.plot(open_values,predicted_2,'g.',markersize=np.sqrt(150.),label ='ID3 Prediction') plt.plot(open_values,original_close_values,'b.',markersize=np.sqrt(100.),label = 'Orignial Values') plt.legend(loc='upper left') plt.xlabel("Open Values") plt.ylabel("Close Values") plt.grid() #plt.show() fig = plt.gcf() fig.set_size_inches(8, 4) ax=plt.subplot(111) # Shrink current axis's height by 10% on the bottom box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * 0.1,box.width, box.height * 0.9]) # Put a legend below current axis ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.09),fancybox=True, shadow=True, ncol=5,fontsize="10") fig.savefig('test_result_id3_naivebayes.jpg', dpi=100) #showinfo("Naive Bayes Algorithm","Plotting Completed")''' x = Image.open("E:\\4.2\Final Year Project\Code\Complete Project\\test_result_id3_naivebayes.jpg") y = ImageTk.PhotoImage(x) label6 = Label(image=y) label6.image = y label6.place(x=50, y=290) result=accuracy_calculation(original_close_values,predicted_1,predicted_2) return result
__author__ = 'Aman' import DataPrepare import DataSpecific import DecisionTree ''' tree is the root of DecisionTree obtained by training the data in data file ''' tree = DecisionTree.trainData() ''' Updating the attribute list with values obtained from converting continuous variables. ''' DataSpecific.attribute_list.update(tree.dictIntervalContVar) ''' Convert continuous variables from the query/test data to discrete variables using the intervals used in decision tree. ''' def cleanRow(row): row = DataPrepare.cleanRow(row) dictContVar = tree.dictIntervalContVar for entry in dictContVar.keys(): listIntervals = dictContVar[entry] for i in range(0,len(listIntervals)-1): if row[entry] == '?': break if row[entry] > listIntervals[i] and row[entry] <= listIntervals[i+1]: row[entry] = i break
#!/bin/env python # -*- coding: utf-8 -*- import ThinFatData import DecisionTree data = ThinFatData.createDataSet(num=100000,version=2) DecisionTree.testFromDF(data)
class LearningModule: def __init__(self): self.mdp_list = [] self.success_config = [] self.decision_tree = None self.StateActionPairs= [] # initialise self.mdp_list.append([]) def initialiseAttributes(self): binary_values = ("true", "false") attributes = ["on(b0, table)", "on(b0, b1)", "on(b0,b2)", "on(b1, table)", "on(b1, b0)", "on(b1, b2)", "on(b2, table)", "on(b2, b0)", "on(b2,b1)", "has_shape(b0, prism)", "has_shape(b1, prism)", "has_shape(b2, prism)", "has_shape(b0, cube)", "has_shape(b1, cube)", "has_shape(b2, cube)", "has_shape(b0, cuboid)", "has_shape(b1, cuboid)", "has_shape(b2, cuboid)", "has_colour(b0, red)", "has_colour(b1, red)", "has_colour(b2, red)", "has_colour(b0, blue)", "has_colour(b1, blue)", "has_colour(b2, blue)", "has_colour(b0, green)", "has_colour(b1, green)", "has_colour(b2, green)", "has_size(b0, small)", "has_size(b1, small)", "has_size(b2, small)", "has_size(b0, medium)", "has_size(b1, medium)", "has_size(b2, medium)", "has_size(b0, large)", "has_size(b1, large)", "has_size(b2, large)", "move(b0, table)", "move(b0, b1)", "move(b0, b2)", "move(b1, table)", "move(b1, b0)", "move(b1, b2)", "move(b2, table)", "move(b2, b0)", "move(b2, b1)"] attribute_dict = [] for attribute in attributes: attribute_dict.append((attribute,binary_values)) attribute_dict = OrderedDict(attribute_dict) index = 0 names = attribute_dict.keys() values = attribute_dict.values() attributes = [] for name, vals in zip(names, values): attributes.append(Attribute(name, index, vals)) index += 1 return attributes def initialise_mdp(self, blocks): start_config = [-1,-1,-1] startingState = State(0, blocks, start_config) self.initialise_lists() self.success_config[-1].append(startingState) label = len(self.mdp_list[-1]) mdp = MDP(label, blocks) mdp.statelist.append(startingState) mdp.initMDP(startingState) self.mdp_list[-1].append(mdp) def new_layer(self): try: self.mdp_list.append([]) return True except: return False def initialise_lists(self): self.success_config.append([]) def combineIdenticalMDPs(self, mdp_list): print "combining" sum_distance = [[0.0 for i in range(0,len(mdp_list[0].getStateList()))] for j in range(0,len(mdp_list[0].getStateList()))] weighted_average = [[0.0 for i in range(0,len(mdp_list[0].getStateList()))] for j in range(0,len(mdp_list[0].getStateList()))] for mdp in mdp_list: for i, row in enumerate(mdp.getDistanceMatrix()): for j, distance in enumerate(row): if distance > 0: sum_distance[i][j] += 1/distance for mdp in mdp_list: for i, row in enumerate(mdp.getDistanceMatrix()): for j, distance in enumerate(row): if distance > 0.0 and sum_distance[i][j] > 0.0: weight = (1/distance)/(sum_distance[i][j]) weighted_average[i][j] += weight*mdp.getQMatrix()[i][j] newMDP = deepcopy(mdp_list[0]) newMDP.setQMatrix(weighted_average) return newMDP def findState(self, config, mdp): for state in mdp.getStateList(): if state.getConfiguration() == config: return state def errorHandle(self, error_config, success_config, attributes): success_states = [] for config in success_config: success_states.append(self.findState(config, self.mdp_list[-1][-1])) error_state = self.findState(error_config, self.mdp_list[-1][-1]) self.mdp_list[-1][-1].simulation(error_state, self.success_config[-1], attributes) def writeToList(self, mdp): blocks = mdp.getBlocks() for state in mdp.getStateList(): for action in state.getActions(): action_block = action.getActionableBlock() dest_block = action.getDestinationBlock() if dest_block == None: example = (blocks[action_block].getShape(), blocks[action_block].getColour(), blocks[action_block].getSize(), mdp.getQMatrix()[state.getLabel()][action.getNextStateAddr()]) else: example = (blocks[action_block].getShape(), blocks[action_block].getColour(),blocks[action_block].getSize(), blocks[dest_block].getShape(), blocks[dest_block].getColour(),blocks[dest_block].getSize(), mdp.getQMatrix()[state.getLabel()][action.getNextStateAddr()]) self.StateActionPairs.append(example) return def generateRules(self): reduced_mdp_list = [] attributes = [] self.mdp_list[-1] = [self.combineIdenticalMDPs(self.mdp_list[-1])] self.writeToList(self.mdp_list[-1][-1]) training_set = self.StateActionPairs attr_shape = ("cube", "prism", "cuboid") attr_colour = ("red", "blue", "green") attr_size = ("small","medium","large") attribute_dict = [("has_shape(A,", attr_shape), ("has_colour(A,", attr_colour), ("has_size(A,", attr_size), ("has_shape(D,", attr_shape), ("has_colour(D,",attr_colour), ("has_size(D,", attr_size)] attribute_dict = OrderedDict(attribute_dict) index = 0 names = attribute_dict.keys() values = attribute_dict.values() for name, vals in zip(names, values): attributes.append(Attribute(name, index, vals)) index += 1 self.decision_tree = DecisionTree(attributes, training_set) rules = self.decision_tree.getRules() for rule in rules: print rule # rules = self.selectRules(rules) def selectRules(self, rules): """ Select the best rules """ """ Think about doing it using SVM""" print rules for rule in rules: print rule print "\n" rules = sorted(rules, key=operator.itemgetter(-1)) q_val = [] for index, rule in enumerate(rules): q_val.append([index, rule[-1]]) whitened = whiten(q_val) centroids,_ = kmeans(whitened, 3, thresh = 1,iter = 100) ids,_= vq(whitened, centroids) key = ids[-1] indices = [] for index, keys in enumerate(ids): if key == keys: indices.append(index) valid_rules = [] for index in indices: valid_rules.append(rules[index][0]) return self.parseRules(valid_rules) def parseRules(self, rules): valid_rules = [] for rule in rules: sentence = "" for segment in rule: sentence = sentence + segment + ", " sentence = sentence[:-2] valid_rules.append(sentence) return valid_rules def reduceMDP(self,errorconfig, stack_config, start_config, blocks): mdp_list = [] attributes = self.initialiseAttributes() for i in range(0, len(errorconfig)): mdp_list.append(MDP(i, blocks)) startingState = State(0, start_config) mdp_list[i].statelist.append(startingState) mdp_list[i].initMDP(startingState) errorstate = self.findState(errorconfig[i], mdp_list[i]) stackstate = [] for j in range(0,len(stack_config)): stackstate.append(self.findState(stack_config[j], mdp_list[i])) mdp_list[i].simulation(errorstate, stackstate, attributes) mdp_list[i].updateDistanceMatrix(errorstate) reduced_mdp = self.combineIdenticalMDPs(mdp_list) return reduced_mdp
return parts avg_error = [] tsets = parts(training_set, 10) pg, prog = ProgressBar(), 1.0 for s_para in s_paras: op = prog/s_len ip = 1.0 test_error = [] for p in tsets: train = [] for q in tsets: if p != q: train += q test = p dt = DecisionTree(train, stopping_parameter=s_para) test_error.append(dt.prediction_error(test)[1]) pg.update(-(1-ip/10)/s_len+op, "Stopping Para %d" % s_para) ip += 1 #print test_error avg_error.append((s_para, float(sum(test_error))/len(test_error))) prog += 1 top = sorted(avg_error, key = lambda (_,e) : e)[0] print avg_error print "Stopping Parameter and Error: %d, %f" % (top[0], top[1]) # Build decision tree with "best" parameter trained on original print "Using these values to get training and test error" dt = DecisionTree(training_set, stopping_parameter=top[0]) print dt.training_error()
def main(): row1 = 0 count = 0 accuracy = 0 orig_op = [] print "Training Naive-Bayes ..." tic = time.clock() file1 = open('CreditTraining.csv') finattr = "class" base1 = [[]] baseT1 = [[]] basedata = [[]] basedata1 = [[]] basedata2 = [[]] for line in file1: line = line.strip("\r\n") basedata.append(line.split(',')) basedata.remove([]) #print " base main "+str(basedata) parameters = basedata[0] basedata.remove(parameters) #import pudb #pudb.set_trace() #base1, A2 = DecisionTree.getContinuous(basedata, parameters, parameters[1]) #base1, A3 = DecisionTree.getContinuous(base1, parameters, parameters[2]) #base1, A8 = DecisionTree.getContinuous(base1, parameters, parameters[7]) #base1, A11 = DecisionTree.getContinuous(base1, parameters, parameters[10]) #base1, A14 = DecisionTree.getContinuous(base1, parameters, parameters[13]) #base1, A15 = DecisionTree.getContinuous(base1, parameters, parameters[14]) #print " base final " + str(base1) #print " A2 " + str(A2) #print " A3" + str(A3) #print " A8 " + str(A8) #print " A11 " + str(A11) #print " A14 " + str(A14) #Run ID3''' tree = DecisionTree.ID(basedata, parameters, finattr) #print "generated decision tree"+ str(tree) f = open('Credit.csv') for line in f: line = line.strip("\r\n") basedata1.append(line.split(',')) basedata1.remove([]) #import pudb #pudb.set_trace() '''baseT1= DecisionTree.getContinuousTest(basedata1, parameters, parameters[1],A2) #print "based="+str(basedata1) #print "baset="+str(baseT1) baseT1= DecisionTree.getContinuousTest(baseT1, parameters, parameters[2],A3) baseT1= DecisionTree.getContinuousTest(baseT1, parameters, parameters[7],A8) #import pudb #pudb.set_trace() baseT1= DecisionTree.getContinuousTest(baseT1, parameters, parameters[10],A11) baseT1= DecisionTree.getContinuousTest(baseT1, parameters, parameters[13],A14) baseT1= DecisionTree.getContinuousTest(baseT1, parameters, parameters[14],A15)''' #print " baseT1 " + str(baseT1) #import pudb #pudb.set_trace() for entry in basedata1: row1 += 1 train_data = tree.copy() output = "" #import pudb #pudb.set_trace() while(isinstance(train_data, dict)): root = Node.Node(train_data.keys()[0], train_data[train_data.keys()[0]]) train_data = train_data[train_data.keys()[0]] index = parameters.index(root.X) value = entry[index] if(value in train_data.keys()): Node.Node(value, train_data[value]) output = train_data[value] train_data = train_data[value] else: #print " value break at " + str(value) #print "can't process input %s" % count output = DecisionTree.freq_check(parameters, basedata, parameters[15]) break orig_op.append(output) #print ("row%s = %s" % (row1, output)) #print "written program" f1 = open('classcredit.csv') for line in f1: line = line.strip("\r\n") basedata2.append(line) basedata2.remove([]) i = 0 for ent in basedata2: #print "orig_op[i] " + str(orig_op[i]) + "ent = " + ent if (ent == orig_op[i]): count += 1 i += 1 accuracy = (int)(count/100) print "Calculated accuracy for the testing data = "+ str(count)