def test(): attributes = ['Author', 'Name', 'Genre', 'Year', 'Topics'] infolists = [ ['James Joyce', 'Ulyssess', 'Novel', 1922, 'Modernist Novel'], [ 'James Joyce', 'AfterTheRace', 'Short Story', 1914, 'Realist Fiction' ], ['James Joyce', 'Araby', 'Short Story', 1914, 'Realist Fiction'], ['James Joyce', 'Encounter', 'Short Story', 1914, 'Realist Fiction'], ['James Joyce', 'Eveline', 'Short Story', 1904, 'Realist Fiction'], [ 'James Joyce', 'TheBoardingHouse', 'Short Story', 1914, 'Realist Fiction' ], ['Mark Twain', 'ConnecticutYankee', 'Novel', 1889, 'Science Fiction'], ['Poe', 'CaskofAmontillado', 'Short Story', 1846, 'Horror'], ['Poe', 'FallHouseOfUsher', 'Short Story', 1839, 'Horror'], ['Poe', 'MasqueofTheRedDeath', 'Short Story', 1842, 'Horror'], ['Poe', 'Raven', 'Short Story', 1845, 'Horror'] ] testlists = [[None, 'TwoGallants', 'Short Story', 1914, 'Realist Fiction'], [None, 'Sisters', 'Short Story', 1914, 'Realist Fiction'], [None, 'AnnalbelLee', 'Short Story', 1849, 'Horror'], [None, 'ConnecticutYankee', 'Novel', 1889, 'Science Fiction']] infoTree = DecisionTree() aftereval = None while True: try: infoTree = DecisionTree() infoTree.train(infoTree.root, infolists, attributes) aftereval = infoTree.eval(testlists) break except Exception as e: e = str(e) for i in testlists: ind = 0 for j in i: j = str(j) if j == e: attributes[ind] = None break elif ind == len(i) - 1: break ind += 1 else: break print(attributes) text = 'Prefix Walk \n' text += prefixWalk(infoTree.root) text += '\nPostfix Walk \n' text += postfixWalk(infoTree.root) writeFile(text, 'author') print(aftereval)
def create_tree(self): targetIndex = self.dataset.columns.get_loc(self.target) samples_Idxs = np.random.permutation(self.dataset.shape[0])[:self.n_sample] features_idxs = list(set(np.append(np.random.permutation(self.dataset.shape[1])[:self.n_features], targetIndex))) train = self.dataset.iloc[samples_Idxs][self.dataset.columns[features_idxs]] return dt.DecisionTree(train.values, train.columns.get_loc(self.target), train.columns.values, features_idxs)
def create(jsonFilePath, dataset): try: with open(jsonFilePath) as json_file: try: jsonData = json.load(json_file) validate(instance=jsonData, schema=estimatorSchema) except jsonschema.exceptions.ValidationError as err: print(err) raise ValueError(error.errors['estimator_config']) except ValueError as err: print(err) raise ValueError(error.errors['estimator_config']) if jsonData['estimator'].startswith('KNeighbors'): import Knn #as Knn esti = Knn.Knn(jsonData) elif jsonData['estimator'].startswith('DecisionTree'): import DecisionTree esti = DecisionTree.DecisionTree(jsonData) else: est_str = jsonData['estimator'] print(f'Invalid value for estimator name: {est_str}') raise ValueError(error.errors['estimator_config']) esti.parse(jsonData) esti.assign_dataset(dataset) return esti except FileNotFoundError as err: print(err) raise ValueError(error.errors['estimator_config'])
def __init__(self, train, n_trees, sample_leaf_limits, sample_ratio, chara_ratio): ''' : __init__: 根据参数初始化随机森林,并根据训练集进行训练 : note: 实现步骤可以直接参照李航的统计学习方法中的步骤依次进行实现 : param train: 训练集,其中第一列为样本类别标签 : type train: pd.Dataframe : param n_trees: 随机森林中的决策树个数 : type n_trees: int : param sample_leaf_limits: 随机挑选的样本比例,范围在[0,1] : type sample_leaf_limits: float : param sample_ratio: 随机挑选的特征比例,范围在[0,1] : type chara_ratio: float ''' self.forest = [] fn = int(chara_ratio * (train.shape[1] - 1)) for n in range(n_trees): temp1 = time.time() sf = np.random.choice(np.arange(1, train.shape[1]), fn, replace=False) sf = np.append(0, sf) train_n = train.iloc[:, sf] p = np.random.random_sample() * (1 - sample_ratio) + sample_ratio train_n = train_n.loc[np.random.choice(train_n.index, int(p * train_n.index.size), replace=False)] tree = DT.DecisionTree(train_n, sample_leaf_limits) self.forest.append(tree) temp2 = time.time() print('随机森林中的第%d棵树构造成功,耗时%f' % (n, temp2 - temp1))
def train(self, data, labels, bootstrapping=True): # for i, data in enumerate(data): for i in data.iterrows(): index, rowdata = i assigned_tree = math.floor(random.random() * self.count) # adds key value pair to data, labels # self.data[assigned_tree].append((index, rowdata)) self.data[assigned_tree] = self.data[assigned_tree].append(rowdata) self.labels[assigned_tree].append((index, labels[index])) if bootstrapping: treesPerForest = int(len(data) / 3) for i in range(0, self.count): data = data.sample(frac=1) self.data[i] = self.data[i].append( data.iloc[1:treesPerForest, :]) index = data.index.values.astype(int)[1:treesPerForest] for r in index: self.labels[i].append((r, labels[r])) for i, tree in enumerate(self.forest): x = pd.DataFrame(self.labels[i]).drop(0, axis=1) self.forest[i] = DecisionTree.DecisionTree( self.data[i].reset_index(drop=True), x.squeeze()) self.forest[i].build_tree()
def decisionTreeLearning(examples, attributes, parents_examples=()): if len(examples) == 0: return pluralityValue( parents_examples ) #ritorna la piu frequente classificazione tra gli examples elif allSameClass(examples): return DecisionTree.Leaf( examples[0][dataset.target] ) #se tutti hanno la stessa classe ritorna la classe del primo esempio elif len(attributes) == 0: return pluralityValue( examples ) #ritorna la piu frequente classificazione tra gli esempi else: if ce == 0: mostImpAtt, threshold = chooseAttribute(attributes, examples) else: mostImpAtt, threshold = chooseAttribute2(attributes, examples) tree = DecisionTree.DecisionTree(mostImpAtt, threshold, dataset.attrnames[mostImpAtt]) ExampleMinor, ExampleMajor = splittingOnThreshold( mostImpAtt, threshold, examples) #separazione basata sulla soglia #fa la ricorsione ed aggiunge all albero branchesLeft = decisionTreeLearning(ExampleMinor, removeAttr( mostImpAtt, attributes), examples) #ricorsione branchesRight = decisionTreeLearning(ExampleMajor, removeAttr( mostImpAtt, attributes), examples) #ricorsione tree.addLeft(threshold, branchesLeft) tree.addRight(threshold, branchesRight) return tree
def decisionTreeLearning(examples, attributes, parents_examples=()): if len(examples) == 0: return pluralityValue( parents_examples ) #returns the most frequent classification among the examples elif allSameClass(examples): return DecisionTree.Leaf( examples[0][dataset.target] ) #if they all have the same class, I return the class of the first example elif len(attributes) == 0: return pluralityValue( examples ) #returns the most frequent classification among the examples else: mostImpAtt, threshold = chooseAttribute(attributes, examples) tree = DecisionTree.DecisionTree(mostImpAtt, threshold, dataset.attrnames[mostImpAtt]) ExampleMinor, ExampleMajor = splittingOnThreshold( mostImpAtt, threshold, examples) #separate based on threshold #do recursion and add to the tree branchesLeft = decisionTreeLearning(ExampleMinor, removeAttr( mostImpAtt, attributes), examples) #recursion branchesRight = decisionTreeLearning(ExampleMajor, removeAttr( mostImpAtt, attributes), examples) #recursion tree.addLeft(threshold, branchesLeft) tree.addRight(threshold, branchesRight) return tree
def main(): print("Enter main()") #========================================================================================== # 決定木 [DecisionTree] の不純度 [purity] を表す関数の作図 # ノードの誤り率 [eror rate], 交差エントロピー関数 [cross-entropy], ジニ係数 [Gini index] #========================================================================================== tree = DecisionTree.DecisionTree() #------------------------------- # 不純度を表す関数群の plot #------------------------------- figure = plt.figure() axis = plt.subplot(1,1,1) plt.grid(linestyle='-') tree.plotNodeErrorFunction( figure, axis ) tree.plotCrossEntropyFunction( figure, axis ) tree.plotGiniIndexFunction( figure, axis ) plt.title("purity functions (i=1)") # title plt.legend(loc = "upper left") # 凡例 plt.tight_layout() # グラフ同士のラベルが重ならない程度にグラフを小さくする。 # 図の保存&表示 plt.savefig("./DecisionTree_scikit-learn_1.png", dpi=300) plt.show() print("Finish main()") return
def __init__(self, T=10, M=30, bagging=False): self.t = T self.m = M self.bagging = bagging self.forest = map(lambda i: DecisionTree(), range(T)) self.shape = None self.selected_attributes = list()
def classify_dataset_test(): #create dataset filename = "Dataset/iris.data" dataset = DT.Dataset(filename, _delimiter=',') Tree = DT.DecisionTree(dataset) #load exemples exemple1 = np.array([5.4, 3.9, 1.3, 0.4]).astype('S15') exemple2 = np.array([6.3, 2.5, 4.9, 1.5]).astype('S15') exemple3 = np.array([ 6.5, 3.0, 5.5, 1.8, ]).astype('S15') #classify exemples class1 = Tree.classify(exemple1) class2 = Tree.classify(exemple2) class3 = Tree.classify(exemple3) #verify classification eq_(class1, b'Iris-setosa') eq_(class2, b'Iris-versicolor') eq_(class3, b'Iris-virginica')
def test_DT2(): X, y = loadDataSet("HCTrain.csv") X_test, y_test = loadDataSet("HCTest.csv") tree = dt.DecisionTree(gt_privacy_p=float(1.0 / 100)) tree.fit(X, y) pred1 = tree.predict(X_test) print "AUC value", roc_auc_score(y_test, pred1)
def runTree(X_train, y_train, X_test, y_test, d): ''' initialize Decision Tree ''' # now for decision tree tree = dt.DecisionTree() tree.fitTree( X_train, y_train, max_depth=d ) # calling fitTree without maxDepth argument sets max depth to 999 # test on training set pred_tree_train = tree.predict(X_train) error_rate_tree_train = (sum([ 0 if pred == true else 1 for (pred, true) in zip(y_train, pred_tree_train) ]) / float(len(y_train))) # test on test set pred_tree_test = tree.predict(X_test) error_rate_tree_test = (sum([ 0 if pred == true else 1 for (pred, true) in zip(y_test, pred_tree_test) ]) / float(len(y_test))) '''print('') print('***** RESULTS DECISION TREE *****') print('Depth: ', tree.depth) print('') print('Training Error: ', error_rate_tree_train) print('Test Error : ', error_rate_tree_test)''' return error_rate_tree_train, error_rate_tree_test, tree.depth
def fit(self, train_data): self.train_data = train_data self.label_col = train_data.columns[0] labels = list(set(train_data[self.label_col].tolist())) for n in range(0, self.n_trees): print('%d tree begin fit' % (n + 1)) tt1 = time.time() train_labels = [] for label in labels: train_label = train_data[train_data[self.label_col] == label] train_n_label = train_label.sample(frac=1, replace=True, random_state=20) train_labels.append(train_n_label) train_n = pd.concat(train_labels) train_n.reset_index(drop=True, inplace=True) tree = DecisionTree.DecisionTree( min_sample_split=self.min_sample_split, n_features=self.n_features, criterion=self.criterion) tree.fit(train_n) weight = tree.score(train_data) self.forest.append(tree) self.weights.append(weight) tt2 = time.time() print('%d tree time cost: %f' % (n + 1, tt2 - tt1))
def bootstrap(trees, depth, train, test, display=False): """Performs bootstrap aggregation with a decision tree learner for k-class classification""" #Build an array for indices/predictions for output indices = np.zeros((train.length, trees), dtype=int) prediction_labels = np.zeros((test.length, trees), dtype=str) prediction_probs = np.zeros((test.length, test.label_length), dtype=float) for i in range(0, trees): #Randomly sample data from train to use bs_sample = np.random.choice(range(0, train.length), size=train.length) indices.T[i] = bs_sample bs_features = train.features[bs_sample] bs_labels = train.labels[bs_sample] #Create and train boostrap decision tree tree = dt.DecisionTree() tree.fit(bs_features, bs_labels, train.metadata, max_depth=depth) #Using this tree, do prediction on test prediction_probs += tree.predict(test.features, prob=True) prediction_labels.T[i] = tree.predict(test.features, prob=False) #Now, vote for predicted class using prediction_probs matrix predictions = [] truth = [] correct = 0 for i in range(0, test.length): #Finds the class that received the most probability prediction_index = np.argmax(prediction_probs[i]) yhat = test.metadata[-1][1][prediction_index] y = test.labels[i] predictions.append(yhat) truth.append(y) #Increment number of correct predictions if yhat == y: correct += 1 #calculate accuracy accuracy = correct / test.length if display: #Print the tree training indices for i in range(0, train.length): print(','.join(map(str, indices[i]))) #Print the predictions print() for i in range(0, test.length): print(','.join(prediction_labels[i]), predictions[i], truth[i], sep=',') #Print accuracy print() print(accuracy) #Return the overall predictions return predictions
def dtree(self, event): self.GetParent().setStatus("Generando Árbol...", 1) self.pbutton.actions = [self.mlC.GetValue(), self.mmC.GetValue()] dtree = DecisionTree(self.db, self.target, self.labels, self.mlC.GetValue(), self.mmC.GetValue()) tv = treeView(self, dtree) self.GetParent().setStatus("", 0) tv.Show()
def setUp(self): print("Testing probability calculation on sample training file") self.dt = DecisionTree.DecisionTree(training_datafile = training_datafile, csv_class_column_index = 1, csv_columns_for_features = [2,3,4,5]) self.dt.get_training_data() self.dt.calculate_first_order_probabilities() self.dt.calculate_class_priors()
def create(jsonFilePath, dataset): try: with open('schemas/estSchema.json') as schema_file: estimatorSchema = json.load(schema_file) except FileNotFoundError as err: template = "An exception of type {0} occurred. Arguments: {1!r}" message = template.format(type(err).__name__, err.args) print(message) raise ValueError(error.errors['estimator_config']) try: with open(jsonFilePath) as json_file: try: jsonData = json.load(json_file) validate(instance=jsonData, schema=estimatorSchema) except jsonschema.exceptions.ValidationError as err: template = "An exception of type {0} occurred. Arguments: {1!r}" message = template.format(type(err).__name__, err.args) print(message) raise ValueError(error.errors['estimator_config']) except ValueError as err: template = "An exception of type {0} occurred. Arguments: {1!r}" message = template.format(type(err).__name__, err.args) print(message) raise ValueError(error.errors['estimator_config']) if jsonData['estimator'].startswith('KNeighbors'): import Knn #as Knn esti = Knn.Knn(jsonData) elif jsonData['estimator'].startswith('DecisionTree'): import DecisionTree esti = DecisionTree.DecisionTree(jsonData) elif jsonData['estimator'].startswith('RandomForest'): import RandomForest esti = RandomForest.RandomForest(jsonData) elif jsonData['estimator'] == 'LinearSVC' or jsonData[ 'estimator'] == 'LinearSVR': import SVM esti = SVM.SVM(jsonData) elif jsonData['estimator'].startswith('ANN'): import ANN esti = ANN.ANN(jsonData) elif jsonData['estimator'] == 'TripleES': import TripleES esti = TripleES.TripleES(jsonData) else: est_str = jsonData['estimator'] print(f'Invalid value for estimator name: {est_str}') raise ValueError(error.errors['estimator_config']) #esti.parse(jsonData) # right??? esti.assign_dataset(dataset) return esti except FileNotFoundError as err: template = "An exception of type {0} occurred. Arguments: {1!r}" message = template.format(type(err).__name__, err.args) print(message) raise ValueError(error.errors['estimator_config'])
def setUp(self): print("Testing decision-tree induction on sample training file") self.dt = DecisionTree.DecisionTree(training_datafile = training_datafile, csv_class_column_index = 1, csv_columns_for_features = [2,3,4,5]) self.dt.get_training_data() self.dt.calculate_first_order_probabilities() self.dt.calculate_class_priors() self.root_node = self.dt.construct_decision_tree_classifier()
def test_DT1(): X = np.array([[0, 0, 0], [0.1, 0.1, 0.1], [ 1.0, 1.0, 1.0], [.99, .99, .99]]) y = np.array([0, 0, 1, 1]) tree = dt.DecisionTree(gt_privacy_p=float(1.0 / 100)) tree.fit(X, y) pred1 = tree.predict(np.array([0.05, 0.05, 0.05])) print "pred1 value-0:", pred1 pred2 = tree.predict(np.array([0.995, 0.995, 0.995])) print "pred2 value-1:", pred2
def fit(self, X, Y): N = len(X) d = np.int(len(X[0]) * 0.5) for i in range(N): print("Progress:", i, "of ", N) sel = np.random.choice(len(X), size=len(X), replace=True) Xb, Yb = X[sel], Y[sel] model = DecisionTree() model.fit(Xb, Yb, d) self.models.append(model)
def main(argv): #read parameters num of trees and max depth of decision tree n_trees = int(argv[1]) max_d = int(argv[2]) #get training and test data train = load(open(argv[3], 'r')) meta = train['metadata']['features'] train_data = np.array(train['data']) n_train = train_data.shape[0] K = len(meta[-1][1]) #get test data test = load(open(argv[4], 'r')) test_data = np.array(test['data']) n_test = test_data.shape[0] #initial weight w = np.full((n_train), 1.0 / n_train) predictions = [] weights = [] alphas = [] #training decision tree by adaboost epsilon = 0 for i in range(n_trees): tree = dt.DecisionTree() tree.fit(train_data[:, :-1], train_data[:, -1], meta, max_d, instance_weights=w) train_result = tree.predict(train_data[:, :-1], prob=False) test_result = tree.predict(test_data[:, :-1], prob=False) match = (train_result == train_data[:, -1]).astype(int) err = np.sum(w * (1 - match)) / np.sum(w) if (err >= 1 - 1.0 / K): break weights.append(w) predictions.append(test_result) alpha = np.log((1 - err) / err) + np.log(K - 1) alphas.append(alpha) w = w * np.exp(alpha * (1 - match)) w = w / np.sum(w) predictions = np.asarray(predictions).T alphas = np.asarray(alphas) weights = np.asarray(weights).T #calculate ensemble prediction and accuracy ens_prediction = np.apply_along_axis(combine_predict, 1, predictions, alphas) test_Y = test_data[:, -1] return (meta[-1][1], ens_prediction, test_Y)
def Process_air_quality(): X, y = LoadData.load_ozone_data() Experiments.Models_Comparison(X, y, "Air Pollution") op = True DT.DecisionTree(X, y, title="Air Pollution Decision Tree", optimize=op) AB.AdaBoost(X, y, title="Air Pollution AdaBoost", optimize=op) KNN.KNN(X, y, title="Air Pollution KNN", optimize=op) NN.NeuralNetwork(X, y, title="Air Pollution Neural Network", optimize=op) SVM.SVM(X, y, title="Air Pollution SVM", optimize=op)
def create_a_node(self, node_type): # type list: # # sensor_listener # serial_transmit # wheel_node # arm_node # voice_node # dt_node # UI_node # app_train_face # vision_node # ddd a_object = None if node_type is 'sensor_listener': a_object = SensorListener.SensorListener(self) elif node_type is 'serial_transmit': a_object = SerialTransmit.SerialTransmit(self) elif node_type is 'wheel_node': a_object = WheelNode.WheelNode(self) elif node_type is 'arm_node': a_object = ArmNode.ArmNode(self) elif node_type is 'voice_sys': a_object = voice.Voice(self) elif node_type is 'dt_sys': a_object = DT.DecisionTree(self) self.dt_node = a_object elif node_type is 'UI_sys': a_object = UI.RobotControl(self) elif node_type is 'app_train_face': self.cap = open_camera(self.cap) a_object = tfr.TrainFaceRecognition(self, self.cap) elif node_type is 'vision_sys': self.cap = open_camera(self.cap) a_object = vision.Vision(self, self.cap) elif node_type is 'default_control': a_object = defaultcontrol.DefaultControl(self) elif node_type is 'marker_sys': self.cap = open_camera(self.cap, [1280, 720]) a_object = MarkerAPI.Marker(self, self.cap) elif node_type is 'face_track': a_object = PersonTrack.PersonTrack(self) elif node_type is 'roam': a_object = Roam.Roam(self) elif node_type is 'cloud': a_object = Cloud.PersonTrackToCloud(self) elif node_type is 'object_detect': self.cap = open_camera(self.cap) a_object = obj_detect.ObjectRecognition(self, self.cap) elif node_type is 'emotion_detect': self.cap = open_camera(self.cap) a_object = emotion_detection.EmotionDetection(self, self.cap) return a_object
def fit(self, x, y): data = np.hstack((x, y)) for i in range(self.max_tree): ranData = self.randomSample(data) x2 = ranData[:, :-1] y2 = ranData[:, -1] model = de.DecisionTree(criterion=self.criterion, max_depth=self.max_depth) model.fit(x2, y2.reshape(len(y2), 1)) self.forest.append(model) return self
def decision_tree_classification(X, y, test_dat): classifier = dt.DecisionTree(45) classifier.train(X, y) y_hat = classifier.predict(test_dat) f = open("spam_predictions_decision_tree.csv", 'w') f.write("Id,Category\n") for i in range(np.size(test_dat, 0)): f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n") f.close() print("DONE")
def __init__(self): self.player_id = None self.our_hand = [] self.our_rule_expression = None self.decision_tree = dt.DecisionTree() self.god_instance = God.God.get_instance() self.num_correct = 0 self.num_incorrect = 0 self.card_played = None self.num_consecutive_correct = 0 self.confidence_value = 50
def fit(self,data,data_label): self.data = data self.data_label = data_label self.trees = [] for _ in range(0,self.n_trees): train_index = self.sample() train = [data[j] for j in train_index] train_label = [data_label[j] for j in train_index] dt = DecisionTree.DecisionTree(n_attribute=self.n_attribute,discretize=self.discretize) dt.fit(train, train_label) self.trees.append(dt)
def fit(self, data, label): num_samples, total_features = data.shape for tree_num in range(self.num_trees): # print("TREE:", tree_num) random_rows = np.random.randint(0, num_samples, num_samples) random_features = np.random.choice(total_features, self.num_features, replace=False) random_data = data[random_rows, :][:, random_features] random_labels = label[random_rows] dt = DecisionTree(self.max_depth, self.min_obs) dt.fit(random_data, random_labels) self.trees += [(random_features, dt)]
def bagging_learning(sample_indices, meta, train_data, test_data, max_d): resamples = train_data[sample_indices] tree = dt.DecisionTree() tree.fit(resamples[:, :-1], resamples[:, -1], meta, max_d) print(tree.predict(test_data[:, :-1], prob=True)) #return np.concatenate((tree.predict(test_data[:,:-1],prob=False).reshape((-1,1)),\ # tree.predict(test_data[:,:-1],prob=True)), axis=1) return pd.DataFrame({ "prediction": tree.predict(test_data[:, :-1], prob=False), "probs": tree.predict(test_data[:, :-1], prob=True) })
def main(): opts = util.parse_args() train_partition = util.read_arff(opts.train_filename, True) test_partition = util.read_arff(opts.test_filename, False) # create an instance of the DecisionTree class from the train_partition tree = DecisionTree(train_partition, (vars(opts)).get("depth")) rootnode = tree.constructsubtree(train_partition, (vars(opts)).get("depth"), 0) #print text representation of the DecisionTree tree.printtree(rootnode)