def decision_tree(train_features, train_labels, test_features, test_labels, feature_names): regressor = tree.DecisionTreeRegressor() regressor.fit(train_features, train_labels) test_results = cap_results(regressor.predict(test_features)) train_results = cap_results(regressor.predict(train_features)) print "test result", metrics.mean_squared_error(test_labels, test_results) print "test r2", metrics.r2_score(test_labels, test_results) print "train result", metrics.mean_squared_error(train_labels, train_results) print "train r2", metrics.r2_score(train_labels, train_results) # print "importances" # temp = [] # for index, val in enumerate(regressor.feature_importances_): # if val > 0.001: # temp.append((index, val)) # print sorted(temp, key=lambda x: x[1]) '''graph stuff''' dot_data = StringIO() tree.export_graphviz(regressor, out_file=dot_data, special_characters=True, class_names=regressor.classes_, impurity=False, feature_names=feature_names) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("tree.pdf") return (test_results, train_results)
def dt_graph(treeest, cv, scores, features, labels, featnames, outfile): ''' Retrains the tree estimator using the fold with the best results from the cross-validation process. Prints out a graph pdf file of that estimator.''' # Hacky way to get the training data for the best fold bestfold = np.argmax(scores) cnt = 0 for train, _ in cv: # Only do stuff when you've got the training indices for the best fold if(cnt == bestfold): # Fit treeest.fit(features[train], labels[train]) # Get the dot file dot_data = StringIO() tree.export_graphviz(treeest, out_file=dot_data, \ feature_names=featnames) # Convert the dot file to a graph graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(outfile) return else: cnt += 1 print("You should never see this text from dt_graph!") return
def visualize_tree(clf, outname, headers): from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=list(headers)) graph = pydot.graph_from_dot_data(dot_data.getvalue().decode('latin1').encode('utf8')) graph.write_pdf(outname)
def main(): if (len(sys.argv) < 2): print("One Argument Required; Training Set") return X_train, Y_train = ParseTraining(sys.argv[1]) #X_test, Y_test = ParseTraining(sys.argv[2]) #X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=99) #X_train, X_test, Y_train, Y_test = X, X, Y, Y #clf = tree.DecisionTreeClassifier() clf = tree.DecisionTreeClassifier(max_depth=6) #clf = OneVsRestClassifier(SVC(kernel="linear", C=0.025)) #clf = RandomForestClassifier(max_depth=6, n_estimators=10, max_features=1) #clf = SVC(kernel="linear", C=0.025) #clf = AdaBoostClassifier() #clf = SVC(gamma=2, C=1) clf = clf.fit(X_train, Y_train) #feature_names = ["partAvg", "recavg", "latency", "ReadRate"] feature_names = ["partConf", "recAvg", "latency", "ReadRate", "homeconf"] #feature_names = ["partAvg", "recAvg", "recVar", "ReadRate"] #feature_names = ["partAvg", "recAvg", "recVar"] #feature_names = ["recAvg", "recVar", "Read"] #feature_names = ["partAvg", "recVar"] ##class_names = ["Partition", "OCC", "2PL"] #class_names = ["OCC", "2PL"] class_names = ["Partition", "No Partition"] dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png("partition.png")
def tree3(): global final_html global df,df_train,df_test,test_train_created,origin_df chi_key = list() init_style_string = template.style_string if request.method == 'POST': Listkey1 = list(MultiDict(request.form).values()) Listkey2 = MultiDict(request.form) DV_tree = Listkey2.get('DV') df1 = df for key1 in Listkey1: if(key1 <> "Build Tree" and key1 <> DV_tree): chi_key.append(key1) df1 = df.loc[:,chi_key] df2 = df1.values Y = df[DV_tree] clf = tree.DecisionTreeClassifier() clf = clf.fit(df2,Y.values) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) k = dot_data.getvalue() left_px = 600 width_px = 150 top_px = 50 height_px = 309 s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px) temp_df = df[0:15] t = """</div><div style="width:600px; height:700px; position: absolute; top: 20px; left:500px;"><br> Decision Tree result <br>""" final_html = template.s1 + t + k + "<br><br></div>" + temp_df.to_html() return final_html return 'helloo'
def mainTree(): header=re.sub(' |\t','','id|gender|age|height|edu|salary|nation|car|house|body|face|hair|\ smoke|drink|child|parent|bmi|where0|where1|\ marriage0|marriage1|look0|look1|where2').split('|') MaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/transed_M.txt',names=header,sep='|') FemaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/cluster_female.txt',names=header+['class'],sep='|') matches=matchDict('/home/idanan/jiayuan/code/resources/lovers_ids.txt') FemaleData['id']=FemaleData['id'].map(partial(match,matches=matches)) FemaleClass=FemaleData[['id','class']] newMaleData=concatData(MaleData,FemaleClass) MaleArrays=scaleData(newMaleData,['id','gender']) pca=factors(MaleArrays[:,:-1],17) print 'PCA explained variance:', sum(pca.explained_variance_ratio_) pcaMaleArray=pca.transform(MaleArrays[:,:-1]) MaleArrays=np.c_[pcaMaleArray,MaleArrays] trainData,testData=departData(MaleArrays,0.9) trainModel=decisionModel(trainData) dot_data = StringIO() tree.export_graphviz(trainModel, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("/home/idanan/jiayuan/code/resources/marriage.pdf") rate=test(trainModel,testData) print 'Decision Model true rate',rate
def run_DT_model_2(df, criteria_col): # run the tree for various 0,1 lebel (e.g. : high value or not..) from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split from sklearn.externals.six import StringIO from IPython.display import Image import pydotplus print ('criteria_col = ', criteria_col) tree_col = [criteria_col,'Frequency', 'LTV', 'period_no_use','AverageTimeToOrder', 'late_by_collection', 'late_by_delivery', 'tickets', 'recleaned_orders', 'cancalled_orders', 'voucher_used'] df_train_ = df #df_train_tree = df_train_[tree_col] tree_data = df_train_[tree_col] tree_data = tree_data.dropna() tree_train, tree_test = train_test_split(tree_data, test_size=0.2, random_state=200, stratify=tree_data[criteria_col]) clf = tree.DecisionTreeClassifier() clf = clf.fit(tree_train.iloc[:,1:], tree_train[criteria_col]) print (clf.score(tree_test.iloc[:,1:], tree_test[criteria_col])) # confusion matrix print (confusion_matrix(tree_test[criteria_col], clf.predict(tree_test.iloc[:,1:]))) # visualize the tree dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=tree_col[1:], filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) return Image(graph.create_png()), tree_train, tree_test
def decisionTree(): iris = load_iris() clf = tree.DecisionTreeClassifier( criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None ) clf = clf.fit(iris.data, iris.target) dot_data = StringIO() tree.export_graphviz( clf, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=False, rounded=True, special_characters=True ) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("iris.pdf")
def classifyTree(Xtr, ytr, Xte, yte, splitCriterion="gini", maxDepth=0, visualizeTree=False): """ Classifies data using CART """ try: accuracyRate, probabilities, timing = 0.0, [], 0.0 # Perform classification cartClassifier = tree.DecisionTreeClassifier(criterion=splitCriterion, max_depth=maxDepth) startTime = time.time() prettyPrint("Training a CART tree for classification using \"%s\" and maximum depth of %s" % (splitCriterion, maxDepth), "debug") cartClassifier.fit(numpy.array(Xtr), numpy.array(ytr)) prettyPrint("Submitting the test samples", "debug") predicted = cartClassifier.predict(Xte) endTime = time.time() # Compare the predicted and ground truth and append result to list accuracyRate = round(metrics.accuracy_score(predicted, yte), 2) # Also append the probability estimates probs = cartClassifier.predict_proba(Xte) probabilities.append(probs) timing = endTime-startTime # Keep track of performance if visualizeTree: # Visualize the tree dot_data = StringIO() tree.export_graphviz(cartClassifier, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) prettyPrint("Saving learned CART to \"tritonTree_%s.pdf\"" % getTimestamp(), "debug") graph.write_pdf("tree_%s.pdf" % getTimestamp()) except Exception as e: prettyPrint("Error encountered in \"classifyTree\": %s" % e, "error") return accuracyRate, timing, probabilities, predicted
def tree2(): global final_html global df,origin_df chi_key = list() firstkey = "" init_style_string = """<p style="position: absolute; font-size: 12px; top: <top>px; width: <width>px; height: <height>px; left:<left>px; text-align: center;">tree_text_here</p>""" if request.method == 'POST': Listkey1 = list(MultiDict(request.form).values()) Listkey2 = MultiDict(request.form) DV_tree = Listkey2.get('DV') df1 = df for key1 in Listkey1: if(key1 <> "Build Tree" and key1 <> DV_tree): chi_key.append(key1) df1 = df.loc[:,chi_key] df2 = df1.values temp_count = 0 Y = df[DV_tree] clf = tree.DecisionTreeClassifier() clf = clf.fit(df2,Y.values) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) k = dot_data.getvalue() k1 = k.split(";") left_px = 600 width_px = 150 top_px = 50 height_px = 309 s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px) temp_df = df[0:15] t = """</div><div style="float:right;"><br> Decision Tree result <br>""" final_html = template.s1 + t + k + "</div><br><br><br>" + temp_df.to_html() return final_html return 'helloo'
def generate_plot(clf): print "\nGenerating plot..." dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("weather_forecast.pdf") print "Plot generated!"
def visualize_tree(dtree): dot_data = StringIO() tree.export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) display(Image(graph.create_png()))
def train_network(self): """ Pure virtual method for training the network """ db_query = self._database_session.query(PregameHitterGameEntry) mlb_training_data, mlb_evaluation_data = self.get_train_eval_data(db_query, 0.8) X_train, Y_train = self.get_stochastic_batch(mlb_training_data, self.SIZE_TRAINING_BATCH) self._decision_tree.fit(X_train, Y_train) dot_data = StringIO() tree.export_graphviz(self._decision_tree, out_file=dot_data, feature_names=PregameHitterGameEntry.get_input_vector_labels()) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("hitter_tree.pdf") x_test_actual = list() y_test_actual = list() for data in mlb_evaluation_data: try: postgame_entry = self._database_session.query(PostgameHitterGameEntry).filter(PostgameHitterGameEntry.rotowire_id == data.rotowire_id, PostgameHitterGameEntry.game_date == data.game_date).one() y_test_actual.append([postgame_entry.actual_draftkings_points]) x_test_actual.append(data.to_input_vector()) except NoResultFound: print "Ignoring hitter %s since his postgame stats were not found." % data.rotowire_id continue self._database_session.close()
def create_tree(X, Y): clf = tree.DecisionTreeClassifier(criterion="entropy") clf = clf.fit(X, Y) from IPython.display import Image import pydotplus dot_data = StringIO() # tree.export_graphviz(clf, out_file=dot_data) # feature_names = ['Gender', 'Age'] feature_names = ["Gender", "0-5", "6-12", "13-19", "20-27", "28-35", "36-50", "55+"] target_names = [] for i in range(1, len(Y) + 1): target_names.append("Ad #" + str(i)) tree.export_graphviz( clf, out_file=dot_data, feature_names=feature_names, class_names=target_names, filled=True, rounded=True, special_characters=True, ) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("Tree.pdf") return clf
def applyDecisionTree(trainData, trainTargets, testData, testTargets, featureNames): """Train and classify using a Decision Tree and prints the decision Tree.""" decisionTree = DecisionTreeClassifier() model = decisionTree.fit(trainData, trainTargets) # Create graph description of the Decision Tree dot_data = StringIO() #export_graphviz(model, out_file=dot_data, max_depth=5) print("Feature names:", featureNames) export_graphviz(model, out_file=dot_data, feature_names=featureNames, max_depth=5) export_graphviz(model, out_file="DecisionTree.dot", feature_names=featureNames, max_depth=5) #with open("DecisionTree.dot", 'r') as dotFile: # dotFile.write(exportFile) # Create PDF from dot graph = pydot.graph_from_dot_data(dot_data.getvalue()) #path = "/Users/konstantin/Documents/University/Bachelorthesis/paper/src/DecisionTree.dot" #graph = pydot.graph_from_dot_file(path) #graph.write_pdf("DecisionTree.pdf") classification = [model.predict(d)[0] for d in testData] print("\nUsing a Decision Tree:") showPerformance(testTargets, classification)
def drawDecisionTree(dt, filename, featureNames, classNames): dot_data = StringIO() print featureNames print classNames tree.export_graphviz(dt, out_file=dot_data, feature_names=featureNames, class_names=classNames, rounded=True, special_characters=True, filled=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png(filename)
def printTreePDF(self, path = './tree.pdf'): if self.clf == None: raise NameError('Tree was not created!') else: dot_data = StringIO() tree.export_graphviz(self.clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(path)
def createGraph(clf): with open("portScan.dot", 'w') as f: f = tree.export_graphviz(clf, out_file=f) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("portScan.pdf")
def printPdf(clf, dataTrain): from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('sentiment.pdf') print dataTrain.data[0]
def export_tree(clf, filename, feature_names=None, max_depth=None): from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, max_depth=max_depth) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(filename)
def drawDecisionTree(classIndex): clf = tree.DecisionTreeClassifier() clf = clf.fit(preference,y[classIndex]) dot_data = StringIO() # change it: class_names = cnames[classIndex] tree.export_graphviz(clf,out_file=dot_data,feature_names= fname,filled=True, rounded=True,special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) filename = "decisionTree_" + str(classIndex) + ".pdf" graph.write_pdf(filename)
def tree_vis(clf): #fn = ''.join([random.choice(string.ascii_lowercase + string.digits) for _ in range(10)]) fn = 'tree' fn = 'data/trees/{0}.png'.format(fn) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png(fn) return Image(filename=fn)
def view(classifier): """ Renders a graph representation of classifier, and saves it to "MyTree.pdf" in the same folder as the executing script. """ tree_dot = StringIO() tree.export_graphviz(classifier, out_file=tree_dot) graph = pydot.graph_from_dot_data(tree_dot.getvalue()) graph.write_pdf("MyTree.pdf")
def __plotTree(clf,name): tree.export_graphviz(clf,out_file=outputdir + name) dot_data = StringIO() tree.export_graphviz(clf,out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(outputdir + name + '.pdf') os.remove(outputdir + name) #plot utilities
def train_decision_tree_elite_status_classifier(): """Trains and validates a decision tree model for predicting users' Elite status.""" model = train_and_validate_elite_status_classifier(DecisionTreeClassifier, DECISION_TREE_USER_ATTRIBUTES) # Output tree representation showing decision rules dot_data = StringIO() tree.export_graphviz(model, out_file=dot_data, class_names=True, filled=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('analysis/analysis_results/decision_tree.pdf')
def save_tree_png(self, store): import pydot from sklearn.externals.six import StringIO dot_data = StringIO() tree.export_graphviz(self.clf, out_file=dot_data, feature_names=self.feature_names) graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] with open(store.dataset_path + '/tree.png','wb') as f: f.write(graph.create_png())
def export(self, fpath): """ Export the decision tree as a PDF file :return: None """ dot_data = StringIO() tree.export_graphviz(self.model, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(fpath)
def drawTree(X, y, names, depth, outFile, writePickle, pickleFile): clf = tree.DecisionTreeClassifier(max_depth=depth) # criterion="entropy" clf = clf.fit(X, y) dot_data = StringIO() tree.export_graphviz(clf, feature_names=names, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(outFile) if writePickle: pickle.dump(clf, open(pickleFile, "wb")) return clf
def createTreePdf(self): try: import pydot except: return dot_data = StringIO() tree.export_graphviz(self.getClf(), out_file = dot_data, feature_names = self.featureNames) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("DT" + "-".join(self.classNames) + ".pdf")
def printTree(self, export_pdf=True, pdf_name="Decision_Tree.pdf"): dot_data = StringIO() export_graphviz(self.alg, out_file=dot_data, feature_names=self.predictors, filled=True, rounded=True, special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) if export_pdf: graph.write_pdf(pdf_name) return graph
# Getting the test data from dataset | Obtendo os dados de teste do conjunto de dados test_target = iris.target[test_id_test] test_data = iris.data[test_id_test] # Classifying training data | Classificando os dados de treino classifier = tree.DecisionTreeClassifier() classifier.fit(train_data, train_target) # Showing the real rating and the predicted rating | Mostrando a classificação real e a previsão. print(f'Test target: {test_target}') print(f'Predict target: {classifier.predict(test_data)}') # Visualization Code of Decision Tree | Código de visualização da Árvore de Decisão from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(classifier, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, impurity=False) # I used this module (graphviz) to generate the graph import graphviz as gp graph = gp.Source(dot_data.getvalue()) graph.render("iris", view=True)
def train_holdout(base_dir, classifier_name, classifier): base = pd.read_csv(f'{base_dir}/features.csv', sep=';', header=None) # fetch folder from first index of base images_dirs = base.iloc[:, 0] # Separate X to a new DataFrame and convert to numpy array X = base.iloc[:, 1:] # Load classes names y = pd.read_csv(f'{base_dir}/Y.csv', sep=';', header=None) # HOLDOUT # separate the base in 70% to train and 30% to test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42, stratify=y) all_classes = np.unique(y.to_numpy()) def overfitting_prevent_train(X_train, y_train): # separate the train base in 70% to train and 30% do validation X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=.2, random_state=42, stratify=y_train) chunk_size = 10 graphic_data = [] current_epoch = 0 alpha = 0.8 smooth_error_training = [] smooth_error_validation = [] overfitting_count = 0 for chunk in chunk_generator(pd.concat([X_train, y_train], axis=1), chunk_size=chunk_size): classifier.partial_fit(chunk[0], np.ravel(chunk[1]), classes=all_classes) current_error_on_training = 1 - classifier.score(X_train, np.ravel(y_train)) current_error_on_validation = 1 - classifier.score(X_validation, np.ravel(y_validation)) graphic_data.append( GraphicData(x=current_epoch, error_on_val=current_error_on_validation, error_on_train=current_error_on_training) ) if current_epoch == 0: smooth_error_training.append(current_error_on_training) smooth_error_validation.append(current_error_on_validation) else: smooth_error_training.append( alpha * smooth_error_training[current_epoch - 1] + (1 - alpha) * graphic_data[ current_epoch - 1].error_on_train) smooth_error_validation.append( alpha * smooth_error_validation[current_epoch - 1] + (1 - alpha) * graphic_data[ current_epoch - 1].error_on_val) if current_epoch >= 1: if smooth_error_validation[-2] - smooth_error_validation[-1] < 1e-03: overfitting_count += 1 if overfitting_count >= 5: print(f'Overfitting Detected on {classifier_name}, epoch: {current_epoch}') break else: overfitting_count = 0 current_epoch += 1 print(f'Acurácia {classifier_name} Validação: {classifier.score(X_validation, np.ravel(y_validation))}') plt.title(f'Error Compare --> {classifier_name} --> {base_dir}') # plt.plot([data.x for data in graphic_data], # [data.error_on_train for data in graphic_data], label='Error on Train') # plt.plot([data.x for data in graphic_data], # [data.error_on_val for data in graphic_data], label='Error on Validation') plt.plot([data.x for data in graphic_data], smooth_error_training, label='Error on Train Smooth') plt.plot([data.x for data in graphic_data], smooth_error_validation, label='Error on Validation Smooth') plt.ylabel('Error') plt.xlabel('Epoch') plt.legend() plt.show() def normal_fit(): classifier.fit(X_train, y_train) def visualizate_data(): x_tsne = TSNE(n_components=2).fit_transform(X) y_aux = pd.DataFrame() y_aux['classes'] = y[0] tsne_df = pd.DataFrame() tsne_df['tsne-x'] = x_tsne[:, 0] tsne_df['tsne-y'] = x_tsne[:, 1] tsne_df = pd.concat([tsne_df, y_aux], axis=1) plt.figure(figsize=(16, 10)) sns.scatterplot('tsne-x', 'tsne-y', hue="classes", legend='full', palette=sns.color_palette("hls", 10), alpha=0.3, data=tsne_df) plt.show() if hasattr(classifier, 'partial_fit'): overfitting_prevent_train(X_train, y_train) else: normal_fit() predicated_rows = classifier.predict(X_test) predicated_proba = classifier.predict_proba(X_test) # aux for find proba for class aux = 0 for predicated, expected in zip(predicated_rows, y_test.iterrows()): if expected[1][0] != predicated: img_dir = images_dirs[expected[0]] percent = predicated_proba[aux][np.where(all_classes == predicated)] print(f'Confundiu {img_dir} com {predicated}, proba: {percent}') aux += 1 print(f'Acurácia {classifier_name} Teste : {classifier.score(X_test, y_test)}') cm = confusion_matrix(y_test, predicated_rows) df_cm = pd.DataFrame(cm, index=all_classes, columns=all_classes) sns.heatmap(df_cm, annot=True) plt.title(f'Confusion Matrix --> {classifier_name} --> {base_dir}') plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() # visualizate_data() if isinstance(classifier, DecisionTreeClassifier): from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydot dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, rounded=True, filled=True) graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] graph.write_pdf(f'{classifier_name}.pdf') return classifier, X_test, y_test
tree.fit(x_train, y_train) print(tree.score(x_train, y_train)) print(tree.score(x_test, y_test)) # (fix it...) from sklearn.tree import export_graphviz export_graphviz(tree, out_file='cancertree.dot', class_names=['m','b'], feature_names=cancer.feature_names, filled=True) import pydot import graphviz from sklearn.externals.six import StringIO dotfile = StringIO() export_graphviz(tree, out_file=dotfile, class_names=['m','b'], feature_names=cancer.feature_names, filled=True) pydot.graph_from_dot_data(dotfile.getvalue()).write_png("dtree2.png") import matplotlib.image as mpimg img = mpimg.imread('dtree2.png.png') plt.imshow(img) plt.show() from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO()
def plot(seed): filename = 'cryo.csv' random.seed(seed) np.random.seed(seed) data = pd.read_csv(filename, sep='\t') data = data.sample(frac=1).reset_index(drop=True) X = data.values[:, 0:6] y = data.values[:, -1] #indexList=[0,1,2,3,4,5,6,8,23,24,25] names = list(data)[0:6] #print(X.shape) #print(y.shape) #X=X[:,np.r_[indexList]] params = createHyperParameters(seed) kf = KFold(n_splits=5, random_state=seed, shuffle=False) acc_history = [] split = 0 reg = None reg = DecisionTreeClassifier() reg.set_params(**params) for (train_indices, val_indices) in kf.split(X, y): split = split + 1 xtrain, xval = X[train_indices], X[val_indices] ytrain, yval = y[train_indices], y[val_indices] # print(xval) # ytrain = ytrain.reshape(-1,1) # yval = yval.reshape(-1,1) reg.fit(xtrain, ytrain) ypred = reg.predict(xval) # print(reg.predict_proba(xval)) # ypred2=reg.predict(xtrain) accuracy = accuracy_score(yval, ypred) # print(accuracy) acc_history.append(accuracy) ACCVALMIN = np.min(acc_history) ACCVALMAX = np.max(acc_history) ACCVALMEAN = np.mean(acc_history) from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import pydotplus from sklearn import tree import collections dot_data = StringIO() export_graphviz(reg, out_file=dot_data, feature_names=names, filled=True, rounded=True, special_characters=True) colors = ('turquoise', 'orange') edges = collections.defaultdict(list) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) for edge in graph.get_edge_list(): edges[edge.get_source()].append(int(edge.get_destination())) for edge in edges: edges[edge].sort() for i in range(2): dest = graph.get_node(str(edges[edge][i]))[0] dest.set_fillcolor(colors[i]) predictions = reg.predict_proba(X) ROCAREA = roc_auc_score(y, predictions[:, 1]) ypr = reg.predict(X) print("Global Set Accuracy Score : %.4f" % (accuracy_score(y, ypr))) print("Area under ROC Curve : %.4f" % (ROCAREA)) fpr, tpr, _ = roc_curve(y, predictions[:, 1]) plt.clf() plt.plot(fpr, tpr) plt.xlabel('FPR') plt.ylabel('TPR') plt.title('ROC curve') plt.show() return (ACCVALMIN, ACCVALMAX, ACCVALMEAN, ROCAREA, params, graph)
ac1 #for decision tree ac2=100*accuracy_score(y_test,pred2) ac2 #for Naive Bayes ac3=100*accuracy_score(y_test,pred3) ac3 from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import pydotplus import pydot dotfile = StringIO() export_graphviz(clf2, out_file=dotfile, filled=True, rounded=True, special_characters=True) (graph,) = pydot.graph_from_dot_data(dotfile.getvalue()) Image(graph.create_png()) objects = ('NaiveBayes','Decision Tree', 'RandomForest',) y_pos = np.arange(len(objects)) performance = [ac3,ac2,ac1] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Usage') plt.title('Classifier Accuracy') plt.show()
def customer_segment(customerslist, productname): g = globals() xx='match(c:customerid)-[r:Bought_this]->(s:stockcode) ' \ 'set r.Actual_Price=toFloat(r.Actual_Price)' \ 'return s.Category,collect(DISTINCT r.Actual_Price) as pricelist ' \ 'order by s.Category, pricelist' pricebucket = session.run(xx) for i in pricebucket: # print(i) actualcategoryspresent.append(i['s.Category']) if len(i['pricelist']) == 1: start = (i['pricelist'][0]) - 1 end = (i['pricelist'][0]) + 1 else: start = min(i['pricelist']) end = max(i['pricelist']) bucketsize = (end - start) / 10 starts.append(start) ends.append(end) bucketsizes.append(bucketsize) cat.append(i['s.Category']) # print('start :',start) # print('end :',end) # print('bucket_size:',bucketsize) for i in customerslist: x = 'optional MATCH(c:customerid{CustomerID: "' + i + '"})-[r:Bought_this]->(s:stockcode{Description: "' + productname + '"}) ' \ 'return distinct ' \ 'case ' \ 'when r.Quantity IS NULL THEN 0 ' \ 'when r.Quantity IS NOT NULL THEN 1 ' \ 'else r.Quantity END AS Quantity ' for ii in session.run(x): target.append(ii[0]) for i in customerslist: x = 'match(c1:customerid{CustomerID: "' + i + '"})-[r1:Bought_this]->(s1:stockcode) return c1.Country limit 1' result = session.run(x) for ii in result: countryofcustomer.append(ii[0]) for i in customerslist: g[i + ' vector'] = [0] * len(cat) g[i + ' category-wise purchase vector'] = [0] * len(cat) g[i + ' bucket_vector'] = [0] * 4 x='match(c1:customerid{CustomerID: "'+i+'"})-[r1:Bought_this]->(s1:stockcode) ' \ 'set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price)' \ 'return c1.CustomerID as CustomerID, s1.Category as Category,reduce(sum=0, i in collect(r1.Quantity * r1.Price) | sum + i) as totalspent , reduce(sum=0, i in collect(r1.Quantity) | sum + i) as Quantity ' \ 'order by totalspent desc ' \ result = session.run(x) count = 0 for i in result: count = count + 1 if count == 1: totalspent.append(i['totalspent']) category.append(i['Category']) #print(i['totalspent']) if i['Category'] in cat: vv = cat.index(i['Category']) g[i['CustomerID'] + ' category-wise purchase vector'][vv] = i['Quantity'] else: if i['Category'] in cat: vv = cat.index(i['Category']) g[i['CustomerID'] + ' category-wise purchase vector'][vv] = i['Quantity'] for i in range(0, len(customerslist)): x='match(c1:customerid{CustomerID:"'+customerslist[i]+'"})-[r1:Bought_this]->(s1:stockcode{Category:"'+category[i]+'"}) ' \ 'set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price) ' \ 'return c1.CustomerID as CustomerID,collect(r1.Actual_Price) as prices_in_category' result = session.run(x) for yy in result: # print(yy[1]) most_buyed_item_cost.append(float(Most_Common(yy[1]))) for i in customerslist: dd = 'match(c1:customerid{CustomerID: "' + i + '"})-[r1:Bought_this]->(s1:stockcode) set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price) return c1.CustomerID as CustomerID, collect( distinct s1.Category) as Categoryss' result = session.run(dd) for uu in result: for hh in uu['Categoryss']: #print(hh) if hh in cat: vv = cat.index(hh) g[i + ' vector'][vv] = 1 for t in range(0, len(customerslist)): if category[t] in cat: vv = cat.index(category[t]) l = drop_in_bucket(starts[vv], ends[vv], bucketsizes[vv], most_buyed_item_cost[t]) pricesensi.append(l) ageassign(pricesensi) df = pd.DataFrame(columns=[ 'age', 'p_s', 'category', 'totalspent', 'total_cat_bought', 'country' ]) for t in range(0, len(customerslist)): #print(t) if category[t] in cat: vv = cat.index(category[t]) # print('------------------------------------------------------------------------------------------------------------------') # print('customer :',customerslist[t]) # print('customer age :',ages[t]) # print('country :',countryofcustomer[t]) # print('total spent :',totalspent[t]) # print('category spent :',category[t]) # print('customer category vector :',g[customerslist[t]+' vector']) if pricesensi[t] == 'High': g[customerslist[t] + ' bucket_vector'][0] = 1 if pricesensi[t] == 'Medium High': g[customerslist[t] + ' bucket_vector'][1] = 1 if pricesensi[t] == 'Medium Low': g[customerslist[t] + ' bucket_vector'][2] = 1 if pricesensi[t] == 'Low': g[customerslist[t] + ' bucket_vector'][3] = 1 # print('customer category-wise purchase vector :', g[customerslist[t] + ' category-wise purchase vector']) # print('customer bucket vector :',g[customerslist[t] +' bucket_vector']) # print('starting price of that category :',starts[vv]) # print('ending price of that category :',ends[vv]) # print('bucket size of category :',bucketsizes[vv]) # print('most buyed item cost :',most_buyed_item_cost[t]) # print('price sensitivity :',pricesensi[t]) # print('total catogories bought :',sum(g[customerslist[t]+' vector'])) # print('Yes/No :',target[t]) df = df.append( { 'age': ages[t], 'p_s': pricesensi[t], 'category': category[t], 'totalspent': totalspent[t], 'total_cat_bought': int( sum(g[customerslist[t] + ' vector'])), 'country': countryofcustomer[t] }, ignore_index=True) # print(df) df = df.drop('country', axis=1) hot_ps = pd.get_dummies(df.p_s) df = df.join(hot_ps) df = df.drop('p_s', axis=1) hot_category = pd.get_dummies(df.category) df = df.join(hot_category) df = df.drop('category', axis=1) # hot_country = pd.get_dummies(df.country) # df=df.join(hot_country) # df=df.drop('country',axis=1) data = df.values train_target = target train_data = data # print(train_target) x = tree.DecisionTreeClassifier() print("##################____________________productname", productname) print(train_data) print(train_target) x.fit(train_data, train_target) dot_data = StringIO() tree.export_graphviz(x, out_file=dot_data, feature_names=df.columns.tolist(), class_names=['No', 'Yes'], filled=True, rounded=True, impurity=False) # tree.export_graphviz(x,out_file='tree.dot') for i in dot_data: print(i) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) # print(graph) #os.remove("D:\E-commerce_data_visualization\search\static\stuff/segment.png") # os.remove(file) for file in os.listdir('path/to/directory') if file.endswith('.png') # os.system(rm ,"D:\E-commerce_data_visualization\search\static\stuff/segment.png") graph.write_png( os.path.join(base_dir, 'search/static/stuff/segment.png')) print(productname, "productname727")
def buildTree(self): k = 0 #Until now I considered as threshold for test reward: # 1- Without considering FER: TEST_REWARD = 76.0 # 2 - Without considering SER: TEST_REWARD = 76.0 # 4 - Without considering ES: TEST_REWARD = 76.0 TEST_REWARD = 76.0 # I want to know the average reward of all features (in this case three features) after 25 run avgRewAllFeatures = [] # I want to know the average error of all features (in this case three features) after 25 run total_error = np.array([]) # confidenceIntervalFeatures is a dictionary containing the Monte Carlo error considered for the self.num_trees_optimal policy trees for each run of the algorithm without considering the i-th feature (in this case the i-th key) confidenceIntervalFeatures = dict() for elem in self.features: confidenceIntervalFeatures[elem] = np.array([]) # Average reward after self.n_runs for trees without a feature averageRewardFeatures = dict() for elem in self.features: averageRewardFeatures[elem] = np.array([]) total_importance = [ 0.0, 0.0, 0.0 ] # it considers the importance of all runs of the algorithm while (k < self.num_run): print("RUN " + str(k)) # first tree based on the randomly generated buffer self.initializeBuffer() # Fit regression model self.current_tree.fit(self.X, self.y) # i is used to count the number of updates a tree has done, j is used to update the value of epsilon # lastRun is used to count how many trees you want to build considering the optimal policy (epsilon = 0) after reaching the stopping criteria for training i = j = flag = lastRun = 0 scores = [] slidingWindowReward = [] while (lastRun < self.num_trees_optimal_policy): print("Tree number: " + str(j)) total_reward = 0 epsilon = self.get_epsilon(j) current_state = self.env.reset() if (len(slidingWindowReward) == self.slidingWindow): if (sum(slidingWindowReward) / len(slidingWindowReward) > TEST_REWARD): total_importance += self.current_tree.feature_importances_ print("Total importance: " + str(total_importance)) lastRun += 1 epsilon = 0 # in order to consider the optimal policy we set epsilon = 0 else: slidingWindowReward = [ ] # we clear the slidingWindow if the samples considered don't match our threshold (TEST_REWARD) while (i < self.update): if ("Speech Emotion Recognition" not in self.features): listx = list(current_state) listx.remove(current_state[1]) tuplex = tuple(listx) current_state = tuplex if ("Object State" not in self.features): listx = list(current_state) listx.remove(current_state[2]) tuplex = tuple(listx) current_state = tuplex if (("Speech Emotion Recognition" in self.features) and ("Object State" in self.features)): current_state = current_state[self.initialIndex:self. finalIndex] action = self.choose_action(current_state, epsilon) obs, reward, done, _ = self.env.step(action) temp = obs if ("Speech Emotion Recognition" not in self.features): listx = list(obs) listx.remove(obs[1]) tuplex = tuple(listx) obs = tuplex if ("Object State" not in self.features): listx = list(obs) listx.remove(obs[2]) tuplex = tuple(listx) obs = tuplex if (("Speech Emotion Recognition" in self.features) and ("Object State" in self.features)): obs = obs[self.initialIndex:self.finalIndex] total_reward += reward q_current = self.current_tree.predict([current_state]) q_new = self.current_tree.predict([obs]) q_current[0][action] = reward + self.gamma * np.max( q_new[0]) self.buffer.append( [current_state, action, obs, q_current[0]]) self.X.append(current_state) self.y.append(q_current[0]) if ( not lastRun == 0 ): # it means until we did not build self.num_trees_optimal_policy self.X_final.append(current_state) self.y_final.append(q_current[0]) current_state = temp i += 1 if done: current_state = self.env.reset() if (not flag): scores.append(total_reward) flag = 1 if (lastRun == 0): slidingWindowReward.append(self.test_reward()) i = 0 if (not flag): scores.append(total_reward) if (lastRun == 0): slidingWindowReward.append(total_reward) self.current_tree = DecisionTreeRegressor() self.current_tree.fit(self.X, self.y) j += 1 flag = 0 # Testing considering all the variables but training the model on samples generated by last self.num_trees_optimal_policy with epsilon = 0 print("Testing the tree considering all variables") self.current_tree = DecisionTreeRegressor() self.current_tree.fit(self.X_final, self.y_final) # I want to know the average reward after 25 run considering all variables avgRewAllFeatures.append(self.test_reward()) # Visualize data if (self.MAX_LEN > 2 and self.update > 2): dot_data = StringIO() export_graphviz(self.current_tree, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=self.features) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) Image(graph.create_png()) graph.write_png("final_tree.png") # Now we build four trees (each one without considering one of the four features), we generate a test-set for each tree (so we generate some episodes) # From this episodes we compute the Monte-Carlo error that is (r + gamma * G_{t+1} - q^predicted(x,a))^{2} e we sum all these differences to compute the total error # Higher is this error, more important is the removed feature (so the feature we didn't consider) """for i in range(0, len(self.features)): print("Testing the final tree without considering " + str(self.features[i])) self.current_tree = DecisionTreeRegressor() X_feature = self.dataFilter(i) self.current_tree.fit(X_feature, self.y_final) r, e = self.getMonteCarloError(i) averageRewardFeatures[self.features[i]] = np.append(averageRewardFeatures[self.features[i]], r) confidenceIntervalFeatures[self.features[i]] = np.append(confidenceIntervalFeatures[self.features[i]], e)""" self.current_tree = DecisionTreeRegressor() self.current_tree.fit(self.X_final, self.y_final) r, e = self.getMonteCarloError(len(self.features)) total_error = np.append(total_error, e) k += 1 """"# This is the final average reward after 25 run considering all features
# Split data into training set and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% testing # Create decision tree classifier object clf = DecisionTreeClassifier(max_depth=5) # note: arguments are optional (used for pruning) # Train decision tree classifier clf = clf.fit(X_train, y_train) # Predict the response for the test dataset y_pred = clf.predict(X_test) # Evaluate model by printing the model accuracy (how often the classifier is correct) print("\nAccuracy:", metrics.accuracy_score(y_test, y_pred)) print() # attempting to visualize the data print("Beginning Visualization...") dotData = StringIO() print("\nExporting Graphviz...") export_graphviz(clf, out_file=dotData, filled=True, rounded=True, special_characters=True, feature_names = featureCols, class_names=['0','1']) print("\nGraphing using pydotplus library...") graph = pydotplus.graph_from_dot_data(dotData.getvalue()) graph.write_png('BigTheta.png') print("Printing now...") Image(graph.create_png())
from sklearn.datasets import load_iris from sklearn.externals.six import StringIO from sklearn import tree iris = load_iris() clf = tree.DecisionTreeClassifier() clf = clf.fit(iris.data, iris.target) with open("iris.dot", 'w') as f: f = tree.export_graphviz(clf, out_file=f) import os os.unlink('iris.dot') import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("iris.pdf") from IPython.display import Image dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) Image(graph.create_png())
from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.tree import export_graphviz from sklearn.externals.six import StringIO from matplotlib import pyplot as plt from pydot import graph_from_dot_data import pandas as pd import numpy as np iris = load_iris() X = pd.DataFrame(iris.data, columns=iris.feature_names) y = pd.Categorical.from_codes(iris.target, iris.target_names) X.head() y = pd.get_dummies(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) dt = DecisionTreeClassifier() dt.fit(X_train, y_train) dot_data = StringIO() export_graphviz(dt, out_file=dot_data, feature_names=iris.feature_names) (graph, ) = graph_from_dot_data(dot_data.getvalue()) graph.write_png("tmp.png") plt.imshow(plt.imread("tmp.png")) plt.show() y_pred = dt.predict(X_test) species = np.array(y_test).argmax(axis=1) predictions = np.array(y_pred).argmax(axis=1) confusion_matrix(species, predictions)
def plotTree(treeName,tree,featureNames): treePic_dot = StringIO() export_graphviz(tree, out_file=treePic_dot, feature_names=featureNames, filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(treePic_dot.getvalue()) Image(graph.create_png()) graph.write_png(treeName+'.png')
def draw_tree(model, name): dot_data = StringIO() _tree.export_graphviz(model, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(name + ".pdf")
kyphosis_train = kyphosis[kyphosis.is_train] kyphosis_test = kyphosis[kyphosis.is_train == False] # Train model kyphosis_features = kyphosis.columns[1:] kyphosis_dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1) kyphosis_dt_clf = kyphosis_dt_clf.fit(kyphosis_train[kyphosis_features], kyphosis_train['Kyphosis']) # Print a string representation of the tree. # If you have graphviz (www.graphviz.org) installed, you can write a pdf # visualization using graph.write_pdf(filename) kyphosis_dt_data = StringIO() tree.export_graphviz(kyphosis_dt_clf, out_file=kyphosis_dt_data) kyphosis_dt_graph = pydotplus.parser.parse_dot_data( kyphosis_dt_data.getvalue()) print(kyphosis_dt_graph.to_string()) # Predict classes of test set and evaluate kyphosis_dt_pred = kyphosis_dt_clf.predict(kyphosis_test[kyphosis_features]) kyphosis_dt_cm = metrics.confusion_matrix(kyphosis_test['Kyphosis'], kyphosis_dt_pred, labels=['absent', 'present']) print(kyphosis_dt_cm) kyphosis_dt_acc = metrics.accuracy_score(kyphosis_test['Kyphosis'], kyphosis_dt_pred) kyphosis_dt_prec = metrics.precision_score(kyphosis_test['Kyphosis'],
#!/usr/bin/env python '''Read and write a string as a file-like object.''' from sklearn.externals.six import StringIO # create a sample mysample = StringIO() mysample.write('My first testing line.') print( mysample) #this only will indicate the location of the file in the memory # retrieve contents using getvalue() content = mysample.getvalue() print(content) # close my sample mysample.close()
from IPython.display import Image import os import sys def conda_fix(graph): path = os.path.join(sys.base_exec_prefix, "Library", "bin", "graphviz") paths = ("dot", "twopi", "neato", "circo", "fdp") paths = {p: os.path.join(path, "{}.exe".format(p)) for p in paths} graph.set_graphviz_executables(paths) from sklearn import tree buffer = StringIO() tree.export_graphviz(dt, out_file=buffer, feature_names=X.columns, class_names=X.columns, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(buffer.getvalue()) conda_fix(graph) graph.write_pdf("loan_tree.pdf") Image(graph.create_png()) #ada-booster #boosting #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html from sklearn.ensemble import AdaBoostClassifier ada = AdaBoostClassifier(base_estimator=dt,
def test_graphviz_toy(): # Check correctness of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2, criterion="gini", random_state=2) clf.fit(X, y) # Test export code out = StringIO() export_graphviz(clf, out_file=out) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]"] ;\n' \ '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert_equal(contents1, contents2) # Test with feature_names out = StringIO() export_graphviz(clf, out_file=out, feature_names=["feature0", "feature1"]) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]"] ;\n' \ '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert_equal(contents1, contents2) # Test with class_names out = StringIO() export_graphviz(clf, out_file=out, class_names=["yes", "no"]) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]\\nclass = yes"] ;\n' \ '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n' \ 'class = yes"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n' \ 'class = no"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert_equal(contents1, contents2) # Test plot_options out = StringIO() export_graphviz(clf, out_file=out, filled=True, impurity=False, proportion=True, special_characters=True, rounded=True) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled, rounded", color="black", ' \ 'fontname=helvetica] ;\n' \ 'edge [fontname=helvetica] ;\n' \ '0 [label=<X<SUB>0</SUB> ≤ 0.0<br/>samples = 100.0%<br/>' \ 'value = [0.5, 0.5]>, fillcolor="#e5813900"] ;\n' \ '1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, ' \ 'fillcolor="#e58139ff"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, ' \ 'fillcolor="#399de5ff"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert_equal(contents1, contents2) # Test max_depth out = StringIO() export_graphviz(clf, out_file=out, max_depth=0, class_names=True) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]\\nclass = y[0]"] ;\n' \ '1 [label="(...)"] ;\n' \ '0 -> 1 ;\n' \ '2 [label="(...)"] ;\n' \ '0 -> 2 ;\n' \ '}' assert_equal(contents1, contents2) # Test max_depth with plot_options out = StringIO() export_graphviz(clf, out_file=out, max_depth=0, filled=True, node_ids=True) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled", color="black"] ;\n' \ '0 [label="node #0\\nX[0] <= 0.0\\ngini = 0.5\\n' \ 'samples = 6\\nvalue = [3, 3]", fillcolor="#e5813900"] ;\n' \ '1 [label="(...)", fillcolor="#C0C0C0"] ;\n' \ '0 -> 1 ;\n' \ '2 [label="(...)", fillcolor="#C0C0C0"] ;\n' \ '0 -> 2 ;\n' \ '}' assert_equal(contents1, contents2) # Test multi-output with weighted samples clf = DecisionTreeClassifier(max_depth=2, min_samples_split=2, criterion="gini", random_state=2) clf = clf.fit(X, y2, sample_weight=w) out = StringIO() export_graphviz(clf, out_file=out, filled=True, impurity=False) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled", color="black"] ;\n' \ '0 [label="X[0] <= 0.0\\nsamples = 6\\n' \ 'value = [[3.0, 1.5, 0.0]\\n' \ '[3.0, 1.0, 0.5]]", fillcolor="#e5813900"] ;\n' \ '1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n' \ '[3, 0, 0]]", fillcolor="#e58139ff"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="X[0] <= 1.5\\nsamples = 3\\n' \ 'value = [[0.0, 1.5, 0.0]\\n' \ '[0.0, 1.0, 0.5]]", fillcolor="#e5813986"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n' \ '[0, 1, 0]]", fillcolor="#e58139ff"] ;\n' \ '2 -> 3 ;\n' \ '4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n' \ '[0.0, 0.0, 0.5]]", fillcolor="#e58139ff"] ;\n' \ '2 -> 4 ;\n' \ '}' assert_equal(contents1, contents2) # Test regression output with plot_options clf = DecisionTreeRegressor(max_depth=3, min_samples_split=2, criterion="mse", random_state=2) clf.fit(X, y) out = StringIO() export_graphviz(clf, out_file=out, filled=True, leaves_parallel=True, rotate=True, rounded=True) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled, rounded", color="black", ' \ 'fontname=helvetica] ;\n' \ 'graph [ranksep=equally, splines=polyline] ;\n' \ 'edge [fontname=helvetica] ;\n' \ 'rankdir=LR ;\n' \ '0 [label="X[0] <= 0.0\\nmse = 1.0\\nsamples = 6\\n' \ 'value = 0.0", fillcolor="#e5813980"] ;\n' \ '1 [label="mse = 0.0\\nsamples = 3\\nvalue = -1.0", ' \ 'fillcolor="#e5813900"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="True"] ;\n' \ '2 [label="mse = 0.0\\nsamples = 3\\nvalue = 1.0", ' \ 'fillcolor="#e58139ff"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="False"] ;\n' \ '{rank=same ; 0} ;\n' \ '{rank=same ; 1; 2} ;\n' \ '}' assert_equal(contents1, contents2)
classifier = DecisionTreeClassifier(random_state=0) params = {"max_depth": range(1, 11)} scoring_fnc = make_scorer(performance_metric) grid = GridSearchCV(classifier, param_grid=params, scoring=scoring_fnc, cv=cv_sets) grid = grid.fit(X, y) print(pd.DataFrame(grid.cv_results_)) return grid.best_estimator_ reg = fit_model(X_train, y_train) reg.fit(X_train, y_train) Z = reg.predict(X_test) s = pickle.dumps(reg) print(metrics.confusion_matrix(y_test, Z)) print(metrics.classification_report(y_test, Z)) dot_data = StringIO() export_graphviz(reg, out_file="dot.dot", feature_names=list(data)[1:], class_names=["edible", "poisonous"]) #export_graphviz(reg, out_file=dot_data, feature_names=list(data)[1:]) # graph_ = pydot.graph_from_dot_data(dot_data.getvalue()) # graph_.write_pdf("tree.pdf") feature_importances = reg.feature_importances_ fi = dict(zip(feature_importances, list(data)[1:])) fi_S = sorted(fi.items(), key=operator.itemgetter(0), reverse=True) print(fi_S)
def _decision_tree_regression_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, presort) regressor.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(regressor, out_file=dot_data, feature_names=feature_cols, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col feature_importance = regressor.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = regressor.max_features_ model['n_features'] = regressor.n_features_ model['n_outputs'] = regressor.n_outputs_ model['tree'] = regressor.tree_ get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Regression Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
print(predTree[0:5]) print(y_testset[0:5]) #Evaluation from sklearn import metrics import matplotlib.pyplot as plt print("Decision Tree Accuracy: ", metrics.accuracy_score(y_testset, predTree)) #Tree visualization from sklearn.externals.six import StringIO import pydotplus import matplotlib.image as mpimg from sklearn import tree dot_data = StringIO() filename = "DrugTree.png" feature_names = my_data.columns[0:5] targetNames = my_data["Drug"].unique().tolist() out = tree.export_graphviz(drugTree, feature_names=feature_names, out_file=dot_data, class_names=np.unique(y_trainset), filled=True, special_characters=True, rotate=False) graph = pydotplus.graph_from_dot_data( dot_data.getvalue() ) #use dot data, which is the string representation of the tree and forms a graph graph.write_png(filename) img = mpimg.imread(filename)
labelSet = data.iloc[:, 4] dataConvertList = {} labelConvertList = {} #convert string attribute values to integers for feat in np.array(dataSet).transpose(): i = 1 for data in feat: if data not in dataConvertList.keys(): dataConvertList[data] = i i = i + 1 i = 1 for data in labelSet: if data not in labelConvertList.keys(): labelConvertList[data] = i i = i + 1 for key in dataConvertList: dataSet = dataSet.replace(key, dataConvertList[key]) for key in labelConvertList: labelSet = labelSet.replace(key, labelConvertList[key]) #train a model model = tree.DecisionTreeClassifier(criterion='entropy', random_state=0) s = model.fit(dataSet, labelSet) #plot the decision tree by pydotplus tree_file = StringIO() tree.export_graphviz(model, out_file=tree_file) graph = pydotplus.graph_from_dot_data(tree_file.getvalue()) graph.write_pdf("tree.pdf")
def product_segment(productslist, customerid): descriptions = [] mrp = [] categoryofproduct = [] yes_no = [] cat = [] xx='match(c:customerid)-[r:Bought_this]->(s:stockcode) ' \ 'set r.Actual_Price=toFloat(r.Actual_Price)' \ 'return s.Category,collect(DISTINCT r.Actual_Price) as pricelist ' \ 'order by s.Category, pricelist' pricebucket = session.run(xx) for i in pricebucket: # print(i) actualcategoryspresent.append(i['s.Category']) if len(i['pricelist']) == 1: start = (i['pricelist'][0]) - 1 end = (i['pricelist'][0]) + 1 else: start = min(i['pricelist']) end = max(i['pricelist']) bucketsize = (end - start) / 10 starts.append(start) ends.append(end) bucketsizes.append(bucketsize) cat.append(i['s.Category']) # print('start :',start) # print('end :',end) # print('bucket_size:',bucketsize) for i in productslist: x='match() - [r:Bought_this]->(s:stockcode{StockCode:"'+i+'"}) ' \ 'return distinct s.Description as description, s.Category as category, r.Actual_Price as MRP ' \ 'limit 1' result = session.run(x) for i in result: descriptions.append(i['description']) mrp.append(i['MRP']) categoryofproduct.append(i['category']) df1 = pd.DataFrame( columns=['Description', 'MRP', 'category', 'PriceBucket']) for i in productslist: x = 'optional MATCH(c:customerid{CustomerID: "' + customerid + '"})-[r:Bought_this]->(s:stockcode{StockCode: "' + i + '"}) ' \ 'return distinct ' \ 'case ' \ 'when r.Quantity IS NULL THEN 0 ' \ 'when r.Quantity IS NOT NULL THEN 1 ' \ 'else r.Quantity END AS Quantity ' for ii in session.run(x): yes_no.append(ii[0]) for i in range(0, len(productslist)): # print('=======================================================================================================') # print('StockCode of product :',productslist[i]) # print('Description of product :',descriptions[i]) # print('MRP :',mrp[i]) # print('Category of product :',categoryofproduct[i]) # if categoryofproduct[i] in cat: vv = cat.index(categoryofproduct[i]) z = price_bucket_of_product(starts[vv], ends[vv], bucketsizes[vv], mrp[i]) # print('Price bucket :',z) # print('yes_no :',yes_no[i]) df1 = df1.append( { 'Description': descriptions[i], 'MRP': mrp[i], 'category': categoryofproduct[i], 'PriceBucket': z }, ignore_index=True) # hot_descp = pd.get_dummies(df1.Description) # df1 = df1.join(hot_descp) df1 = df1.drop('Description', axis=1) hot_category = pd.get_dummies(df1.category) df1 = df1.join(hot_category) df1 = df1.drop('category', axis=1) hot_PriceBucket = pd.get_dummies(df1.PriceBucket) df1 = df1.join(hot_PriceBucket) df1 = df1.drop('PriceBucket', axis=1) # print(df1) data = df1.values train_target = yes_no train_data = data # print(train_target) # print(train_data) x = tree.DecisionTreeClassifier() x.fit(train_data, train_target) # print(x) dot_data = StringIO() tree.export_graphviz(x, out_file=dot_data, feature_names=df1.columns.tolist(), class_names=['No', 'Yes'], filled=True, rounded=True, impurity=False) for i in dot_data: print(i) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('D:\project\search\static\stuff/segment.png')
print("") print("Data 1's Upper p value with Gini -> ", gini_upper_p_data1) print("") print("Data 2's Lower p value with Gini -> ", gini_lower_p_data2) print("") print("Data 2's Upper p value with Gini -> ", gini_upper_p_data2) print("") #Part i data_1_col_names = [ "age", "job", "marital", "education", "balance", "housing", "duration", "poutcome" ] data_2_col_names = ["job", "marital", "education", "housing"] dot_data1_entropy = StringIO() dot_data2_entropy = StringIO() dot_data1_gini = StringIO() dot_data2_gini = StringIO() export_graphviz(entropy_data_1, out_file=dot_data1_entropy, filled=True, rounded=True, special_characters=True, feature_names=data_1_col_names, class_names=["0", "1"]) data_1_entropy_graph = pydotplus.graph_from_dot_data( dot_data1_entropy.getvalue()) data_1_entropy_graph.write_png('data_1_entropy.png') Image(data_1_entropy_graph.create_png())
def wlasne_drzewo(): dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("my-tree.pdf")
RMSECvAll.append(math.sqrt( sum( (Originaly-PredictedYcv)**2 )/ OriginalX.shape[0])) plt.figure() plt.plot(CandidatesOfDTDepth, RMSECvAll, 'k', linewidth=2) plt.xlabel("Depth of tree for DT") plt.ylabel("RMSE in CV for DT") plt.show() OptimalMaxDepthDT = CandidatesOfDTDepth[np.where( RMSECvAll == np.min(RMSECvAll) )[0][0] ] DTResult = tree.DecisionTreeRegressor(max_depth=OptimalMaxDepthDT, min_samples_leaf=MinSamplesLeafDT) DTResult.fit( OriginalX, Originaly ) CalculatedYAll[:,7] = DTResult.predict(OriginalX) np.random.seed(10000) PredictedYcvAll[:,7] = model_selection.cross_val_predict(DTResult, OriginalX, Originaly, cv=FoldNumber) np.random.seed() # Check rules of DT datapdDT = pd.read_csv("data.csv", encoding='SHIFT-JIS', index_col=0) with contextlib.closing(StringIO()) as DTfile: tree.export_graphviz(DTResult, out_file=DTfile, feature_names=datapdDT.columns[1:], class_names=datapdDT.columns[0]) output = DTfile.getvalue().splitlines() output.insert(1, 'node[fontname="meiryo"];') with open('DTResult.dot', 'w') as f: f.write('\n'.join(output)) # Estimate Y for new samples based on DT in 1. and 2. PredictedY1All[:,7] = DTResult.predict(OriginalX_prediction1) PredictedY2All[:,7] = DTResult.predict(OriginalX_prediction2) # 9. Random Forest (RF) NumberOfTreesRF = 500 # 1. Number of decision trees CandidatesOfXvariablesRateRF = np.arange( 1, 10, dtype=float)/10 #candidates of the ratio of the number of explanatory variables (X) for decision trees # Run RFR for every candidate of X-ratio and estimate values of objective variable (Y) for Out Of Bag (OOB) samples
def plot_tree(clf, file_name, **kwargs): dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, **kwargs) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(file_name)
def main(data): y = data['Species'] print(" ") print("Before going into our task do you want to check out the data ? ") print(" ") print("y- Yes") print("n-No") choice = input("Select your choice : ") choice = choice.lower() if (('y' in choice) or ('yes' in choice)): print(" ") print(" ") print("Ok ! Let's explore the data") print('\nOur dataset looks like : \n', data.head()) print(" ") print('\nThe shape of the data is: ', data.shape) print(" ") print("\nWhat about the datatypes: \n", data.dtypes) print(" ") print("\nThe whole data can be described as : \n", data.describe()) print(" ") print(" ") from sklearn.tree import DecisionTreeClassifier from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus print(" ") print(" ") print("Ok, now let's train the model and make predictions using it") print(" ") print(" ") print("\nDivide the data into attributes(inputs) and labels(outputs)") x = data.iloc[:, [0, 1, 2, 3]].values le = LabelEncoder() data['Species'] = le.fit_transform(data['Species']) y = data['Species'].values print("\nAttributes:\n", x) print("\nLables :", y) print("Next step is to split this data into training and test sets.") print(" ") print(" ") print("\nTrain-Test-Split : ") print("The test size in default is 20. Would you like to change it ?") print("y- Yes") print("n- No") size = input("\nYour choice : ") if (('y' in size) or ('yes' in size)): tsize = int(input("Specify the test size you want : ")) tsize /= 100 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=tsize, random_state=0) print("Splitted with test size ", tsize) else: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) print("Splitted with default test size") print(" ") print(" ") print("\nLets explore the splitted data : ") print("X_Train data : ", x_train.shape) print("X_Test data : ", x_test.shape) print("Y_Train data : ", y_train.shape) print("Y_Test data : ", y_test.shape) print(y_test) print(" ") print(" ") print("Training the Algorithm") print("We are going to train the model") print("Which training method do you want ?") print("g - Gini") print("e - Entropy") method = input("\nYour choice : ") method = method.lower() if (('g' in method) or ('gini' in method)): dtmodel = DecisionTreeClassifier(criterion="gini", random_state=0, max_depth=3, min_samples_leaf=5) dtmodel.fit(x_train, y_train) elif (('e' in method) or ('entropy' in method)): dtmodel = DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth=3, min_samples_leaf=5) dtmodel.fit(x_train, y_train) else: print("Wrong Choice") return () print(" ") print(" ") print("Testing the Algorithm") y_pred = dtmodel.predict(x_test) print("Predicted values:") print(y_pred) print("Completed") print("Accuracy:", accuracy_score(y_test, y_pred) * 100) print("Report:", classification_report(y_test, y_pred)) print("Confusion Matrix : ") cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(9, 9)) sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues') plt.ylabel('Actual label') plt.xlabel('Predicted label') all_sample_title = 'Accuracy Score: {0}'.format( dtmodel.score(x_test, y_test)) plt.title(all_sample_title, size=15) plt.show() print(" ") print(" ") print("Now, let's visualize the Decision Tree to understand it better.") df = data.copy() df = df.drop('Species', axis=1) dot_data = StringIO() export_graphviz(dtmodel, out_file=dot_data, feature_names=df.columns, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png("dtree.png") im = Image.open(r"dtree.png") im.show() print(" ") print(" ") Flag = True while Flag == True: print("\nWould you like to give try another input ?") print("y - Yes") print("n - No") sp = input("Your answer : ") sp = sp.lower() if (('y' in sp) or ('yes' in sp)): spe = [] slen = float(input("Give Sepal Length in cm : ")) spe.append(slen) swid = float(input("Give Sepal width in cm : ")) spe.append(swid) plen = float(input("Give Petal Length in cm : ")) spe.append(plen) pwid = float(input("Give Petal width in cm : ")) spe.append(pwid) y_pred = dtmodel.predict([spe]) print(" ") print(" ") print("Species according to encoding : ") print("0 - Iris-setosa") print("1 - Iris-versicolor") print("2 - Iris-virginica") print(" ") print(" ") print("The predicted species is ", y_pred) print("Were you expecting the same ?") print("OK") print("Do you want to try again ?") print("y - Yes") print("n - No") ans = input("Your choice : ") if (('y' in ans) or ('yes' in ans)): Flag = True else: Flag = False else: Flag = False print(" ") print(" ") print("Yippee... We learned to use Decision Tree") print("I really had fun") print("Hope you enjoyed it too") print("Bye")
iris = load_iris() df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target']) # df['label'] = df.target.replace(dict(enumerate(df.target_names))) print(df.head()) # to check the top results print(iris.feature_names) print(iris.target_names) print(df.describe()) # to check difference between min and maxmium value x = iris['data'] y = iris['target'] iris_df = pd.DataFrame(x, columns=iris['feature_names']) print(iris_df.head) x, y = shuffle(x, y, random_state=0) # random shuffle x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) classifier=DecisionTreeClassifier(criterion="entropy", max_depth=3) # To check accuracy ,applied algorithm clf = classifier.fit(x_train,y_train) y_pred = classifier.predict(x_test) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # accuracy result shoecase in console dot_data = StringIO() tree.export_graphviz(classifier, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, impurity=False, proportion=True) graph=pydot.graph_from_dot_data(dot_data.getvalue()) # plotting the graph graph[0].write_pdf("iris3.pdf") # run the file.
def train(): balance_data_excel = DecisionTreeQuestionnaire.get_csv_file_data() ''' Clean the Data and replace with nan ''' balance_data_excel = balance_data_excel.replace(r'^\s*$', str(np.nan), regex=True).replace( '', str(np.nan)) balance_data_excel = balance_data_excel.applymap(str) balance_data_excel print("Dataset Length:: ", len(balance_data_excel)) print("Dataset Shape:: ", balance_data_excel.shape) X = balance_data_excel.iloc[:, :-1] y = balance_data_excel.iloc[:, 22] X = DecisionTreeQuestionnaire.encode_onehot( X, X.columns.get_values().tolist()) X.head() le_y = LabelEncoder() y = le_y.fit_transform(y) cols = X.columns for c in cols: x = c if x.split('=')[1] == 'nan': X.drop(c, axis=1, inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100) clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=100, min_samples_leaf=5, min_samples_split=8) abc = clf_entropy.fit(X_train, y_train) y_pred = clf_entropy.predict(X_test) print("Accuracy is ", accuracy_score(y_test, y_pred) * 100) ''' Convert Target Classes to key-value pairs ''' print("le2") class_names = {} for i in range(len(le_y.classes_)): class_names[i] = le_y.classes_[i] print(class_names) features = {} for i in range(len(list(X.columns[0:56]))): features[i] = X.columns[i] #features for i in range(len(list(X.columns[0:56]))): DecisionTreeQuestionnaire.feature_names.append(X.columns[i]) #feature_names dot_data = StringIO() export_graphviz(clf_entropy, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=features, class_names=class_names) # graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph = pydot.graph_from_dot_data(dot_data.getvalue()) # graph_from_dot_file() graph.write_png('decisiontree.png') DecisionTreeQuestionnaire.tree_to_code(abc) DecisionTreeQuestionnaire.tree_to_code2(abc, class_names) DecisionTreeQuestionnaire.paths DecisionTreeQuestionnaire.isModelTrained = True for i in DecisionTreeQuestionnaire.paths: for k, v in i.items(): print(k + ' : ' + v) print("-----------------------------------") print('Your inputs are not defined')
def printTree(clf): dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) return graph.write_png('tree.png')
def decision_tree_training(self): self.target_names = ['lying', 'lie on the side', 'sitting', 'standing'] self.feature_names = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33' ] print('---start training decision tree---') #split dataset in two equal parts #print(np.shape(self.feature), np.shape(self.label)) X_train, X_test, Y_train, Y_test = train_test_split(self.feature, self.label, test_size=0.25, random_state=0) np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/X_train.txt', X_train, fmt='%f') np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/X_test.txt', X_test, fmt='%f') np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/Y_train.txt', Y_train, fmt='%d') np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/Y_test.txt', Y_test, fmt='%d') print('---split data done!---') print() clf = DecisionTreeClassifier(criterion='gini', random_state=0) # 默认使用CART算法 print(np.shape(X_train), np.shape(Y_train.ravel())) clf.fit(X_train, Y_train.ravel()) # cross_val_score(classifier, X_train, Y_train, cv=5) # visualization dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=self.feature_names, class_names=self.target_names, filled=True, rounded=True, impurity=False) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("decision_tree.pdf") # classifier.fit(X_train, Y_train) #验证测试集 print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() Y_true, Y_pred = Y_test.ravel(), clf.predict(X_test) np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/Y_true.txt', Y_true, fmt='%d') np.savetxt( '/home/hts/posture_classification_based_pose/decision_tree/Y_pred.txt', Y_pred, fmt='%d') print( classification_report(Y_true, Y_pred, target_names=self.target_names)) print() print('Decision Tree model saving ......') model_save_path = '/home/hts/posture_classification_based_pose/decision_tree/train_decision_tree_model.m' joblib.dump(clf, model_save_path) return