def test_fit(): bunch = load_breast_cancer() id3Estimator = Id3Estimator() id3Estimator.fit(bunch.data, bunch.target) assert_equal(id3Estimator.tree_.root.value, 22) id3Estimator = Id3Estimator(max_depth=2) id3Estimator.fit(bunch.data, bunch.target) assert_equal(id3Estimator.tree_.root.value, 22) id3Estimator = Id3Estimator(min_samples_split=20) id3Estimator.fit(bunch.data, bunch.target) assert_equal(id3Estimator.tree_.root.value, 22)
def draw_graph(self, x, y): # Decision Tree Graph clf = Id3Estimator() clf.fit(x, y, check_input=True) #clf.predict_proba(x) print(export_text(clf.tree_, self.feature_names)) # export tree.dot as pdf file to write Decision Tree as a graph dot_data = StringIO() #tree.export_graphviz(clf, out_file = dot_data) export_graphviz(clf.tree_, 'SVC_Tree.dot', self.feature_names) graph = pydot.graph_from_dot_file('SVC_Tree.dot') graph[0].write_pdf("SVC_Tree.pdf") clf = DecisionTreeClassifier() clf = clf.fit(x,y) clf.predict(x, check_input=True) clf.predict_proba(x) # version v1 pdf output dot_data = tree.export_graphviz(clf, out_file='SVC_Tree_v1.dot') graph = pydot.graph_from_dot_file('SVC_Tree_v1.dot') graph[0].write_pdf("SVC_Tree_v1.pdf") # version v2 pdf output dot_data = tree.export_graphviz(clf, out_file="SVC_Tree_v2", feature_names=self.feature_names, class_names=self.target, filled=True, rounded=True, special_characters=True) #dot_data = tree.export_graphviz(clf, out_file="Decision-Tree-Regression-v2", feature_names=feature_names, class_names=target.name, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data) # print graph this is done correct as lang as out_file=None graph # save graph version 2 as pdf data file graph = pydot.graph_from_dot_file('SVC_Tree_v2') graph[0].write_pdf("SVC_Tree_v2.pdf") return True
def Tree(): names = ["tarcza", "czy lata", "wiek", "zbroja", "hp", "level", "potwor"] count = len(open('przypadki.txt', 'rU').readlines()) x = [] for i in range(1, count): line = linecache.getline('przypadki.txt', i).split(" ") line[6] = str(line[6][0]) x.append(line) X = np.asarray(x) print(X) y = np.array([int(i) for i in linecache.getline('wyniki.txt', 1)[:-2]]) yd = [int(i) for i in linecache.getline('wyniki.txt', 1)[:-2]] d = [] d.append(names) d[0].append("wynik") for i in range(0, len(yd)): d.append(x[i] + [yd[i]]) print(d) clf = Id3Estimator() clf.fit(X, y, check_input=True) #d = np.array([['0', '0', '39', '1', '9', '0','1', 't']]) #print(d) #c = clf.predict(d) #print(c) export_graphviz(clf.tree_, "out.dot", names) print(export_text(clf.tree_, names)) return clf
def test_nominal(): id3Estimator = Id3Estimator() id3Estimator.fit(X, y_nom) assert_equal(id3Estimator.tree_.root.value, 3) predict = id3Estimator.predict(X_nom_test) assert_equal(predict, y_nom_test)
def Task2(): dia, dia_meta = parser("diabetes.arff") stratifiedCrossValidation2(dia, dia_meta, 10) means = [] stds = [] accs = [] means2 = [] stds2 = [] test_paths = [] #prepare test paths train_paths = [] #prepare train paths for i in range(10): test_paths.append("test" + str(i) + ".arff") train_paths.append("train" + str(i) + ".arff") id3 = Id3Estimator() print("ID3: ") mean_1, std_1 = CV_learn(id3, train_paths, test_paths, 10, dia_meta) print("\nRFC: ") rfc = RandomForestClassifier(max_depth=50, random_state=0) mean_2, std_2 = CV_learn(rfc, train_paths, test_paths, 10, dia_meta)
def CreaMatrice(): filename = askopenfilename(title="Ouvrir votre document", filetypes=[('txt files', '.txt'), ('all files', '.*')]) fichier = open(filename, "r") content = fichier.read() fichier = open(filename, "r") first_ligne = fichier.readline() L = first_ligne.split() nbAttributs = len(L) fichierX = [] ligne = fichier.readline() compte = 0 while (ligne): fichierX.append(ligne.split()) compte = compte + 1 ligne = fichier.readline() attributCible = [] for i in range(len(fichierX)): attributCible.append(fichierX[i][-1]) fichierX[i].pop() feature_names = L X = np.array(fichierX) y = np.array(attributCible) clf = Id3Estimator() clf.fit(X, y, check_input=False) print(export_text(clf.tree_, feature_names)) save = open("matrice.txt", "w") save.write(export_text(clf.tree_, feature_names)) save.close() fichier.close()
def generate_tree(self, max_depth): self.__print( "\n-------------------------------------- Modelling ------------------------------------------\n" ) self.__create_output_dir() self.__print("\n-----DECISION TREE GENERATION-----\n") self.__print("Output file names: ./output/" + self.run_id + "/tree.dot ./output/" + self.run_id + "/tree.png") # the estimator self.estimator = Id3Estimator(max_depth) # suvrived x = self.dataframe.iloc[:, 0] # all attributes except survieved y = self.dataframe.iloc[:, 1:] # all var names except survieved feature_names = list(y.columns.values) # calc the tree self.estimator = self.estimator.fit(y, x) # export as .dot dot_data = export_graphviz(self.estimator.tree_, './output/' + self.run_id + '/tree.dot', feature_names) # create png file #command = ["dot", "-Tpng", './output/' + self.run_id + '/tree.dot', "-o", "./output/" + self.run_id + "/tree.png"] #subprocess.check_call(command, shell=True) command = "dot -Tpng " + './output/' + self.run_id + '/tree.dot' + " -o " + "./output/" + self.run_id + "/tree.png" #Tsvg can be changed to Tjpg, Tpng, Tgif etc (see dot man pages) os.system(command)
def cpuUsageDecisionTree(self): (X, Y) = self.get_data_from_csv() feature_names = [ "vm_id_map", "timestamp_new", "cpu_usage_percent", "admin_historic_decision_cpu" ] clf = Id3Estimator() clf.fit(X, Y, check_input=True) export_graphviz(clf.tree_, "out.dot", feature_names)
def id3(): headers = pd.read_csv('Task4_Data.csv', nrows=1).columns.values headers = headers[3:6] X = pd.read_csv('Task4_Data.csv').values y = X[:,6] X = X[:,3:6] clf = Id3Estimator() clf.fit(X, y, check_input=True) export_graphviz(clf.tree_, 'tree.dot', headers)
def main(): feature_names = ["Opponent", "Home/Away", "AP Top 25", "Media"] X = np.array([['Texas', 'Home', 'Out', '1-NBC'], ['Virginia', 'Away', 'Out', '4-ABC'], ['GeorgiaTech', 'Home', 'In', '1-NBC'], ['UMass', 'Home', 'Out', '1-NBC'], ['Clemson', 'Away', 'In', '4-ABC'], ['Navy', 'Home', 'Out', '1-NBC'], ['USC', 'Home', 'In', '1-NBC'], ['Temple', 'Away', 'Out', '4-ABC'], ['PITT', 'Away', 'Out', '4-ABC'], ['WakeForest', 'Home', 'Out', '1-NBC'], ['BostonCollege', 'Away', 'Out', '1-NBC'], ['Stanford', 'Away', 'In', '3-FOX'], ['Texas', 'Away', 'Out', '4-ABC'], ['Nevada', 'Home', 'Out', '1-NBC'], ['MichiganState', 'Home', 'Out', '1-NBC'], ['Duke', 'Home', 'Out', '1-NBC'], ['Syracuse', 'Home', 'Out', '2-ESPN'], ['NorthCarolinaState', 'Away', 'Out', '4-ABC'], ['Stanford', 'Home', 'In', '1-NBC'], ['MiamiFlorida', 'Home', 'Out', '1-NBC'], ['Navy', 'Home', 'Out', '5-CBS'], ['Army', 'Home', 'Out', '1-NBC'], ['VirginiaTech', 'Home', 'In', '1-NBC'], ['USC', 'Away', 'In', '4-ABC']]) y = np.array([ "Win", "Win", "Win", "Win", "Lose", "Win", "Win", "Win", "Win", "Win", "Win", "Lose", "Lose", "Win", "Lose", "Lose", "Win", "Lose", "Lose", "Win", "Lose", "Win", "Lose", "Lose" ]) clf = Id3Estimator() clf.fit(X, y, check_input=True) print("Training:") print(export_text(clf.tree_, feature_names)) testing = [ ["Temple", "Home", "Out", "1-NBC"], # ["Georgia", "Home", "In", "1-NBC"], ["BostonCollege", "Away", "Out", "2-ESPN"], ["MichiganState", "Away", "Out", "3-FOX"], # ["MiamiOhio", "Home", "Out", "1-NBC"], # ["NorthCarolina", "Away", "Out", "4-ABC"], ["USC", "Home", "In", "1-NBC"], ["NorthCarolinaState", "Home", "Out", "1-NBC"], ["WakeForest", "Home", "Out", "1-NBC"], ["MiamiFlorida", "Away", "In", "4-ABC"], ["Navy", "Home", "Out", "1-NBC"], ["Stanford", "Away", "In", "4-ABC"] ] print("\n\nTesting:") print(clf.predict(testing))
def test_predict(): estimator = Id3Estimator() bunch = load_breast_cancer() estimator.fit(bunch.data, bunch.target) sample = np.array([ 20.57, 17.77, 132.9, 1326, 0.08474, 0.07864, 0.0869, 0.07017, 0.1812, 0.05667, 0.5435, 0.7339, 3.398, 74.08, 0.005225, 0.01308, 0.0186, 0.0134, 0.01389, 0.003532, 24.99, 23.41, 158.8, 1956, 0.1238, 0.1866, 0.2416, 0.186, 0.275, 0.08902 ]).reshape(1, -1) assert_almost_equal(estimator.predict(bunch.data), bunch.target) assert_almost_equal(estimator.predict(sample), 0)
def test_predict_proba(): estimator = Id3Estimator() bunch = load_breast_cancer() estimator.fit(bunch.data, bunch.target) # Test shape of probability data structure using breast cancer data probs = estimator.predict_proba(bunch.data) assert_equal(probs.shape[0],bunch.data.shape[0]) assert_equal(probs.shape[1],estimator.tree_.y_encoder.classes_.shape[0]) # Test probability values using sample data (as per test_predict method) probs = estimator.predict_proba(bc_sample) assert probs[0,0] >= 0.5 assert probs[0,1] < 0.5 assert_equal(np.sum(probs[0]),1.0)
def test_numerical_split(): bunch = load_breast_cancer() id3Estimator = Id3Estimator() id3Estimator.fit(bunch.data, bunch.target) splitter = id3Estimator.builder_.splitter record = splitter.calc(np.array(list(range(bunch.target.shape[0]))), np.array(list(range(bunch.data.shape[1])))) less = np.sum(bunch.data[:, record.feature_idx] <= record.pivot) more = bunch.data[:, record.feature_idx].shape[0] - less split = splitter.split(np.array(list(range(bunch.target.shape[0]))), record) assert_almost_equal(len(split[0].bag), less) assert_almost_equal(len(split[1].bag), more)
def BuildTree(): feature_names = ["danie", "na ciepło", "z mięsem", "na słodko", "kwaśne", "alkoholowe", "czekoladowe", "wybor"] dataset = ps.read_csv("recommend.csv", header=None, names=feature_names, sep=";") X = dataset.drop('wybor', axis=1) Y = dataset['wybor'] clf = Id3Estimator() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20) clf.fit(X_train, Y_train) export_graphviz(clf.tree_, "lol.dot", feature_names) return clf
def BuildTree(): # nazwy cech feature_names = [ "pair", "empty_plate", "talking", "mood", "asked", "hurry", "bill" ] Yfeature_names = [ "pair", "empty_plate", "talking", "mood", "asked", "hurry" ] # wczytaj dataset z pliku dane.csv dataset = ps.read_csv("bill.csv", header=None, names=feature_names, sep=";") X = dataset.drop('bill', axis=1) Y = dataset['bill'] # tworzenie drzewa decyzyjnego clf = Id3Estimator() # Podział na dane treningowe i dane testowe X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20) # fit - synonim do "find patterns in data" clf.fit(X_train, Y_train) export_graphviz(clf.tree_, "test.dot", feature_names) model = load_model('third_try.h5') while True: path = random.choice( os.listdir("C://Users/Kinia/Desktop/sztuczna2/SI-master/test")) print(path) img_pred = image.load_img("test/" + path, target_size=(100, 100)) img_pred = image.img_to_array(img_pred) img_pred = np.expand_dims(img_pred, axis=0) rslt = model.predict(img_pred) print(rslt) if rslt[0][0] == 1: prediction = 1 break else: prediction = 0 print(prediction) return [prediction, clf]
def measures_of_id3(subsets): clf = Id3Estimator() start_time = perf_counter() clf.fit(subsets[SplitPartNames['X_train']], subsets[SplitPartNames['y_train']]) end_time = perf_counter() learning_time = end_time - start_time start_time = perf_counter() prediction = clf.predict(subsets[SplitPartNames['X_test']]) end_time = perf_counter() prediction_time = end_time - start_time accuracy = metrics.accuracy_score(subsets[SplitPartNames['y_test']], prediction) return round(learning_time, 4), round(prediction_time, 4), round(accuracy, 2)
def main(): feature_names = ["home/away", "top25", "media"] X = np.array([['home', 'out', '1-nbc'], ['home', 'in', '1-nbc'], ['away', 'out', '2-espn'], ['away', 'out', '3-fox'], ['home', 'out', '1-nbc'], ['away', 'out', '4-abc']]) y = np.array(["win", "lose", "win", "win", "win", "win"]) clf = Id3Estimator() clf.fit(X, y, check_input=True) print(export_text(clf.tree_, feature_names)) testing = [["home", "in", "1-nbc"], ["home", "out", "1-nbc"], ["home", "out", "1-nbc"], ["home", "in", "4-abc"], ["home", "out", "1-nbc"], ["home", "in", "4-abc"]] print("\n\nTesting:") print(clf.predict(testing))
def main(): feature_names = ["outlook", "temperature", "humidity", "windy"] X = np.array([['sunny', 'hot', 'high', 'false'], ['sunny', 'hot', 'high', 'true'], ['overcast', 'hot', 'high', 'false'], ['rainy', 'mild', 'high', 'false'], ['rainy', 'cool', 'normal', 'false'], ['rainy', 'cool', 'normal', 'true'], ['overcast', 'cool', 'normal', 'true'], ['sunny', 'mild', 'high', 'false'], ['sunny', 'cool', 'normal', 'false'], ['rainy', 'mild', 'normal', 'false'], ['sunny', 'mild', 'normal', 'true'], ['overcast', 'mild', 'high', 'true'], ['overcast', 'hot', 'normal', 'false'], ['rainy', 'mild', 'high', 'true']]) y = np.array(["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"]) clf = Id3Estimator() clf.fit(X, y, check_input=True) print("Training:") print(export_text(clf.tree_, feature_names)) print("Testing: rainy, hot, high, false") print(clf.predict([["rainy", "hot", "high", "false"]])) #Throws DeprecationWarning, ignore it
def id3(size): x_train, x_test, y_train, y_test = setData("Pokemon.csv", 718, size) est = Id3Estimator(gain_ratio=True) est.fit(x_train, y_train) y_test = y_test.to_numpy() y_train = y_train.to_numpy() y_predict = est.predict(x_test) y_predict2 = est.predict(x_train) error_1 = 0 error_2 = 0 for i in range(len(y_test)): if y_predict[i] != y_test[i]: error_1 = error_1 + 1 for i in range(len(y_train)): if y_predict2[i] != y_train[i]: error_2 = error_2 + 1 #dot = export_graphviz(est.tree_, 'tree.dot', bunch.feature_names) return error_1 / len(y_test) * 100, error_2 / len(y_train) * 100
dados = pd.read_csv('amostras.csv', sep=',', encoding='utf8') dadosX = dados[[ 'pontuacao_final', 'sobrevivencia', 'bonus_ultima_sobrevivencia', 'dano_disparo', 'bonus_disparo_morte', 'colisao_dano', 'bonus_colisao_morte', '1lugar', '2lugar', '3lugar' ]].values dadosY = dados['classificacao'] treinoX, testeX, treinoY, testeY = train_test_split(dadosX, dadosY, test_size=0.3, shuffle=False) modeloArvodeID3 = Id3Estimator(max_depth=3) modeloArvodeID3.fit(treinoX, treinoY) export_graphviz(modeloArvodeID3.tree_, 'arvoreExecutada.dot', [ 'pontuacao_final', 'sobrevivencia', 'bonus_ultima_sobrevivencia', 'dano_disparo', 'bonus_disparo_morte', 'colisao_dano', 'bonus_colisao_morte', '1lugar', '2lugar', '3lugar' ]) classificacoes = modeloArvodeID3.predict(testeX) print('Resultados Árvore de Decisão ID3 (Iterative Dichotomiser 3):') print('Acurácia: %.4f' % accuracy_score(classificacoes, testeY)) print('Precisão: %.4f' % precision_score(classificacoes, testeY, average='macro'))
datatype_file = graph_dir + "-datatype" x_file = graph_dir + "-x.out" y_file = graph_dir + "-y.out" dot_file = graph_dir + ".dot" with open(feature_file) as f: feature_names = f.readlines() feature_names = [x.strip() for x in feature_names] X = np.array(genfromtxt(x_file, dtype=None, delimiter="~").tolist()) y = genfromtxt(y_file, dtype='i4') if len(feature_names) == 1: X = X.reshape(-1, 1) clf = Id3Estimator() clf.fit(X, y, check_input=True) end = datetime.now() delta = end - start try: export_graphviz(clf.tree_, dot_file, feature_names) except: print("Unexpected error:", sys.exc_info()[0]) result = convert_dot_to_predicate(dot_file, graph_dir) path, filename = os.path.split(graph_dir) result.insert(0, filename) result.append(delta.seconds) pprint(result)
https://pypi.python.org/pypi/decision-tree-id3/0.1.2 """ #from sklearn import tree from id3 import Id3Estimator from id3 import export_graphviz import numpy as np import graphviz # | 0 | 1 | 2 #Outlook | Sunny | Overcast | Rain #Temperature| Hot | Mild | Cool #Humidity | High | Normal | - #Wind | Weak | Strong | - x_labels = ["Outlook", "Temperature", "Humidity", "Wind"] X = np.array([[0, 0, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0], [2, 1, 0, 0], [2, 2, 1, 0], [2, 2, 1, 1], [1, 2, 1, 1], [0, 1, 0, 0], [0, 2, 1, 0], [2, 1, 1, 0], [0, 1, 1, 1], [1, 1, 0, 1], [1, 0, 1, 0], [2, 1, 0, 1]]) Y = np.array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0]) #clf = tree.DecisionTreeClassifier() clf = Id3Estimator(min_samples_split=3) clf.fit(X, Y) dot_data = export_graphviz(clf.tree_, "decisiontree.dot", x_labels) #predictions = clf.predict(X) #for i in range(len(X)): # print X[i],Y[i],"->",predictions[i]
'research background', 'final year projct type', 'enthusiasm', 'teamwork ability', 'communication & network skill', 'cgpa' ] #selecting train data x = data[fcols] y = data['current job field'] #initilizing the models lr = LogisticRegression() knn = KNeighborsClassifier(n_neighbors=5) gnb = GaussianNB() mn = MultinomialNB() ber = BernoulliNB() tree = DecisionTreeClassifier() id3 = Id3Estimator() rnd = RandomForestClassifier(n_estimators=300) svc = SVC() mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 75), random_state=1) """a = tree.fit(x,y) print(a.feature_importances_)""" val = list() val.append(k_fold(lr, x, y)) val.append(k_fold(knn, x, y)) val.append(k_fold(gnb, x, y)) val.append(k_fold(mn, x, y)) val.append(k_fold(ber, x, y)) val.append(k_fold(tree, x, y))
from scalingdata import * import time A = np.genfromtxt('data.txt', delimiter=',') # split data Xtrain = A[:60, 0:5] Ytrain = A[:60, 5] Xtest = A[60:80, 0:5] Ytest = A[60:80, 5] print("---------result id3 build decision tree non scaling data--------") t = time.time() clf = Id3Estimator() clf.fit(Xtrain, Ytrain) h = clf.predict(Xtest) ouput(h, Ytest) ed = time.time() print("Time excution non scaling data", ed - t) print("---------result scaling data with standardization------------") t = time.time() Xmean = np.mean(Xtrain, axis=0) st = np.std(Xtrain, axis=0) X_test_stadar = standardization(Xtest, Xmean, st) X_train_stadar = standardization(Xtrain, Xmean, st)
from sklearn.datasets import fetch_kddcup99 from sklearn.model_selection import train_test_split from id3 import export_graphviz import numpy as np bunch = fetch_kddcup99(subset="SA") data = bunch.data data = np.delete(data, np.s_[1:4], axis=1) target = bunch.target X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=.2, random_state=17) estimator = Id3Estimator() print("->Fitting ID3 classifier") estimator.fit(X_train, y_train) print("->Writing dot file") export_graphviz(estimator.tree_, 'tree.dot') print("->Calculating predictions") pred = estimator.predict(X_test) well_detected = 0 for index, val in enumerate(pred): if val == y_test[index]: well_detected += 1 percentage = well_detected / len(pred) * 100
from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import accuracy_score from id3 import Id3Estimator from extractData import X, y X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=6) # Without pruning no_prone_accuracy = cross_val_score(Id3Estimator(), X_train, y_train, cv=4, scoring='accuracy').mean() print("Accuracy without pruning on cross validation is:", no_prone_accuracy) # With pruning estimator = Id3Estimator(min_samples_split=20) prone_accuracy = cross_val_score(estimator, X_train, y_train, cv=4, scoring='accuracy').mean() print("Accuracy with pruning on cross validation is:", prone_accuracy) # estimator.fit(X_train, y_train) # print("Accuracy with pruning on test is:", accuracy_score(y_test, estimator.predict(X_test)))
import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier as knn import numpy as np from sklearn import svm from sklearn.datasets import load_svmlight_file as load_svm from sklearn.model_selection import KFold from drop_highlycorelated import clf, xtrain, ytrain, xtest, ytest, X_important_train, X_important_test from sklearn.metrics import accuracy_score from id3 import Id3Estimator clf_imp = Id3Estimator() clf_imp.fit(X_important_train, ytrain) #kfold from sklearn.model_selection import cross_val_score, KFold n_folds = [] n_folds.append(('K2', 2)) n_folds.append(('K4', 4)) n_folds.append(('K5', 5)) n_folds.append(('K10', 10)) seed = 7 for name, n_split in n_folds: results = [] names = [] print(name) kfold = KFold(n_splits=n_split, random_state=seed) cv_results = cross_val_score(clf_imp,
X, Y, test_size=test_proportion) X_trainf5, X_test, Y_trainf5, Y_test = train_test_split( X, Y, test_size=test_proportion) # X_trainf6, X_test, Y_trainf6, Y_test = train_test_split( X, Y, test_size=test_proportion) X_trainf7, X_test, Y_trainf7, Y_test = train_test_split( X, Y, test_size=test_proportion) X_trainf8, X_test, Y_trainf8, Y_test = train_test_split( X, Y, test_size=test_proportion) X_trainf9, X_test, Y_trainf9, Y_test = train_test_split( X, Y, test_size=test_proportion) X_trainf10, X_test, Y_trainf10, Y_test = train_test_split( X, Y, test_size=test_proportion) estimator1 = Id3Estimator() estimator2 = Id3Estimator() estimator3 = Id3Estimator() estimator4 = Id3Estimator() estimator5 = Id3Estimator() # estimator6 = Id3Estimator() estimator7 = Id3Estimator() estimator8 = Id3Estimator() estimator9 = Id3Estimator() estimator10 = Id3Estimator() estimator1.fit(X_trainf1, Y_trainf1) estimator2.fit(X_trainf2, Y_trainf2) estimator3.fit(X_trainf3, Y_trainf3) estimator4.fit(X_trainf4, Y_trainf4)
import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from id3 import Id3Estimator min_samples = 20 # load data data = np.genfromtxt('flare.csv', delimiter=",", dtype=None, names=True) data_classification = data['classification'] data_features = [] for row in data: data_features.append([col for col in row][:-1]) # split data to 75% train and 25% test features_train, features_test, classification_train, classification_test = \ train_test_split(data_features, data_classification, test_size=0.25, random_state=4) # train without cut estimator = Id3Estimator() estimator.fit(features_train, classification_train) classification_predict = estimator.predict(features_test) print(accuracy_score(classification_test, classification_predict)) # train with cut estimator_cut = Id3Estimator(min_samples_split=min_samples) estimator_cut.fit(features_train, classification_train) classification_predict = estimator_cut.predict(features_test) print(accuracy_score(classification_test, classification_predict))
"Tree_with": [], "Tree_without": []} for _ in range(100): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # Question 7 # Without choosing parameters es = KNeighborsClassifier() es.fit(X_train, y_train) results["KNN_without"].append(accuracy_score(y_test, es.predict(X_test))) # With choosing parameters number_of_indexes = sfs.sfs(X_train, y_train, 8, KNeighborsClassifier(), scoreSFS) es.fit(sfs.subset_of_x(X_train, number_of_indexes), y_train) results["KNN_with"].append(accuracy_score(y_test, es.predict(sfs.subset_of_x(X_test, number_of_indexes)))) # Question 8 # Without pruning es = Id3Estimator() es.fit(X_train, y_train) results["Tree_without"].append(accuracy_score(y_test, es.predict(X_test))) # With pruning es = Id3Estimator(min_samples_split=20) es.fit(X_train, y_train) results["Tree_with"].append(accuracy_score(y_test, es.predict(X_test))) print(mean(results["KNN_without"])) print(mean(results["KNN_with"])) print(mean(results["Tree_without"])) print(mean(results["Tree_with"]))