def _fit(x, labels, y, default_val=False, prune=False): x = C45.normalize_missing_attribute(x, y) process_numeric(x, y) gain = list() if default_val == False: default_val = mode(y) # All target are the same value if np.all(y == y[0, ]): return Node(str(y[0]), [], True) # Empty attribute if x.shape[1] == 0: return Node(str(default_val), [], True) # Calculate gain entropy = ID3.count_entropy(y) for idx, attr in enumerate(x.T): gain.append(ID3.gain(entropy, attr, y) / C45.splitinfo(attr)) # Create node from best attribute idx_max = np.argmax(gain) attr_values = np.unique(x.T[idx_max]) node = Node(labels[idx_max], attr_values, False) # Delete label of best attribute next_labels = labels.copy() next_labels.pop(idx_max) # Split row based on best attribute unique value data_per_values = dict() for value in attr_values: value_x = np.array([]) value_y = np.array([]) for idx, example in enumerate(x): if (example[idx_max] == value): if value_x.shape[0] == 0: value_x = np.array([example]) value_y = np.array([y[idx]]) else: value_x = np.vstack((value_x, example)) value_y = np.append(value_y, y[idx]) value_x = np.delete(value_x, idx_max, axis=1) data_per_values[value] = (value_x, value_y) # Recursively set child for each attribute for value, data in data_per_values.items(): node.set_child(value, ID3._fit(data[0], next_labels, data[1])) if prune: x_test, y_test, x_train, y_train = C45.train_test_split(x, y) ruleset = node.to_rule_list() node = ruleset return node
def performTest(dataService, k): dataService.fetchData() v = Validation(dataService, k) errorID3 = [] errorC45 = [] time = datetime.datetime.now().time() for i in range(k): print(f'Iteration: {i}, error rate:') train, test = v.split_to_train_test(i) id3_algorithm = ID3(train, dataService.attrValues, dataService.classes) tree = id3_algorithm.generateTree() errorID3.append(id3_algorithm.evaluate(test)) c45_algorithm = C45(train, dataService.attrValues, dataService.classes) c45_algorithm.adjustWithC45(tree) errorC45.append(c45_algorithm.evaluateC45Tree(test)) Test.save_to_file(k, time, errorID3[i], errorC45[i]) MeanErrorID3 = round(100 * sum(errorID3) / k, 2) MeanErrorC45 = round(100 * sum(errorC45) / k, 2) Test.save_to_file(k, time, MeanErrorID3, MeanErrorC45) Test.save_to_file(k, time, len(train), (len(train) / len(v.data)) * 100) print(f'ID3 mean error: {MeanErrorID3}%') print(f'C45 mean error: {MeanErrorC45}%')
def __init__(self, filename=None): self.__id3 = ID3() self.load = self.__id3.load self.save = self.__id3.save self.delete = self.__id3.delete if filename is not None: self.load(filename)
def task5(self,printTree = True, printPrecision = True): """ Performs task 5. """ #this part can create multiple replicates if the tree construction #in order to create an accuracy plot """ print('Building the tree (Task 5)...') donnees = self.importData('train_continuous.csv') precisions = [] for i in np.linspace(0.4,4,60): id3 = ID3() print(i) self.arbre_advance = id3.construit_arbre(donnees,True,i)[0] if printTree: print('Decision tree :') print(self.arbre_advance.__repr__(notEg = True)) #print() precision = self.precision(self.importData("test_public_continuous.csv"),True) if printPrecision: print('Testing the tree...') print('Accuracy = ' + "{:5.2f}".format(precision) + '%') #print() precisions.append(precision) plt.plot(np.linspace(0.4,4,60),precisions) plt.xlabel('accuracy_factor') plt.ylabel('Accuracy %') plt.show() """ print('Building the tree (Task 5)...') donnees = self.importData('train_continuous.csv') precisions = [] id3 = ID3() self.arbre_advance = id3.construit_arbre(donnees,True,0.7)[0] if printTree: print('Decision tree :') print(self.arbre_advance.__repr__(notEg = True)) print() precision = self.precision(self.importData("test_public_continuous.csv"),True) if printPrecision: print('Testing the tree...') print('Accuracy = ' + "{:5.2f}".format(precision) + '%') print()
def crossValidation(data, rules): ac = 0 confusion = {} for i in range(len(data)): currentrules = {} for i in rules.keys(): currentrules[i] = rules[i] case = data.pop(0) i = ID3() i.train(data, currentrules) result = i.classify(case) if result == case['Type']: ac += 1 if case['Type'] not in confusion.keys(): confusion[case['Type']] = {} if result not in confusion[case['Type']].keys(): confusion[case['Type']][result] = 0 confusion[case['Type']][result] += 1 data.append(case)
def __init__(self): # Do computations here self.train_discrete = csv_to_array('train_bin.csv') test_discrete = csv_to_array('test_public_bin.csv') id3 = ID3() # Task 1 self.arbre = id3.construit_arbre(self.train_discrete) self.print_precision(self.arbre, test_discrete) # Task 3 self.faits_initiaux = test_discrete self.regles = rules_generator( self.arbre, [reglesansvariables.RegleSansVariables("", set())]) tk3.explain_and_cure(self.faits_initiaux, self.arbre, self.healthy_rules()) # Task 5 train_continuous = csv_to_array('train_continuous.csv') test_continuous = csv_to_array('test_public_continuous.csv') id3_cont = ID3_cont() self.arbre_advance = id3_cont.construit_arbre(train_continuous) self.print_precision(self.arbre_advance, test_continuous)
def test_id3(): goal_attr = 'play' attr = 'wind' attr_universe = ['strong', 'weak'] attr_2 = 'wheather' attr_2_univserse = ['sunny', 'cloudy', 'rainny'] attr_3 = 'temperature' attr_3_universe = ['cold', 'norm', 'hot'] attr_4 = 'humidity' attr_4_universe = ['norm', 'high'] df = { goal_attr: [0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1], attr: ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak'], attr_2: ['sunny', 'sunny', 'cloudy', 'rainny', 'rainny', 'rainny', 'cloudy', 'sunny', 'sunny', 'rainny', 'sunny', 'cloudy', 'cloudy'], attr_3: ['hot', 'hot', 'hot', 'norm', 'cold', 'cold', 'cold', 'norm', 'cold', 'norm', 'norm', 'norm', 'hot'], attr_4: ['high', 'high', 'high', 'high', 'norm', 'norm', 'norm', 'high', 'norm', 'norm', 'norm', 'high', 'norm'] } df = pd.DataFrame(df) id3 = ID3(shanon_gain) tree = id3.train(df, {goal_attr: [0, 1]}, { attr: attr_universe, attr_2: attr_2_univserse, attr_3: attr_3_universe, attr_4: attr_4_universe}) case = { attr: 'strong', attr_2: 'rainny', attr_3: 'norm', attr_4: 'high' } result = tree.predict(case) expected = 0 assert result == expected
def task1(self,printTree = True): """ Performs task 1. """ print('Building the tree (Task 1)...') donnees = self.importData('train_bin.csv') id3 = ID3() s = id3.construit_arbre(donnees) self.attributs = s[1] self.arbre = s[0] if printTree: print('Decision tree :') print(self.arbre) depthData = self.arbre.getDepth() print('Average Depth : ' + "{:5.2f}".format(depthData[0])) print('Maximum Depth : ' + "{:5.2f}".format(depthData[1])) print('Maximum Number of Children : ' + "{:5.2f}".format(depthData[2])) print()
def train(self, n_tree, n_data, n_attr, dataset, goal_attr, attrs): ''' To train a random forest, we build each tree and the decide upond the most common answer. params: - n_tree: number of trees to build - n_data: percentage of data to input each tree to train. - dataset: datframe with all data. - n_attr: number of attributes to consider in each individual tree [1, n]. - goal_attr: dict containing the name (key) and universe(value) of the output. - attrs: dict with name(key) and universe (value) of each attr expected in the dataset. ''' self.forest = [] # build each tree for i in range(n_tree): # get m data with replace mini_batch = self._train_split(dataset, n_data) # now we generate tree id3 = ID3(self.gain) attrs_batch = sample(list(attrs.items()), k=n_attr) attrs_batch = dict(attrs_batch) tree = id3.train(mini_batch, goal_attr, attrs_batch) self.forest.append(tree)
# -*- coding: utf-8 -*- import pandas as pd from id3 import ID3 import numpy as np np.random.seed(1993) # 读取所有数据 all_data = pd.read_csv('./nursery_data/all.csv') # 利用permutation函数随机挑选1000个数据作为测试集,并将剩下的作为训练集 permutation = np.random.permutation(len(all_data))[:1000] test_data = all_data.iloc[permutation] result = test_data['classes'].values test_data = test_data.drop('classes', axis=1) train_data = all_data.drop(permutation) id3_solver = ID3(train_data, target='classes') id3_solver.run() id3_solver.render_decision_tree('./nursery_data/dtree') predict = id3_solver.predict(test_data, force=True) accuracy = id3_solver.score(predict, result) print('The accuracy of the prediction of test data is {}'.format(accuracy))
from c45_numeric_handler import process_numeric from Rule import Rule if __name__ == "__main__": data = read_csv('Bagian B/datasets/iris.csv') # print(data) label = data[0, 0:-1].tolist() x = data[1:, 0:-1] target = data[1:, -1:].flatten() # print(label) # print(x) # print(target) # ID3 print("=====ID 3=====") id3 = ID3() id3.label = label id3.fit(x, target) # print(id3.tree) # C45 print("=====C45=====") c45 = C45() c45.label = label # print(x) # print(target) c45.fit(x, target) # print(c45.tree) print(c45.predict(x[0:1, :]))
from id3 import ID3 c1 = ID3("../data/car.data", "../data/car.names", "../data/test.data", "../data/test2.data") c1.fetchData() c1.generateTree() c1.printTree()
from data_load import clear_data, load, binarize_data, train_test_split from id3 import ID3 from stat import tree_prune_stat m_data = clear_data(load("./mushroom.txt")) for i in range(len(m_data)): f, l = m_data[i][0], m_data[i][-1] m_data[i][0] = l m_data[i][-1] = f m_binary = binarize_data(m_data) m_train, m_test = train_test_split(m_binary, 0.8) m_tree = ID3(m_train) tree_prune_stat(m_tree, m_train, m_test)
import sys from id3 import ID3 from data import Dataframe import copy model = ID3() datafile = sys.argv[1] dataset = Dataframe("") dataset.read_data(dataset, datafile) dataset_copy = Dataframe("") dataset_copy.read_data(dataset_copy, datafile) if len(sys.argv) == 3: root = model.fit(dataset, dataset_copy, dataset.attributes, dataset.target_attribute) print("[BRANCHES]:") else: root = model.fit2(dataset, dataset_copy, dataset.attributes, dataset.target_attribute, sys.argv[3], 0) print("[BRANCHES]:") model.printAllRootToLeafPaths(root) datafile_test = sys.argv[2] dataset_test = Dataframe("") dataset_test.read_data(dataset_test, datafile_test) predictions = [] for row in dataset_test.rows:
def __init__(self): id3 = ID3() # Import data donnee_train = traitement_donnees.import_donnee(self,"../Data/train_bin.csv") donnee_test = traitement_donnees.import_donnee_test(self,"../Data/test_public_bin.csv") self.faits_initiaux = donnee_train # Task 1 : Build tree self.arbre = id3.construit_arbre(donnee_train) print(self.arbre) # Task 2 : Precision of the tree n = 0 p = 0 for donnee in donnee_test : model_result = self.classifie(donnee, self.arbre) if model_result[-1] == donnee['target']: p = p+1 n = n+1 print("Precision : " + str (p/n)) # Task 3 : generate rules self.regles = self.generation_regle(self.arbre) # Print rules r = 0 for regle in self.regles: r += 1 print(str(r) + ') ' + self.ecrit_regle(regle)) # Justification of an example using the rules conflict = [] print(self.justifie_exemple(donnee_test[1], self.regles, self.arbre, conflict)) #any patient can be used as an example, we just chose to only print one # Rules precision (should be the same as the precision of the tree they come from) n_ex = 0 for ex in donnee_test: justification = self.arbre.justifie_exemple(ex, self.regles, conflict) #justification can be printed in case someone wants to see the justification for each patient of the test data n_ex += 1 print('Taux de succes des justifications : ' + str(1 - len(conflict)/n_ex)) #Task 4 : try to help the patients classified as sick by the tree d=[] for patient in donnee_test: self.arbre.diagnostic(self.regles,patient, d) print ('On a pu aider ' + str(len(d)) + ' patients en changeant 2 parametres au maximum.') # Task 5 id3_pt5= ID3_PT5() #Import continuous data donnee_train_continue = traitement_donnees.import_donnee(self,"../Data/train_continuous.csv") donnee_test_continue = traitement_donnees.import_donnee_test(self,"../Data/test_public_continuous.csv") #Build continous tree self.arbre_advance = id3_pt5.construit_arbre(donnee_train_continue) print(self.arbre_advance) #Accuracy of the continuous tre n = 0 p = 0 for donnee in donnee_test_continue : model_result = self.arbre_advance.classifie(donnee) if model_result[-1] == donnee['target']: p = p+1 n = n+1 print("Precision : " + str(p/n))
from data_load import load from id3 import ID3 from stat import tree_prune_stat dane = load("./data1.txt", cast_to_int=True) test = load("./test1.txt", cast_to_int=True) tree = ID3(dane) tree_prune_stat(tree, dane, test)
import numpy as np all_data = pd.read_csv('./dna_data/all.csv') all_data = all_data.drop('name', axis=1) all_data['dna'] = all_data['dna'].apply(lambda x: x.strip()) all_data['dna_len'] = all_data['dna'].apply(len) columns = ['system'] for i in range(60): columns.append('d{}'.format(i)) modified_data = pd.DataFrame(columns=columns, index=all_data.index) for index, row in all_data.iterrows(): new_row = [row['system']] new_row.extend(list(row['dna'])) modified_data.iloc[index] = new_row permutation = np.random.permutation(len(modified_data))[:100] test_data = modified_data.iloc[permutation] result = test_data['system'].values test_data = test_data.drop('system', axis=1) train_data = modified_data.drop(permutation) id3_solver = ID3(train_data, target='system') id3_solver.run() id3_solver.render_decision_tree('./dna_data/dtree') predict = id3_solver.predict(test_data, force=True) accuracy = id3_solver.score(predict, result) print('The accuracy of the prediction of test data is {}'.format(accuracy))
from id3 import ID3 from anytree import RenderTree S = [ {"Outlook": "Sunny", "Temperature": "Hot", "Humidity": "High", "Wind": "Weak", "Sport": "No"}, {"Outlook": "Sunny", "Temperature": "Hot", "Humidity": "High", "Wind": "Strong", "Sport": "No"}, {"Outlook": "Overcast", "Temperature": "Hot", "Humidity": "High", "Wind": "Weak", "Sport": "Yes"}, {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "High", "Wind": "Weak", "Sport": "Yes"}, {"Outlook": "Rain", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Weak", "Sport": "Yes"}, {"Outlook": "Rain", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong", "Sport": "No"}, {"Outlook": "Overcast", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong", "Sport": "Yes"}, {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "High", "Wind": "Weak", "Sport": "No"}, {"Outlook": "Sunny", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Weak", "Sport": "Yes"}, {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "Normal", "Wind": "Weak", "Sport": "Yes"}, {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "Normal", "Wind": "Strong", "Sport": "Yes"}, {"Outlook": "Overcast", "Temperature": "Mild", "Humidity": "High", "Wind": "Strong", "Sport": "Yes"}, {"Outlook": "Overcast", "Temperature": "Hot", "Humidity": "Normal", "Wind": "Weak", "Sport": "Yes"}, {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "High", "Wind": "Strong", "Sport": "No"} ] A = ['Outlook', 'Temperature', 'Humidity', 'Wind'] decision_tree = ID3(S, A) # Show the tree produced by ID3 for prefix, filling, node in RenderTree(decision_tree.T): print("{}{}".format(prefix, node.name))
import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.tree import DecisionTreeClassifier from id3 import ID3 data = pd.DataFrame( np.array([["Sunny", "Hot", "High", "Weak", "No"], ["Sunny", "Hot", "High", "Strong", "No"], ["Overcast", "Hot", "High", "Weak", "Yes"], ["Rain", "Mild", "High", "Weak", "Yes"], ["Rain", "Cool", "Normal", "Weak", "Yes"], ["Rain", "Cool", "Normal", "Strong", "No"], ["Overcast", "Cool", "Normal", "Strong", "Yes"], ["Sunny", "Mild", "High", "Weak", "No"], ["Sunny", "Cool", "Normal", "Weak", "Yes"], ["Rain", "Mild", "Normal", "Weak", "Yes"], ["Sunny", "Mild", "Normal", "Strong", "Yes"], ["Overcast", "Mild", "High", "Strong", "Yes"], ["Overcast", "Hot", "Normal", "Weak", "Yes"], ["Rain", "Mild", "High", "Strong", "No"]]), columns=['Outlook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis']) attributes = ['Humidity', 'Wind', 'Outlook'] target_attribute = "PlayTennis" id3_instance = ID3() id3_instance.fit(data, target_attribute, attributes) print("xxxxxxxxxxxxxxxxxxxxxxxxx") id3_instance.traverse("")
# general settings attrs = {'Age': age_labels, 'Pclass': [1, 2, 3], 'Sex': ['male', 'female']} goal_attr = {'Survived': [0, 1]} train_data, test_data = train_test_split(data, args['test']) # now we build each method if args['predictor'] == 'random_forest': predictior = RandomForest(shanon_gain) predictior.train(args['num_tree'], args['n_data'], args['n_attr'], train_data, goal_attr, attrs) else: gain_func = shanon_gain if args['gain'] == 'shanon' else gini_gain id3 = ID3(gain_func) predictior = id3.train(train_data, goal_attr, attrs) # now we test predictions = [] # make predictions for index, case in test_data.iterrows(): case = case.to_dict() predictions.append(predictior.predict(case)) # build confusion matrix labels = [1, 0] conf_matrix = confusion_matrix(test_data.Survived.to_list(), predictions, labels=labels) df_cm = pd.DataFrame(conf_matrix, index=['survived', 'not survived'],