def main(path, predict_column): df = pd.read_csv(path) columns = df.columns.values X = [ df[column].values for column in columns if column != predict_column and column != 'Name' ] y = df[predict_column].values swtch = Switcher() sk_X = [] for column in X: converted = np.array(swtch.convert_to(column)) sk_X.append(converted) sk_X = np.array(sk_X).T sk_y = np.array(swtch.convert_to(y)).T decisions = ID3() entropy = decisions.fit(X, y) print(f'Entropies:\t\t\t{entropy}') sk_decisions = DecisionTreeClassifier(random_state=1370) sk_decisions.fit(sk_X, sk_y) print(f'SkLearn Tree Decision accuracy:\t{sk_decisions.score(sk_X, sk_y)}') #print(sk_decisions.predict(['pycharm', 'Java', 'tea'])) return 0
def prune(tree, validation_group, evaluate_func): if tree.children is None: # it is a leave return tree # continues features, answers are binaries sub_validation_0 = validation_group[ validation_group[tree.feature[0]] < tree.feature[1]] tree.children[0] = (0, CostSensitiveID3.prune(tree.children[0][1], sub_validation_0, evaluate_func)) sub_validation_1 = validation_group[ validation_group[tree.feature[0]] >= tree.feature[1]] tree.children[1] = (1, CostSensitiveID3.prune(tree.children[1][1], sub_validation_1, evaluate_func)) err_prune = 0 err_no_prune = 0 for index, row in validation_group.iterrows(): real_classification = row['diagnosis'] err_prune += evaluate_func(real_classification, tree.class_of_node) err_no_prune += evaluate_func(real_classification, ID3class.classify(row, tree)) if err_prune < err_no_prune: # it is better to prune tree.f = None tree.children = None return tree
def gerar_objetos(): texto = open("data.txt").read() linhas = texto.replace(' ', '').splitlines() objetos = ID3() for linha in linhas: objetos.dias.append(Dia(linha.split(','))) return objetos
def classify_by_k(self, sample, k_param=6): sample_to_cenroid = self.improved_forest.minmax(sample.drop(['diagnosis'])) # we need only features self.improved_forest.K = k_param distance_np = np.array( [self.improved_forest.calc_distance(self.improved_forest.centroid_lst[it].to_numpy(), sample_to_cenroid.to_numpy()) for it in range(self.improved_forest.N)]) def get_indices_of_k_smallest(arr, k): # finds the k indices of k smallest values (distance) if k == arr.size: # all the trees can vote return range(k) #idx = np.argpartition(arr, k) idx = sorted(range(len(arr)), key=lambda sub: arr[sub])[:K] # return np.array(np.unravel_index(idx, arr.shape))[:, range(min(k, 0), max(k, 0))].tolist() return idx[:k] conference_indices_lst = get_indices_of_k_smallest(distance_np, self.improved_forest.K) classify_func = lambda i: ID3class.classify(sample, self.improved_forest.id3_lst[conference_indices_lst[i]].tree) dec_by = int(self.improved_forest.K / 3) votes_lst = [[classify_func(i)] * (3 - int(i /2)) for i in range(self.improved_forest.K)] votes_lst_flatt = [item for sublist in votes_lst for item in sublist] if votes_lst_flatt.count('B') > len(votes_lst) / 2: # B won the vote return 'B' return 'M'
def findBestFeaturesByIG(param, E_train, F_intial): F_best_list = [] for it in range(param): best_f = ID3class.MaxIG(F_intial, E_train) F_best_list.append(best_f[0]) F_intial.remove(best_f[0]) return F_best_list
def fit(self, arr_instans, arr_target): #convert array to dataframe df = pandas.DataFrame.from_records(arr_instans) arr_target = pandas.DataFrame(arr_target) arr_instans = df.assign(target=arr_target.values) #pre-process continuous valued attribute ##arr_instans = self.continuous_value(arr_instans) #pre-process missing value attribute arr_instans = self.missing_value_handler(arr_instans) #pre-process attribute with many values #convert dataframe to array of array arr_target = [target[0] for target in arr_target.values] arr_instans = arr_instans.drop(columns="target", axis=1) arr_instans = arr_instans.values.tolist() #ic(arr_instans) #ic(arr_target) #buat objek ID3 id3_ = ID3(arr_instans, arr_target) #fitting ID3 id3_.fit(arr_instans, arr_target) #this->root = ID3.root self.root = id3_.root
def build_decision_tree(training_data_acids, training_data_labels): """ Based on the training data and information gain, build a decision tree that can accurately predict if an amino acid will be exposed(e) or buried(-) :param training_data_acids: The amino acids in the training data set :param training_data_labels: The labels in the training data set :return: A binary tree that represents a decision tree based on the given features """ print "Building tree ..." # Flatten the data for training flat_training_data_acids = [] flat_training_data_labels = [] for x, y in zip(training_data_acids, training_data_labels): flat_training_data_acids.extend(x) flat_training_data_labels.extend(y) # Build decision tree with training data attributes = [i for i in range(len(training_data_acids[0][0]))] decision_tree = ID3(attributes, flat_training_data_acids, flat_training_data_labels) return decision_tree
def run(): print("Executing ID3 algorithm") examples = ID3Reader.Read() DTree = ID3(examples,"ghost","ghost",0) j = 0 accuracy = 0 print("#######################################") newExamples = open("./ID3/readyData.txt",'r') for line in newExamples: j+=1 ex = line.split(',') for i in range(len(ex)): ex[i] = ex[i].rstrip() tempEx = [] for i in range(0,len(ex) -1): tempEx.append(ex[i]) answer = DTree.answer(tempEx) if answer == ex[4]: accuracy += 1 print("Test subject "+str(j)) print("Algorithm category of choice is : "+answer) print("Actual category is : "+ex[-1] ) print("#######################################") return str("ID3 algorithm accuracy with "+ str(len(examples))+" examples of Iris flowers is: "+str(accuracy)+"/30")
def gerar_objetos(): global cabecalho linhas = open("data.txt").read().replace(' ', '').splitlines() objetos = ID3() cabecalho = linhas.pop(0).split(',') for linha in linhas: objetos.dias.append(Dia(linha.split(','))) return objetos
def train(self, data, p_param): # kf = KFold(n_splits=3, shuffle=True, random_state=318981586) decisions_trees = [] for i in range(self.n_param): size = p_param * self.n_param random_examples = sample(data, k=int(size)) classifier = ID3(random_examples) classifier.train() centroid = calc_centroid(random_examples) decisions_trees.append((centroid, classifier)) self.decision_trees = decisions_trees
def main(path, predict_column): df = pd.read_csv(path) columns = df.columns.values X = [ df[column].values for column in columns if column != predict_column and column != 'Name' ] y = df[predict_column].values decisions = ID3() fitted = decisions.fit(X, y) print(f'Entropy of {predict_column}:\t\t\t{fitted[1]}\n') print(f'Information Gains of Xs\n') headers = ' '.join( map( str, list( filter(lambda x: x != predict_column and x != "Name", df.columns.values)))).replace(' ', '\t | ') print(f'{headers}') values = ' '.join(map(lambda x: "{0:.8f}".format(x), fitted[0])).replace(' ', '\t | ') print(values) print('\n\n') swtch = Switcher() sk_X = [] for column in X: converted = np.array(swtch.convert_to(column)) sk_X.append(converted) sk_X = np.array(sk_X).T sk_y = np.array(swtch.convert_to(y)).T x_train, x_test, y_train, y_test = train_test_split(sk_X, sk_y, test_size=0.3) sk_decisions = DecisionTreeClassifier(random_state=1370) sk_decisions.fit(x_train, y_train) print( f'SkLearn Tree Decision accuracy:\t{sk_decisions.score(x_train, y_train)}' ) print(f'SkLearn Tree Decision prediction: {sk_decisions.predict(x_test)}') return 0
def classify_by_k(self, sample, k_param=1): sample_to_cenroid = self.minmax(sample.drop(['diagnosis'])) # we need only features self.K = k_param distance_np = np.array([self.calc_distance(self.centroid_lst[it].to_numpy(), sample_to_cenroid.to_numpy()) for it in range(self.N)]) def get_indices_of_k_smallest(arr, k): # finds the k indices of k smallest values (distance) if k == arr.size: # all the trees can vote return range(k) idx = np.argpartition(arr, k) # return np.array(np.unravel_index(idx, arr.shape))[:, range(min(k, 0), max(k, 0))].tolist() return idx[:k] conference_indices_lst = get_indices_of_k_smallest(distance_np, self.K) classify_func = lambda i: ID3class.classify(sample, self.id3_lst[conference_indices_lst[i]].tree) votes_lst = [classify_func(i) for i in range(self.K)] if votes_lst.count('B') > len(votes_lst) / 2: # B won the vote return 'B' return 'M'
def train(self, data, p_param): # kf = KFold(n_splits=3, shuffle=True, random_state=318981586) decisions_trees = [] for i in range(self.n_param): size = p_param * self.n_param random_examples = choices(data, k=int(size)) test_group = [ex for ex in data if ex not in random_examples] classifier = ID3(random_examples, self.m_param, information_gain, majority_class_for_knn) classifier.train() score = classifier.test(test_group, False) relevant = classifier.root.find_features( classifier.num_of_features) centroid = calc_centroid(random_examples) height = classifier.root.calc_height() decisions_trees.append( (centroid, height, classifier, score, relevant)) self.decision_trees = decisions_trees
def simpleKnn(relPath, columns, resultColumn): dataSet = r.readDataSet(relPath, columns) trainingSets = [] avaliationSets = [] kfold = kc(dataSet, 5, resultColumn, True) kfold.run(trainingSets, avaliationSets, stratified=True) dataSet = dataSet.apply(pd.to_numeric) for i in range(len(trainingSets)): tset = [] aset = [] for index, row in dataSet.iterrows(): tupla = (dataSet.iloc[index][resultColumn], index) if tupla in trainingSets[i]: tset.append(row.tolist()) if tupla in avaliationSets[i]: aset.append(row.tolist()) i = ID3(tset, resultColumn) i.printTree()
def train(self, data, p_param): decisions_trees = [] for i in range(self.n_param): size = p_param * self.n_param random_examples = sample(data, k=int(size)) minmax_vector = create_minmax_vector(random_examples) normalized_data = normalized_set(random_examples, minmax_vector) """ choose the best m_param """ classifier = ID3(normalized_data, 10) classifier.train() test_group = [ex for ex in data if ex not in random_examples] """ normalize all the data : """ normalized_test = normalized_set(test_group, minmax_vector) """ keeps in mind which features are relevant : """ relevant = classifier.root.find_features( classifier.num_of_features) score = classifier.test(normalized_test, False) centroid = calc_centroid_for_impro(random_examples, relevant) decisions_trees.append( (1 - score, centroid, classifier, minmax_vector, relevant)) """ prefer more accurate trees : """ decisions_trees.sort(key=lambda x: x[0]) self.decision_trees = decisions_trees
from ID3 import ID3 from data import Data import numpy as np DATA_DIR = 'data_new/' if __name__ == "__main__": print("Training on train.csv and testing on test.csv with no depth restriction...") ignore = list() data = np.loadtxt(DATA_DIR+'train.csv', delimiter=',', dtype=str) data_obj = Data(data=data) ID3_obj = ID3(data_obj, data_obj.attributes, data_obj.get_column('label')) tree = ID3_obj.build_tree() # tree = ID3.build_tree(data_obj, data_obj.attributes, data_obj.get_column('label'), ignore) # tree = ID3.build_tree(data_obj, data_obj.attributes, data_obj.get_column('label'), ignore, 7, data_obj, 0) # tree = ID3.build_tree(data_obj, data_obj.attributes, data_obj.get_column('label'), ignore, 5, data_obj, 0) # labels = ID3.predict(tree, data_obj) data = np.loadtxt(DATA_DIR+'test.csv', delimiter=',', dtype=str) data_obj = Data(data=data) labels = ID3_obj.predict(data_obj) labels_true = data_obj.get_column('label') print("Depth =", ID3_obj.max_depth(tree)) error = 0 total = 0 for l1, l2 in zip(labels, labels_true): total += 1 if l1 != l2: error += 1 error = float(error)/float(total) * 100 print("Error =", error, "%")
numPlaylists, numTracks = 0, 0 startTime = time() for artist in listdir(mp3): thisArtist = "%s/%s" % (mp3, artist) if not S_ISDIR(stat(thisArtist)[ST_MODE]): continue for album in listdir(thisArtist): numTracks += 1 thisAlbum = "%s/%s" % (thisArtist, album) if not S_ISDIR(stat(thisAlbum)[ST_MODE]): continue thisPlaylist = "%s/%s-%s.m3u" % (mp3, artist, album) playlist = {} for track in listdir(thisAlbum): numTracks += 1 thisTrack = "%s/%s" % (thisAlbum, track) id3 = ID3(thisTrack) playlist[id3.track] = thisTrack try: del (id3) except NameError, e: pass m3u = open(thisPlaylist, "w") for i in range((playlist.__len__() + 1))[1:]: try: m3u.write("%s\n" % playlist[i]) except KeyError: pass m3u.close() endTime = time() runTime = endTime - startTime
and why does this make sense? In about two sentences, how does the advantage of pruning change as the data set size increases? Does this make sense, and why or why not? """ x = [] trend_basic = [] trend_prune = [] for size in range(10, 300, 10): x.append(size) accuracy_b = [] accuracy_p = [] for i in range(100): random.shuffle(data) train_set = data[:size] test_set = data[size:] tree = ID3(train_set) accuracy_b.append(test(tree, test_set)) tree_p = prune(tree, test_set) accuracy_p.append(test(tree_p, test_set)) trend_basic.append(np.mean(accuracy_b)) trend_prune.append(np.mean(accuracy_p)) fig, ax = plt.subplots() ax.plot(x, trend_basic, color='blue', label='$Default$') ax.plot(x, trend_prune, color='orange', label='$Pruned$') ax.legend(loc='lower right') ax.set_xlabel('Training Set Size') ax.set_ylabel('Test Set Accuracy') ax.set_title('Pruned vs Standard Tree Accuracy')
from ID3 import ID3 if __name__ == '__main__': x, y = [], [] label = ["age", "prescript", "astigmatic", "tear_rate"] with open("lenses.txt", "r") as FILE: for line in FILE.readlines(): x.append(line.strip().split("\t")) y.append(x[-1].pop()) id3 = ID3(x, y, label) id3.generate_tree() print(id3) # print(id3.inference())
if verbose: pprint(X) print('loss if prune:', pruned_loss) print('current loss', cur_loss) if pruned_loss < cur_loss: root.children.clear() return pruned_loss # if not pruned, the loss of node root is the sum loss of all of its children return cur_loss if __name__ == "__main__": console = Console(markup=False) # -------------------------- Example 1 (Small Normalization Param) ------------ print("Example 1:") id3 = ID3(verbose=False) X = [ ['青年', '否', '否', '一般'], ['青年', '否', '否', '好'], ['青年', '是', '否', '好'], ['青年', '是', '是', '一般'], ['青年', '否', '否', '一般'], ['老年', '否', '否', '一般'], ['老年', '否', '否', '好'], ['老年', '是', '是', '好'], ['老年', '否', '是', '非常好'], ['老年', '否', '是', '非常好'], ['老年', '否', '是', '非常好'], ['老年', '否', '是', '好'], ['老年', '是', '否', '好'], ['老年', '是', '否', '非常好'],
import re import sys from channels import * from config import * from compat2and3 import urlquote # ID3 libraries try: from mutagen import File as get_meta except: try: from ID3 import ID3 log.INFO("Just basic ID3 support") get_meta = lambda fn: dict([(k.lower(), v) for k, v in ID3(fn).iteritems()]) except: log.INIT( "You are out of luck in regards to mp3 browsing. No ID3 support.") get_meta = lambda *x: {} #Convert seconds to a time string "[[[DD:]HH:]MM:]SS". def ddhhmmss(seconds): dhms = '' for scale in 86400, 3600, 60: result, seconds = divmod(seconds, scale) if dhms != '' or result > 0: dhms += '{0:02d}:'.format(result) dhms += '{0:02d}'.format(seconds) if len(dhms) == 2:
# codes to get the results in my report import numpy as np import pandas as pd from ID3 import ID3 # Import & some CONST DATA_DIR = "C:/Users/Yu Zhu/OneDrive/Academy/the U/Assignment/AssignmentSln/ML-01-DT/experiment-data_new/data_new/" ##### Experiments # 1.(b) id3 = ID3() id3.train_id3(fpath = DATA_DIR + 'train.csv') id3.test_id3(fpath = DATA_DIR + 'train.csv') # the answer is 1 # 1.(c) id3.test_id3(fpath = DATA_DIR + 'test.csv') # the answer is 1 # 1.(d) rules = id3.rules max([len(r['attr']) for r in rules]) # the answer is 6 #### CORSS-VALIDATION # 2.(a) # import validation set cv1 = pd.read_csv(DATA_DIR + 'CVfolds_new/fold1.csv') cv2 = pd.read_csv(DATA_DIR + 'CVfolds_new/fold2.csv')
tennis_wind[i]]) #splitting tennis_datasets tennis_train_instances, tennis_test_instances, tennis_train_targets, tennis_test_targets = train_test_split(tennis_instances, tennis_targets, test_size=0.2, random_state=42) #pre-process iris datasets iris_instances = iris_datasets.data iris_targets = iris_datasets.target iris_train_instances, iris_test_instances, iris_train_targets, iris_test_targets = train_test_split(iris_instances, iris_targets, test_size=0.9, random_state=42) #testing kode test_instances = [[1, 1], [0, 0]] test_targets = [] id = ID3(tennis_train_instances,tennis_train_targets) id.fit(id.instances,id.targets) print("Predicted : ", id.predict(tennis_train_instances)) print("Actual : ", tennis_train_targets) # Node testing """ instances = [[0, 1, 2], [2, 1, 0]] targets = [1, 0] rules = ["== 0"] root = Node(0, instances, targets) nodes = [Node(1, [instances[0]], [targets[0]])] root.set_rule_children(rules, nodes) child = root.next_node(instances[0]) child.set_rule_children(["== 1"], [Node([instances[1]], [targets[1]])])
return dataSet def transformMushroomTargetAttribute(dataSet, targetAttribute): uniqueValues = dataSet[targetAttribute].unique() dataSet[targetAttribute].replace(uniqueValues, range(len(uniqueValues)), inplace=True) return dataSet if __name__ == '__main__': print("Iris Dataset") irisFileLocation = 'iris.csv' irisDataSet = importData(irisFileLocation) irisID3 = ID3(irisDataSet, 10, [0.05, 0.10, 0.15, 0.20], True) irisID3.validate() print("Spambase Dataset") spambaseFileLocation = 'spambase.csv' spambaseDataSet = importData(spambaseFileLocation) spambaseID3 = ID3(spambaseDataSet, 10, [0.05, 0.10, 0.15, 0.20, 0.25], True) spambaseID3.validate() print("Mushroom Dataset - Multiway Split") mushroomFileLocation = 'mushroom.csv' mushroomDataSet = importData(mushroomFileLocation) columnsLength = len(mushroomDataSet.columns) mushroomDataSet = transformMushroomTargetAttribute(mushroomDataSet, columnsLength - 1) mushroomMultiwayID3 = ID3(mushroomDataSet, 10, [0.05, 0.10, 0.15], False) mushroomMultiwayID3.validate()
def main(): print 'PARTS A & B' orig_spam = parse_c45('spam') orig_volcanoes = parse_c45('volcanoes') orig_voting = parse_c45('voting') print 'making numpy' spam_examples = numpy.array(orig_spam.to_float()) volcanoes_examples = numpy.array(orig_volcanoes.to_float()) voting_examples = numpy.array(orig_voting.to_float()) spam_attributes = dtree.getAttributes(orig_spam) volcanoes_attributes = dtree.getAttributes(orig_volcanoes) voting_attributes = dtree.getAttributes(orig_voting) spam_partitions = dtree.partitionExamples(spam_examples) volcanoes_partitions = dtree.partitionExamples(volcanoes_examples) voting_partitions = dtree.partitionExamples(voting_examples) part_a_builder = ID3(1, 0) print 'Spam CV Accuracies for Depth = 1' spam_trees = dtree.runOnFolds(part_a_builder, spam_partitions, spam_attributes) print '' print 'Volcanoes CV Accuracies for Depth = 1' volcanoes_trees = dtree.runOnFolds(part_a_builder, volcanoes_partitions, volcanoes_attributes) print '' print 'Voting CV Accuracies for Depth = 1' voting_trees = dtree.runOnFolds(part_a_builder, voting_partitions, voting_attributes) print '' print 'PART C' depth_1_builder = ID3(1, 0) depth_3_builder = ID3(3, 0) depth_5_builder = ID3(5, 0) depth_7_builder = ID3(7, 0) depth_9_builder = ID3(9, 0) print 'Spam Depth 1:' dtree.runOnFolds(depth_1_builder, spam_partitions, spam_attributes) print 'Spam Depth 3:' dtree.runOnFolds(depth_3_builder, spam_partitions, spam_attributes) print 'Spam Depth 5:' dtree.runOnFolds(depth_5_builder, spam_partitions, spam_attributes) print 'Spam Depth 7:' dtree.runOnFolds(depth_7_builder, spam_partitions, spam_attributes) print 'Spam Depth 9:' dtree.runOnFolds(depth_9_builder, spam_partitions, spam_attributes) print 'Volcanoes Depth 1:' dtree.runOnFolds(depth_1_builder, volcanoes_partitions, volcanoes_attributes) print 'Volcanoes Depth 3:' dtree.runOnFolds(depth_3_builder, volcanoes_partitions, volcanoes_attributes) print 'Volcanoes Depth 5:' dtree.runOnFolds(depth_5_builder, volcanoes_partitions, volcanoes_attributes) print 'Volcanoes Depth 7:' dtree.runOnFolds(depth_7_builder, volcanoes_partitions, volcanoes_attributes) print 'Volcanoes Depth 9:' dtree.runOnFolds(depth_9_builder, volcanoes_partitions, volcanoes_attributes) print 'Part D' depth_1_GR_builder = ID3(1, 1) depth_3_GR_builder = ID3(3, 1) depth_5_GR_builder = ID3(5, 1) print 'Spam Depth 1 IG' dtree.runOnFolds(depth_1_builder, spam_partitions, spam_attributes) print '' print 'Spam Depth 3 IG' dtree.runOnFolds(depth_3_builder, spam_partitions, spam_attributes) print '' print 'Spam Depth 5 IG' dtree.runOnFolds(depth_5_builder, spam_partitions, spam_attributes) print 'Spam Depth 1 GR' dtree.runOnFolds(depth_1_GR_builder, spam_partitions, spam_attributes) print '' print 'Spam Depth 3 GR' dtree.runOnFolds(depth_3_GR_builder, spam_partitions, spam_attributes) print '' print 'Spam Depth 5 GR' dtree.runOnFolds(depth_5_GR_builder, spam_partitions, spam_attributes) print '' print 'Voting Depth 1 IG' dtree.runOnFolds(depth_1_builder, voting_partitions, voting_attributes) print '' print 'Voting Depth 3 IG' dtree.runOnFolds(depth_3_builder, voting_partitions, voting_attributes) print '' print 'Voting Depth 5 IG' dtree.runOnFolds(depth_5_builder, voting_partitions, voting_attributes) print '' print 'Voting Depth 1 GR' dtree.runOnFolds(depth_1_GR_builder, voting_partitions, voting_attributes) print '' print 'Voting Depth 3 GR' dtree.runOnFolds(depth_3_GR_builder, voting_partitions, voting_attributes) print '' print 'Voting Depth 5 GR' dtree.runOnFolds(depth_5_GR_builder, voting_partitions, voting_attributes) print '' print 'Volcanoes Depth 1 IG' dtree.runOnFolds(depth_1_builder, volcanoes_partitions, volcanoes_attributes) print '' print 'Volcanoes Depth 3 IG' dtree.runOnFolds(depth_3_builder, volcanoes_partitions, volcanoes_attributes) print '' print 'Volcanoes Depth 5 IG' dtree.runOnFolds(depth_5_builder, volcanoes_partitions, volcanoes_attributes) print '' print 'Volcanoes Depth 1 GR' dtree.runOnFolds(depth_1_GR_builder, volcanoes_partitions, volcanoes_attributes) print '' print 'Volcanoes Depth 3 GR' dtree.runOnFolds(depth_3_GR_builder, volcanoes_partitions, volcanoes_attributes) print '' print 'Volcanoes Depth 5 GR' dtree.runOnFolds(depth_5_GR_builder, volcanoes_partitions, volcanoes_attributes) print 'PART E' depth_2_builder = ID3(2, 0) print 'Spam CV Depth 1' dtree.runOnFolds(depth_1_builder, spam_partitions, spam_attributes) print '' print 'Spam CV Depth 2' dtree.runOnFolds(depth_2_builder, spam_partitions, spam_attributes) print '' print 'Spam Full Depth 1' single_tree = depth_1_builder.buildTree(spam_examples, spam_attributes, {}, 1) print dtree.process_tree(single_tree, spam_examples) print '' print 'Spam Full Depth 2' single_tree = depth_2_builder.buildTree(spam_examples, spam_attributes, {}, 1) print dtree.process_tree(single_tree, spam_examples) print '' print 'Voting CV Depth 1' dtree.runOnFolds(depth_1_builder, voting_partitions, voting_attributes) print '' print 'Voting CV Depth 2' dtree.runOnFolds(depth_2_builder, voting_partitions, voting_attributes) print '' print 'Voting Full Depth 1' single_tree = depth_1_builder.buildTree(voting_examples, voting_attributes, {}, 1) print dtree.process_tree(single_tree, voting_examples) print '' print 'Voting Full Depth 2' single_tree = depth_2_builder.buildTree(voting_examples, voting_attributes, {}, 1) print dtree.process_tree(single_tree, voting_examples) print '' print 'Volcanoes CV Depth 1' dtree.runOnFolds(depth_1_builder, volcanoes_partitions, volcanoes_attributes) print '' print 'Volcanoes CV Depth 2' dtree.runOnFolds(depth_2_builder, volcanoes_partitions, volcanoes_attributes) print '' print 'Volcanoes Full Depth 1' single_tree = depth_1_builder.buildTree(volcanoes_examples, volcanoes_attributes, {}, 1) print dtree.process_tree(single_tree, volcanoes_examples) print '' print 'Volcanoes Full Depth 2' single_tree = depth_2_builder.buildTree(volcanoes_examples, volcanoes_attributes, {}, 1) print dtree.process_tree(single_tree, volcanoes_examples)
def main(): while True: data_path = raw_input("Please enter path to the data: ") try: orig_array = parse_c45(data_path) break except Exception as error: print 'Not a valid path!' print 'Converting Example array to numpy array...' examples = numpy.array(orig_array.to_float()) print 'Conversion done. \n Extracting attributes...' attributes = getAttributes(orig_array) print 'Attributes successfully parsed' while True: data_option = input( "Enter '0' for cross validation, or '1' to run on full sample:") if data_option == 0: cross_validate = True break elif data_option == 1: cross_validate = False break else: print 'Not a valid input!' while True: depth_option = input( "Please enter a nonnegative integer to set the maximum depth of the tree, or enter 0 to grow the full tree:" ) if depth_option < 0: print 'Not a valid input!' elif depth_option == 0: depth = float("inf") break else: depth = depth_option break while True: split_option = input( "Enter '0' for information gain, or '1' for gain ratio:") if split_option != 0 and split_option != 1: print 'Not a valid input!' else: break tree_builder = ID3(depth, split_option) if cross_validate: partitions = partitionExamples(examples) trees = runOnFolds(tree_builder, partitions, attributes) else: # build tree on the entire data set single_tree = tree_builder.buildTree(examples, attributes, {}, 1) accuracy = process_tree(single_tree, examples) size = tree.subtree_size max_depth = tree_builder.max_depth first_feature = tree.attribute.name print('Accuracy: ' + str(accuracy)) print('Sizes: ' + str(size)) print('Maximum Depth: ' + str(max_depth)) print('First Feature: ' + str(first_feature))
def collect_metadata( abspathitem, db, recentartists, recentalbums, recentgenres, queues, condition): """ id3 tags retriever """ id3item = None id3v1item = {} id3v1 = False id3v2 = False for decoder in DECODERS: try: id3item = decoder(abspathitem) break except Exception as e: logging.error(e) try: id3v1item = defaultdict(lambda: 'unknown', ID3(abspathitem).as_dict()) except InvalidTagError as e: logging.error(e) except Exception as e: logging.error(e) title = id3v1item['TITLE'].strip().lower() titleclean = re.sub("[^\w]*", "", title) artist = id3v1item['ARTIST'].strip().lower() album = id3v1item['ALBUM'].strip().lower() albumclean = re.sub("[^\w]*", "", album) genre = id3v1item['GENRE'].strip().lower() genreclean = re.sub("[^\w]+", "", genre).strip().lower() if not id3item: logging.warning("No ID3 informations found") return length = 0.0 try: title = " ".join(id3item['TIT2'].text).strip().lower() id3v2 = True except Exception as e: logging.error(e) try: titleclean = re.sub("[^\w]*", "", title) except Exception as e: logging.error(e) try: artist = " ".join(id3item['TPE1'].text).strip().lower() id3v2 = True except Exception as e: logging.error(e) try: album = " ".join(id3item['TALB'].text).strip().lower() id3v2 = True except Exception as e: logging.error(e) try: albumclean = re.sub("[^\w]*", "", album) except Exception as e: logging.error(e) try: genre = " ".join(id3item['TCON'].text).strip().lower() id3v2 = True except Exception as e: logging.error(e) try: genreclean = re.sub("[^\w]+", "", genre).strip().lower() except Exception as e: logging.error(e) try: length = float(id3item['TLEN']) except Exception as e: logging.error(e) if not id3v2: logging.warning("No ID3v2 informations found") return with condition: try: ar = artist if not artist in recentartists.keys(): if not db.execute( "select id from artist where name = ?", (ar,)).fetchone(): db.execute( "insert into artist(name) values(?)", (ar,)) db.commit() recentartists[artist] = db.execute( "select id from artist where name = ?", (ar,)).fetchone()[0] except Exception as e: logging.error(e) with condition: try: al = albumclean if not album in recentalbums.keys(): if not db.execute( "select id from album where titleclean = ?", (al,)).fetchone(): db.execute( "insert into album(title, titleclean) " "values(?, ?)", (album, albumclean)) db.commit() recentalbums[album] = db.execute( "select id from album where titleclean = ?", (al,)).fetchone()[0] except Exception as e: logging.error(e) with condition: try: ge = genre if not genre in recentgenres.keys(): if not db.execute( "select id from genre where desc = ?", (ge,)).fetchone(): db.execute( "insert or ignore into genre(desc, descclean) " "values(?, ?)", (genre, genreclean)) db.commit() recentgenres[genre] = db.execute( "select id from genre where desc = ?", (ge,)).fetchone()[0] except Exception as e: logging.error(e) with condition: try: db.execute( "insert or replace into song(" "title, titleclean, artist_id, " "genre_id, album_id, path, length) " "values (?,?,?,?,?,?,?)", ( title, titleclean, recentartists[artist], recentgenres[genre], recentalbums[album], abspathitem.decode(FS_ENCODING), length)) logging.debug("collect_metadata putting new artist on queue") for q in queues: if not q.full(): q.put_nowait((abspathitem, title, artist, album)) else: q.put((abspathitem, title, artist, album), block=True) db.commit() except Exception as e: logging.error(e)
def __init__(self, data_arr, m_param, information_gain_func, majority_class_for_cost, epsilon, delta): ID3.__init__(self, data_arr, m_param, information_gain_func, majority_class_for_cost, epsilon, delta) self.classifiers = None self.m_param = m_param self.epsilon = epsilon