Exemplo n.º 1
0
def main(path, predict_column):
    df = pd.read_csv(path)
    columns = df.columns.values
    X = [
        df[column].values for column in columns
        if column != predict_column and column != 'Name'
    ]
    y = df[predict_column].values

    swtch = Switcher()

    sk_X = []
    for column in X:
        converted = np.array(swtch.convert_to(column))
        sk_X.append(converted)
    sk_X = np.array(sk_X).T

    sk_y = np.array(swtch.convert_to(y)).T

    decisions = ID3()
    entropy = decisions.fit(X, y)
    print(f'Entropies:\t\t\t{entropy}')

    sk_decisions = DecisionTreeClassifier(random_state=1370)
    sk_decisions.fit(sk_X, sk_y)
    print(f'SkLearn Tree Decision accuracy:\t{sk_decisions.score(sk_X, sk_y)}')
    #print(sk_decisions.predict(['pycharm', 'Java', 'tea']))
    return 0
Exemplo n.º 2
0
    def prune(tree, validation_group, evaluate_func):
        if tree.children is None:  # it is a leave
            return tree
        # continues features, answers are binaries

        sub_validation_0 = validation_group[
            validation_group[tree.feature[0]] < tree.feature[1]]
        tree.children[0] = (0,
                            CostSensitiveID3.prune(tree.children[0][1],
                                                   sub_validation_0,
                                                   evaluate_func))

        sub_validation_1 = validation_group[
            validation_group[tree.feature[0]] >= tree.feature[1]]
        tree.children[1] = (1,
                            CostSensitiveID3.prune(tree.children[1][1],
                                                   sub_validation_1,
                                                   evaluate_func))

        err_prune = 0
        err_no_prune = 0

        for index, row in validation_group.iterrows():
            real_classification = row['diagnosis']
            err_prune += evaluate_func(real_classification, tree.class_of_node)
            err_no_prune += evaluate_func(real_classification,
                                          ID3class.classify(row, tree))

        if err_prune < err_no_prune:  # it is better to prune
            tree.f = None
            tree.children = None
        return tree
Exemplo n.º 3
0
def gerar_objetos():
    texto = open("data.txt").read()
    linhas = texto.replace(' ', '').splitlines()
    objetos = ID3()
    for linha in linhas:
        objetos.dias.append(Dia(linha.split(',')))
    return objetos
Exemplo n.º 4
0
    def classify_by_k(self, sample, k_param=6):
        sample_to_cenroid = self.improved_forest.minmax(sample.drop(['diagnosis']))  # we need only features
        self.improved_forest.K = k_param
        distance_np = np.array(
            [self.improved_forest.calc_distance(self.improved_forest.centroid_lst[it].to_numpy(),
                                                sample_to_cenroid.to_numpy())
             for it in range(self.improved_forest.N)])

        def get_indices_of_k_smallest(arr, k):  # finds the k indices of k smallest values (distance)
            if k == arr.size:  # all the trees can vote
                return range(k)
            #idx = np.argpartition(arr, k)
            idx = sorted(range(len(arr)), key=lambda sub: arr[sub])[:K]
            # return np.array(np.unravel_index(idx, arr.shape))[:, range(min(k, 0), max(k, 0))].tolist()
            return idx[:k]

        conference_indices_lst = get_indices_of_k_smallest(distance_np, self.improved_forest.K)
        classify_func = lambda i: ID3class.classify(sample,
                                                    self.improved_forest.id3_lst[conference_indices_lst[i]].tree)

        dec_by = int(self.improved_forest.K / 3)
        votes_lst = [[classify_func(i)] * (3 - int(i /2)) for i in
                     range(self.improved_forest.K)]
        votes_lst_flatt = [item for sublist in votes_lst for item in sublist]

        if votes_lst_flatt.count('B') > len(votes_lst) / 2:  # B won the vote
            return 'B'
        return 'M'
Exemplo n.º 5
0
def findBestFeaturesByIG(param, E_train, F_intial):
    F_best_list = []
    for it in range(param):
        best_f = ID3class.MaxIG(F_intial, E_train)
        F_best_list.append(best_f[0])
        F_intial.remove(best_f[0])
    return F_best_list
Exemplo n.º 6
0
    def fit(self, arr_instans, arr_target):
        #convert array to dataframe
        df = pandas.DataFrame.from_records(arr_instans)
        arr_target = pandas.DataFrame(arr_target)
        arr_instans = df.assign(target=arr_target.values)

        #pre-process continuous valued attribute
        ##arr_instans = self.continuous_value(arr_instans)

        #pre-process missing value attribute
        arr_instans = self.missing_value_handler(arr_instans)

        #pre-process attribute with many values

        #convert dataframe to array of array
        arr_target = [target[0] for target in arr_target.values]
        arr_instans = arr_instans.drop(columns="target", axis=1)
        arr_instans = arr_instans.values.tolist()

        #ic(arr_instans)
        #ic(arr_target)

        #buat objek ID3
        id3_ = ID3(arr_instans, arr_target)

        #fitting ID3
        id3_.fit(arr_instans, arr_target)

        #this->root = ID3.root
        self.root = id3_.root
Exemplo n.º 7
0
def build_decision_tree(training_data_acids, training_data_labels):
    """
    Based on the training data and information gain,
    build a decision tree that can accurately predict if an amino acid
    will be exposed(e) or buried(-)
    :param training_data_acids: The amino acids in the training data set
    :param training_data_labels: The labels in the training data set
    :return: A binary tree that represents a decision tree based on the given features
    """

    print "Building tree ..."

    # Flatten the data for training
    flat_training_data_acids = []
    flat_training_data_labels = []
    for x, y in zip(training_data_acids, training_data_labels):
        flat_training_data_acids.extend(x)
        flat_training_data_labels.extend(y)

    # Build decision tree with training data
    attributes = [i for i in range(len(training_data_acids[0][0]))]
    decision_tree = ID3(attributes, flat_training_data_acids,
                        flat_training_data_labels)

    return decision_tree
Exemplo n.º 8
0
def run():
	print("Executing ID3 algorithm")
	examples = ID3Reader.Read()


	DTree = ID3(examples,"ghost","ghost",0)
	j = 0


	accuracy = 0

	print("#######################################")
	newExamples = open("./ID3/readyData.txt",'r')
	for line in newExamples:
		j+=1
		ex = line.split(',')
		for i in range(len(ex)):
			ex[i] = ex[i].rstrip()
		tempEx = []
		for i in range(0,len(ex) -1):
			tempEx.append(ex[i])
		answer = DTree.answer(tempEx)
		if answer == ex[4]:
			accuracy += 1
		print("Test subject "+str(j))
		print("Algorithm category of choice is : "+answer)
		print("Actual category is : "+ex[-1] )
		print("#######################################")

	return str("ID3 algorithm accuracy with "+ str(len(examples))+" examples of Iris flowers is: "+str(accuracy)+"/30")
Exemplo n.º 9
0
def gerar_objetos():
    global cabecalho

    linhas = open("data.txt").read().replace(' ', '').splitlines()
    objetos = ID3()

    cabecalho = linhas.pop(0).split(',')
    for linha in linhas:
        objetos.dias.append(Dia(linha.split(',')))

    return objetos
Exemplo n.º 10
0
    def train(self, data, p_param):
        # kf = KFold(n_splits=3, shuffle=True, random_state=318981586)
        decisions_trees = []

        for i in range(self.n_param):
            size = p_param * self.n_param
            random_examples = sample(data, k=int(size))
            classifier = ID3(random_examples)
            classifier.train()
            centroid = calc_centroid(random_examples)
            decisions_trees.append((centroid, classifier))
        self.decision_trees = decisions_trees
Exemplo n.º 11
0
def main(path, predict_column):
    df = pd.read_csv(path)
    columns = df.columns.values
    X = [
        df[column].values for column in columns
        if column != predict_column and column != 'Name'
    ]
    y = df[predict_column].values

    decisions = ID3()
    fitted = decisions.fit(X, y)
    print(f'Entropy of {predict_column}:\t\t\t{fitted[1]}\n')
    print(f'Information Gains of Xs\n')
    headers = ' '.join(
        map(
            str,
            list(
                filter(lambda x: x != predict_column and x != "Name",
                       df.columns.values)))).replace(' ', '\t | ')
    print(f'{headers}')
    values = ' '.join(map(lambda x: "{0:.8f}".format(x),
                          fitted[0])).replace(' ', '\t | ')
    print(values)
    print('\n\n')

    swtch = Switcher()

    sk_X = []
    for column in X:
        converted = np.array(swtch.convert_to(column))
        sk_X.append(converted)
    sk_X = np.array(sk_X).T

    sk_y = np.array(swtch.convert_to(y)).T

    x_train, x_test, y_train, y_test = train_test_split(sk_X,
                                                        sk_y,
                                                        test_size=0.3)

    sk_decisions = DecisionTreeClassifier(random_state=1370)
    sk_decisions.fit(x_train, y_train)
    print(
        f'SkLearn Tree Decision accuracy:\t{sk_decisions.score(x_train, y_train)}'
    )
    print(f'SkLearn Tree Decision prediction: {sk_decisions.predict(x_test)}')
    return 0
Exemplo n.º 12
0
    def classify_by_k(self, sample, k_param=1):
        sample_to_cenroid = self.minmax(sample.drop(['diagnosis']))  # we need only features
        self.K = k_param
        distance_np = np.array([self.calc_distance(self.centroid_lst[it].to_numpy(), sample_to_cenroid.to_numpy())
                                for it in range(self.N)])

        def get_indices_of_k_smallest(arr, k):  # finds the k indices of k smallest values (distance)
            if k == arr.size:  # all the trees can vote
                return range(k)
            idx = np.argpartition(arr, k)
            # return np.array(np.unravel_index(idx, arr.shape))[:, range(min(k, 0), max(k, 0))].tolist()
            return idx[:k]

        conference_indices_lst = get_indices_of_k_smallest(distance_np, self.K)
        classify_func = lambda i: ID3class.classify(sample, self.id3_lst[conference_indices_lst[i]].tree)
        votes_lst = [classify_func(i) for i in range(self.K)]
        if votes_lst.count('B') > len(votes_lst) / 2:  # B won the vote
            return 'B'
        return 'M'
Exemplo n.º 13
0
    def train(self, data, p_param):
        # kf = KFold(n_splits=3, shuffle=True, random_state=318981586)
        decisions_trees = []

        for i in range(self.n_param):
            size = p_param * self.n_param
            random_examples = choices(data, k=int(size))
            test_group = [ex for ex in data if ex not in random_examples]
            classifier = ID3(random_examples, self.m_param, information_gain,
                             majority_class_for_knn)
            classifier.train()
            score = classifier.test(test_group, False)
            relevant = classifier.root.find_features(
                classifier.num_of_features)
            centroid = calc_centroid(random_examples)
            height = classifier.root.calc_height()
            decisions_trees.append(
                (centroid, height, classifier, score, relevant))
        self.decision_trees = decisions_trees
Exemplo n.º 14
0
def simpleKnn(relPath, columns, resultColumn):
    dataSet = r.readDataSet(relPath, columns)
    trainingSets = []
    avaliationSets = []
    kfold = kc(dataSet, 5, resultColumn, True)
    kfold.run(trainingSets, avaliationSets, stratified=True)
    dataSet = dataSet.apply(pd.to_numeric)

    for i in range(len(trainingSets)):
        tset = []
        aset = []
        for index, row in dataSet.iterrows():
            tupla = (dataSet.iloc[index][resultColumn], index)
            if tupla in trainingSets[i]:
                tset.append(row.tolist())
            if tupla in avaliationSets[i]:
                aset.append(row.tolist())
        i = ID3(tset, resultColumn)
        i.printTree()
Exemplo n.º 15
0
    def train(self, data, p_param):
        decisions_trees = []

        for i in range(self.n_param):
            size = p_param * self.n_param
            random_examples = sample(data, k=int(size))
            minmax_vector = create_minmax_vector(random_examples)
            normalized_data = normalized_set(random_examples, minmax_vector)
            """ choose the best m_param """
            classifier = ID3(normalized_data, 10)
            classifier.train()
            test_group = [ex for ex in data if ex not in random_examples]
            """ normalize all the data : """
            normalized_test = normalized_set(test_group, minmax_vector)
            """ keeps in mind which features are relevant : """
            relevant = classifier.root.find_features(
                classifier.num_of_features)
            score = classifier.test(normalized_test, False)
            centroid = calc_centroid_for_impro(random_examples, relevant)
            decisions_trees.append(
                (1 - score, centroid, classifier, minmax_vector, relevant))
            """ prefer more accurate trees : """
            decisions_trees.sort(key=lambda x: x[0])
        self.decision_trees = decisions_trees
Exemplo n.º 16
0
from ID3 import ID3
from data import Data
import numpy as np
DATA_DIR = 'data_new/'

if __name__ == "__main__":
    print("Training on train.csv and testing on test.csv with no depth restriction...")
    ignore = list()
    data = np.loadtxt(DATA_DIR+'train.csv', delimiter=',', dtype=str)
    data_obj = Data(data=data)
    ID3_obj = ID3(data_obj, data_obj.attributes, data_obj.get_column('label'))
    tree = ID3_obj.build_tree()
    # tree = ID3.build_tree(data_obj, data_obj.attributes, data_obj.get_column('label'), ignore)
    # tree = ID3.build_tree(data_obj, data_obj.attributes, data_obj.get_column('label'), ignore, 7, data_obj, 0)
    # tree = ID3.build_tree(data_obj, data_obj.attributes, data_obj.get_column('label'), ignore, 5, data_obj, 0)
    # labels = ID3.predict(tree, data_obj)
    data = np.loadtxt(DATA_DIR+'test.csv', delimiter=',', dtype=str)
    data_obj = Data(data=data)
    labels = ID3_obj.predict(data_obj)
    labels_true = data_obj.get_column('label')
    print("Depth =", ID3_obj.max_depth(tree))
    error = 0
    total = 0
    for l1, l2 in zip(labels, labels_true):
        total += 1
        if l1 != l2:
            error += 1
    error = float(error)/float(total) * 100
    print("Error =", error, "%")
Exemplo n.º 17
0
numPlaylists, numTracks = 0, 0
startTime = time()

for artist in listdir(mp3):
    thisArtist = "%s/%s" % (mp3, artist)
    if not S_ISDIR(stat(thisArtist)[ST_MODE]): continue
    for album in listdir(thisArtist):
        numTracks += 1
        thisAlbum = "%s/%s" % (thisArtist, album)
        if not S_ISDIR(stat(thisAlbum)[ST_MODE]): continue
        thisPlaylist = "%s/%s-%s.m3u" % (mp3, artist, album)
        playlist = {}
        for track in listdir(thisAlbum):
            numTracks += 1
            thisTrack = "%s/%s" % (thisAlbum, track)
            id3 = ID3(thisTrack)
            playlist[id3.track] = thisTrack
        try:
            del (id3)
        except NameError, e:
            pass
        m3u = open(thisPlaylist, "w")
        for i in range((playlist.__len__() + 1))[1:]:
            try:
                m3u.write("%s\n" % playlist[i])
            except KeyError:
                pass
        m3u.close()

endTime = time()
runTime = endTime - startTime
Exemplo n.º 18
0
and why does this make sense?
In about two sentences, how does the advantage of pruning change as the data set size increases? 
Does this make sense, and why or why not?
"""

x = []
trend_basic = []
trend_prune = []
for size in range(10, 300, 10):
    x.append(size)
    accuracy_b = []
    accuracy_p = []
    for i in range(100):
        random.shuffle(data)
        train_set = data[:size]
        test_set = data[size:]

        tree = ID3(train_set)
        accuracy_b.append(test(tree, test_set))
        tree_p = prune(tree, test_set)
        accuracy_p.append(test(tree_p, test_set))
    trend_basic.append(np.mean(accuracy_b))
    trend_prune.append(np.mean(accuracy_p))

fig, ax = plt.subplots()
ax.plot(x, trend_basic, color='blue', label='$Default$')
ax.plot(x, trend_prune, color='orange', label='$Pruned$')
ax.legend(loc='lower right')
ax.set_xlabel('Training Set Size')
ax.set_ylabel('Test Set Accuracy')
ax.set_title('Pruned vs Standard Tree Accuracy')
Exemplo n.º 19
0
from ID3 import ID3

if __name__ == '__main__':
    x, y = [], []
    label = ["age", "prescript", "astigmatic", "tear_rate"]
    with open("lenses.txt", "r") as FILE:
        for line in FILE.readlines():
            x.append(line.strip().split("\t"))
            y.append(x[-1].pop())
    id3 = ID3(x, y, label)
    id3.generate_tree()
    print(id3)
    # print(id3.inference())
Exemplo n.º 20
0
    if verbose:
        pprint(X)
        print('loss if prune:', pruned_loss)
        print('current loss', cur_loss)
    if pruned_loss < cur_loss:
        root.children.clear()
        return pruned_loss
    # if not pruned, the loss of node root is the sum loss of all of its children
    return cur_loss


if __name__ == "__main__":
    console = Console(markup=False)
    # -------------------------- Example 1 (Small Normalization Param) ------------
    print("Example 1:")
    id3 = ID3(verbose=False)
    X = [
        ['青年', '否', '否', '一般'],
        ['青年', '否', '否', '好'],
        ['青年', '是', '否', '好'],
        ['青年', '是', '是', '一般'],
        ['青年', '否', '否', '一般'],
        ['老年', '否', '否', '一般'],
        ['老年', '否', '否', '好'],
        ['老年', '是', '是', '好'],
        ['老年', '否', '是', '非常好'],
        ['老年', '否', '是', '非常好'],
        ['老年', '否', '是', '非常好'],
        ['老年', '否', '是', '好'],
        ['老年', '是', '否', '好'],
        ['老年', '是', '否', '非常好'],
Exemplo n.º 21
0
import re
import sys

from channels import *
from config import *
from compat2and3 import urlquote

# ID3 libraries
try:
    from mutagen import File as get_meta
except:
    try:
        from ID3 import ID3
        log.INFO("Just basic ID3 support")
        get_meta = lambda fn: dict([(k.lower(), v)
                                    for k, v in ID3(fn).iteritems()])
    except:
        log.INIT(
            "You are out of luck in regards to mp3 browsing. No ID3 support.")
        get_meta = lambda *x: {}


#Convert seconds to a time string "[[[DD:]HH:]MM:]SS".
def ddhhmmss(seconds):
    dhms = ''
    for scale in 86400, 3600, 60:
        result, seconds = divmod(seconds, scale)
        if dhms != '' or result > 0:
            dhms += '{0:02d}:'.format(result)
    dhms += '{0:02d}'.format(seconds)
    if len(dhms) == 2:
Exemplo n.º 22
0
# codes to get the results in my report

import numpy as np
import pandas as pd
from ID3 import ID3

# Import & some CONST
DATA_DIR = "C:/Users/Yu Zhu/OneDrive/Academy/the U/Assignment/AssignmentSln/ML-01-DT/experiment-data_new/data_new/"

##### Experiments
# 1.(b)
id3 = ID3()
id3.train_id3(fpath = DATA_DIR + 'train.csv')
id3.test_id3(fpath = DATA_DIR + 'train.csv') # the answer is 1

# 1.(c)
id3.test_id3(fpath = DATA_DIR + 'test.csv') # the answer is 1

# 1.(d)
rules = id3.rules
max([len(r['attr']) for r in rules]) # the answer is 6



#### CORSS-VALIDATION

# 2.(a)

# import validation set
cv1 = pd.read_csv(DATA_DIR + 'CVfolds_new/fold1.csv')
cv2 = pd.read_csv(DATA_DIR + 'CVfolds_new/fold2.csv')
                            tennis_wind[i]])

#splitting tennis_datasets
tennis_train_instances, tennis_test_instances, tennis_train_targets, tennis_test_targets = train_test_split(tennis_instances, tennis_targets, test_size=0.2, random_state=42)

#pre-process iris datasets
iris_instances = iris_datasets.data
iris_targets = iris_datasets.target

iris_train_instances, iris_test_instances, iris_train_targets, iris_test_targets = train_test_split(iris_instances, iris_targets, test_size=0.9, random_state=42)

#testing kode
test_instances = [[1, 1], [0, 0]]
test_targets = []

id = ID3(tennis_train_instances,tennis_train_targets)
id.fit(id.instances,id.targets)
print("Predicted : ", id.predict(tennis_train_instances))
print("Actual : ", tennis_train_targets)

# Node testing
""" instances = [[0, 1, 2], [2, 1, 0]]
targets = [1, 0]
rules = ["== 0"] 
root = Node(0, instances, targets)

nodes = [Node(1, [instances[0]], [targets[0]])]
root.set_rule_children(rules, nodes)
child = root.next_node(instances[0])
child.set_rule_children(["== 1"], [Node([instances[1]], [targets[1]])])
Exemplo n.º 24
0
    return dataSet


def transformMushroomTargetAttribute(dataSet, targetAttribute):
    uniqueValues = dataSet[targetAttribute].unique()
    dataSet[targetAttribute].replace(uniqueValues,
                                     range(len(uniqueValues)),
                                     inplace=True)
    return dataSet


if __name__ == '__main__':
    print("Iris Dataset")
    irisFileLocation = 'iris.csv'
    irisDataSet = importData(irisFileLocation)
    irisID3 = ID3(irisDataSet, 10, [0.05, 0.10, 0.15, 0.20], True)
    irisID3.validate()
    print("Spambase Dataset")
    spambaseFileLocation = 'spambase.csv'
    spambaseDataSet = importData(spambaseFileLocation)
    spambaseID3 = ID3(spambaseDataSet, 10, [0.05, 0.10, 0.15, 0.20, 0.25],
                      True)
    spambaseID3.validate()
    print("Mushroom Dataset - Multiway Split")
    mushroomFileLocation = 'mushroom.csv'
    mushroomDataSet = importData(mushroomFileLocation)
    columnsLength = len(mushroomDataSet.columns)
    mushroomDataSet = transformMushroomTargetAttribute(mushroomDataSet,
                                                       columnsLength - 1)
    mushroomMultiwayID3 = ID3(mushroomDataSet, 10, [0.05, 0.10, 0.15], False)
    mushroomMultiwayID3.validate()
def main():
    print 'PARTS A & B'

    orig_spam = parse_c45('spam')
    orig_volcanoes = parse_c45('volcanoes')
    orig_voting = parse_c45('voting')

    print 'making numpy'
    spam_examples = numpy.array(orig_spam.to_float())
    volcanoes_examples = numpy.array(orig_volcanoes.to_float())
    voting_examples = numpy.array(orig_voting.to_float())

    spam_attributes = dtree.getAttributes(orig_spam)
    volcanoes_attributes = dtree.getAttributes(orig_volcanoes)
    voting_attributes = dtree.getAttributes(orig_voting)

    spam_partitions = dtree.partitionExamples(spam_examples)
    volcanoes_partitions = dtree.partitionExamples(volcanoes_examples)
    voting_partitions = dtree.partitionExamples(voting_examples)

    part_a_builder = ID3(1, 0)

    print 'Spam CV Accuracies for Depth = 1'
    spam_trees = dtree.runOnFolds(part_a_builder, spam_partitions,
                                  spam_attributes)
    print ''

    print 'Volcanoes CV Accuracies for Depth = 1'
    volcanoes_trees = dtree.runOnFolds(part_a_builder, volcanoes_partitions,
                                       volcanoes_attributes)
    print ''

    print 'Voting CV Accuracies for Depth = 1'
    voting_trees = dtree.runOnFolds(part_a_builder, voting_partitions,
                                    voting_attributes)
    print ''

    print 'PART C'
    depth_1_builder = ID3(1, 0)
    depth_3_builder = ID3(3, 0)
    depth_5_builder = ID3(5, 0)
    depth_7_builder = ID3(7, 0)
    depth_9_builder = ID3(9, 0)

    print 'Spam Depth 1:'
    dtree.runOnFolds(depth_1_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 3:'
    dtree.runOnFolds(depth_3_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 5:'
    dtree.runOnFolds(depth_5_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 7:'
    dtree.runOnFolds(depth_7_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 9:'
    dtree.runOnFolds(depth_9_builder, spam_partitions, spam_attributes)

    print 'Volcanoes Depth 1:'
    dtree.runOnFolds(depth_1_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Volcanoes Depth 3:'
    dtree.runOnFolds(depth_3_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Volcanoes Depth 5:'
    dtree.runOnFolds(depth_5_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Volcanoes Depth 7:'
    dtree.runOnFolds(depth_7_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Volcanoes Depth 9:'
    dtree.runOnFolds(depth_9_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Part D'
    depth_1_GR_builder = ID3(1, 1)
    depth_3_GR_builder = ID3(3, 1)
    depth_5_GR_builder = ID3(5, 1)

    print 'Spam Depth 1 IG'
    dtree.runOnFolds(depth_1_builder, spam_partitions, spam_attributes)
    print ''
    print 'Spam Depth 3 IG'
    dtree.runOnFolds(depth_3_builder, spam_partitions, spam_attributes)
    print ''
    print 'Spam Depth 5 IG'
    dtree.runOnFolds(depth_5_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 1 GR'
    dtree.runOnFolds(depth_1_GR_builder, spam_partitions, spam_attributes)
    print ''
    print 'Spam Depth 3 GR'
    dtree.runOnFolds(depth_3_GR_builder, spam_partitions, spam_attributes)
    print ''
    print 'Spam Depth 5 GR'
    dtree.runOnFolds(depth_5_GR_builder, spam_partitions, spam_attributes)
    print ''

    print 'Voting Depth 1 IG'
    dtree.runOnFolds(depth_1_builder, voting_partitions, voting_attributes)
    print ''
    print 'Voting Depth 3 IG'
    dtree.runOnFolds(depth_3_builder, voting_partitions, voting_attributes)
    print ''
    print 'Voting Depth 5 IG'
    dtree.runOnFolds(depth_5_builder, voting_partitions, voting_attributes)
    print ''

    print 'Voting Depth 1 GR'
    dtree.runOnFolds(depth_1_GR_builder, voting_partitions, voting_attributes)
    print ''
    print 'Voting Depth 3 GR'
    dtree.runOnFolds(depth_3_GR_builder, voting_partitions, voting_attributes)
    print ''
    print 'Voting Depth 5 GR'
    dtree.runOnFolds(depth_5_GR_builder, voting_partitions, voting_attributes)
    print ''

    print 'Volcanoes Depth 1 IG'
    dtree.runOnFolds(depth_1_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''
    print 'Volcanoes Depth 3 IG'
    dtree.runOnFolds(depth_3_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''
    print 'Volcanoes Depth 5 IG'
    dtree.runOnFolds(depth_5_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''

    print 'Volcanoes Depth 1 GR'
    dtree.runOnFolds(depth_1_GR_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''
    print 'Volcanoes Depth 3 GR'
    dtree.runOnFolds(depth_3_GR_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''
    print 'Volcanoes Depth 5 GR'
    dtree.runOnFolds(depth_5_GR_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'PART E'
    depth_2_builder = ID3(2, 0)
    print 'Spam CV Depth 1'
    dtree.runOnFolds(depth_1_builder, spam_partitions, spam_attributes)
    print ''

    print 'Spam CV Depth 2'
    dtree.runOnFolds(depth_2_builder, spam_partitions, spam_attributes)
    print ''

    print 'Spam Full Depth 1'
    single_tree = depth_1_builder.buildTree(spam_examples, spam_attributes, {},
                                            1)
    print dtree.process_tree(single_tree, spam_examples)
    print ''

    print 'Spam Full Depth 2'
    single_tree = depth_2_builder.buildTree(spam_examples, spam_attributes, {},
                                            1)
    print dtree.process_tree(single_tree, spam_examples)
    print ''

    print 'Voting CV Depth 1'
    dtree.runOnFolds(depth_1_builder, voting_partitions, voting_attributes)
    print ''

    print 'Voting CV Depth 2'
    dtree.runOnFolds(depth_2_builder, voting_partitions, voting_attributes)
    print ''

    print 'Voting Full Depth 1'
    single_tree = depth_1_builder.buildTree(voting_examples, voting_attributes,
                                            {}, 1)
    print dtree.process_tree(single_tree, voting_examples)
    print ''
    print 'Voting Full Depth 2'
    single_tree = depth_2_builder.buildTree(voting_examples, voting_attributes,
                                            {}, 1)
    print dtree.process_tree(single_tree, voting_examples)
    print ''

    print 'Volcanoes CV Depth 1'
    dtree.runOnFolds(depth_1_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''

    print 'Volcanoes CV Depth 2'
    dtree.runOnFolds(depth_2_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''

    print 'Volcanoes Full Depth 1'
    single_tree = depth_1_builder.buildTree(volcanoes_examples,
                                            volcanoes_attributes, {}, 1)
    print dtree.process_tree(single_tree, volcanoes_examples)
    print ''
    print 'Volcanoes Full Depth 2'
    single_tree = depth_2_builder.buildTree(volcanoes_examples,
                                            volcanoes_attributes, {}, 1)
    print dtree.process_tree(single_tree, volcanoes_examples)
def main():
    while True:
        data_path = raw_input("Please enter path to the data: ")
        try:
            orig_array = parse_c45(data_path)
            break
        except Exception as error:
            print 'Not a valid path!'

    print 'Converting Example array to numpy array...'
    examples = numpy.array(orig_array.to_float())

    print 'Conversion done. \n Extracting attributes...'
    attributes = getAttributes(orig_array)
    print 'Attributes successfully parsed'

    while True:
        data_option = input(
            "Enter '0' for cross validation, or '1' to run on full sample:")
        if data_option == 0:
            cross_validate = True
            break
        elif data_option == 1:
            cross_validate = False
            break
        else:
            print 'Not a valid input!'

    while True:
        depth_option = input(
            "Please enter a nonnegative integer to set the maximum depth of the tree, or enter 0 to grow the full tree:"
        )
        if depth_option < 0:
            print 'Not a valid input!'
        elif depth_option == 0:
            depth = float("inf")
            break
        else:
            depth = depth_option
            break

    while True:
        split_option = input(
            "Enter '0' for information gain, or '1' for gain ratio:")
        if split_option != 0 and split_option != 1:
            print 'Not a valid input!'
        else:
            break

    tree_builder = ID3(depth, split_option)

    if cross_validate:
        partitions = partitionExamples(examples)
        trees = runOnFolds(tree_builder, partitions, attributes)

    else:
        # build tree on the entire data set
        single_tree = tree_builder.buildTree(examples, attributes, {}, 1)
        accuracy = process_tree(single_tree, examples)
        size = tree.subtree_size
        max_depth = tree_builder.max_depth
        first_feature = tree.attribute.name
        print('Accuracy: ' + str(accuracy))
        print('Sizes: ' + str(size))
        print('Maximum Depth: ' + str(max_depth))
        print('First Feature: ' + str(first_feature))
Exemplo n.º 27
0
def collect_metadata(
        abspathitem,
        db,
        recentartists,
        recentalbums,
        recentgenres,
        queues,
        condition):
    """ id3 tags retriever """

    id3item = None
    id3v1item = {}
    id3v1 = False
    id3v2 = False
    for decoder in DECODERS:
        try:
            id3item = decoder(abspathitem)
            break
        except Exception as e:
            logging.error(e)
    try:
        id3v1item = defaultdict(lambda: 'unknown', ID3(abspathitem).as_dict())
    except InvalidTagError as e:
        logging.error(e)
    except Exception as e:
        logging.error(e)

    title = id3v1item['TITLE'].strip().lower()
    titleclean = re.sub("[^\w]*", "", title)
    artist = id3v1item['ARTIST'].strip().lower()
    album = id3v1item['ALBUM'].strip().lower()
    albumclean = re.sub("[^\w]*", "", album)
    genre = id3v1item['GENRE'].strip().lower()
    genreclean = re.sub("[^\w]+", "", genre).strip().lower()
    if not id3item:
        logging.warning("No ID3 informations found")
        return
    length = 0.0

    try:
        title = " ".join(id3item['TIT2'].text).strip().lower()
        id3v2 = True
    except Exception as e:
        logging.error(e)
    try:
        titleclean = re.sub("[^\w]*", "", title)
    except Exception as e:
        logging.error(e)
    try:
        artist = " ".join(id3item['TPE1'].text).strip().lower()
        id3v2 = True
    except Exception as e:
        logging.error(e)
    try:
        album = " ".join(id3item['TALB'].text).strip().lower()
        id3v2 = True
    except Exception as e:
        logging.error(e)
    try:
        albumclean = re.sub("[^\w]*", "", album)
    except Exception as e:
        logging.error(e)
    try:
        genre = " ".join(id3item['TCON'].text).strip().lower()
        id3v2 = True
    except Exception as e:
        logging.error(e)
    try:
        genreclean = re.sub("[^\w]+", "", genre).strip().lower()
    except Exception as e:
        logging.error(e)
    try:
        length = float(id3item['TLEN'])
    except Exception as e:
        logging.error(e)
    if not id3v2:
        logging.warning("No ID3v2 informations found")
        return

    with condition:
        try:
            ar = artist
            if not artist in recentartists.keys():
                if not db.execute(
                    "select id from artist where name = ?",
                        (ar,)).fetchone():
                    db.execute(
                        "insert into artist(name) values(?)", (ar,))
                    db.commit()
                recentartists[artist] = db.execute(
                    "select id from artist where name = ?",
                    (ar,)).fetchone()[0]
        except Exception as e:
            logging.error(e)

    with condition:
        try:
            al = albumclean
            if not album in recentalbums.keys():
                if not db.execute(
                    "select id from album where titleclean = ?",
                        (al,)).fetchone():
                    db.execute(
                        "insert into album(title, titleclean) "
                        "values(?, ?)", (album, albumclean))
                    db.commit()
                recentalbums[album] = db.execute(
                    "select id from album where titleclean = ?",
                    (al,)).fetchone()[0]
        except Exception as e:
            logging.error(e)

    with condition:
        try:
            ge = genre
            if not genre in recentgenres.keys():
                if not db.execute(
                    "select id from genre where desc = ?",
                        (ge,)).fetchone():
                    db.execute(
                        "insert or ignore into genre(desc, descclean) "
                        "values(?, ?)", (genre, genreclean))
                    db.commit()
                recentgenres[genre] = db.execute(
                    "select id from genre where desc = ?",
                    (ge,)).fetchone()[0]
        except Exception as e:
            logging.error(e)

    with condition:
        try:
            db.execute(
                "insert or replace into song("
                "title, titleclean, artist_id, "
                "genre_id, album_id, path, length) "
                "values (?,?,?,?,?,?,?)",
                (
                    title,
                    titleclean,
                    recentartists[artist],
                    recentgenres[genre],
                    recentalbums[album],
                    abspathitem.decode(FS_ENCODING), length))

            logging.debug("collect_metadata putting new artist on queue")
            for q in queues:
                if not q.full():
                    q.put_nowait((abspathitem, title, artist, album))
                else:
                    q.put((abspathitem, title, artist, album), block=True)
            db.commit()
        except Exception as e:
            logging.error(e)
Exemplo n.º 28
0
 def __init__(self, data_arr, m_param, information_gain_func, majority_class_for_cost, epsilon, delta):
     ID3.__init__(self, data_arr, m_param, information_gain_func, majority_class_for_cost, epsilon, delta)
     self.classifiers = None
     self.m_param = m_param
     self.epsilon = epsilon