Exemplos de ID3 em Python, exemplos de ID3.ID3 em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: main.py Projeto: AlexNik11/IT-master

def main(path, predict_column):
    df = pd.read_csv(path)
    columns = df.columns.values
    X = [
        df[column].values for column in columns
        if column != predict_column and column != 'Name'
    ]
    y = df[predict_column].values

    swtch = Switcher()

    sk_X = []
    for column in X:
        converted = np.array(swtch.convert_to(column))
        sk_X.append(converted)
    sk_X = np.array(sk_X).T

    sk_y = np.array(swtch.convert_to(y)).T

    decisions = ID3()
    entropy = decisions.fit(X, y)
    print(f'Entropies:\t\t\t{entropy}')

    sk_decisions = DecisionTreeClassifier(random_state=1370)
    sk_decisions.fit(sk_X, sk_y)
    print(f'SkLearn Tree Decision accuracy:\t{sk_decisions.score(sk_X, sk_y)}')
    #print(sk_decisions.predict(['pycharm', 'Java', 'tea']))
    return 0

Exemplo n.º 2

0

Exibir arquivo

Arquivo: CostSensitiveID3.py Projeto: nir6760/hw3AI

    def prune(tree, validation_group, evaluate_func):
        if tree.children is None:  # it is a leave
            return tree
        # continues features, answers are binaries

        sub_validation_0 = validation_group[
            validation_group[tree.feature[0]] < tree.feature[1]]
        tree.children[0] = (0,
                            CostSensitiveID3.prune(tree.children[0][1],
                                                   sub_validation_0,
                                                   evaluate_func))

        sub_validation_1 = validation_group[
            validation_group[tree.feature[0]] >= tree.feature[1]]
        tree.children[1] = (1,
                            CostSensitiveID3.prune(tree.children[1][1],
                                                   sub_validation_1,
                                                   evaluate_func))

        err_prune = 0
        err_no_prune = 0

        for index, row in validation_group.iterrows():
            real_classification = row['diagnosis']
            err_prune += evaluate_func(real_classification, tree.class_of_node)
            err_no_prune += evaluate_func(real_classification,
                                          ID3class.classify(row, tree))

        if err_prune < err_no_prune:  # it is better to prune
            tree.f = None
            tree.children = None
        return tree

Exemplo n.º 3

0

Exibir arquivo

def gerar_objetos():
    texto = open("data.txt").read()
    linhas = texto.replace(' ', '').splitlines()
    objetos = ID3()
    for linha in linhas:
        objetos.dias.append(Dia(linha.split(',')))
    return objetos

Exemplo n.º 4

0

Exibir arquivo

    def classify_by_k(self, sample, k_param=6):
        sample_to_cenroid = self.improved_forest.minmax(sample.drop(['diagnosis']))  # we need only features
        self.improved_forest.K = k_param
        distance_np = np.array(
            [self.improved_forest.calc_distance(self.improved_forest.centroid_lst[it].to_numpy(),
                                                sample_to_cenroid.to_numpy())
             for it in range(self.improved_forest.N)])

        def get_indices_of_k_smallest(arr, k):  # finds the k indices of k smallest values (distance)
            if k == arr.size:  # all the trees can vote
                return range(k)
            #idx = np.argpartition(arr, k)
            idx = sorted(range(len(arr)), key=lambda sub: arr[sub])[:K]
            # return np.array(np.unravel_index(idx, arr.shape))[:, range(min(k, 0), max(k, 0))].tolist()
            return idx[:k]

        conference_indices_lst = get_indices_of_k_smallest(distance_np, self.improved_forest.K)
        classify_func = lambda i: ID3class.classify(sample,
                                                    self.improved_forest.id3_lst[conference_indices_lst[i]].tree)

        dec_by = int(self.improved_forest.K / 3)
        votes_lst = [[classify_func(i)] * (3 - int(i /2)) for i in
                     range(self.improved_forest.K)]
        votes_lst_flatt = [item for sublist in votes_lst for item in sublist]

        if votes_lst_flatt.count('B') > len(votes_lst) / 2:  # B won the vote
            return 'B'
        return 'M'

Exemplo n.º 5

0

Exibir arquivo

def findBestFeaturesByIG(param, E_train, F_intial):
    F_best_list = []
    for it in range(param):
        best_f = ID3class.MaxIG(F_intial, E_train)
        F_best_list.append(best_f[0])
        F_intial.remove(best_f[0])
    return F_best_list

Exemplo n.º 6

0

Exibir arquivo

    def fit(self, arr_instans, arr_target):
        #convert array to dataframe
        df = pandas.DataFrame.from_records(arr_instans)
        arr_target = pandas.DataFrame(arr_target)
        arr_instans = df.assign(target=arr_target.values)

        #pre-process continuous valued attribute
        ##arr_instans = self.continuous_value(arr_instans)

        #pre-process missing value attribute
        arr_instans = self.missing_value_handler(arr_instans)

        #pre-process attribute with many values

        #convert dataframe to array of array
        arr_target = [target[0] for target in arr_target.values]
        arr_instans = arr_instans.drop(columns="target", axis=1)
        arr_instans = arr_instans.values.tolist()

        #ic(arr_instans)
        #ic(arr_target)

        #buat objek ID3
        id3_ = ID3(arr_instans, arr_target)

        #fitting ID3
        id3_.fit(arr_instans, arr_target)

        #this->root = ID3.root
        self.root = id3_.root

Exemplo n.º 7

0

Exibir arquivo

def build_decision_tree(training_data_acids, training_data_labels):
    """
    Based on the training data and information gain,
    build a decision tree that can accurately predict if an amino acid
    will be exposed(e) or buried(-)
    :param training_data_acids: The amino acids in the training data set
    :param training_data_labels: The labels in the training data set
    :return: A binary tree that represents a decision tree based on the given features
    """

    print "Building tree ..."

    # Flatten the data for training
    flat_training_data_acids = []
    flat_training_data_labels = []
    for x, y in zip(training_data_acids, training_data_labels):
        flat_training_data_acids.extend(x)
        flat_training_data_labels.extend(y)

    # Build decision tree with training data
    attributes = [i for i in range(len(training_data_acids[0][0]))]
    decision_tree = ID3(attributes, flat_training_data_acids,
                        flat_training_data_labels)

    return decision_tree

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test.py Projeto: Mitrosoulas/Bayes-and-ID3

def run():
	print("Executing ID3 algorithm")
	examples = ID3Reader.Read()


	DTree = ID3(examples,"ghost","ghost",0)
	j = 0


	accuracy = 0

	print("#######################################")
	newExamples = open("./ID3/readyData.txt",'r')
	for line in newExamples:
		j+=1
		ex = line.split(',')
		for i in range(len(ex)):
			ex[i] = ex[i].rstrip()
		tempEx = []
		for i in range(0,len(ex) -1):
			tempEx.append(ex[i])
		answer = DTree.answer(tempEx)
		if answer == ex[4]:
			accuracy += 1
		print("Test subject "+str(j))
		print("Algorithm category of choice is : "+answer)
		print("Actual category is : "+ex[-1] )
		print("#######################################")

	return str("ID3 algorithm accuracy with "+ str(len(examples))+" examples of Iris flowers is: "+str(accuracy)+"/30")

Exemplo n.º 9

0

Exibir arquivo

def gerar_objetos():
    global cabecalho

    linhas = open("data.txt").read().replace(' ', '').splitlines()
    objetos = ID3()

    cabecalho = linhas.pop(0).split(',')
    for linha in linhas:
        objetos.dias.append(Dia(linha.split(',')))

    return objetos

Exemplo n.º 10

0

Exibir arquivo

    def train(self, data, p_param):
        # kf = KFold(n_splits=3, shuffle=True, random_state=318981586)
        decisions_trees = []

        for i in range(self.n_param):
            size = p_param * self.n_param
            random_examples = sample(data, k=int(size))
            classifier = ID3(random_examples)
            classifier.train()
            centroid = calc_centroid(random_examples)
            decisions_trees.append((centroid, classifier))
        self.decision_trees = decisions_trees

Exemplo n.º 11

0

Exibir arquivo

Arquivo: main.py Projeto: fggj228/IT-master

def main(path, predict_column):
    df = pd.read_csv(path)
    columns = df.columns.values
    X = [
        df[column].values for column in columns
        if column != predict_column and column != 'Name'
    ]
    y = df[predict_column].values

    decisions = ID3()
    fitted = decisions.fit(X, y)
    print(f'Entropy of {predict_column}:\t\t\t{fitted[1]}\n')
    print(f'Information Gains of Xs\n')
    headers = ' '.join(
        map(
            str,
            list(
                filter(lambda x: x != predict_column and x != "Name",
                       df.columns.values)))).replace(' ', '\t | ')
    print(f'{headers}')
    values = ' '.join(map(lambda x: "{0:.8f}".format(x),
                          fitted[0])).replace(' ', '\t | ')
    print(values)
    print('\n\n')

    swtch = Switcher()

    sk_X = []
    for column in X:
        converted = np.array(swtch.convert_to(column))
        sk_X.append(converted)
    sk_X = np.array(sk_X).T

    sk_y = np.array(swtch.convert_to(y)).T

    x_train, x_test, y_train, y_test = train_test_split(sk_X,
                                                        sk_y,
                                                        test_size=0.3)

    sk_decisions = DecisionTreeClassifier(random_state=1370)
    sk_decisions.fit(x_train, y_train)
    print(
        f'SkLearn Tree Decision accuracy:\t{sk_decisions.score(x_train, y_train)}'
    )
    print(f'SkLearn Tree Decision prediction: {sk_decisions.predict(x_test)}')
    return 0

Exemplo n.º 12

0

Exibir arquivo

Arquivo: KNNForest.py Projeto: nir6760/hw3AI

    def classify_by_k(self, sample, k_param=1):
        sample_to_cenroid = self.minmax(sample.drop(['diagnosis']))  # we need only features
        self.K = k_param
        distance_np = np.array([self.calc_distance(self.centroid_lst[it].to_numpy(), sample_to_cenroid.to_numpy())
                                for it in range(self.N)])

        def get_indices_of_k_smallest(arr, k):  # finds the k indices of k smallest values (distance)
            if k == arr.size:  # all the trees can vote
                return range(k)
            idx = np.argpartition(arr, k)
            # return np.array(np.unravel_index(idx, arr.shape))[:, range(min(k, 0), max(k, 0))].tolist()
            return idx[:k]

        conference_indices_lst = get_indices_of_k_smallest(distance_np, self.K)
        classify_func = lambda i: ID3class.classify(sample, self.id3_lst[conference_indices_lst[i]].tree)
        votes_lst = [classify_func(i) for i in range(self.K)]
        if votes_lst.count('B') > len(votes_lst) / 2:  # B won the vote
            return 'B'
        return 'M'

Exemplo n.º 13

0

Exibir arquivo

    def train(self, data, p_param):
        # kf = KFold(n_splits=3, shuffle=True, random_state=318981586)
        decisions_trees = []

        for i in range(self.n_param):
            size = p_param * self.n_param
            random_examples = choices(data, k=int(size))
            test_group = [ex for ex in data if ex not in random_examples]
            classifier = ID3(random_examples, self.m_param, information_gain,
                             majority_class_for_knn)
            classifier.train()
            score = classifier.test(test_group, False)
            relevant = classifier.root.find_features(
                classifier.num_of_features)
            centroid = calc_centroid(random_examples)
            height = classifier.root.calc_height()
            decisions_trees.append(
                (centroid, height, classifier, score, relevant))
        self.decision_trees = decisions_trees

Exemplo n.º 14

0

Exibir arquivo

Arquivo: ID3-KFold.py Projeto: Gui-Lima/Learning-Machines

def simpleKnn(relPath, columns, resultColumn):
    dataSet = r.readDataSet(relPath, columns)
    trainingSets = []
    avaliationSets = []
    kfold = kc(dataSet, 5, resultColumn, True)
    kfold.run(trainingSets, avaliationSets, stratified=True)
    dataSet = dataSet.apply(pd.to_numeric)

    for i in range(len(trainingSets)):
        tset = []
        aset = []
        for index, row in dataSet.iterrows():
            tupla = (dataSet.iloc[index][resultColumn], index)
            if tupla in trainingSets[i]:
                tset.append(row.tolist())
            if tupla in avaliationSets[i]:
                aset.append(row.tolist())
        i = ID3(tset, resultColumn)
        i.printTree()

Exemplo n.º 15

0

Exibir arquivo

    def train(self, data, p_param):
        decisions_trees = []

        for i in range(self.n_param):
            size = p_param * self.n_param
            random_examples = sample(data, k=int(size))
            minmax_vector = create_minmax_vector(random_examples)
            normalized_data = normalized_set(random_examples, minmax_vector)
            """ choose the best m_param """
            classifier = ID3(normalized_data, 10)
            classifier.train()
            test_group = [ex for ex in data if ex not in random_examples]
            """ normalize all the data : """
            normalized_test = normalized_set(test_group, minmax_vector)
            """ keeps in mind which features are relevant : """
            relevant = classifier.root.find_features(
                classifier.num_of_features)
            score = classifier.test(normalized_test, False)
            centroid = calc_centroid_for_impro(random_examples, relevant)
            decisions_trees.append(
                (1 - score, centroid, classifier, minmax_vector, relevant))
            """ prefer more accurate trees : """
            decisions_trees.sort(key=lambda x: x[0])
        self.decision_trees = decisions_trees

Exemplo n.º 16

0

Exibir arquivo

from ID3 import ID3
from data import Data
import numpy as np
DATA_DIR = 'data_new/'

if __name__ == "__main__":
    print("Training on train.csv and testing on test.csv with no depth restriction...")
    ignore = list()
    data = np.loadtxt(DATA_DIR+'train.csv', delimiter=',', dtype=str)
    data_obj = Data(data=data)
    ID3_obj = ID3(data_obj, data_obj.attributes, data_obj.get_column('label'))
    tree = ID3_obj.build_tree()
    # tree = ID3.build_tree(data_obj, data_obj.attributes, data_obj.get_column('label'), ignore)
    # tree = ID3.build_tree(data_obj, data_obj.attributes, data_obj.get_column('label'), ignore, 7, data_obj, 0)
    # tree = ID3.build_tree(data_obj, data_obj.attributes, data_obj.get_column('label'), ignore, 5, data_obj, 0)
    # labels = ID3.predict(tree, data_obj)
    data = np.loadtxt(DATA_DIR+'test.csv', delimiter=',', dtype=str)
    data_obj = Data(data=data)
    labels = ID3_obj.predict(data_obj)
    labels_true = data_obj.get_column('label')
    print("Depth =", ID3_obj.max_depth(tree))
    error = 0
    total = 0
    for l1, l2 in zip(labels, labels_true):
        total += 1
        if l1 != l2:
            error += 1
    error = float(error)/float(total) * 100
    print("Error =", error, "%")

Exemplo n.º 17

0

Exibir arquivo

numPlaylists, numTracks = 0, 0
startTime = time()

for artist in listdir(mp3):
    thisArtist = "%s/%s" % (mp3, artist)
    if not S_ISDIR(stat(thisArtist)[ST_MODE]): continue
    for album in listdir(thisArtist):
        numTracks += 1
        thisAlbum = "%s/%s" % (thisArtist, album)
        if not S_ISDIR(stat(thisAlbum)[ST_MODE]): continue
        thisPlaylist = "%s/%s-%s.m3u" % (mp3, artist, album)
        playlist = {}
        for track in listdir(thisAlbum):
            numTracks += 1
            thisTrack = "%s/%s" % (thisAlbum, track)
            id3 = ID3(thisTrack)
            playlist[id3.track] = thisTrack
        try:
            del (id3)
        except NameError, e:
            pass
        m3u = open(thisPlaylist, "w")
        for i in range((playlist.__len__() + 1))[1:]:
            try:
                m3u.write("%s\n" % playlist[i])
            except KeyError:
                pass
        m3u.close()

endTime = time()
runTime = endTime - startTime

Exemplo n.º 18

0

Exibir arquivo

Arquivo: Graph.py Projeto: Richarjw/EECS-349

and why does this make sense?
In about two sentences, how does the advantage of pruning change as the data set size increases? 
Does this make sense, and why or why not?
"""

x = []
trend_basic = []
trend_prune = []
for size in range(10, 300, 10):
    x.append(size)
    accuracy_b = []
    accuracy_p = []
    for i in range(100):
        random.shuffle(data)
        train_set = data[:size]
        test_set = data[size:]

        tree = ID3(train_set)
        accuracy_b.append(test(tree, test_set))
        tree_p = prune(tree, test_set)
        accuracy_p.append(test(tree_p, test_set))
    trend_basic.append(np.mean(accuracy_b))
    trend_prune.append(np.mean(accuracy_p))

fig, ax = plt.subplots()
ax.plot(x, trend_basic, color='blue', label='$Default$')
ax.plot(x, trend_prune, color='orange', label='$Pruned$')
ax.legend(loc='lower right')
ax.set_xlabel('Training Set Size')
ax.set_ylabel('Test Set Accuracy')
ax.set_title('Pruned vs Standard Tree Accuracy')

Exemplo n.º 19

0

Exibir arquivo

from ID3 import ID3

if __name__ == '__main__':
    x, y = [], []
    label = ["age", "prescript", "astigmatic", "tear_rate"]
    with open("lenses.txt", "r") as FILE:
        for line in FILE.readlines():
            x.append(line.strip().split("\t"))
            y.append(x[-1].pop())
    id3 = ID3(x, y, label)
    id3.generate_tree()
    print(id3)
    # print(id3.inference())

Exemplo n.º 20

0

Exibir arquivo

    if verbose:
        pprint(X)
        print('loss if prune:', pruned_loss)
        print('current loss', cur_loss)
    if pruned_loss < cur_loss:
        root.children.clear()
        return pruned_loss
    # if not pruned, the loss of node root is the sum loss of all of its children
    return cur_loss


if __name__ == "__main__":
    console = Console(markup=False)
    # -------------------------- Example 1 (Small Normalization Param) ------------
    print("Example 1:")
    id3 = ID3(verbose=False)
    X = [
        ['青年', '否', '否', '一般'],
        ['青年', '否', '否', '好'],
        ['青年', '是', '否', '好'],
        ['青年', '是', '是', '一般'],
        ['青年', '否', '否', '一般'],
        ['老年', '否', '否', '一般'],
        ['老年', '否', '否', '好'],
        ['老年', '是', '是', '好'],
        ['老年', '否', '是', '非常好'],
        ['老年', '否', '是', '非常好'],
        ['老年', '否', '是', '非常好'],
        ['老年', '否', '是', '好'],
        ['老年', '是', '否', '好'],
        ['老年', '是', '否', '非常好'],

Exemplo n.º 21

0

Exibir arquivo

import re
import sys

from channels import *
from config import *
from compat2and3 import urlquote

# ID3 libraries
try:
    from mutagen import File as get_meta
except:
    try:
        from ID3 import ID3
        log.INFO("Just basic ID3 support")
        get_meta = lambda fn: dict([(k.lower(), v)
                                    for k, v in ID3(fn).iteritems()])
    except:
        log.INIT(
            "You are out of luck in regards to mp3 browsing. No ID3 support.")
        get_meta = lambda *x: {}


#Convert seconds to a time string "[[[DD:]HH:]MM:]SS".
def ddhhmmss(seconds):
    dhms = ''
    for scale in 86400, 3600, 60:
        result, seconds = divmod(seconds, scale)
        if dhms != '' or result > 0:
            dhms += '{0:02d}:'.format(result)
    dhms += '{0:02d}'.format(seconds)
    if len(dhms) == 2:

Exemplo n.º 22

0

Exibir arquivo

# codes to get the results in my report

import numpy as np
import pandas as pd
from ID3 import ID3

# Import & some CONST
DATA_DIR = "C:/Users/Yu Zhu/OneDrive/Academy/the U/Assignment/AssignmentSln/ML-01-DT/experiment-data_new/data_new/"

##### Experiments
# 1.(b)
id3 = ID3()
id3.train_id3(fpath = DATA_DIR + 'train.csv')
id3.test_id3(fpath = DATA_DIR + 'train.csv') # the answer is 1

# 1.(c)
id3.test_id3(fpath = DATA_DIR + 'test.csv') # the answer is 1

# 1.(d)
rules = id3.rules
max([len(r['attr']) for r in rules]) # the answer is 6



#### CORSS-VALIDATION

# 2.(a)

# import validation set
cv1 = pd.read_csv(DATA_DIR + 'CVfolds_new/fold1.csv')
cv2 = pd.read_csv(DATA_DIR + 'CVfolds_new/fold2.csv')

Exemplo n.º 23

0

Exibir arquivo

Arquivo: Driver.py Projeto: KhairulM/Tugas-Besar-IF3270-Pembelajaran-Mesin

                            tennis_wind[i]])

#splitting tennis_datasets
tennis_train_instances, tennis_test_instances, tennis_train_targets, tennis_test_targets = train_test_split(tennis_instances, tennis_targets, test_size=0.2, random_state=42)

#pre-process iris datasets
iris_instances = iris_datasets.data
iris_targets = iris_datasets.target

iris_train_instances, iris_test_instances, iris_train_targets, iris_test_targets = train_test_split(iris_instances, iris_targets, test_size=0.9, random_state=42)

#testing kode
test_instances = [[1, 1], [0, 0]]
test_targets = []

id = ID3(tennis_train_instances,tennis_train_targets)
id.fit(id.instances,id.targets)
print("Predicted : ", id.predict(tennis_train_instances))
print("Actual : ", tennis_train_targets)

# Node testing
""" instances = [[0, 1, 2], [2, 1, 0]]
targets = [1, 0]
rules = ["== 0"] 
root = Node(0, instances, targets)

nodes = [Node(1, [instances[0]], [targets[0]])]
root.set_rule_children(rules, nodes)
child = root.next_node(instances[0])
child.set_rule_children(["== 1"], [Node([instances[1]], [targets[1]])])

Exemplo n.º 24

0

Exibir arquivo

    return dataSet


def transformMushroomTargetAttribute(dataSet, targetAttribute):
    uniqueValues = dataSet[targetAttribute].unique()
    dataSet[targetAttribute].replace(uniqueValues,
                                     range(len(uniqueValues)),
                                     inplace=True)
    return dataSet


if __name__ == '__main__':
    print("Iris Dataset")
    irisFileLocation = 'iris.csv'
    irisDataSet = importData(irisFileLocation)
    irisID3 = ID3(irisDataSet, 10, [0.05, 0.10, 0.15, 0.20], True)
    irisID3.validate()
    print("Spambase Dataset")
    spambaseFileLocation = 'spambase.csv'
    spambaseDataSet = importData(spambaseFileLocation)
    spambaseID3 = ID3(spambaseDataSet, 10, [0.05, 0.10, 0.15, 0.20, 0.25],
                      True)
    spambaseID3.validate()
    print("Mushroom Dataset - Multiway Split")
    mushroomFileLocation = 'mushroom.csv'
    mushroomDataSet = importData(mushroomFileLocation)
    columnsLength = len(mushroomDataSet.columns)
    mushroomDataSet = transformMushroomTargetAttribute(mushroomDataSet,
                                                       columnsLength - 1)
    mushroomMultiwayID3 = ID3(mushroomDataSet, 10, [0.05, 0.10, 0.15], False)
    mushroomMultiwayID3.validate()

Exemplo n.º 25

0

Exibir arquivo

Arquivo: writeup.py Projeto: AustinFeydt/Machine-Learning-Projects

def main():
    print 'PARTS A & B'

    orig_spam = parse_c45('spam')
    orig_volcanoes = parse_c45('volcanoes')
    orig_voting = parse_c45('voting')

    print 'making numpy'
    spam_examples = numpy.array(orig_spam.to_float())
    volcanoes_examples = numpy.array(orig_volcanoes.to_float())
    voting_examples = numpy.array(orig_voting.to_float())

    spam_attributes = dtree.getAttributes(orig_spam)
    volcanoes_attributes = dtree.getAttributes(orig_volcanoes)
    voting_attributes = dtree.getAttributes(orig_voting)

    spam_partitions = dtree.partitionExamples(spam_examples)
    volcanoes_partitions = dtree.partitionExamples(volcanoes_examples)
    voting_partitions = dtree.partitionExamples(voting_examples)

    part_a_builder = ID3(1, 0)

    print 'Spam CV Accuracies for Depth = 1'
    spam_trees = dtree.runOnFolds(part_a_builder, spam_partitions,
                                  spam_attributes)
    print ''

    print 'Volcanoes CV Accuracies for Depth = 1'
    volcanoes_trees = dtree.runOnFolds(part_a_builder, volcanoes_partitions,
                                       volcanoes_attributes)
    print ''

    print 'Voting CV Accuracies for Depth = 1'
    voting_trees = dtree.runOnFolds(part_a_builder, voting_partitions,
                                    voting_attributes)
    print ''

    print 'PART C'
    depth_1_builder = ID3(1, 0)
    depth_3_builder = ID3(3, 0)
    depth_5_builder = ID3(5, 0)
    depth_7_builder = ID3(7, 0)
    depth_9_builder = ID3(9, 0)

    print 'Spam Depth 1:'
    dtree.runOnFolds(depth_1_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 3:'
    dtree.runOnFolds(depth_3_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 5:'
    dtree.runOnFolds(depth_5_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 7:'
    dtree.runOnFolds(depth_7_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 9:'
    dtree.runOnFolds(depth_9_builder, spam_partitions, spam_attributes)

    print 'Volcanoes Depth 1:'
    dtree.runOnFolds(depth_1_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Volcanoes Depth 3:'
    dtree.runOnFolds(depth_3_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Volcanoes Depth 5:'
    dtree.runOnFolds(depth_5_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Volcanoes Depth 7:'
    dtree.runOnFolds(depth_7_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Volcanoes Depth 9:'
    dtree.runOnFolds(depth_9_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'Part D'
    depth_1_GR_builder = ID3(1, 1)
    depth_3_GR_builder = ID3(3, 1)
    depth_5_GR_builder = ID3(5, 1)

    print 'Spam Depth 1 IG'
    dtree.runOnFolds(depth_1_builder, spam_partitions, spam_attributes)
    print ''
    print 'Spam Depth 3 IG'
    dtree.runOnFolds(depth_3_builder, spam_partitions, spam_attributes)
    print ''
    print 'Spam Depth 5 IG'
    dtree.runOnFolds(depth_5_builder, spam_partitions, spam_attributes)

    print 'Spam Depth 1 GR'
    dtree.runOnFolds(depth_1_GR_builder, spam_partitions, spam_attributes)
    print ''
    print 'Spam Depth 3 GR'
    dtree.runOnFolds(depth_3_GR_builder, spam_partitions, spam_attributes)
    print ''
    print 'Spam Depth 5 GR'
    dtree.runOnFolds(depth_5_GR_builder, spam_partitions, spam_attributes)
    print ''

    print 'Voting Depth 1 IG'
    dtree.runOnFolds(depth_1_builder, voting_partitions, voting_attributes)
    print ''
    print 'Voting Depth 3 IG'
    dtree.runOnFolds(depth_3_builder, voting_partitions, voting_attributes)
    print ''
    print 'Voting Depth 5 IG'
    dtree.runOnFolds(depth_5_builder, voting_partitions, voting_attributes)
    print ''

    print 'Voting Depth 1 GR'
    dtree.runOnFolds(depth_1_GR_builder, voting_partitions, voting_attributes)
    print ''
    print 'Voting Depth 3 GR'
    dtree.runOnFolds(depth_3_GR_builder, voting_partitions, voting_attributes)
    print ''
    print 'Voting Depth 5 GR'
    dtree.runOnFolds(depth_5_GR_builder, voting_partitions, voting_attributes)
    print ''

    print 'Volcanoes Depth 1 IG'
    dtree.runOnFolds(depth_1_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''
    print 'Volcanoes Depth 3 IG'
    dtree.runOnFolds(depth_3_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''
    print 'Volcanoes Depth 5 IG'
    dtree.runOnFolds(depth_5_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''

    print 'Volcanoes Depth 1 GR'
    dtree.runOnFolds(depth_1_GR_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''
    print 'Volcanoes Depth 3 GR'
    dtree.runOnFolds(depth_3_GR_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''
    print 'Volcanoes Depth 5 GR'
    dtree.runOnFolds(depth_5_GR_builder, volcanoes_partitions,
                     volcanoes_attributes)

    print 'PART E'
    depth_2_builder = ID3(2, 0)
    print 'Spam CV Depth 1'
    dtree.runOnFolds(depth_1_builder, spam_partitions, spam_attributes)
    print ''

    print 'Spam CV Depth 2'
    dtree.runOnFolds(depth_2_builder, spam_partitions, spam_attributes)
    print ''

    print 'Spam Full Depth 1'
    single_tree = depth_1_builder.buildTree(spam_examples, spam_attributes, {},
                                            1)
    print dtree.process_tree(single_tree, spam_examples)
    print ''

    print 'Spam Full Depth 2'
    single_tree = depth_2_builder.buildTree(spam_examples, spam_attributes, {},
                                            1)
    print dtree.process_tree(single_tree, spam_examples)
    print ''

    print 'Voting CV Depth 1'
    dtree.runOnFolds(depth_1_builder, voting_partitions, voting_attributes)
    print ''

    print 'Voting CV Depth 2'
    dtree.runOnFolds(depth_2_builder, voting_partitions, voting_attributes)
    print ''

    print 'Voting Full Depth 1'
    single_tree = depth_1_builder.buildTree(voting_examples, voting_attributes,
                                            {}, 1)
    print dtree.process_tree(single_tree, voting_examples)
    print ''
    print 'Voting Full Depth 2'
    single_tree = depth_2_builder.buildTree(voting_examples, voting_attributes,
                                            {}, 1)
    print dtree.process_tree(single_tree, voting_examples)
    print ''

    print 'Volcanoes CV Depth 1'
    dtree.runOnFolds(depth_1_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''

    print 'Volcanoes CV Depth 2'
    dtree.runOnFolds(depth_2_builder, volcanoes_partitions,
                     volcanoes_attributes)
    print ''

    print 'Volcanoes Full Depth 1'
    single_tree = depth_1_builder.buildTree(volcanoes_examples,
                                            volcanoes_attributes, {}, 1)
    print dtree.process_tree(single_tree, volcanoes_examples)
    print ''
    print 'Volcanoes Full Depth 2'
    single_tree = depth_2_builder.buildTree(volcanoes_examples,
                                            volcanoes_attributes, {}, 1)
    print dtree.process_tree(single_tree, volcanoes_examples)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: modified_dtree.py Projeto: AustinFeydt/Machine-Learning-Projects

def main():
    while True:
        data_path = raw_input("Please enter path to the data: ")
        try:
            orig_array = parse_c45(data_path)
            break
        except Exception as error:
            print 'Not a valid path!'

    print 'Converting Example array to numpy array...'
    examples = numpy.array(orig_array.to_float())

    print 'Conversion done. \n Extracting attributes...'
    attributes = getAttributes(orig_array)
    print 'Attributes successfully parsed'

    while True:
        data_option = input(
            "Enter '0' for cross validation, or '1' to run on full sample:")
        if data_option == 0:
            cross_validate = True
            break
        elif data_option == 1:
            cross_validate = False
            break
        else:
            print 'Not a valid input!'

    while True:
        depth_option = input(
            "Please enter a nonnegative integer to set the maximum depth of the tree, or enter 0 to grow the full tree:"
        )
        if depth_option < 0:
            print 'Not a valid input!'
        elif depth_option == 0:
            depth = float("inf")
            break
        else:
            depth = depth_option
            break

    while True:
        split_option = input(
            "Enter '0' for information gain, or '1' for gain ratio:")
        if split_option != 0 and split_option != 1:
            print 'Not a valid input!'
        else:
            break

    tree_builder = ID3(depth, split_option)

    if cross_validate:
        partitions = partitionExamples(examples)
        trees = runOnFolds(tree_builder, partitions, attributes)

    else:
        # build tree on the entire data set
        single_tree = tree_builder.buildTree(examples, attributes, {}, 1)
        accuracy = process_tree(single_tree, examples)
        size = tree.subtree_size
        max_depth = tree_builder.max_depth
        first_feature = tree.attribute.name
        print('Accuracy: ' + str(accuracy))
        print('Sizes: ' + str(size))
        print('Maximum Depth: ' + str(max_depth))
        print('First Feature: ' + str(first_feature))

Exemplo n.º 27

0

Exibir arquivo

def collect_metadata(
        abspathitem,
        db,
        recentartists,
        recentalbums,
        recentgenres,
        queues,
        condition):
    """ id3 tags retriever """

    id3item = None
    id3v1item = {}
    id3v1 = False
    id3v2 = False
    for decoder in DECODERS:
        try:
            id3item = decoder(abspathitem)
            break
        except Exception as e:
            logging.error(e)
    try:
        id3v1item = defaultdict(lambda: 'unknown', ID3(abspathitem).as_dict())
    except InvalidTagError as e:
        logging.error(e)
    except Exception as e:
        logging.error(e)

    title = id3v1item['TITLE'].strip().lower()
    titleclean = re.sub("[^\w]*", "", title)
    artist = id3v1item['ARTIST'].strip().lower()
    album = id3v1item['ALBUM'].strip().lower()
    albumclean = re.sub("[^\w]*", "", album)
    genre = id3v1item['GENRE'].strip().lower()
    genreclean = re.sub("[^\w]+", "", genre).strip().lower()
    if not id3item:
        logging.warning("No ID3 informations found")
        return
    length = 0.0

    try:
        title = " ".join(id3item['TIT2'].text).strip().lower()
        id3v2 = True
    except Exception as e:
        logging.error(e)
    try:
        titleclean = re.sub("[^\w]*", "", title)
    except Exception as e:
        logging.error(e)
    try:
        artist = " ".join(id3item['TPE1'].text).strip().lower()
        id3v2 = True
    except Exception as e:
        logging.error(e)
    try:
        album = " ".join(id3item['TALB'].text).strip().lower()
        id3v2 = True
    except Exception as e:
        logging.error(e)
    try:
        albumclean = re.sub("[^\w]*", "", album)
    except Exception as e:
        logging.error(e)
    try:
        genre = " ".join(id3item['TCON'].text).strip().lower()
        id3v2 = True
    except Exception as e:
        logging.error(e)
    try:
        genreclean = re.sub("[^\w]+", "", genre).strip().lower()
    except Exception as e:
        logging.error(e)
    try:
        length = float(id3item['TLEN'])
    except Exception as e:
        logging.error(e)
    if not id3v2:
        logging.warning("No ID3v2 informations found")
        return

    with condition:
        try:
            ar = artist
            if not artist in recentartists.keys():
                if not db.execute(
                    "select id from artist where name = ?",
                        (ar,)).fetchone():
                    db.execute(
                        "insert into artist(name) values(?)", (ar,))
                    db.commit()
                recentartists[artist] = db.execute(
                    "select id from artist where name = ?",
                    (ar,)).fetchone()[0]
        except Exception as e:
            logging.error(e)

    with condition:
        try:
            al = albumclean
            if not album in recentalbums.keys():
                if not db.execute(
                    "select id from album where titleclean = ?",
                        (al,)).fetchone():
                    db.execute(
                        "insert into album(title, titleclean) "
                        "values(?, ?)", (album, albumclean))
                    db.commit()
                recentalbums[album] = db.execute(
                    "select id from album where titleclean = ?",
                    (al,)).fetchone()[0]
        except Exception as e:
            logging.error(e)

    with condition:
        try:
            ge = genre
            if not genre in recentgenres.keys():
                if not db.execute(
                    "select id from genre where desc = ?",
                        (ge,)).fetchone():
                    db.execute(
                        "insert or ignore into genre(desc, descclean) "
                        "values(?, ?)", (genre, genreclean))
                    db.commit()
                recentgenres[genre] = db.execute(
                    "select id from genre where desc = ?",
                    (ge,)).fetchone()[0]
        except Exception as e:
            logging.error(e)

    with condition:
        try:
            db.execute(
                "insert or replace into song("
                "title, titleclean, artist_id, "
                "genre_id, album_id, path, length) "
                "values (?,?,?,?,?,?,?)",
                (
                    title,
                    titleclean,
                    recentartists[artist],
                    recentgenres[genre],
                    recentalbums[album],
                    abspathitem.decode(FS_ENCODING), length))

            logging.debug("collect_metadata putting new artist on queue")
            for q in queues:
                if not q.full():
                    q.put_nowait((abspathitem, title, artist, album))
                else:
                    q.put((abspathitem, title, artist, album), block=True)
            db.commit()
        except Exception as e:
            logging.error(e)

Exemplo n.º 28

0

Exibir arquivo

 def __init__(self, data_arr, m_param, information_gain_func, majority_class_for_cost, epsilon, delta):
     ID3.__init__(self, data_arr, m_param, information_gain_func, majority_class_for_cost, epsilon, delta)
     self.classifiers = None
     self.m_param = m_param
     self.epsilon = epsilon