예제 #1
0
def Supervised(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0,
               step =10, learner='svm_linear', boost=None):
    print("FILENAME: ", filename, "OLDFILES: ", len(old_files))
    stopat = float(stopat)
    np.random.seed(seed)

    read = MAR()
    read = read.create(filename, old_files)
    read.step = step

    read.interval = interval
    read.seed = seed

    if boost:
        util.vote(read, clf_name=boost, seed=seed, all=False, temp=str(seed) + filename)
    return
    num2 = read.get_allpos()
    target = int(num2 * stopat)
    if stop == 'est':
        read.enable_est = True
    else:
        read.enable_est = False

    if boost == None:
        read.train_supervised(learner, seed)
    pos, neg, total = read.get_numbers()

    if boost:
        read.query_boost()
    else:
        read.query_supervised()

    read.record['est'][0] = read.est_num

    while True:
        pos, neg, total = read.get_numbers()

        # try:
        #     print("%d, %d, %d" %(pos,pos+neg, read.est_num))
        # except:
        #     print("%d, %d" %(pos,pos+neg))

        if pos + neg >= total:
            break

        # if pos >= target and (pos+neg) >= total * .22 and read.enable_est and read.est_num*stopat<= pos:
        #     break
        if boost:
            ids = read.query_boost()[:read.step]
        else:
            ids = read.query_supervised()[:read.step]
        read.code_batch(ids)
    return read
예제 #2
0
def Boosting(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10):
    print("FILENAME: ", filename, "OLDFILES: ", len(old_files))
    stopat = float(stopat)
    np.random.seed(seed)

    read = MAR()
    read = read.create(filename,old_files)
    read.step = step

    read.interval = interval

    util.vote(read)

    num2 = read.get_allpos()
    target = int(num2 * stopat)
    if stop == 'est':
        read.enable_est = True
    else:
        read.enable_est = False

    pos, neg, total = read.get_numbers()


    read.query_boost()
    read.record['est'][0]= read.est_num


    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" %(pos,pos+neg, read.est_num))
        except:
            print("%d, %d" %(pos,pos+neg))

        if pos + neg >= total:
            break

        if read.enable_est and read.est_num*stopat<= pos:
            break
        for id in read.query_boost()[:read.step]:
            read.code_error(id, error=error)
    return read
예제 #3
0
def create_decision_tree(data, attributes, target_attr, params=None):
    data = data[:]
    target_vals = [record[target_attr] for record in data]
    default = vote(target_vals)
    if len(data) <= 20 or len(
            attributes) <= 1:  #we stop early to prevent overfitting
        return default
    elif target_vals.count(
            target_vals[0]) == len(target_vals):  #all True or all False
        return target_vals[0]
    else:
        best = choose_attribute(data, attributes, target_attr)
        tree = {best: {}}
        for val in get_values(data, best):
            subtree = create_decision_tree(get_sub_dataset(
                data, best,
                val), [attr for attr in attributes if attr != best],
                                           target_attr, default)
            tree[best][val] = subtree
    return tree
예제 #4
0
            list19_19 = np.argsort(prediction19_19)[0][::-1][:5]

            '''
            print('target is {0}'.format(hashmap[testing_target]))
            print('prediction16 is ', list16)
            print('prediction19_10 is ', list19_10)
            print('prediction19_19 is ', list19_19)
            print('vote is ', util.vote(list16, list19_10, list19_19)[0])
            print('vote top_5 are ', util.vote(list16, list19_10, list19_19)[1])
            print('===========================')
            '''
   
            if testing_target in hashmap:
                related += 1
                target = hashmap[testing_target]
                if target == util.vote(list16, list19_10, list19_19)[0]:
                    vote += 1
                if target in util.vote(list16, list19_10, list19_19)[1]:
                    top5 += 1

print("=================")
print('total is {0}, related is {1}'.format(count, related))
print('accuracy is ', vote*1.0/related)
print('top5 accuracy is ', top5*1.0/related)






예제 #5
0
if __name__ == "__main__":
    filename = sys.argv[1]
    num_trees = int(sys.argv[2])

    data, attributes, target_attr = get_data(filename)
    n = len(data)

    accs = []
    for i in range(5):
        valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 *
                                                    (i + 1))]  #validation data
        train_data = [d for d in data if not d in valid_data]  #training data
        labels = [d[target_attr] for d in valid_data]
        trees = create_forest(data, attributes, target_attr, num_trees)
        #classify
        classes = []
        for tree in trees:
            classification = classify_decision_tree(tree, valid_data,
                                                    vote(labels))
            classes.append(classification)
        classification = [vote(c) for c in zip(*classes)]
        count = 0
        for x, y in zip(classification, labels):
            if x == y:
                count += 1
        acc = float(count) / len(classification)
        accs.append(acc)
        print("accuracy: " + str(100 * acc) + "%")
    print("standard deviation: " + str(std_dev(accs)))
예제 #6
0
파일: naiveBayes.py 프로젝트: albert-001/CS
                classification.append(c)
                break
    return classification


if __name__ == "__main__":
    filename = sys.argv[1]

    data, attributes, target_attr = get_data(filename)
    n = len(data)

    accs = []
    for i in range(5):
        valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 *
                                                    (i + 1))]  #validation data
        train_data = [d for d in data if not d in valid_data]  #training data
        default = vote([d[target_attr] for d in train_data])
        attrs = [a for a in attributes if not a == target_attr]
        classifier = create_naive_bayes(train_data, attrs, target_attr,
                                        classes[filename])
        classification = classify_naive_bayes(classifier, valid_data,
                                              classes[filename], attrs)
        labels = [d[target_attr] for d in valid_data]
        count = 0
        for x, y in zip(classification, labels):
            if x == y:
                count += 1
        acc = float(count) / len(classification)
        accs.append(acc)
        print("accuracy: " + str(100 * acc) + "%")
    print("standard deviation: " + str(std_dev(accs)))
예제 #7
0
파일: adaBoost.py 프로젝트: albert-001/CS
if __name__ == "__main__":
    filename = sys.argv[1]
    num_trees = int(sys.argv[2])
    classifier = int(sys.argv[3])

    data, attributes, target_attr = get_data(filename)
    n = len(data)

    accs = []
    for i in range(5):
        valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 *
                                                    (i + 1))]  #validation data
        train_data = [d for d in data if not d in valid_data]  #training data
        labels = [d[target_attr] for d in data]
        default = vote(labels)
        if classifier == 1:  #decision tree
            classifier_generator = create_decision_tree
            classifier_function = classify_decision_tree
            params = default
        else:  #naive bayes
            classifier_generator = create_naive_bayes
            classifier_function = classify_naive_bayes
            params = classes[filename]
        #train
        classifiers, weights = ada_boost_train(data, attributes, target_attr,
                                               classifier_generator, num_trees,
                                               classifier_function, params)
        #classify
        classification = ada_boost_classify(
            data, classifiers, weights, classifier_function, params,
예제 #8
0
파일: kNN.py 프로젝트: albert-001/CS
def kNN_classifier(neighbors, target_attr):
    return vote([n[target_attr] for n in neighbors])
예제 #9
0
import sys
from dtree import create_decision_tree, classify_decision_tree
from util import print_tree, vote, get_data, std_dev

if __name__ == "__main__":
    filename = sys.argv[1]

    data, attributes, target_attr = get_data(filename)
    n = len(data)

    accs = []
    for i in range(5):
        valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 *
                                                    (i + 1))]  #validation data
        train_data = [d for d in data if not d in valid_data]  #training data
        tree = create_decision_tree(train_data, attributes, target_attr)
        labels = [d[target_attr] for d in valid_data]
        classification = classify_decision_tree(tree, valid_data, vote(labels))
        count = 0
        for x, y in zip(classification, labels):
            if x == y:
                count += 1
        acc = float(count) / len(classification)
        accs.append(acc)
        print("accuracy: " + str(100 * acc) + "%")
    print("standard deviation: " + str(std_dev(accs)))