def Supervised(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10, learner='svm_linear', boost=None): print("FILENAME: ", filename, "OLDFILES: ", len(old_files)) stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval read.seed = seed if boost: util.vote(read, clf_name=boost, seed=seed, all=False, temp=str(seed) + filename) return num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False if boost == None: read.train_supervised(learner, seed) pos, neg, total = read.get_numbers() if boost: read.query_boost() else: read.query_supervised() read.record['est'][0] = read.est_num while True: pos, neg, total = read.get_numbers() # try: # print("%d, %d, %d" %(pos,pos+neg, read.est_num)) # except: # print("%d, %d" %(pos,pos+neg)) if pos + neg >= total: break # if pos >= target and (pos+neg) >= total * .22 and read.enable_est and read.est_num*stopat<= pos: # break if boost: ids = read.query_boost()[:read.step] else: ids = read.query_supervised()[:read.step] read.code_batch(ids) return read
def Boosting(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10): print("FILENAME: ", filename, "OLDFILES: ", len(old_files)) stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename,old_files) read.step = step read.interval = interval util.vote(read) num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False pos, neg, total = read.get_numbers() read.query_boost() read.record['est'][0]= read.est_num while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" %(pos,pos+neg, read.est_num)) except: print("%d, %d" %(pos,pos+neg)) if pos + neg >= total: break if read.enable_est and read.est_num*stopat<= pos: break for id in read.query_boost()[:read.step]: read.code_error(id, error=error) return read
def create_decision_tree(data, attributes, target_attr, params=None): data = data[:] target_vals = [record[target_attr] for record in data] default = vote(target_vals) if len(data) <= 20 or len( attributes) <= 1: #we stop early to prevent overfitting return default elif target_vals.count( target_vals[0]) == len(target_vals): #all True or all False return target_vals[0] else: best = choose_attribute(data, attributes, target_attr) tree = {best: {}} for val in get_values(data, best): subtree = create_decision_tree(get_sub_dataset( data, best, val), [attr for attr in attributes if attr != best], target_attr, default) tree[best][val] = subtree return tree
list19_19 = np.argsort(prediction19_19)[0][::-1][:5] ''' print('target is {0}'.format(hashmap[testing_target])) print('prediction16 is ', list16) print('prediction19_10 is ', list19_10) print('prediction19_19 is ', list19_19) print('vote is ', util.vote(list16, list19_10, list19_19)[0]) print('vote top_5 are ', util.vote(list16, list19_10, list19_19)[1]) print('===========================') ''' if testing_target in hashmap: related += 1 target = hashmap[testing_target] if target == util.vote(list16, list19_10, list19_19)[0]: vote += 1 if target in util.vote(list16, list19_10, list19_19)[1]: top5 += 1 print("=================") print('total is {0}, related is {1}'.format(count, related)) print('accuracy is ', vote*1.0/related) print('top5 accuracy is ', top5*1.0/related)
if __name__ == "__main__": filename = sys.argv[1] num_trees = int(sys.argv[2]) data, attributes, target_attr = get_data(filename) n = len(data) accs = [] for i in range(5): valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 * (i + 1))] #validation data train_data = [d for d in data if not d in valid_data] #training data labels = [d[target_attr] for d in valid_data] trees = create_forest(data, attributes, target_attr, num_trees) #classify classes = [] for tree in trees: classification = classify_decision_tree(tree, valid_data, vote(labels)) classes.append(classification) classification = [vote(c) for c in zip(*classes)] count = 0 for x, y in zip(classification, labels): if x == y: count += 1 acc = float(count) / len(classification) accs.append(acc) print("accuracy: " + str(100 * acc) + "%") print("standard deviation: " + str(std_dev(accs)))
classification.append(c) break return classification if __name__ == "__main__": filename = sys.argv[1] data, attributes, target_attr = get_data(filename) n = len(data) accs = [] for i in range(5): valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 * (i + 1))] #validation data train_data = [d for d in data if not d in valid_data] #training data default = vote([d[target_attr] for d in train_data]) attrs = [a for a in attributes if not a == target_attr] classifier = create_naive_bayes(train_data, attrs, target_attr, classes[filename]) classification = classify_naive_bayes(classifier, valid_data, classes[filename], attrs) labels = [d[target_attr] for d in valid_data] count = 0 for x, y in zip(classification, labels): if x == y: count += 1 acc = float(count) / len(classification) accs.append(acc) print("accuracy: " + str(100 * acc) + "%") print("standard deviation: " + str(std_dev(accs)))
if __name__ == "__main__": filename = sys.argv[1] num_trees = int(sys.argv[2]) classifier = int(sys.argv[3]) data, attributes, target_attr = get_data(filename) n = len(data) accs = [] for i in range(5): valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 * (i + 1))] #validation data train_data = [d for d in data if not d in valid_data] #training data labels = [d[target_attr] for d in data] default = vote(labels) if classifier == 1: #decision tree classifier_generator = create_decision_tree classifier_function = classify_decision_tree params = default else: #naive bayes classifier_generator = create_naive_bayes classifier_function = classify_naive_bayes params = classes[filename] #train classifiers, weights = ada_boost_train(data, attributes, target_attr, classifier_generator, num_trees, classifier_function, params) #classify classification = ada_boost_classify( data, classifiers, weights, classifier_function, params,
def kNN_classifier(neighbors, target_attr): return vote([n[target_attr] for n in neighbors])
import sys from dtree import create_decision_tree, classify_decision_tree from util import print_tree, vote, get_data, std_dev if __name__ == "__main__": filename = sys.argv[1] data, attributes, target_attr = get_data(filename) n = len(data) accs = [] for i in range(5): valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 * (i + 1))] #validation data train_data = [d for d in data if not d in valid_data] #training data tree = create_decision_tree(train_data, attributes, target_attr) labels = [d[target_attr] for d in valid_data] classification = classify_decision_tree(tree, valid_data, vote(labels)) count = 0 for x, y in zip(classification, labels): if x == y: count += 1 acc = float(count) / len(classification) accs.append(acc) print("accuracy: " + str(100 * acc) + "%") print("standard deviation: " + str(std_dev(accs)))