Пример #1
0
def main():
    print("Give the details of user:"******"Who was your referrer(Slashdot,Google or someone else:)")
    faq = input("Did you read faqs?:")
    pages = int(input("How many pages did you visit:"))
    location = input("Enter your location:")
    list = [ref, location, faq, pages]
    treepredict.classify(list)
Пример #2
0
def classify(example, forest):
    #print "using forest to classify"

    counts = {}

    #count the results
    for i in range(0, len(forest)):
        #printtree(forest[0])
        r = treepredict.classify(example, forest[i])
        if r not in counts: counts[r] = 1
        counts[r] += 1

    winner_key = 0
    winner_value = 0
    pickrandom = False
    for key in counts.keys():
        if (counts[key] > winner_value):
            winner_key = key
            winner_value = counts[key]
        elif (counts[key] == winner_value):
            pickrandom = True

        if (pickrandom):
            winner_key = random.choice(counts.keys())

    #print counts

    return winner_key
Пример #3
0
def classify(example,forest):
  #print "using forest to classify"

  counts = {}

  #count the results
  for i in range(0,len(forest)):
    #printtree(forest[0])
    r = treepredict.classify(example,forest[i])
    if r not in counts: counts[r]=1
    counts[r]+=1

  
  winner_key = 0
  winner_value = 0
  pickrandom = False
  for key in counts.keys():
    if(counts[key] > winner_value):
      winner_key = key
      winner_value = counts[key]
    elif (counts[key] == winner_value):
      pickrandom = True

    if(pickrandom):
      winner_key = random.choice(counts.keys())
  
  #print counts

  return winner_key
Пример #4
0
Файл: lc.py Проект: mattc58/lc
    def run_tree(self, tree):
        '''
        Run the testing data against the tree
        '''
        # first get a sample to use for training and make a tree from it
        print "transforming test data"
        test_data = []
        for item in self.testing_data:
            test_data.append(self.normalize_data(item))

        print "running..."
        for item in test_data:
            guess = treepredict.classify(item[0:-1], tree)
            print "loan id=%s, results=%s" % (item[-1], guess)
Пример #5
0
Файл: lc.py Проект: mattc58/lc
    def test_tree(self, k=.2):
        '''
        Conduct a test of decision trees
        '''
        # first get a sample to use for training and make a tree from it
        print "making sample and training tree..."
        ids = random.sample(range(len(self.training_data)), int(k * len(self.training_data)))
        sample = []
        transform_all = []
        for i, item in enumerate(self.training_data):
            row = self.normalize_data(item)
            if i in ids:
                sample.append(row)
            else:
                transform_all.append(row)

        tree = self.make_tree(sample)

        # now go through the rest of all, seeing how it does
        num_false_positive = num_false_negative = num_right = 0

        print "testing..."
        for item in transform_all:
            status = item[-1]
            guess = treepredict.classify(item[1:-1], tree)

            # if we're right, record. if not, determine if false negative (ok) or false positive (bad)
            if status in guess:
                num_right += 1
            else:
                if status == 'GOOD':
                    num_false_negative += 1
                else:
                    num_false_positive += 1

        # display results
        num_processed = len(transform_all)
        print "sample size=%d, testing size=%d" % (len(sample), num_processed)
        print "%.2f correct" % ((float(num_right) / float(num_processed)) * 100.0)
        print "%.2f false negatives (kinda ok)" % ((float(num_false_negative) / float(num_processed)) * 100.0)
        print "%.2f false positives (bad)" % ((float(num_false_positive) / float(num_processed)) * 100.0)
Пример #6
0
def classify_with_several_trees(data, trees, original_attribute_list):
    predicted_results_all = {}
    for tree in trees:
        decision_tree = tree['tree']
        attributes_index = tree['attributes_index']
        re_organized_data = []  #根据当前决策树,重新组织数据(只留下这棵决策树用到的属性和标签)
        re_organized_attribute_value_list = []
        for index in attributes_index:
            re_organized_data.append(data[index])
            re_organized_attribute_value_list.append(
                original_attribute_list[index])
        re_organized_data.append(data[len(data) - 1])  #将标签加入重新组织的数据中
        predicted_results = treepredict.classify(
            re_organized_data, decision_tree,
            re_organized_attribute_value_list)
        #将当前这棵树的预测结果合并到总的预测结果中去
        for result in predicted_results.keys():
            predicted_results_all[result] = predicted_results_all.get(
                result, 0) + predicted_results[result]

    print('predicted_results_all:', predicted_results_all)
    single_label_result = treepredict.post_classify(predicted_results_all)
    return single_label_result
Пример #7
0
	cur_date = orig_date + timedelta
	str_date = cur_date.strftime('%y/%m/%d')
	str_date = "20"+str_date
	new_flights = fl_lines.get_flights_by_dpdate(str_date)
	test_flights.update(new_flights)

test_data = get_data(test_flights)
#writeintxt('test_data.txt', test_data)
print '测试数据读入完成,用时', time.clock()

corr = 0.0
err_shouldwait = 0.0
err_shouldbuy = 0.0
for row in test_data:
	obs = row[:-1]
	pred = treepredict.classify(obs, flighttree)
	pred = pred.keys()[0]
	#print pred
	if pred == row[-1]:
		corr = corr + 1
	elif pred == 'wait':
		#print '实际为,价格%s,%s' % (row[-2], row[-1])
		err_shouldbuy = err_shouldbuy + 1
	else:
		#print '实际为,价格%s,%s' % (row[-2], row[-1])
		err_shouldwait = err_shouldwait + 1

size = float(len(test_data))
print "正确率:", float(corr)/size
print '错误情况:应当买却没买%s,应当等却买了%s。' % (err_shouldbuy/size, err_shouldwait/size)
print '预测%s个数据用时%s' % (len(test_data), time.clock())
Пример #8
0
# #测试divideSet
# print treepredict.divideSet(data.my_data, 2, "yes")

# #测试经过训练后,基尼不纯度和熵的变化
# print treepredict.giniImpurity(data.my_data)
# print treepredict.entropy(data.my_data)
# set1,set2=treepredict.divideSet(data.my_data, 2, "yes")
# print treepredict.giniImpurity(set1)
# print treepredict.entropy(set1)

#测试buildTree
tree=treepredict.buildTree(data.my_data)
draw=DrawTree.DrawTree(tree,'treeview.jpg')
draw.drawTree()

# #使用classify函数进行预测
# tree=treepredict.buildTree(data.my_data)
# print treepredict.classify(['(direct)','USA','yes',5], tree)

#尝试剪枝函数,并绘图
tree=treepredict.buildTree(data.my_data)
treepredict.prune(tree, 1.0)
draw=DrawTree.DrawTree(tree,'treeview2.jpg')
draw.drawTree()

# #使用mdclassify函数进行预测
tree=treepredict.buildTree(data.my_data)
print treepredict.classify(['(direct)','USA','yes',5], tree)
print treepredict.mdclassify(['google',None,'yes',None], tree)
print treepredict.mdclassify(['google','France',None,None], tree)
Пример #9
0
agesonly = ad.loadmatch('agesonly.csv', allnum=True)
matchmaker = ad.loadmatch('matchmaker.csv')

# ad.plotagematches(agesonly)

age = []
for line in file('agesonly.csv'):
    l = []
    for w in line.split(','):
        l.append(int(w))
    age.append(l)
tree = tr.buildtree(age)
tr.printtree(tree)
tr.drawtree(tree)

print tr.classify(tree, [65, 63])

avgs = ad.lineartrain(agesonly)
print avgs

print ad.dpclassify([30, 25], avgs.values())
print ad.dpclassify([25, 40], avgs.values())
print ad.dpclassify([48, 20], avgs.values())

print tr.classify(tree, [30, 25])
print tr.classify(tree, [25, 40])
print tr.classify(tree, [48, 20])

numericalset = ad.loadnumerical()
numericalset[0].data
 def testBasics(self):
     t = treepredict.buildtree(treepredict.testdata())
     self.assertEquals(
         treepredict.classify(['(direct)', 'USA', 'yes', 5], t),
         {'Basic': 4})
Пример #11
0
import treepredict
# fruits with their colors and size
fruits = [[4, 'red', 'apple'], [4, 'green', 'apple'], [1, 'red', 'cherry'],
          [1, 'green', 'grape'], [5, 'red', 'apple']]
tree = treepredict.buildtree(fruits)
treepredict.classify([2, 'red'], tree)
treepredict.classify([5, 'red'], tree)
treepredict.classify([1, 'green'], tree)
treepredict.printtree(tree)
#treepredict.drawtree(tree, jpeg='treeview.jpg')
Пример #12
0
def Classifying_New_Observations():
  reload(treepredict)
  tree=treepredict.buildtree(treepredict.my_data)
  print '>>Classifying New Observations'
  print treepredict.classify(['(direct)','USA','yes',5],tree)
 def testBasics(self):
   t = treepredict.buildtree(treepredict.testdata())
   self.assertEquals(treepredict.classify(['(direct)', 'USA', 'yes', 5], t),
       {'Basic': 4})
Пример #14
0
    cur_date = orig_date + timedelta
    str_date = cur_date.strftime('%y/%m/%d')
    str_date = "20" + str_date
    new_flights = fl_lines.get_flights_by_dpdate(str_date)
    test_flights.update(new_flights)

test_data = get_data(test_flights)
#writeintxt('test_data.txt', test_data)
print '测试数据读入完成,用时', time.clock()

corr = 0.0
err_shouldwait = 0.0
err_shouldbuy = 0.0
for row in test_data:
    obs = row[:-1]
    pred = treepredict.classify(obs, flighttree)
    pred = pred.keys()[0]
    #print pred
    if pred == row[-1]:
        corr = corr + 1
    elif pred == 'wait':
        #print '实际为,价格%s,%s' % (row[-2], row[-1])
        err_shouldbuy = err_shouldbuy + 1
    else:
        #print '实际为,价格%s,%s' % (row[-2], row[-1])
        err_shouldwait = err_shouldwait + 1

size = float(len(test_data))
print "正确率:", float(corr) / size
print '错误情况:应当买却没买%s,应当等却买了%s。' % (err_shouldbuy / size, err_shouldwait / size)
print '预测%s个数据用时%s' % (len(test_data), time.clock())
Пример #15
0
import treepredict
# fruits with their colors and size
fruits = [
[4, 'red', 'apple'],
[4, 'green', 'apple'],
[1, 'red', 'cherry'],
[1, 'green', 'grape'],
[5, 'red', 'apple']
]
tree = treepredict.buildtree(fruits)
treepredict.classify([2, 'red'], tree)
treepredict.classify([5, 'red'], tree)
treepredict.classify([1, 'green'], tree)
treepredict.printtree(tree)
#treepredict.drawtree(tree, jpeg='treeview.jpg')
Пример #16
0
def do_simpletree_kcross_validation(fin,finy,kfolds):
    print "Starting k=" + str(kfolds)+" validation for Simple tree"
    #there is 2500 tracks
    labels = dt.get_lines(finy,int)
    pb = ProgBar()
    lines = dt.get_lines(fin,float," ", callback = pb.callback)
    del pb
    #normalize features
    
    lines = dt.transform_features(lines)
    data = dt.add_labels_to_lines(lines, labels)


    block_size = len(lines)/kfolds
    print "chunk size = " + str(block_size)
    example_chunks = list(dt.chunks(data, block_size))
    #labels_chunks = list(dt.chunks(labels, block_size))

   
    print "number of chunks = " +str(len(example_chunks))

    #holds avg accuracy for one forest
    accuracy_results = []

    for i in range(0,len(example_chunks)):

        #we leave set in index i out of train
        print "prepare validation set"
        validationdata = example_chunks[i]

        #extract validation chunk
        print "leaving out block " + str(i) + " for validation"
        leaveout = i
        validationdata = [ exampleentry(validationdata[i][0:len(validationdata[i])-1],validationdata[i][-1]) for i in range(0,len(validationdata)) ]
        
        trainingdata = []

        print("merging blocks "),
        for j in range(0,len(example_chunks)):
            if(j != leaveout):
                #print "j="+str(j) + " i="+ str(leaveout)
                print(str(j) + ","),
                trainingdata = trainingdata + example_chunks[j]

        print "\nprepare training set"

        print "training on " + str(len(trainingdata))
        print "each track has " + str(len(trainingdata[0])) + " features"

        tree = treepredict.buildtree(trainingdata)

        print "testing on " + str(len(validationdata))
        corrects = 0
        #classify a set of entries
        for example in validationdata:
            #print example.features
            result = treepredict.classify(example.features,tree)
            #print 'expected : ' + str(example.label) + ' result : '+ str(result)
            if(result == example.label):
                corrects = corrects + 1
        #calculate the % of accuracy
        accuracy_percentage = (corrects*100)/len(validationdata)
        print "accuracy = " + str(accuracy_percentage) + "%"
        accuracy_results.append(accuracy_percentage)
    avgcc = dt.average(accuracy_results)
    print "average accuracy ="+  str(avgcc) + "%"
def do_simpletree_kcross_validation(fin, finy, kfolds):
    print "Starting k=" + str(kfolds) + " validation for Simple tree"
    #there is 2500 tracks
    labels = dt.get_lines(finy, int)
    pb = ProgBar()
    lines = dt.get_lines(fin, float, " ", callback=pb.callback)
    del pb
    #normalize features

    lines = dt.transform_features(lines)
    data = dt.add_labels_to_lines(lines, labels)

    block_size = len(lines) / kfolds
    print "chunk size = " + str(block_size)
    example_chunks = list(dt.chunks(data, block_size))
    #labels_chunks = list(dt.chunks(labels, block_size))

    print "number of chunks = " + str(len(example_chunks))

    #holds avg accuracy for one forest
    accuracy_results = []

    for i in range(0, len(example_chunks)):

        #we leave set in index i out of train
        print "prepare validation set"
        validationdata = example_chunks[i]

        #extract validation chunk
        print "leaving out block " + str(i) + " for validation"
        leaveout = i
        validationdata = [
            exampleentry(validationdata[i][0:len(validationdata[i]) - 1],
                         validationdata[i][-1])
            for i in range(0, len(validationdata))
        ]

        trainingdata = []

        print("merging blocks "),
        for j in range(0, len(example_chunks)):
            if (j != leaveout):
                #print "j="+str(j) + " i="+ str(leaveout)
                print(str(j) + ","),
                trainingdata = trainingdata + example_chunks[j]

        print "\nprepare training set"

        print "training on " + str(len(trainingdata))
        print "each track has " + str(len(trainingdata[0])) + " features"

        tree = treepredict.buildtree(trainingdata)

        print "testing on " + str(len(validationdata))
        corrects = 0
        #classify a set of entries
        for example in validationdata:
            #print example.features
            result = treepredict.classify(example.features, tree)
            #print 'expected : ' + str(example.label) + ' result : '+ str(result)
            if (result == example.label):
                corrects = corrects + 1
        #calculate the % of accuracy
        accuracy_percentage = (corrects * 100) / len(validationdata)
        print "accuracy = " + str(accuracy_percentage) + "%"
        accuracy_results.append(accuracy_percentage)
    avgcc = dt.average(accuracy_results)
    print "average accuracy =" + str(avgcc) + "%"
Пример #18
0
    dt.drawtree(b, jpeg='treeview.jpg')

    #print("original_testset=",testSet)
    ############# Preparing Testing DataSet ##############
    testlabels = []
    for i in range(len(testSet)):
        label = testSet[i].pop(-1)
        testlabels.append(label)

    #print("testSet=",testSet)
    #print("testlabels=",testlabels)
    ############# Classification of Test Records ##############
    number = 0
    for i in range(len(testSet)):
        #print("\ntest_data",testSet[i])
        a = dt.classify(testSet[i], b)
        #print("a=",a)
        max = 0
        best = ""
        for key in a.keys():
            if a[key] > max:
                max = a[key]
                best = key
        #print("best=",best)
        #print("label=",testlabels[i])
        if (best == testlabels[i]):
            number = number + 1

    ############# Accuracy Calculations ##############
    accuracy = (number / len(testSet)) * 100
    final_acc += accuracy
# Build the DT recursively using the buildtree function; assumes
# last column/field is the classification attribute.

tree = treepredict.buildtree(treepredict.my_data)

# Let's see what it looks like...
print "\nFinal tree...\n"
treepredict.printtree(tree)

# Produce a png of the tree
treepredict.drawtree(tree, jpeg="sample_tree.jpg")
print "\npng of tree generated using PIL (Python Imaging Library) modules.\n"

# Let's classify an incoming record of '(direct), USA, yes, 5' ...
incoming = ["(direct)", "USA", "yes", 5]
print "Prediction of new record: ", treepredict.classify(incoming, tree)

# Let's see how the missing data classification via
# the "mdclassify" function performs on our sample data.

# Suppose the page field is mssing...
reload(treepredict)
missing1 = ["google", "France", None, None]
treepredict.mdclassify(missing1, tree)
print "Prediction when missing pages: ", treepredict.mdclassify(missing1, tree)

# Finally, what does pruning do with say a mingain = 0.9 ?
print "\nPruned tree...\n"
treepredict.prune(tree, 0.9)
treepredict.printtree(tree)
Пример #20
0
train_data_file = '.\\scene\\scene-train-tiny.arff'
test_data_file = '.\\scene\\scene-test-tiny.arff'
method = input('1 单标签;2 多个二类分类')
if method == '1':
    #读取训练集,建树(多标签转换成单标签)
    (attributes_list, label_value_list,train_data) = preprocessor.read_data(train_data_file, label_count, arff.DENSE)
    train_data = preprocessor.translate_label_multiclass(train_data, label_count)
    tree = treepredict.buildtree(train_data, attributes_list, label_value_list)
    treepredict.printtree(tree)

    #读取测试集,验证效果
    (test_attributes_list, test_label_value_list, test_data) = preprocessor.read_data(test_data_file, label_count, arff.DENSE)
    test_data_copy = copy.deepcopy(test_data)
    predicted_labels_list = []
    for row in test_data:
        result = treepredict.classify(row, tree, test_attributes_list)
        post_result = treepredict.post_classify(result)
        decoded_result = preprocessor.label_decoding(post_result)
        predicted_labels_list.append(decoded_result)

    hamming_loss = postprocessor.hamming_loss(test_data_copy, predicted_labels_list)
    print('hamming loss of merging labels:', hamming_loss)
else :
    #当做多个二类分类问题处理
    (attributes_list, label_value_list, train_data) = preprocessor.read_data(train_data_file, label_count, arff.DENSE)
    trees = []
    for label_index in range(0, label_count):   #建立label_count个决策树,每个决策树对应一个二类分类问题(Binary Classification)
        binary_data = preprocessor.translate_label_binary(train_data, label_count, label_index)
        print('numbers of attributes of binary_data', len(binary_data[1]))
        trees.append(treepredict.buildtree(binary_data, attributes_list, label_value_list))
        print('label index:', label_index)
Пример #21
0
#读取训练集,建树(多标签转换成单标签)
label_count = 6
# (attributes_list, label_list,train_data) = preprocessor.read_data('.\\scene\\scene-train-tiny.arff',
#                                                                   label_count, arff.DENSE)
# train_data = preprocessor.translate_label_multiclass(train_data, label_count)
# tree = treepredict.buildtree(train_data, attributes_list, label_list)
# treepredict.printtree(tree)
#
# #测试决策树文件读写
# tree_list = preprocessor.tree2array(tree)
# preprocessor.store_tree('.\\my_tree', tree_list)

#从文件中加载决策树
loaded_tree_list = preprocessor.load_tree('.\\my_tree')
loaded_tree = preprocessor.list2tree(loaded_tree_list)

#读取测试集,验证效果
(test_attributes_list, test_label_value_list,
 test_data) = preprocessor.read_data('.\\scene\\scene-test-tiny.arff',
                                     label_count, arff.DENSE)

results = []
for row in test_data:
    result = treepredict.classify(row, loaded_tree, test_label_value_list)
    print('predict result:', result, 'test case', row)
    post_result = treepredict.post_classify(result)
    results.append(preprocessor.label_decoding(post_result))
hammingloss = postprocessor.hamming_loss(test_data, results)
print('hamming loss:', hammingloss)