def main(args): #take argument trainfile = args[1] testfile = args[2] classlabel = int(args[3]) printWord = int(args[4]) #set train file an dtest file train = a.read_data(trainfile) test = a.read_data(testfile) #get top 2000 frequency fre = a.frequency(train) #if yes, print Words if (printWord == 1): a.printTopwords(fre) #create binary feature for boss data train = a.create_binary_feature(train,fre,classlabel) test = a.create_binary_feature(test,fre,classlabel) #get probability table based on train data prob_table,pYes,pNo = a.train_nbc(train) #use probability table for testing,and return result result = a.test_nbc(prob_table,test,pYes,pNo) #get test class label classlabel = [x[-1] for x in test] #use zero one difference figure out result diff = a.zero_onr_loss(result,classlabel) print "ZERO-ONE-LOSS {0}".format(diff)
def crossValidation(data): X = kfold(data, 10) for tss in [100, 250, 500, 1000, 2000]: print "tss = ", tss loss = list() for i in range(10): test_set = X[i] # take rest of data train_data = list() for j in range(10): if j != i: train_data += X[j] train_set = random.sample(train_data, tss) train, pYes, pNo = nbc.train_nbc(train_set) result = nbc.test_nbc(train, test_set, pYes, pNo) label = [x[-1] for x in test_set] diff = nbc.zero_one_loss(result, label) loss.append(diff) print loss print "mean: ", numpy.mean(loss) print "std error: ", standard_error(loss, 10)