if sys.argv[1] == "-lda" or sys.argv[1] == "-bayes": if len(sys.argv) != 4: print "Usage: control.py (-lda|-bayes) trainingfile testfile" sys.exit(0) (x,y)= csv_parser.parse_data(sys.argv[2]) (testX, testY) = csv_parser.parse_data(sys.argv[3]) if sys.argv[1] == "-lda": runLDA(x,y, testX, testY, sys.argv[3]) else: runBayes(x,y, testX, testY, sys.argv[3]) sys.exit(0) #default is logistic regression (controls, training, test) = csv_parser.parse_control(sys.argv[1]) (x,y)= csv_parser.parse_data(training) (testX, testY) = csv_parser.parse_data(test) for i in xrange(len(controls)): params = controls[i] resultfile = open(test + "_result" + str(i), 'w') resultfile.write('"WeightVector","ConfusionMatrix"\n') for j in xrange(int(math.floor(params[3]))): result= logisticregression.trainLogisticReg(params[0],params[1], int(params[2]), x, y) confusion = logisticregression.getConfusionMatrix(result, testX,testY) resultfile.write(str(result) + "," + str(confusion)+"\n") resultfile.close()
def kfolds_all_algos(k, x, y, train_subjects, isotest_x, isotest_y, isotest_words): def word_count(examples, keep): words = {} for ex in examples: for subject in ex: if subject in words: words[subject] += 1 else: words[subject] = 1 sorted_x = sorted(words.iteritems(), key=operator.itemgetter(1)) sorted_x.reverse() if keep > len(sorted_x): return words.keys() limit = sorted_x[keep][1] print limit ret_words = [] for key in words: if words[key] >= limit: ret_words.append(key) return ret_words k_groups = splitdata(k, x, y, train_subjects) #now we have the k groups, assign each one as test once and run tests! print "groups split" lda_train_results = [] lda_test_results = [] #lda_iso_results = [] nb_train_results = [] nb_test_results = [] #nb_iso_results = [] lr_train_results = [] lr_test_results = [] #r_iso_results = [] for i in xrange(k): print "K Fold number " + str(i) test = k_groups[i] train = [] train.append([]) #x train.append([]) #y train.append([]) #words for j in xrange(k): if(j != i): train[0].extend(k_groups[j][0]) train[1].extend(k_groups[j][1]) train[2].extend(k_groups[j][2]) # Perform a word count of the test data and get 15 top subjects. top_subjects = word_count(train[2], 50) # Extend train[0] and test to contain features for the top subjects. for i in xrange(0, len(train[0])): if type(train[0][i]) != list: train[0][i] = train[0][i].tolist() for i in xrange(0, len(train[0])): subjects = train[2][i] bits = [0] * len(top_subjects) for s in subjects: if s in top_subjects: bits[top_subjects.index(s)] = 1 train[0][i].extend(bits) for i in xrange(0, len(test[0])): subjects = test[2][i] bits = [0] * len(top_subjects) for s in subjects: if s in top_subjects: bits[top_subjects.index(s)] = 1 test[0][i].extend(bits) #Now we have test and training data... what shall we do? #train on LDA #print "Training LDA..." #(prob, mean, cov) = lda.trainLDA(train[0], train[1]) #print str(prob) + "\t" + str(mean) + "\t" + str(cov) #print "DONE training LDA." print "Training NB..." (py, theta) = naivebayes.trainNaiveBayesMN(train[0], train[1]) #print str(py) + "\t" + str(theta) print "DONE training NB" print "Training Logistic Regression..." t_x = copy.deepcopy(train[0]) for i in xrange(len(t_x)): temp_row = [1] temp_row.extend(t_x[i]) t_x[i] = temp_row (wvector, scales) = logisticregression.trainLogisticReg(0.01, 0.00001, 100, copy.deepcopy(t_x), copy.deepcopy(train[1])) #print str(wvector) print "DONE training Logistic Regression.\n" lr_model = linmod.LogisticRegression() lr_model.fit(t_x, train[1]) for model, name in ((lr_model, "LR"),): tp, tn, fp, fn = 0, 0, 0, 0 for i in xrange(0, len(t_x)): val = model.predict(t_x[i]) if (val == 1 and train[1][i] == 1): tp += 1 elif (val == 1 and train[1][i] == 0): fp += 1 elif (val == 0 and train[1][i] == 0): tn += 1 elif (val == 0 and train[1][i] == 1): fn += 1 print "%s - TP: %d, FP: %d, TN: %d, FN: %d" % (name, tp, fp, tn, fn) #get Prediction Errors on left out set lr_test_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(test[0]), copy.deepcopy(test[1])) lr_train_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(train[0]), copy.deepcopy(train[1])) #lr_iso_error = logisticregression.getConfusionMatrix(wvector,scales, isotest_x, isotest_y) #lda_test_error = lda.getConfusionMatrix(prob, mean, cov, test[0], test[1]) #lda_train_error = lda.getConfusionMatrix(prob, mean, cov, train[0], train[1]) #lda_iso_error = lda.getConfusionMatrix(prob, mean, cov, isotest_x, isotest_y) nb_test_error = naivebayes.getConfusionMatrixMN(py, theta, test[0], test[1]) nb_train_error = naivebayes.getConfusionMatrixMN(py, theta, train[0], train[1]) #nb_iso_error = naivebayes.getConfusionMatrixMN(py, theta, isotest_x, isotest_y) #add to sets the false positives (for now) lr_train_results.append(lr_train_error) lr_test_results.append(lr_test_error) #lr_iso_results.append(lr_iso_error) #lda_train_results.append(lda_train_error) #lda_test_results.append(lda_test_error) #lda_iso_results.append(lda_iso_error) nb_train_results.append(nb_train_error) nb_test_results.append(nb_test_error) #nb_iso_results.append(nb_iso_error) #calc average training and test error for each algorithm #avr_lda_train = averageconfusionmatrix(lda_train_results) #avr_lda_test = averageconfusionmatrix(lda_test_results) #avr_lda_iso = averageconfusionmatrix(lda_iso_results) avr_lr_train = averageconfusionmatrix(lr_train_results) avr_lr_test = averageconfusionmatrix(lr_test_results) #avr_lr_iso = averageconfusionmatrix(lr_iso_results) avr_nb_train = averageconfusionmatrix(nb_train_results) avr_nb_test = averageconfusionmatrix(nb_test_results) #avr_nb_iso = averageconfusionmatrix(nb_iso_results) #return [avr_lr_train, avr_lr_test, avr_lr_iso, avr_lda_train, avr_lda_test, avr_lda_iso, avr_nb_train, avr_nb_test, avr_nb_iso] #return [avr_lr_train, avr_lr_test, avr_lda_train, avr_lda_test, avr_nb_train, avr_nb_test] return [avr_lr_train, avr_lr_test, avr_nb_train, avr_nb_test]
def kfolds_all_algos(k, x, y, isotest_x, isotest_y): k_groups = splitdata(k, x, y) #now we have the k groups, assign each one as test once and run tests! print "groups split" lda_train_results = [] lda_test_results = [] lda_iso_results = [] nb_train_results = [] nb_test_results = [] nb_iso_results = [] lr_train_results = [] lr_test_results = [] lr_iso_results = [] for i in xrange(k): print "K Fold number " + str(i) test = k_groups[i] train = [] train.append([]) #x train.append([]) #y for j in xrange(k): if(j != i): train[0].extend(k_groups[j][0]) train[1].extend(k_groups[j][1]) #Now we have test and training data... what shall we do? #train on LDA print "Training LDA..." (prob, mean, cov) = lda.trainLDA(copy.deepcopy(train[0]), copy.deepcopy(train[1])) #print str(prob) + "\t" + str(mean) + "\t" + str(cov) print "DONE training LDA." print "Training NB..." (py, theta) = naivebayes.trainNaiveBayesMN(copy.deepcopy(train[0]), copy.deepcopy(train[1])) #print str(py) + "\t" + str(theta) print "DONE training NB" print "Training Logistic Regression..." t_x = copy.deepcopy(train[0]) for i in xrange(len(t_x)): temp_row = [1] temp_row.extend(t_x[i]) t_x[i] = temp_row (wvector, scales) = logisticregression.trainLogisticReg(0.01, 0.00001, 100, t_x, train[1]) #print str(wvector) print "DONE training Logistic Regression.\n" #lr_model = linmod.LogisticRegression() #lr_model.fit(t_x, train[1]) #for model, name in ((lr_model, "LR"),): # tp, tn, fp, fn = 0, 0, 0, 0 # for i in xrange(0, len(t_x)): # val = model.predict(t_x[i]) # if (val == 1 and train[1][i] == 1): # tp += 1 # elif (val == 1 and train[1][i] == 0): # fp += 1 # elif (val == 0 and train[1][i] == 0): # tn += 1 # elif (val == 0 and train[1][i] == 1): # fn += 1 # print "%s - TP: %d, FP: %d, TN: %d, FN: %d" % (name, tp, fp, tn, fn) #get Prediction Errors on left out set lr_test_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(test[0]), copy.deepcopy(test[1])) lr_train_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(train[0]), copy.deepcopy(train[1])) lr_iso_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y)) lda_test_error = lda.getConfusionMatrix(prob, mean, cov, copy.deepcopy(test[0]), copy.deepcopy(test[1])) lda_train_error = lda.getConfusionMatrix(prob, mean, cov, copy.deepcopy(train[0]), copy.deepcopy(train[1])) lda_iso_error = lda.getConfusionMatrix(prob, mean, cov, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y)) nb_test_error = naivebayes.getConfusionMatrixMN(py, theta, copy.deepcopy(test[0]), copy.deepcopy(test[1])) nb_train_error = naivebayes.getConfusionMatrixMN(py, theta, copy.deepcopy(train[0]), copy.deepcopy(train[1])) nb_iso_error = naivebayes.getConfusionMatrixMN(py, theta, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y)) #add to sets the false positives (for now) lr_train_results.append(lr_train_error) lr_test_results.append(lr_test_error) lr_iso_results.append(lr_iso_error) lda_train_results.append(lda_train_error) lda_test_results.append(lda_test_error) lda_iso_results.append(lda_iso_error) nb_train_results.append(nb_train_error) nb_test_results.append(nb_test_error) nb_iso_results.append(nb_iso_error) #calc average training and test error for each algorithm avr_lda_train = averageconfusionmatrix(lda_train_results) avr_lda_test = averageconfusionmatrix(lda_test_results) avr_lda_iso = averageconfusionmatrix(lda_iso_results) avr_lr_train = averageconfusionmatrix(lr_train_results) avr_lr_test = averageconfusionmatrix(lr_test_results) avr_lr_iso = averageconfusionmatrix(lr_iso_results) avr_nb_train = averageconfusionmatrix(nb_train_results) avr_nb_test = averageconfusionmatrix(nb_test_results) avr_nb_iso = averageconfusionmatrix(nb_iso_results) return [avr_lr_train, avr_lr_test, avr_lr_iso, avr_lda_train, avr_lda_test, avr_lda_iso, avr_nb_train, avr_nb_test, avr_nb_iso]
def kfolds_all_algos(k, x, y, isotest_x, isotest_y): k_groups = splitdata(k, x, y) #now we have the k groups, assign each one as test once and run tests! print "groups split" lda_train_results = [] lda_test_results = [] lda_iso_results = [] nb_train_results = [] nb_test_results = [] nb_iso_results = [] lr_train_results = [] lr_test_results = [] lr_iso_results = [] for i in xrange(k): print "K Fold number " + str(i) test = k_groups[i] train = [] train.append([]) #x train.append([]) #y for j in xrange(k): if (j != i): train[0].extend(k_groups[j][0]) train[1].extend(k_groups[j][1]) #Now we have test and training data... what shall we do? #train on LDA print "Training LDA..." (prob, mean, cov) = lda.trainLDA(copy.deepcopy(train[0]), copy.deepcopy(train[1])) #print str(prob) + "\t" + str(mean) + "\t" + str(cov) print "DONE training LDA." print "Training NB..." (py, theta) = naivebayes.trainNaiveBayesMN(copy.deepcopy(train[0]), copy.deepcopy(train[1])) #print str(py) + "\t" + str(theta) print "DONE training NB" print "Training Logistic Regression..." t_x = copy.deepcopy(train[0]) for i in xrange(len(t_x)): temp_row = [1] temp_row.extend(t_x[i]) t_x[i] = temp_row (wvector, scales) = logisticregression.trainLogisticReg(0.01, 0.00001, 100, t_x, train[1]) #print str(wvector) print "DONE training Logistic Regression.\n" #lr_model = linmod.LogisticRegression() #lr_model.fit(t_x, train[1]) #for model, name in ((lr_model, "LR"),): # tp, tn, fp, fn = 0, 0, 0, 0 # for i in xrange(0, len(t_x)): # val = model.predict(t_x[i]) # if (val == 1 and train[1][i] == 1): # tp += 1 # elif (val == 1 and train[1][i] == 0): # fp += 1 # elif (val == 0 and train[1][i] == 0): # tn += 1 # elif (val == 0 and train[1][i] == 1): # fn += 1 # print "%s - TP: %d, FP: %d, TN: %d, FN: %d" % (name, tp, fp, tn, fn) #get Prediction Errors on left out set lr_test_error = logisticregression.getConfusionMatrix( wvector, scales, copy.deepcopy(test[0]), copy.deepcopy(test[1])) lr_train_error = logisticregression.getConfusionMatrix( wvector, scales, copy.deepcopy(train[0]), copy.deepcopy(train[1])) lr_iso_error = logisticregression.getConfusionMatrix( wvector, scales, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y)) lda_test_error = lda.getConfusionMatrix(prob, mean, cov, copy.deepcopy(test[0]), copy.deepcopy(test[1])) lda_train_error = lda.getConfusionMatrix(prob, mean, cov, copy.deepcopy(train[0]), copy.deepcopy(train[1])) lda_iso_error = lda.getConfusionMatrix(prob, mean, cov, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y)) nb_test_error = naivebayes.getConfusionMatrixMN( py, theta, copy.deepcopy(test[0]), copy.deepcopy(test[1])) nb_train_error = naivebayes.getConfusionMatrixMN( py, theta, copy.deepcopy(train[0]), copy.deepcopy(train[1])) nb_iso_error = naivebayes.getConfusionMatrixMN( py, theta, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y)) #add to sets the false positives (for now) lr_train_results.append(lr_train_error) lr_test_results.append(lr_test_error) lr_iso_results.append(lr_iso_error) lda_train_results.append(lda_train_error) lda_test_results.append(lda_test_error) lda_iso_results.append(lda_iso_error) nb_train_results.append(nb_train_error) nb_test_results.append(nb_test_error) nb_iso_results.append(nb_iso_error) #calc average training and test error for each algorithm avr_lda_train = averageconfusionmatrix(lda_train_results) avr_lda_test = averageconfusionmatrix(lda_test_results) avr_lda_iso = averageconfusionmatrix(lda_iso_results) avr_lr_train = averageconfusionmatrix(lr_train_results) avr_lr_test = averageconfusionmatrix(lr_test_results) avr_lr_iso = averageconfusionmatrix(lr_iso_results) avr_nb_train = averageconfusionmatrix(nb_train_results) avr_nb_test = averageconfusionmatrix(nb_test_results) avr_nb_iso = averageconfusionmatrix(nb_iso_results) return [ avr_lr_train, avr_lr_test, avr_lr_iso, avr_lda_train, avr_lda_test, avr_lda_iso, avr_nb_train, avr_nb_test, avr_nb_iso ]