def main(argv=sys.argv): if len(argv) != 2 and len(argv) != 3: usage(argv) config_uri = argv[1] conf_parser_args = argv[2] conf_parser = dict((k, True) for k in conf_parser_args.split(',')) setup_logging(config_uri) settings = get_appsettings(config_uri) engine = engine_from_config(settings, 'sqlalchemy.') DBSession.configure(bind=engine) script_dir = os.path.dirname(__file__) rel_path_source_data = "./csv/" source_data_dir = os.path.join(script_dir, rel_path_source_data) csv_data = parse_data(source_data_dir) analyze(csv_data, conf_parser)
if len(sys.argv) < 2 or sys.argv[1] == "--help": print "Usage: control.py controlfile" print "Usage: control.py -lda trainingfile testfile" print " Control files look like: " print " First line: trainingfilename, testfilename" print " Any number of subsequent lines: epsilon,stepsize,iteration limit, restarts" print " Training and test files are CSV's with the output variable last" print " Results are stored in testfilename_resultX where X is the line number from the control CSV" sys.exit(0) #lda control here if sys.argv[1] == "-lda" or sys.argv[1] == "-bayes": if len(sys.argv) != 4: print "Usage: control.py (-lda|-bayes) trainingfile testfile" sys.exit(0) (x,y)= csv_parser.parse_data(sys.argv[2]) (testX, testY) = csv_parser.parse_data(sys.argv[3]) if sys.argv[1] == "-lda": runLDA(x,y, testX, testY, sys.argv[3]) else: runBayes(x,y, testX, testY, sys.argv[3]) sys.exit(0) #default is logistic regression (controls, training, test) = csv_parser.parse_control(sys.argv[1]) (x,y)= csv_parser.parse_data(training) (testX, testY) = csv_parser.parse_data(test) for i in xrange(len(controls)): params = controls[i]
if(y[i] == 0): train_x.append(x[i]) train_y.append(y[i]) del x[i] del y[i] num_positives = num_positives - 1 print "size of x: " + str(len(train_x)) + " size of y: " + str(len(train_y)) + "size of testset: " + str(len(testset[0])) #start off by doing feature selection #(results, temp_x_onlyspon, temp_x_congress, temp_x_nopers) = testfeaturesubsets(k, train_x, train_y, testset[0], testset[1]) (results) = testfeaturesubsets(k, train_x, train_y, testset[0], testset[1]) for set_name, algo_data in results.iteritems(): print (set_name + "\n LR train" + "\t" + str(algo_data[0]) + "\t accuracy: " + str(accuracycalc(algo_data[0])) + "\t accuracy: " + str(fcalc(algo_data[0])) + "\n LR valid" + "\t" + str(algo_data[1]) + "\t A: " + str(accuracycalc(algo_data[1])) + "\t F: " + str(fcalc(algo_data[1])) + "\n LR isolated" + "\t" + str(algo_data[2]) + "\t Acuracy: " + str(accuracycalc(algo_data[2])) + "\t F: " + str(fcalc(algo_data[2])) + "\n LDA train" + "\t" + str(algo_data[3]) + "\t A: " + str(accuracycalc(algo_data[3])) + "\t F: " + str(fcalc(algo_data[3])) + "\n LDA valid" + "\t" + str(algo_data[4]) + "\t A: " + str(accuracycalc(algo_data[4])) + "\t F: " + str(fcalc(algo_data[4])) + "\n LDA isolated" + "\t" + str(algo_data[5]) + "\t A: " + str(accuracycalc(algo_data[5])) + "\t F: " + str(fcalc(algo_data[5])) + "\n NB train" + "\t" + str(algo_data[6]) + "\t A: " + str(accuracycalc(algo_data[6])) + "\t F: " + str(fcalc(algo_data[6])) + "\n NB valid" + "\t" + str(algo_data[7]) + "\t A: " + str(accuracycalc(algo_data[7])) + "\t F: " + str(fcalc(algo_data[7])) + "\n NB isolated" + "\t" + str(algo_data[8]) + "\t A: " + str(accuracycalc(algo_data[8])) + "\t F: " + str(fcalc(algo_data[8])) + "\n") print "parsing..." (x,y) = csv_parser.parse_data("final_data.csv") print "parsed" kfolds_control(4, x, y)
#(results, temp_x_onlyspon, temp_x_congress, temp_x_nopers) = testfeaturesubsets(k, train_x, train_y, testset[0], testset[1]) (results) = testfeaturesubsets(k, train_x, train_y, testset[0], testset[1]) for set_name, algo_data in results.iteritems(): print(set_name + "\n LR train" + "\t" + str(algo_data[0]) + "\t accuracy: " + str(accuracycalc(algo_data[0])) + "\t accuracy: " + str(fcalc(algo_data[0])) + "\n LR valid" + "\t" + str(algo_data[1]) + "\t A: " + str(accuracycalc(algo_data[1])) + "\t F: " + str(fcalc(algo_data[1])) + "\n LR isolated" + "\t" + str(algo_data[2]) + "\t Acuracy: " + str(accuracycalc(algo_data[2])) + "\t F: " + str(fcalc(algo_data[2])) + "\n LDA train" + "\t" + str(algo_data[3]) + "\t A: " + str(accuracycalc(algo_data[3])) + "\t F: " + str(fcalc(algo_data[3])) + "\n LDA valid" + "\t" + str(algo_data[4]) + "\t A: " + str(accuracycalc(algo_data[4])) + "\t F: " + str(fcalc(algo_data[4])) + "\n LDA isolated" + "\t" + str(algo_data[5]) + "\t A: " + str(accuracycalc(algo_data[5])) + "\t F: " + str(fcalc(algo_data[5])) + "\n NB train" + "\t" + str(algo_data[6]) + "\t A: " + str(accuracycalc(algo_data[6])) + "\t F: " + str(fcalc(algo_data[6])) + "\n NB valid" + "\t" + str(algo_data[7]) + "\t A: " + str(accuracycalc(algo_data[7])) + "\t F: " + str(fcalc(algo_data[7])) + "\n NB isolated" + "\t" + str(algo_data[8]) + "\t A: " + str(accuracycalc(algo_data[8])) + "\t F: " + str(fcalc(algo_data[8])) + "\n") print "parsing..." (x, y) = csv_parser.parse_data("final_data.csv") print "parsed" kfolds_control(4, x, y)