def run_test_v2(base_dir): global EXTRACTOR emails_folder = base_dir + '/emails/MTreviewd_Emails/' process_folder = base_dir + '/emails/MTreviewd_Emails_Processed/' csv_file = process_folder + 'results_summary.csv' csv_output = process_folder + 'predict_output' feature_extracion = process_folder + 'feature_extraction' classifier_name = process_folder + 'classifier' iteration = 10 repetition = 5 emails = get_all_emails(emails_folder) preprocess(emails, emails_folder, csv_file) for i in range(iteration): shuffle(emails) for r in range(repetition): training, testing = split_data(emails, repetition, r) output_extraction = feature_extracion + str(i) + '_' + str(r) extract_training_vectors(training, csv_file, output_extraction) classifier_r = classifier_name + str(i) + '_' + str(r) extraction.EXTRACTOR = train(init(), output_extraction, classifier_r) output_file = csv_output + str(i) + '_' + str(r) + '.csv' csv_predict(testing, csv_file, output_file)
def train_model(): """ retrain model and persist """ train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
def run_test(base_dir): start = timeit.default_timer() emails_folder = base_dir + '/emails/body' source_folder = base_dir + '/emails/total/' stat_folder = base_dir + '/tmp/statistics/' dataset_filename = base_dir + '/tmp/trained_model/extraction_' performance_filename = base_dir + '/tmp/predictions/performance_' classifier_dir = base_dir + '/tmp/classifiers/' global EXTRACTOR iteration = 10 repetition = 5 emails = get_all_emails(emails_folder) brute_p = [] brute_r = [] brute_f = [] ml_p = [] ml_r = [] ml_f = [] for i in range(iteration): shuffle(emails) tmp_dataset_filename = dataset_filename + str(i) + '_' tmp_performance_filename = performance_filename + str(i) + '_' for r in range(repetition): training, testing = split_data(emails, repetition, r) #print len(test),len(train) extraction_filename = build_extraction_dataset( str(r), source_folder, training, tmp_dataset_filename) classifier_name = classifier_dir + str(i) + '_' + str(r) extraction.EXTRACTOR = train(init(), extraction_filename, classifier_name) #talon.init() brute_result, ml_result = predict(str(r), base_dir, testing, tmp_performance_filename) brute_precision, brute_recall, brute_f_score = statistics( str(r), brute_result, source_folder, stat_folder, 'brute') ml_precision, ml_recall, ml_f_score = statistics( str(r), ml_result, source_folder, stat_folder, 'ml') print 'i:\t p:\t r:\t f:\t' print str(i * iteration + r) + '\t' + str(round( ml_precision, 4)) + '\t' + str(round( ml_recall, 4)) + '\t' + str(round(ml_f_score, 4)) brute_p.append(brute_precision) brute_r.append(brute_recall) brute_f.append(brute_f_score) ml_p.append(ml_precision) ml_r.append(ml_recall) ml_f.append(ml_f_score) brute_p_iqr = np.subtract(*np.percentile(brute_p, [75, 25])) brute_r_iqr = np.subtract(*np.percentile(brute_r, [75, 25])) brute_f_iqr = np.subtract(*np.percentile(brute_f, [75, 25])) ml_p_iqr = np.subtract(*np.percentile(ml_p, [75, 25])) ml_r_iqr = np.subtract(*np.percentile(ml_r, [75, 25])) ml_f_iqr = np.subtract(*np.percentile(ml_f, [75, 25])) brute_p = map(lambda x: round(x, 3), brute_p) brute_r = map(lambda x: round(x, 3), brute_r) brute_f = map(lambda x: round(x, 3), brute_f) ml_p = map(lambda x: round(x, 3), ml_p) ml_r = map(lambda x: round(x, 3), ml_r) ml_f = map(lambda x: round(x, 3), ml_f) print brute_f print brute_f_iqr print ml_f print ml_f_iqr with open(base_dir + '/tmp/FINAL_RESULTS', 'w') as fr: fr.write('Brute-Force:\n') fr.write('p:\n' + str(brute_p) + '\n') fr.write('r:\n' + str(brute_r) + '\n') fr.write('f:\n' + str(brute_f) + '\n') fr.write('p_median = ' + str(round(np.median(brute_p), 4)) + '\n') fr.write('r_median = ' + str(round(np.median(brute_r), 4)) + '\n') fr.write('f_median = ' + str(round(np.median(brute_f), 4)) + '\n') fr.write('p_iqr = ' + str(round(brute_p_iqr, 4)) + '\n') fr.write('r_iqr: ' + str(round(brute_r_iqr, 4)) + '\n') fr.write('f_iqr: ' + str(round(brute_f_iqr, 4)) + '\n') fr.write('\n******************\n') fr.write('Machine Learning:\n') fr.write('p:\n' + str(ml_p) + '\n') fr.write('r:\n' + str(ml_r) + '\n') fr.write('f:\n' + str(ml_f) + '\n') fr.write('p_median = ' + str(round(np.median(ml_p), 4)) + '\n') fr.write('r_median = ' + str(round(np.median(ml_r), 4)) + '\n') fr.write('f_median = ' + str(round(np.median(ml_f), 4)) + '\n') fr.write('p_iqr: ' + str(round(ml_p_iqr, 4)) + '\n') fr.write('r_iqr: ' + str(round(ml_r_iqr, 4)) + '\n') fr.write('f_iqr: ' + str(round(ml_f_iqr, 4)) + '\n') fr.write('\n******************\n') runtime = timeit.default_timer() - start fr.write('Iteration: ' + str(iteration) + ' Bin Repetition: ' + str(repetition) + ' Total Runtime:' + str(runtime)) print runtime