示例#1
0
def run_test_v2(base_dir):
    global EXTRACTOR
    emails_folder = base_dir + '/emails/MTreviewd_Emails/'
    process_folder = base_dir + '/emails/MTreviewd_Emails_Processed/'
    csv_file = process_folder + 'results_summary.csv'
    csv_output = process_folder + 'predict_output'
    feature_extracion = process_folder + 'feature_extraction'
    classifier_name = process_folder + 'classifier'
    iteration = 10
    repetition = 5
    emails = get_all_emails(emails_folder)
    preprocess(emails, emails_folder, csv_file)
    for i in range(iteration):
        shuffle(emails)
        for r in range(repetition):
            training, testing = split_data(emails, repetition, r)
            output_extraction = feature_extracion + str(i) + '_' + str(r)
            extract_training_vectors(training, csv_file, output_extraction)
            classifier_r = classifier_name + str(i) + '_' + str(r)
            extraction.EXTRACTOR = train(init(), output_extraction,
                                         classifier_r)
            output_file = csv_output + str(i) + '_' + str(r) + '.csv'
            csv_predict(testing, csv_file, output_file)
示例#2
0
def train_model():
    """ retrain model and persist """
    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
示例#3
0
def run_test(base_dir):
    start = timeit.default_timer()
    emails_folder = base_dir + '/emails/body'
    source_folder = base_dir + '/emails/total/'
    stat_folder = base_dir + '/tmp/statistics/'
    dataset_filename = base_dir + '/tmp/trained_model/extraction_'
    performance_filename = base_dir + '/tmp/predictions/performance_'
    classifier_dir = base_dir + '/tmp/classifiers/'
    global EXTRACTOR

    iteration = 10
    repetition = 5
    emails = get_all_emails(emails_folder)

    brute_p = []
    brute_r = []
    brute_f = []
    ml_p = []
    ml_r = []
    ml_f = []

    for i in range(iteration):
        shuffle(emails)
        tmp_dataset_filename = dataset_filename + str(i) + '_'
        tmp_performance_filename = performance_filename + str(i) + '_'
        for r in range(repetition):
            training, testing = split_data(emails, repetition, r)
            #print len(test),len(train)
            extraction_filename = build_extraction_dataset(
                str(r), source_folder, training, tmp_dataset_filename)

            classifier_name = classifier_dir + str(i) + '_' + str(r)
            extraction.EXTRACTOR = train(init(), extraction_filename,
                                         classifier_name)
            #talon.init()

            brute_result, ml_result = predict(str(r), base_dir, testing,
                                              tmp_performance_filename)
            brute_precision, brute_recall, brute_f_score = statistics(
                str(r), brute_result, source_folder, stat_folder, 'brute')
            ml_precision, ml_recall, ml_f_score = statistics(
                str(r), ml_result, source_folder, stat_folder, 'ml')
            print 'i:\t p:\t r:\t f:\t'
            print str(i * iteration + r) + '\t' + str(round(
                ml_precision, 4)) + '\t' + str(round(
                    ml_recall, 4)) + '\t' + str(round(ml_f_score, 4))
            brute_p.append(brute_precision)
            brute_r.append(brute_recall)
            brute_f.append(brute_f_score)
            ml_p.append(ml_precision)
            ml_r.append(ml_recall)
            ml_f.append(ml_f_score)
    brute_p_iqr = np.subtract(*np.percentile(brute_p, [75, 25]))
    brute_r_iqr = np.subtract(*np.percentile(brute_r, [75, 25]))
    brute_f_iqr = np.subtract(*np.percentile(brute_f, [75, 25]))
    ml_p_iqr = np.subtract(*np.percentile(ml_p, [75, 25]))
    ml_r_iqr = np.subtract(*np.percentile(ml_r, [75, 25]))
    ml_f_iqr = np.subtract(*np.percentile(ml_f, [75, 25]))
    brute_p = map(lambda x: round(x, 3), brute_p)
    brute_r = map(lambda x: round(x, 3), brute_r)
    brute_f = map(lambda x: round(x, 3), brute_f)
    ml_p = map(lambda x: round(x, 3), ml_p)
    ml_r = map(lambda x: round(x, 3), ml_r)
    ml_f = map(lambda x: round(x, 3), ml_f)

    print brute_f
    print brute_f_iqr
    print ml_f
    print ml_f_iqr
    with open(base_dir + '/tmp/FINAL_RESULTS', 'w') as fr:
        fr.write('Brute-Force:\n')
        fr.write('p:\n' + str(brute_p) + '\n')
        fr.write('r:\n' + str(brute_r) + '\n')
        fr.write('f:\n' + str(brute_f) + '\n')
        fr.write('p_median = ' + str(round(np.median(brute_p), 4)) + '\n')
        fr.write('r_median = ' + str(round(np.median(brute_r), 4)) + '\n')
        fr.write('f_median = ' + str(round(np.median(brute_f), 4)) + '\n')
        fr.write('p_iqr =  ' + str(round(brute_p_iqr, 4)) + '\n')
        fr.write('r_iqr: ' + str(round(brute_r_iqr, 4)) + '\n')
        fr.write('f_iqr: ' + str(round(brute_f_iqr, 4)) + '\n')
        fr.write('\n******************\n')
        fr.write('Machine Learning:\n')
        fr.write('p:\n' + str(ml_p) + '\n')
        fr.write('r:\n' + str(ml_r) + '\n')
        fr.write('f:\n' + str(ml_f) + '\n')
        fr.write('p_median = ' + str(round(np.median(ml_p), 4)) + '\n')
        fr.write('r_median = ' + str(round(np.median(ml_r), 4)) + '\n')
        fr.write('f_median = ' + str(round(np.median(ml_f), 4)) + '\n')
        fr.write('p_iqr: ' + str(round(ml_p_iqr, 4)) + '\n')
        fr.write('r_iqr: ' + str(round(ml_r_iqr, 4)) + '\n')
        fr.write('f_iqr: ' + str(round(ml_f_iqr, 4)) + '\n')
        fr.write('\n******************\n')
        runtime = timeit.default_timer() - start
        fr.write('Iteration: ' + str(iteration) + ' Bin Repetition: ' +
                 str(repetition) + ' Total Runtime:' + str(runtime))
    print runtime