예제 #1
0
def create_zero_trip_files(folder):
    import os
    from parsers import CVOutputParser
    """
    Iterate .tsv file and only write estimates where the triple was 0 in the sample
    to another file.
    """
    iteration = 0
    while True:
        max_ent_file = folder + str(iteration) + '_data.tsv'
        
        if not os.path.exists(max_ent_file):
            break

        # Cleaned file name
        max_ent_zero_trips_file = folder + str(iteration) + '_data_zero_trips.tsv'
        fd = open(max_ent_zero_trips_file, 'w')

        # write header
        fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')

        # Clean
        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file):

            s1, s2, s3, s12, s13, s23, s123 = triangle

            if s123 != 0:
                continue

            fd.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')

        fd.close()

        iteration += 1
예제 #2
0
def calc_avg_errors(output_folder):

    from parsers import CVOutputParser
    from utils import interpolate, avg
    import math
    from collections import Counter
    import os
    """ 
    Average error calculation on CV output.
    """
    if not output_folder[-1] == '/':
        output_folder += '/'
    
    # better_than_baseline_file = open('better_than_base_line.tsv', 'w')
    # better_than_baseline_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')

    # small_error_file = open('small_error.tsv', 'w')
    # small_error_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')    
    baseline = 88.5
    iteration = 0
    points_evaluated = 0
    over_estimates = 0
    all_sample_errors = []
    while True:
        tsv_file = output_folder + str(iteration) + '_data_zero_trips.tsv'

        if not os.path.exists(tsv_file):
            break

        sample_erros = []
        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(tsv_file):

            s1, s2, s3, s12, s13, s23, s123 = triangle

            # if int(obs) < 200 or s123 == 0:
            #     continue

            # Heurestiv for extrapolation, 200000 in sample
            # est = min(s12, s13, s23) / 200000. * (21006480-200000)

            points_evaluated += 1
            if est > obs:
                over_estimates += 1

            # if obs > baseline:
            #     if abs(est-obs) < abs(est-baseline):
            #         better_than_baseline_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')

            error = abs(est-obs) / math.sqrt(obs)
            # if error < 3:
            #     small_error_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')
            sample_erros.append(error)
        all_sample_errors.append(avg(sample_erros))
        iteration += 1

    # better_than_baseline_file.close()
    # small_error_file.close()

    avg_error = avg(all_sample_errors)
    print 'avg_error ', avg_error
    print 'points evaluated', points_evaluated
    print 'over estimates: ', over_estimates
    return avg_error, all_sample_errors
예제 #3
0
def merge_sample(folder):
    import os
    from parsers import CVOutputParser
    """
    Creates a single a single .tsv file with maxent and extrapolation
    results.
    """

    iteration = 0
    maxent_estimates = []
    while True:
        max_ent_file = folder + str(iteration) + '_data.tsv'
        
        if not os.path.exists(max_ent_file):
            break

        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file):
            s1, s2, s3, s12, s13, s23, s123 = triangle
            maxent_estimates.append(est)
        iteration += 1
        print 'iteration ', iteration


    # merged file name
    merged_file = folder + 'merged_estimates.tsv'
    fd = open(merged_file, 'wr')
    # write header
    fd.write('est\text\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')

    iteration = 0
    estimate_number = 0
    while True:
        ext_file = folder + str(iteration) + '_data_extrapolation.tsv'
        
        if not os.path.exists(ext_file):
            break

        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_file):
            s1, s2, s3, s12, s13, s23, s123 = triangle
            fd.write(str(maxent_estimates[estimate_number]) + '\t' + str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')
            estimate_number += 1

        iteration += 1
        print 'iteration ', iteration
    fd.close()
    print 'merging files done'


# def confidence_interval():
    # in some sample, we calculate the % errors.
    # This gives some distribution
    # calculate errors as a percentage, currently absolute errors
    # have the issue of big estimates weigh too much
    # cross validate max ent estimate on som subset of triplets
    # how good/bad is this estiamte? Confidence interval,
    # ie with 95% percent change our error mean is only x std. from true
    # error mean.

    # Can this confidence tell us anything about how well a sample will work
    # ie how good estiamtes are?

    # Could we calcualte this for both max_ent and extrapolation and 
    # find some threshold there?

    # Varianse in the data. The distribution of the sample has some variance
    # ie frequnecy on item counts, can this be related to the error?



# def test_triple_sort():
#     res = (1, 2, 3)
#     assert triple_sort((1, 2, 3)) == res
#     assert triple_sort((1, 3, 2)) == res
#     assert triple_sort((2, 1, 3)) == res
#     assert triple_sort((2, 3, 1)) == res, triple_sort((2, 3, 1))
#     assert triple_sort((3, 2, 1)) == res
#     assert triple_sort((3, 1, 2)) == res

#     res = (1,1,3)
#     assert triple_sort((1, 1, 3)) == res
#     assert triple_sort((1, 3, 1)) == res
#     assert triple_sort((1, 1, 3)) == res
#     assert triple_sort((1, 3, 1)) == res
#     assert triple_sort((3, 1, 1)) == res
#     assert triple_sort((3, 1, 1)) == res

# def test_chunks():
#     l = [1,2,3,4,5,6]
#     i=0
#     for chunk, index, rest in chunks(l, 2):
#         assert len(chunk) == 2, (chunk, rest)
#         assert not chunk in rest, (chunk, rest)
#         assert index == i, (index, i)
#         i += 1

#     l = [1,2,3,4,5]
#     i = 0
#     for chunk, index, rest in chunks(l, 2):
#         assert len(chunk) <= 2, (chunk, rest)
#         assert not chunk in rest, (chunk, rest)
#         assert index == i, (index, i)
#         i += 1

#     l = [1,2,3,4,5]
#     i = 0
#     for chunk, index, rest in chunks(l, 3):
#         assert len(chunk) <= 3, (chunk, rest)
#         assert not chunk in rest, (chunk, rest)
#         assert index == i, (index, i)
#         i += 1

# test_chunks()
예제 #4
0
def triple_errors(output_folder, triple):
    from parsers import CVOutputParser
    from utils import interpolate, avg, confidence_interval
    import math
    from collections import Counter
    import os

    """ 
    Plot accumulated errors for estimators against pair triple ratios.
    Ratios are binned in the range 0.0 to 1.0.
    """
    if not output_folder[-1] == "/":
        output_folder += "/"

    iteration = -1
    max_ent_errors = []
    ext_errors = []
    max_ent_abs_errors = []
    ext_abs_errors = []
    samples_ignored = 0
    while True:
        iteration += 1
        max_ent_est_file = output_folder + str(iteration) + "_data.tsv"
        ext_est_file = output_folder + str(iteration) + "_data_extrapolation.tsv"
        # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv'
        # read baseline also?
        # Read until we do not find an output file
        if not os.path.exists(max_ent_est_file):
            break

        # Read the maxent estimate
        found = False
        for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(
            max_ent_est_file
        ):
            (s1, s2, s3, s12, s13, s23, s123) = triangle

            if sample_triple == triple:
                # if s123 == 0:
                #     break
                found = True
                max_ent_errors.append(est - obs)
                max_ent_abs_errors.append(abs(obs - est))
                break

        if not found:
            samples_ignored += 1
            continue

        for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_est_file):
            (s1, s2, s3, s12, s13, s23, s123) = triangle

            if sample_triple == triple:
                ext_errors.append(est - obs)
                ext_abs_errors.append(abs(obs - est))
                break

    # maxent confidence interval
    maxent_ci = confidence_interval(max_ent_errors)
    # extrapolation confidence interval
    ext_ci = confidence_interval(ext_errors)

    print "samples ignored: ", samples_ignored
    print "maxent avg error: ", round(avg(max_ent_errors), 1)
    print "maxent 95% confidence interval: ", (round(maxent_ci[0], 1), round(maxent_ci[1], 2))
    print "extrapolation avg error: ", round(avg(ext_errors), 1)
    print "extrapolation 95% confidence interval: ", (round(ext_ci[0], 1), round(ext_ci[1], 2))

    # round
    max_ent_errors_rounded = [round(x, 1) for x in max_ent_errors]
    ext_errors_rounded = [round(x, 1) for x in ext_errors]

    # plot
    xlabel("Estimate error")
    ylabel("Bucket size")
    # text(0.1, 0.8, 'Maxent')
    # text(0.1, 0.7, 'avg. error: ' + str(avg(max_ent_errors)))
    # text(0.1, 0.6, '95% conf. interval: ' + str(maxent_ci))

    # text(0.5, 0.8, 'Extrapolation')
    # text(0.5, 0.7, 'avg. error: ' + str(avg(ext_errors)))
    # text(0.5, 0.6, '95% conf. interval: ' + str(ext_ci))

    hist([max_ent_errors_rounded, ext_errors_rounded], color=("b", "r"))

    return max_ent_errors, max_ent_abs_errors, ext_errors, ext_abs_errors