Python CVOutputParser.read_est_obs_file_disc_version_2 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: parsers

클래스/타입: CVOutputParser

메소드/함수: read_est_obs_file_disc_version_2

hotexamples.com에서의 예제들: 4

Python CVOutputParser.read_est_obs_file_disc_version_2 - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 parsers.CVOutputParser.read_est_obs_file_disc_version_2에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

read_est_obs_file_disc_version_2(4)

read_est_obs_file(2)

read_est_obs_file_disc_version(2)

read_merged_file_disc_version(2)

예제 #1

파일 보기

파일: utils.py 프로젝트: AndersHqst/SAAS-2014

def create_zero_trip_files(folder):
    import os
    from parsers import CVOutputParser
    """
    Iterate .tsv file and only write estimates where the triple was 0 in the sample
    to another file.
    """
    iteration = 0
    while True:
        max_ent_file = folder + str(iteration) + '_data.tsv'
        
        if not os.path.exists(max_ent_file):
            break

        # Cleaned file name
        max_ent_zero_trips_file = folder + str(iteration) + '_data_zero_trips.tsv'
        fd = open(max_ent_zero_trips_file, 'w')

        # write header
        fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')

        # Clean
        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file):

            s1, s2, s3, s12, s13, s23, s123 = triangle

            if s123 != 0:
                continue

            fd.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')

        fd.close()

        iteration += 1

예제 #2

파일 보기

파일: error_averages.py 프로젝트: AndersHqst/SAAS-2014

def calc_avg_errors(output_folder):

    from parsers import CVOutputParser
    from utils import interpolate, avg
    import math
    from collections import Counter
    import os
    """ 
    Average error calculation on CV output.
    """
    if not output_folder[-1] == '/':
        output_folder += '/'
    
    # better_than_baseline_file = open('better_than_base_line.tsv', 'w')
    # better_than_baseline_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')

    # small_error_file = open('small_error.tsv', 'w')
    # small_error_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')    
    baseline = 88.5
    iteration = 0
    points_evaluated = 0
    over_estimates = 0
    all_sample_errors = []
    while True:
        tsv_file = output_folder + str(iteration) + '_data_zero_trips.tsv'

        if not os.path.exists(tsv_file):
            break

        sample_erros = []
        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(tsv_file):

            s1, s2, s3, s12, s13, s23, s123 = triangle

            # if int(obs) < 200 or s123 == 0:
            #     continue

            # Heurestiv for extrapolation, 200000 in sample
            # est = min(s12, s13, s23) / 200000. * (21006480-200000)

            points_evaluated += 1
            if est > obs:
                over_estimates += 1

            # if obs > baseline:
            #     if abs(est-obs) < abs(est-baseline):
            #         better_than_baseline_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')

            error = abs(est-obs) / math.sqrt(obs)
            # if error < 3:
            #     small_error_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')
            sample_erros.append(error)
        all_sample_errors.append(avg(sample_erros))
        iteration += 1

    # better_than_baseline_file.close()
    # small_error_file.close()

    avg_error = avg(all_sample_errors)
    print 'avg_error ', avg_error
    print 'points evaluated', points_evaluated
    print 'over estimates: ', over_estimates
    return avg_error, all_sample_errors

예제 #3

파일 보기

파일: utils.py 프로젝트: AndersHqst/SAAS-2014

def merge_sample(folder):
    import os
    from parsers import CVOutputParser
    """
    Creates a single a single .tsv file with maxent and extrapolation
    results.
    """

    iteration = 0
    maxent_estimates = []
    while True:
        max_ent_file = folder + str(iteration) + '_data.tsv'
        
        if not os.path.exists(max_ent_file):
            break

        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file):
            s1, s2, s3, s12, s13, s23, s123 = triangle
            maxent_estimates.append(est)
        iteration += 1
        print 'iteration ', iteration


    # merged file name
    merged_file = folder + 'merged_estimates.tsv'
    fd = open(merged_file, 'wr')
    # write header
    fd.write('est\text\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')

    iteration = 0
    estimate_number = 0
    while True:
        ext_file = folder + str(iteration) + '_data_extrapolation.tsv'
        
        if not os.path.exists(ext_file):
            break

        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_file):
            s1, s2, s3, s12, s13, s23, s123 = triangle
            fd.write(str(maxent_estimates[estimate_number]) + '\t' + str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')
            estimate_number += 1

        iteration += 1
        print 'iteration ', iteration
    fd.close()
    print 'merging files done'


# def confidence_interval():
    # in some sample, we calculate the % errors.
    # This gives some distribution
    # calculate errors as a percentage, currently absolute errors
    # have the issue of big estimates weigh too much
    # cross validate max ent estimate on som subset of triplets
    # how good/bad is this estiamte? Confidence interval,
    # ie with 95% percent change our error mean is only x std. from true
    # error mean.

    # Can this confidence tell us anything about how well a sample will work
    # ie how good estiamtes are?

    # Could we calcualte this for both max_ent and extrapolation and 
    # find some threshold there?

    # Varianse in the data. The distribution of the sample has some variance
    # ie frequnecy on item counts, can this be related to the error?



# def test_triple_sort():
#     res = (1, 2, 3)
#     assert triple_sort((1, 2, 3)) == res
#     assert triple_sort((1, 3, 2)) == res
#     assert triple_sort((2, 1, 3)) == res
#     assert triple_sort((2, 3, 1)) == res, triple_sort((2, 3, 1))
#     assert triple_sort((3, 2, 1)) == res
#     assert triple_sort((3, 1, 2)) == res

#     res = (1,1,3)
#     assert triple_sort((1, 1, 3)) == res
#     assert triple_sort((1, 3, 1)) == res
#     assert triple_sort((1, 1, 3)) == res
#     assert triple_sort((1, 3, 1)) == res
#     assert triple_sort((3, 1, 1)) == res
#     assert triple_sort((3, 1, 1)) == res

# def test_chunks():
#     l = [1,2,3,4,5,6]
#     i=0
#     for chunk, index, rest in chunks(l, 2):
#         assert len(chunk) == 2, (chunk, rest)
#         assert not chunk in rest, (chunk, rest)
#         assert index == i, (index, i)
#         i += 1

#     l = [1,2,3,4,5]
#     i = 0
#     for chunk, index, rest in chunks(l, 2):
#         assert len(chunk) <= 2, (chunk, rest)
#         assert not chunk in rest, (chunk, rest)
#         assert index == i, (index, i)
#         i += 1

#     l = [1,2,3,4,5]
#     i = 0
#     for chunk, index, rest in chunks(l, 3):
#         assert len(chunk) <= 3, (chunk, rest)
#         assert not chunk in rest, (chunk, rest)
#         assert index == i, (index, i)
#         i += 1

# test_chunks()

예제 #4

파일 보기

파일: triple_errors.py 프로젝트: jonasbusk/saas2014

def triple_errors(output_folder, triple):
    from parsers import CVOutputParser
    from utils import interpolate, avg, confidence_interval
    import math
    from collections import Counter
    import os

    """ 
    Plot accumulated errors for estimators against pair triple ratios.
    Ratios are binned in the range 0.0 to 1.0.
    """
    if not output_folder[-1] == "/":
        output_folder += "/"

    iteration = -1
    max_ent_errors = []
    ext_errors = []
    max_ent_abs_errors = []
    ext_abs_errors = []
    samples_ignored = 0
    while True:
        iteration += 1
        max_ent_est_file = output_folder + str(iteration) + "_data.tsv"
        ext_est_file = output_folder + str(iteration) + "_data_extrapolation.tsv"
        # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv'
        # read baseline also?
        # Read until we do not find an output file
        if not os.path.exists(max_ent_est_file):
            break

        # Read the maxent estimate
        found = False
        for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(
            max_ent_est_file
        ):
            (s1, s2, s3, s12, s13, s23, s123) = triangle

            if sample_triple == triple:
                # if s123 == 0:
                #     break
                found = True
                max_ent_errors.append(est - obs)
                max_ent_abs_errors.append(abs(obs - est))
                break

        if not found:
            samples_ignored += 1
            continue

        for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_est_file):
            (s1, s2, s3, s12, s13, s23, s123) = triangle

            if sample_triple == triple:
                ext_errors.append(est - obs)
                ext_abs_errors.append(abs(obs - est))
                break

    # maxent confidence interval
    maxent_ci = confidence_interval(max_ent_errors)
    # extrapolation confidence interval
    ext_ci = confidence_interval(ext_errors)

    print "samples ignored: ", samples_ignored
    print "maxent avg error: ", round(avg(max_ent_errors), 1)
    print "maxent 95% confidence interval: ", (round(maxent_ci[0], 1), round(maxent_ci[1], 2))
    print "extrapolation avg error: ", round(avg(ext_errors), 1)
    print "extrapolation 95% confidence interval: ", (round(ext_ci[0], 1), round(ext_ci[1], 2))

    # round
    max_ent_errors_rounded = [round(x, 1) for x in max_ent_errors]
    ext_errors_rounded = [round(x, 1) for x in ext_errors]

    # plot
    xlabel("Estimate error")
    ylabel("Bucket size")
    # text(0.1, 0.8, 'Maxent')
    # text(0.1, 0.7, 'avg. error: ' + str(avg(max_ent_errors)))
    # text(0.1, 0.6, '95% conf. interval: ' + str(maxent_ci))

    # text(0.5, 0.8, 'Extrapolation')
    # text(0.5, 0.7, 'avg. error: ' + str(avg(ext_errors)))
    # text(0.5, 0.6, '95% conf. interval: ' + str(ext_ci))

    hist([max_ent_errors_rounded, ext_errors_rounded], color=("b", "r"))

    return max_ent_errors, max_ent_abs_errors, ext_errors, ext_abs_errors