Exemplo n.º 1
0
def error_ratios_cross_val(output_folder):
    """
    Cross validation on the error ratios to find optimal
    triangle values
    """

    from parsers import CVOutputParser
    from utils import avg

    if not output_folder[-1] == '/':
        output_folder += '/'


    singleton_thresholds = [0, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000]
    pair_thresholds = [0, 1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]
    triple_thresholds = [0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250]
    # Results are inserted at an offset
    # acc_error, count, maxent_best, ext_best
    c = [[[(0,0,0,0, (0,0,0)) for z in range(len(triple_thresholds))] for y in range(len(pair_thresholds))] for x in range(len(singleton_thresholds))]


    merged_file = output_folder + 'merged_estimates.tsv'

    iteration = 0
    for (n1, n2, n3), (est, ext, obs, ratio, triangle) in CVOutputParser.read_merged_file_disc_version(merged_file):

        s1, s2, s3, s12, s13, s23, s123 = triangle

        # Calculate errors and add the to the result matrix
        # Ratio error between estiamtes
        error = 0
        # check if both estimaters are spot on:
        if abs(ext-obs) == 0 and abs(est-obs) == 0:
            error = 1.
        # check that we are not dividing be a very small floating point
        # from extrapolation. If below one we just treat the error as
        # if it was 1
        if abs(ext-obs) < 1:
            error = float(abs(est-obs))
        # Get error ratio, avoid division by zero
        elif abs(ext-obs) != 0:
            error = abs(est-obs) / float(abs(ext-obs))
        # ratio_errors.append(error)
        for singleton_index, singleton_threshold in enumerate(singleton_thresholds):
            if not min(s1, s2, s3) > singleton_threshold:
                break
            for pair_index, pair_threshold in enumerate(pair_thresholds):
                if not min(s12, s13, s23) > pair_threshold:
                    break
                for triple_index, triple_threshold in enumerate(triple_thresholds):
                    if not s123 < triple_threshold:
                        continue
                    acc_error, count, maxent_best, ext_best, (s, p, t) = c[singleton_index][pair_index][triple_index]
                    acc_error += error
                    count += 1
                    if error < 1:
                        maxent_best += 1
                    elif error > 1:
                        ext_best += 1
                    c[singleton_index][pair_index][triple_index] = (acc_error, count, maxent_best, ext_best, (singleton_threshold, pair_threshold, triple_threshold))
        if iteration % 1000000 == 0:
            print 'iteration: ', iteration
        iteration += 1

        # maxent_errors.append(est / float(obs))
        # ext_errors.append(ext / float(obs))

    # Compute average errors
    for singleton_index, singleton_threshold in enumerate(singleton_thresholds):
        for pair_index, pair_threshold in enumerate(pair_thresholds):
            for triple_index, triple_threshold in enumerate(triple_thresholds):
                (acc_error, count, maxent_best, ext_best, (s,p,t)) = c[singleton_index][pair_index][triple_index]
                if count > 0:
                    c[singleton_index][pair_index][triple_index] = (acc_error / float(count), count, maxent_best, ext_best, (s,p,t))

    # ratio_error = sum(ratio_errors) / float(len(ratio_errors))
    # ext_ratio = avg(ext_errors)
    # maxent_ratio = avg(maxent_errors)

    # print 'Singletons done for threshold: ', singleton_threshold

    # fd.close()

    # fd = open(output_folder + 'parameter_cv.tsv', 'wr')
    # fd.write('singleton\tpair\ttriple\tmax_ent\text\tratio_error\n')
    # fd.write(singleton + '\t' + pair + '\t' + triple + '\t' + maxent_ratio + '\t' + ext_ratio + '\t' + ratio_error + '\n')
    # max_val = 1000
    # offset = 30
    # hist([x for x in range(max_val)[offset:]], ratio_errors[offset:max_val], color='green')

    return c
Exemplo n.º 2
0
def error_ratios(output_folder, s_min=None, p_min=None, t_max=None, obs_min=None):
    """
    Error ratio against triple count in sample on a CV result.
    Needs the merged_estimates.tsv file that can be created
    with the relevant script in utils.py
    """

    from parsers import CVOutputParser
    from utils import interpolate, avg
    import math
    from collections import Counter
    import os

    if not output_folder[-1] == '/':
        output_folder += '/'

    max_singleton_occurrence = -1
    max_pair_occurrence = -1
    max_triple_occurrence = -1
    #max ent
    occurrence_ratio_errors = [0 for x in range(100000)]
    ratio_errors = []
    occurrences = [0 for x in range(100000)]
    merged_file = output_folder + 'merged_estimates.tsv'
    maxent_errors = []
    ext_errors = []
    iteration = 0
    maxent_was_best_estimates = []
    ext_was_best = []
    for (n1, n2, n3), (est, ext, obs, ratio, triangle) in CVOutputParser.read_merged_file_disc_version(merged_file):

        s1, s2, s3, s12, s13, s23, s123 = triangle

        iteration += 1
        if iteration % 1000000 == 0:
            print 'iteration: ', iteration

        if not s_min is None:
            if not min(s1,s2,s3) > s_min:
                continue
        if not p_min is None:
            if not min(s12,s23,s13) > p_min:
                continue
        if not t_max is None:
            if not s123 < t_max:
                continue

        if not obs_min is None:
            if not obs > obs_min:
                continue



        if max(s1,s2,s3) > max_singleton_occurrence:
            max_singleton_occurrence = max(s1,s2,s3)
        if max(s12,s13,s23) > max_pair_occurrence:
            max_pair_occurrence = max(s12,s13,s23)
        if s123 > max_triple_occurrence:
            max_triple_occurrence = s123

        # get the absolute errors, 
        # if this is below one we 
        # set it to one to avoid problems
        # with dividing with numbers < 1
        abs_ext_obs = abs(ext-obs)
        if abs_ext_obs < 1:
            abs_ext_obs = 1
        abs_est_obs = abs(est-obs)
        if abs_est_obs < 1:
            abs_est_obs = 1

        error = math.log(abs_est_obs / abs_ext_obs)
        ratio_errors.append(error)

        # low max ent estimate, magic numer is the value for the estiamtes
        # when a pair value was 1 for maxent, or 1 for ext
        # if est <= 104.0324:
        if error < 0:
            maxent_was_best_estimates.append(((n1, n2, n3), (est, ext, obs, ratio, triangle)))
        elif error > 0:
            ext_was_best.append(((n1, n2, n3), (est, ext, obs, ratio, triangle)))


        maxent_errors.append(est / float(obs))
        ext_errors.append(ext / float(obs))
        try:
            occurrences[int(obs)] += 1
            occurrence_ratio_errors[int(obs)] += error
        except IndexError, e:
            pass