def error_ratios_cross_val(output_folder): """ Cross validation on the error ratios to find optimal triangle values """ from parsers import CVOutputParser from utils import avg if not output_folder[-1] == '/': output_folder += '/' singleton_thresholds = [0, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000] pair_thresholds = [0, 1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100] triple_thresholds = [0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250] # Results are inserted at an offset # acc_error, count, maxent_best, ext_best c = [[[(0,0,0,0, (0,0,0)) for z in range(len(triple_thresholds))] for y in range(len(pair_thresholds))] for x in range(len(singleton_thresholds))] merged_file = output_folder + 'merged_estimates.tsv' iteration = 0 for (n1, n2, n3), (est, ext, obs, ratio, triangle) in CVOutputParser.read_merged_file_disc_version(merged_file): s1, s2, s3, s12, s13, s23, s123 = triangle # Calculate errors and add the to the result matrix # Ratio error between estiamtes error = 0 # check if both estimaters are spot on: if abs(ext-obs) == 0 and abs(est-obs) == 0: error = 1. # check that we are not dividing be a very small floating point # from extrapolation. If below one we just treat the error as # if it was 1 if abs(ext-obs) < 1: error = float(abs(est-obs)) # Get error ratio, avoid division by zero elif abs(ext-obs) != 0: error = abs(est-obs) / float(abs(ext-obs)) # ratio_errors.append(error) for singleton_index, singleton_threshold in enumerate(singleton_thresholds): if not min(s1, s2, s3) > singleton_threshold: break for pair_index, pair_threshold in enumerate(pair_thresholds): if not min(s12, s13, s23) > pair_threshold: break for triple_index, triple_threshold in enumerate(triple_thresholds): if not s123 < triple_threshold: continue acc_error, count, maxent_best, ext_best, (s, p, t) = c[singleton_index][pair_index][triple_index] acc_error += error count += 1 if error < 1: maxent_best += 1 elif error > 1: ext_best += 1 c[singleton_index][pair_index][triple_index] = (acc_error, count, maxent_best, ext_best, (singleton_threshold, pair_threshold, triple_threshold)) if iteration % 1000000 == 0: print 'iteration: ', iteration iteration += 1 # maxent_errors.append(est / float(obs)) # ext_errors.append(ext / float(obs)) # Compute average errors for singleton_index, singleton_threshold in enumerate(singleton_thresholds): for pair_index, pair_threshold in enumerate(pair_thresholds): for triple_index, triple_threshold in enumerate(triple_thresholds): (acc_error, count, maxent_best, ext_best, (s,p,t)) = c[singleton_index][pair_index][triple_index] if count > 0: c[singleton_index][pair_index][triple_index] = (acc_error / float(count), count, maxent_best, ext_best, (s,p,t)) # ratio_error = sum(ratio_errors) / float(len(ratio_errors)) # ext_ratio = avg(ext_errors) # maxent_ratio = avg(maxent_errors) # print 'Singletons done for threshold: ', singleton_threshold # fd.close() # fd = open(output_folder + 'parameter_cv.tsv', 'wr') # fd.write('singleton\tpair\ttriple\tmax_ent\text\tratio_error\n') # fd.write(singleton + '\t' + pair + '\t' + triple + '\t' + maxent_ratio + '\t' + ext_ratio + '\t' + ratio_error + '\n') # max_val = 1000 # offset = 30 # hist([x for x in range(max_val)[offset:]], ratio_errors[offset:max_val], color='green') return c
def error_ratios(output_folder, s_min=None, p_min=None, t_max=None, obs_min=None): """ Error ratio against triple count in sample on a CV result. Needs the merged_estimates.tsv file that can be created with the relevant script in utils.py """ from parsers import CVOutputParser from utils import interpolate, avg import math from collections import Counter import os if not output_folder[-1] == '/': output_folder += '/' max_singleton_occurrence = -1 max_pair_occurrence = -1 max_triple_occurrence = -1 #max ent occurrence_ratio_errors = [0 for x in range(100000)] ratio_errors = [] occurrences = [0 for x in range(100000)] merged_file = output_folder + 'merged_estimates.tsv' maxent_errors = [] ext_errors = [] iteration = 0 maxent_was_best_estimates = [] ext_was_best = [] for (n1, n2, n3), (est, ext, obs, ratio, triangle) in CVOutputParser.read_merged_file_disc_version(merged_file): s1, s2, s3, s12, s13, s23, s123 = triangle iteration += 1 if iteration % 1000000 == 0: print 'iteration: ', iteration if not s_min is None: if not min(s1,s2,s3) > s_min: continue if not p_min is None: if not min(s12,s23,s13) > p_min: continue if not t_max is None: if not s123 < t_max: continue if not obs_min is None: if not obs > obs_min: continue if max(s1,s2,s3) > max_singleton_occurrence: max_singleton_occurrence = max(s1,s2,s3) if max(s12,s13,s23) > max_pair_occurrence: max_pair_occurrence = max(s12,s13,s23) if s123 > max_triple_occurrence: max_triple_occurrence = s123 # get the absolute errors, # if this is below one we # set it to one to avoid problems # with dividing with numbers < 1 abs_ext_obs = abs(ext-obs) if abs_ext_obs < 1: abs_ext_obs = 1 abs_est_obs = abs(est-obs) if abs_est_obs < 1: abs_est_obs = 1 error = math.log(abs_est_obs / abs_ext_obs) ratio_errors.append(error) # low max ent estimate, magic numer is the value for the estiamtes # when a pair value was 1 for maxent, or 1 for ext # if est <= 104.0324: if error < 0: maxent_was_best_estimates.append(((n1, n2, n3), (est, ext, obs, ratio, triangle))) elif error > 0: ext_was_best.append(((n1, n2, n3), (est, ext, obs, ratio, triangle))) maxent_errors.append(est / float(obs)) ext_errors.append(ext / float(obs)) try: occurrences[int(obs)] += 1 occurrence_ratio_errors[int(obs)] += error except IndexError, e: pass