def create_zero_trip_files(folder): import os from parsers import CVOutputParser """ Iterate .tsv file and only write estimates where the triple was 0 in the sample to another file. """ iteration = 0 while True: max_ent_file = folder + str(iteration) + '_data.tsv' if not os.path.exists(max_ent_file): break # Cleaned file name max_ent_zero_trips_file = folder + str(iteration) + '_data_zero_trips.tsv' fd = open(max_ent_zero_trips_file, 'w') # write header fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') # Clean for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file): s1, s2, s3, s12, s13, s23, s123 = triangle if s123 != 0: continue fd.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n') fd.close() iteration += 1
def calc_avg_errors(output_folder): from parsers import CVOutputParser from utils import interpolate, avg import math from collections import Counter import os """ Average error calculation on CV output. """ if not output_folder[-1] == '/': output_folder += '/' # better_than_baseline_file = open('better_than_base_line.tsv', 'w') # better_than_baseline_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') # small_error_file = open('small_error.tsv', 'w') # small_error_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') baseline = 88.5 iteration = 0 points_evaluated = 0 over_estimates = 0 all_sample_errors = [] while True: tsv_file = output_folder + str(iteration) + '_data_zero_trips.tsv' if not os.path.exists(tsv_file): break sample_erros = [] for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(tsv_file): s1, s2, s3, s12, s13, s23, s123 = triangle # if int(obs) < 200 or s123 == 0: # continue # Heurestiv for extrapolation, 200000 in sample # est = min(s12, s13, s23) / 200000. * (21006480-200000) points_evaluated += 1 if est > obs: over_estimates += 1 # if obs > baseline: # if abs(est-obs) < abs(est-baseline): # better_than_baseline_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n') error = abs(est-obs) / math.sqrt(obs) # if error < 3: # small_error_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n') sample_erros.append(error) all_sample_errors.append(avg(sample_erros)) iteration += 1 # better_than_baseline_file.close() # small_error_file.close() avg_error = avg(all_sample_errors) print 'avg_error ', avg_error print 'points evaluated', points_evaluated print 'over estimates: ', over_estimates return avg_error, all_sample_errors
def merge_sample(folder): import os from parsers import CVOutputParser """ Creates a single a single .tsv file with maxent and extrapolation results. """ iteration = 0 maxent_estimates = [] while True: max_ent_file = folder + str(iteration) + '_data.tsv' if not os.path.exists(max_ent_file): break for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file): s1, s2, s3, s12, s13, s23, s123 = triangle maxent_estimates.append(est) iteration += 1 print 'iteration ', iteration # merged file name merged_file = folder + 'merged_estimates.tsv' fd = open(merged_file, 'wr') # write header fd.write('est\text\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') iteration = 0 estimate_number = 0 while True: ext_file = folder + str(iteration) + '_data_extrapolation.tsv' if not os.path.exists(ext_file): break for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_file): s1, s2, s3, s12, s13, s23, s123 = triangle fd.write(str(maxent_estimates[estimate_number]) + '\t' + str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n') estimate_number += 1 iteration += 1 print 'iteration ', iteration fd.close() print 'merging files done' # def confidence_interval(): # in some sample, we calculate the % errors. # This gives some distribution # calculate errors as a percentage, currently absolute errors # have the issue of big estimates weigh too much # cross validate max ent estimate on som subset of triplets # how good/bad is this estiamte? Confidence interval, # ie with 95% percent change our error mean is only x std. from true # error mean. # Can this confidence tell us anything about how well a sample will work # ie how good estiamtes are? # Could we calcualte this for both max_ent and extrapolation and # find some threshold there? # Varianse in the data. The distribution of the sample has some variance # ie frequnecy on item counts, can this be related to the error? # def test_triple_sort(): # res = (1, 2, 3) # assert triple_sort((1, 2, 3)) == res # assert triple_sort((1, 3, 2)) == res # assert triple_sort((2, 1, 3)) == res # assert triple_sort((2, 3, 1)) == res, triple_sort((2, 3, 1)) # assert triple_sort((3, 2, 1)) == res # assert triple_sort((3, 1, 2)) == res # res = (1,1,3) # assert triple_sort((1, 1, 3)) == res # assert triple_sort((1, 3, 1)) == res # assert triple_sort((1, 1, 3)) == res # assert triple_sort((1, 3, 1)) == res # assert triple_sort((3, 1, 1)) == res # assert triple_sort((3, 1, 1)) == res # def test_chunks(): # l = [1,2,3,4,5,6] # i=0 # for chunk, index, rest in chunks(l, 2): # assert len(chunk) == 2, (chunk, rest) # assert not chunk in rest, (chunk, rest) # assert index == i, (index, i) # i += 1 # l = [1,2,3,4,5] # i = 0 # for chunk, index, rest in chunks(l, 2): # assert len(chunk) <= 2, (chunk, rest) # assert not chunk in rest, (chunk, rest) # assert index == i, (index, i) # i += 1 # l = [1,2,3,4,5] # i = 0 # for chunk, index, rest in chunks(l, 3): # assert len(chunk) <= 3, (chunk, rest) # assert not chunk in rest, (chunk, rest) # assert index == i, (index, i) # i += 1 # test_chunks()
def triple_errors(output_folder, triple): from parsers import CVOutputParser from utils import interpolate, avg, confidence_interval import math from collections import Counter import os """ Plot accumulated errors for estimators against pair triple ratios. Ratios are binned in the range 0.0 to 1.0. """ if not output_folder[-1] == "/": output_folder += "/" iteration = -1 max_ent_errors = [] ext_errors = [] max_ent_abs_errors = [] ext_abs_errors = [] samples_ignored = 0 while True: iteration += 1 max_ent_est_file = output_folder + str(iteration) + "_data.tsv" ext_est_file = output_folder + str(iteration) + "_data_extrapolation.tsv" # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv' # read baseline also? # Read until we do not find an output file if not os.path.exists(max_ent_est_file): break # Read the maxent estimate found = False for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2( max_ent_est_file ): (s1, s2, s3, s12, s13, s23, s123) = triangle if sample_triple == triple: # if s123 == 0: # break found = True max_ent_errors.append(est - obs) max_ent_abs_errors.append(abs(obs - est)) break if not found: samples_ignored += 1 continue for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_est_file): (s1, s2, s3, s12, s13, s23, s123) = triangle if sample_triple == triple: ext_errors.append(est - obs) ext_abs_errors.append(abs(obs - est)) break # maxent confidence interval maxent_ci = confidence_interval(max_ent_errors) # extrapolation confidence interval ext_ci = confidence_interval(ext_errors) print "samples ignored: ", samples_ignored print "maxent avg error: ", round(avg(max_ent_errors), 1) print "maxent 95% confidence interval: ", (round(maxent_ci[0], 1), round(maxent_ci[1], 2)) print "extrapolation avg error: ", round(avg(ext_errors), 1) print "extrapolation 95% confidence interval: ", (round(ext_ci[0], 1), round(ext_ci[1], 2)) # round max_ent_errors_rounded = [round(x, 1) for x in max_ent_errors] ext_errors_rounded = [round(x, 1) for x in ext_errors] # plot xlabel("Estimate error") ylabel("Bucket size") # text(0.1, 0.8, 'Maxent') # text(0.1, 0.7, 'avg. error: ' + str(avg(max_ent_errors))) # text(0.1, 0.6, '95% conf. interval: ' + str(maxent_ci)) # text(0.5, 0.8, 'Extrapolation') # text(0.5, 0.7, 'avg. error: ' + str(avg(ext_errors))) # text(0.5, 0.6, '95% conf. interval: ' + str(ext_ci)) hist([max_ent_errors_rounded, ext_errors_rounded], color=("b", "r")) return max_ent_errors, max_ent_abs_errors, ext_errors, ext_abs_errors