def create_zero_trip_files(folder): import os from parsers import CVOutputParser """ Iterate .tsv file and only write estimates where the triple was 0 in the sample to another file. """ iteration = 0 while True: max_ent_file = folder + str(iteration) + '_data.tsv' if not os.path.exists(max_ent_file): break # Cleaned file name max_ent_zero_trips_file = folder + str(iteration) + '_data_zero_trips.tsv' fd = open(max_ent_zero_trips_file, 'w') # write header fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') # Clean for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file): s1, s2, s3, s12, s13, s23, s123 = triangle if s123 != 0: continue fd.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n') fd.close() iteration += 1
def error_averages_for_triple_counts(output_folder): """ Accumulated errors against triple count in sample on a CV result """ from parsers import CVOutputParser from utils import interpolate, avg import math from collections import Counter import os """ Average error calculation on CV output. """ if not output_folder[-1] == '/': output_folder += '/' baseline = 88.5 #max ent iteration = 0 max_ent_acc_errors = [0 for x in range(100000)] baseline_acc_errors = [0 for x in range(100000)] occurrences = [0 for x in range(100000)] while True: tsv_file = output_folder + str(iteration) + '_data.tsv' if not os.path.exists(tsv_file): break for (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version(tsv_file): s1, s2, s3, s12, s13, s23, s123 = triangle # if obs < 200: # continue try: occurrences[int(obs)] += 1 max_ent_acc_errors[int(obs)] += abs(est-obs) / math.sqrt(obs) baseline_acc_errors[int(obs)] += abs(baseline-obs) / math.sqrt(obs) except IndexError, e: pass iteration += 1 print 'iteration: ', iteration
def calc_avg_errors(output_folder): from parsers import CVOutputParser from utils import interpolate, avg import math from collections import Counter import os """ Average error calculation on CV output. """ if not output_folder[-1] == '/': output_folder += '/' # better_than_baseline_file = open('better_than_base_line.tsv', 'w') # better_than_baseline_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') # small_error_file = open('small_error.tsv', 'w') # small_error_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') baseline = 88.5 iteration = 0 points_evaluated = 0 over_estimates = 0 all_sample_errors = [] while True: tsv_file = output_folder + str(iteration) + '_data_zero_trips.tsv' if not os.path.exists(tsv_file): break sample_erros = [] for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(tsv_file): s1, s2, s3, s12, s13, s23, s123 = triangle # if int(obs) < 200 or s123 == 0: # continue # Heurestiv for extrapolation, 200000 in sample # est = min(s12, s13, s23) / 200000. * (21006480-200000) points_evaluated += 1 if est > obs: over_estimates += 1 # if obs > baseline: # if abs(est-obs) < abs(est-baseline): # better_than_baseline_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n') error = abs(est-obs) / math.sqrt(obs) # if error < 3: # small_error_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n') sample_erros.append(error) all_sample_errors.append(avg(sample_erros)) iteration += 1 # better_than_baseline_file.close() # small_error_file.close() avg_error = avg(all_sample_errors) print 'avg_error ', avg_error print 'points evaluated', points_evaluated print 'over estimates: ', over_estimates return avg_error, all_sample_errors
except IndexError, e: pass iteration += 1 print 'iteration: ', iteration # extrapolation ext_acc_errors = [0 for x in range(100000)] iteration = 0 while True: tsv_file = output_folder + str(iteration) + '_data_extrapolation.tsv' if not os.path.exists(tsv_file): break for (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version(tsv_file): s1, s2, s3, s12, s13, s23, s123 = triangle # if obs < 200: # continue try: ext_acc_errors[int(obs)] += abs(est-obs) / math.sqrt(obs) except IndexError, e: pass iteration += 1 print 'iteration: ', iteration for i, count in enumerate(occurrences):
def merge_sample(folder): import os from parsers import CVOutputParser """ Creates a single a single .tsv file with maxent and extrapolation results. """ iteration = 0 maxent_estimates = [] while True: max_ent_file = folder + str(iteration) + '_data.tsv' if not os.path.exists(max_ent_file): break for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file): s1, s2, s3, s12, s13, s23, s123 = triangle maxent_estimates.append(est) iteration += 1 print 'iteration ', iteration # merged file name merged_file = folder + 'merged_estimates.tsv' fd = open(merged_file, 'wr') # write header fd.write('est\text\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') iteration = 0 estimate_number = 0 while True: ext_file = folder + str(iteration) + '_data_extrapolation.tsv' if not os.path.exists(ext_file): break for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_file): s1, s2, s3, s12, s13, s23, s123 = triangle fd.write(str(maxent_estimates[estimate_number]) + '\t' + str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n') estimate_number += 1 iteration += 1 print 'iteration ', iteration fd.close() print 'merging files done' # def confidence_interval(): # in some sample, we calculate the % errors. # This gives some distribution # calculate errors as a percentage, currently absolute errors # have the issue of big estimates weigh too much # cross validate max ent estimate on som subset of triplets # how good/bad is this estiamte? Confidence interval, # ie with 95% percent change our error mean is only x std. from true # error mean. # Can this confidence tell us anything about how well a sample will work # ie how good estiamtes are? # Could we calcualte this for both max_ent and extrapolation and # find some threshold there? # Varianse in the data. The distribution of the sample has some variance # ie frequnecy on item counts, can this be related to the error? # def test_triple_sort(): # res = (1, 2, 3) # assert triple_sort((1, 2, 3)) == res # assert triple_sort((1, 3, 2)) == res # assert triple_sort((2, 1, 3)) == res # assert triple_sort((2, 3, 1)) == res, triple_sort((2, 3, 1)) # assert triple_sort((3, 2, 1)) == res # assert triple_sort((3, 1, 2)) == res # res = (1,1,3) # assert triple_sort((1, 1, 3)) == res # assert triple_sort((1, 3, 1)) == res # assert triple_sort((1, 1, 3)) == res # assert triple_sort((1, 3, 1)) == res # assert triple_sort((3, 1, 1)) == res # assert triple_sort((3, 1, 1)) == res # def test_chunks(): # l = [1,2,3,4,5,6] # i=0 # for chunk, index, rest in chunks(l, 2): # assert len(chunk) == 2, (chunk, rest) # assert not chunk in rest, (chunk, rest) # assert index == i, (index, i) # i += 1 # l = [1,2,3,4,5] # i = 0 # for chunk, index, rest in chunks(l, 2): # assert len(chunk) <= 2, (chunk, rest) # assert not chunk in rest, (chunk, rest) # assert index == i, (index, i) # i += 1 # l = [1,2,3,4,5] # i = 0 # for chunk, index, rest in chunks(l, 3): # assert len(chunk) <= 3, (chunk, rest) # assert not chunk in rest, (chunk, rest) # assert index == i, (index, i) # i += 1 # test_chunks()
def triple_errors(output_folder, triple): from parsers import CVOutputParser from utils import interpolate, avg, confidence_interval import math from collections import Counter import os """ Plot accumulated errors for estimators against pair triple ratios. Ratios are binned in the range 0.0 to 1.0. """ if not output_folder[-1] == "/": output_folder += "/" iteration = -1 max_ent_errors = [] ext_errors = [] max_ent_abs_errors = [] ext_abs_errors = [] samples_ignored = 0 while True: iteration += 1 max_ent_est_file = output_folder + str(iteration) + "_data.tsv" ext_est_file = output_folder + str(iteration) + "_data_extrapolation.tsv" # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv' # read baseline also? # Read until we do not find an output file if not os.path.exists(max_ent_est_file): break # Read the maxent estimate found = False for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2( max_ent_est_file ): (s1, s2, s3, s12, s13, s23, s123) = triangle if sample_triple == triple: # if s123 == 0: # break found = True max_ent_errors.append(est - obs) max_ent_abs_errors.append(abs(obs - est)) break if not found: samples_ignored += 1 continue for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_est_file): (s1, s2, s3, s12, s13, s23, s123) = triangle if sample_triple == triple: ext_errors.append(est - obs) ext_abs_errors.append(abs(obs - est)) break # maxent confidence interval maxent_ci = confidence_interval(max_ent_errors) # extrapolation confidence interval ext_ci = confidence_interval(ext_errors) print "samples ignored: ", samples_ignored print "maxent avg error: ", round(avg(max_ent_errors), 1) print "maxent 95% confidence interval: ", (round(maxent_ci[0], 1), round(maxent_ci[1], 2)) print "extrapolation avg error: ", round(avg(ext_errors), 1) print "extrapolation 95% confidence interval: ", (round(ext_ci[0], 1), round(ext_ci[1], 2)) # round max_ent_errors_rounded = [round(x, 1) for x in max_ent_errors] ext_errors_rounded = [round(x, 1) for x in ext_errors] # plot xlabel("Estimate error") ylabel("Bucket size") # text(0.1, 0.8, 'Maxent') # text(0.1, 0.7, 'avg. error: ' + str(avg(max_ent_errors))) # text(0.1, 0.6, '95% conf. interval: ' + str(maxent_ci)) # text(0.5, 0.8, 'Extrapolation') # text(0.5, 0.7, 'avg. error: ' + str(avg(ext_errors))) # text(0.5, 0.6, '95% conf. interval: ' + str(ext_ci)) hist([max_ent_errors_rounded, ext_errors_rounded], color=("b", "r")) return max_ent_errors, max_ent_abs_errors, ext_errors, ext_abs_errors
def plot_intervals(output_folder): from parsers import CVOutputParser from preprocessing import Preprocessor from utils import avg import os import math """ Given a cross validation ouput. Certain triple intervals can be plottet to compare the error for extrapolation, max ent and the heurestic. The algorithm runs through each triple interval, and then for each sampled estiamte output the triples in the interval are looked up in each sample and the MAPE error is recorded and the average errors are added. And the average of these averages are then plottet for each interval. """ if not output_folder[-1] == '/': output_folder += '/' intervals = 30 triple_intervals = Preprocessor.triple_intervals(output_folder + 'observed_frequent_items.out', intervals=intervals) avg_max_ent_errors = [] avg_ext_errors = [] avg_heu_errors = [] pair_triple_ratios = [i/10. for i in range(11)] # binned ratios [0.0 to 1.0] max_ent_ratio_error = [0 for i in range(11)] ext_ratio_error = [0 for i in range(11)] for index, triple_interval in enumerate(triple_intervals): print 'Triple interval {} of {}'.format(index, intervals) iteration = 0 MAPE_avg_errors = [] MAPE_avg_errors_ext = [] # MAPE_avg_errors_heu = [] while True: max_ent_est_file = output_folder + str(iteration) + '_data.tsv' ext_est_file = output_folder + str(iteration) + '_data_extrapolation.tsv' # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv' # read baseline also? # Read until we do not find an output file if not os.path.exists(max_ent_est_file): break max_ent_est = CVOutputParser.read_est_obs_file(max_ent_est_file) ext_est = CVOutputParser.read_est_obs_file(ext_est_file) # heu_est = CVOutputParser.read_est_obs_file(heu_est_file) MAPE_errors = [] MAPE_errors_ext = [] # MAPE_errors_heu = [] for triple in triple_interval: # Check that the triple has been estimated if triple in max_ent_est: # Index 1 should hold the observed value parsed from the file # is the same mapped to every estimate, so hust read it once. obs = max_ent_est[triple][1] # maxent estimate est = max_ent_est[triple][0] # extrapolation estimate est2 = ext_est[triple][0] # # independence estimat? # heurestic, use max_ent for 0 triple in sample # est4 = heu_est[triple][0] # Index 2 should hold the pair triple ratio. # is the sam for every estimat ratio = max_ent_est[triple][2] # bin the ratio to one decimal ratio_binned = round(ratio, 1) # add errors to the ratio max_ent_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est-obs) / float(obs) ext_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est2-obs) / float(obs) # MAPE error max ent # error = abs(obs-est) #/ float(obs) * 100 # MAPE_errors.append(error) # # MAPE error extrapolation # error2 = abs(obs-est2) #/ float(obs) * 100 # MAPE_errors_ext.append(error2) # MAPE error independence? # MAPE error heurestic # error4 = abs(obs-est4) #/ float(obs) * 100 # MAPE_errors_heu.append(error4) # MAPE baseline error? MAPE_avg_errors.append(avg(MAPE_errors)) MAPE_avg_errors_ext.append(avg(MAPE_errors_ext)) # MAPE_avg_errors_heu.append(avg(MAPE_errors_heu)) iteration += 1 avg_max_ent_errors.append(avg(MAPE_avg_errors)) avg_ext_errors.append(avg(MAPE_avg_errors_ext)) # avg_heu_errors.append(avg(MAPE_avg_errors_heu)) plot(range(len(avg_max_ent_errors)), avg_max_ent_errors, color='blue') plot(range(len(avg_ext_errors)), avg_ext_errors, color='red')
def plot_ratios(output_folder): from parsers import CVOutputParser from utils import interpolate import math from collections import Counter import os """ Plot accumulated errors for estimators agains pari triple ratios. Ratios are binned in the range 0.0 to 1.0. """ if not output_folder[-1] == '/': output_folder += '/' pair_triple_ratios = [i/10. for i in range(11)] max_ent_ratio_error = [0 for i in range(11)] ext_ratio_error = [0 for i in range(11)] maxent_better_ratio = [0 for i in range(11)] ext_better_ratio = [0 for i in range(11)] values_binned = 0 values_ignored = 0 iteration = 0 pair_counts = Counter() trip_counts = Counter() while True: max_ent_est_file = output_folder + str(iteration) + '_data.tsv' ext_est_file = output_folder + str(iteration) + '_data_extrapolation.tsv' # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv' # read baseline also? # Read until we do not find an output file if not os.path.exists(max_ent_est_file): break max_ent_est = CVOutputParser.read_est_obs_file(max_ent_est_file) ext_est = CVOutputParser.read_est_obs_file(ext_est_file) # heu_est = CVOutputParser.read_est_obs_file(heu_est_file) for triple in max_ent_est.keys(): (s1, s2, s3, s12, s13, s23, s123) = max_ent_est[triple][3] pair_counts[s12] += 1 pair_counts[s13] += 1 pair_counts[s23] += 1 trip_counts[s123] += 1 # if not s123 < max_trips or not min(s12, s13, s23) > min_pairs: # values_ignored += 1 # continue # Index 1 should hold the observed value parsed from the file # is the same mapped to every estimate, so just read it once. obs = max_ent_est[triple][1] # if obs < 200: # values_ignored += 1 # continue if obs < 200: continue # maxent estimate est = max_ent_est[triple][0] # extrapolation estimate est2 = ext_est[triple][0] # # independence estimat? # heurestic, use max_ent for 0 triple in sample # est4 = heu_est[triple][0] # Index 2 should hold the pair triple ratio. # is the sam for every estimat ratio = max_ent_est[triple][2] # bin the ratio to one decimal ratio_binned = round(ratio, 1) # Record the ratio if maxent was better maxent_error = abs(est-obs)/math.sqrt(obs) ext_error = abs(est2-obs)/math.sqrt(obs) try: if maxent_error < ext_error: maxent_better_ratio[pair_triple_ratios.index(ratio_binned)] +=1 elif maxent_error > ext_error: ext_better_ratio[pair_triple_ratios.index(ratio_binned)] +=1 except ValueError, ve: pass # add errors to the ratio bin try: values_binned += 1 max_ent_ratio_error[pair_triple_ratios.index(ratio_binned)] += maxent_error ext_ratio_error[pair_triple_ratios.index(ratio_binned)] += ext_error except ValueError, ve: pass
def error_ratios_cross_val(output_folder): """ Cross validation on the error ratios to find optimal triangle values """ from parsers import CVOutputParser from utils import avg if not output_folder[-1] == '/': output_folder += '/' singleton_thresholds = [0, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000] pair_thresholds = [0, 1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100] triple_thresholds = [0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250] # Results are inserted at an offset # acc_error, count, maxent_best, ext_best c = [[[(0,0,0,0, (0,0,0)) for z in range(len(triple_thresholds))] for y in range(len(pair_thresholds))] for x in range(len(singleton_thresholds))] merged_file = output_folder + 'merged_estimates.tsv' iteration = 0 for (n1, n2, n3), (est, ext, obs, ratio, triangle) in CVOutputParser.read_merged_file_disc_version(merged_file): s1, s2, s3, s12, s13, s23, s123 = triangle # Calculate errors and add the to the result matrix # Ratio error between estiamtes error = 0 # check if both estimaters are spot on: if abs(ext-obs) == 0 and abs(est-obs) == 0: error = 1. # check that we are not dividing be a very small floating point # from extrapolation. If below one we just treat the error as # if it was 1 if abs(ext-obs) < 1: error = float(abs(est-obs)) # Get error ratio, avoid division by zero elif abs(ext-obs) != 0: error = abs(est-obs) / float(abs(ext-obs)) # ratio_errors.append(error) for singleton_index, singleton_threshold in enumerate(singleton_thresholds): if not min(s1, s2, s3) > singleton_threshold: break for pair_index, pair_threshold in enumerate(pair_thresholds): if not min(s12, s13, s23) > pair_threshold: break for triple_index, triple_threshold in enumerate(triple_thresholds): if not s123 < triple_threshold: continue acc_error, count, maxent_best, ext_best, (s, p, t) = c[singleton_index][pair_index][triple_index] acc_error += error count += 1 if error < 1: maxent_best += 1 elif error > 1: ext_best += 1 c[singleton_index][pair_index][triple_index] = (acc_error, count, maxent_best, ext_best, (singleton_threshold, pair_threshold, triple_threshold)) if iteration % 1000000 == 0: print 'iteration: ', iteration iteration += 1 # maxent_errors.append(est / float(obs)) # ext_errors.append(ext / float(obs)) # Compute average errors for singleton_index, singleton_threshold in enumerate(singleton_thresholds): for pair_index, pair_threshold in enumerate(pair_thresholds): for triple_index, triple_threshold in enumerate(triple_thresholds): (acc_error, count, maxent_best, ext_best, (s,p,t)) = c[singleton_index][pair_index][triple_index] if count > 0: c[singleton_index][pair_index][triple_index] = (acc_error / float(count), count, maxent_best, ext_best, (s,p,t)) # ratio_error = sum(ratio_errors) / float(len(ratio_errors)) # ext_ratio = avg(ext_errors) # maxent_ratio = avg(maxent_errors) # print 'Singletons done for threshold: ', singleton_threshold # fd.close() # fd = open(output_folder + 'parameter_cv.tsv', 'wr') # fd.write('singleton\tpair\ttriple\tmax_ent\text\tratio_error\n') # fd.write(singleton + '\t' + pair + '\t' + triple + '\t' + maxent_ratio + '\t' + ext_ratio + '\t' + ratio_error + '\n') # max_val = 1000 # offset = 30 # hist([x for x in range(max_val)[offset:]], ratio_errors[offset:max_val], color='green') return c
def error_ratios(output_folder, s_min=None, p_min=None, t_max=None, obs_min=None): """ Error ratio against triple count in sample on a CV result. Needs the merged_estimates.tsv file that can be created with the relevant script in utils.py """ from parsers import CVOutputParser from utils import interpolate, avg import math from collections import Counter import os if not output_folder[-1] == '/': output_folder += '/' max_singleton_occurrence = -1 max_pair_occurrence = -1 max_triple_occurrence = -1 #max ent occurrence_ratio_errors = [0 for x in range(100000)] ratio_errors = [] occurrences = [0 for x in range(100000)] merged_file = output_folder + 'merged_estimates.tsv' maxent_errors = [] ext_errors = [] iteration = 0 maxent_was_best_estimates = [] ext_was_best = [] for (n1, n2, n3), (est, ext, obs, ratio, triangle) in CVOutputParser.read_merged_file_disc_version(merged_file): s1, s2, s3, s12, s13, s23, s123 = triangle iteration += 1 if iteration % 1000000 == 0: print 'iteration: ', iteration if not s_min is None: if not min(s1,s2,s3) > s_min: continue if not p_min is None: if not min(s12,s23,s13) > p_min: continue if not t_max is None: if not s123 < t_max: continue if not obs_min is None: if not obs > obs_min: continue if max(s1,s2,s3) > max_singleton_occurrence: max_singleton_occurrence = max(s1,s2,s3) if max(s12,s13,s23) > max_pair_occurrence: max_pair_occurrence = max(s12,s13,s23) if s123 > max_triple_occurrence: max_triple_occurrence = s123 # get the absolute errors, # if this is below one we # set it to one to avoid problems # with dividing with numbers < 1 abs_ext_obs = abs(ext-obs) if abs_ext_obs < 1: abs_ext_obs = 1 abs_est_obs = abs(est-obs) if abs_est_obs < 1: abs_est_obs = 1 error = math.log(abs_est_obs / abs_ext_obs) ratio_errors.append(error) # low max ent estimate, magic numer is the value for the estiamtes # when a pair value was 1 for maxent, or 1 for ext # if est <= 104.0324: if error < 0: maxent_was_best_estimates.append(((n1, n2, n3), (est, ext, obs, ratio, triangle))) elif error > 0: ext_was_best.append(((n1, n2, n3), (est, ext, obs, ratio, triangle))) maxent_errors.append(est / float(obs)) ext_errors.append(ext / float(obs)) try: occurrences[int(obs)] += 1 occurrence_ratio_errors[int(obs)] += error except IndexError, e: pass