def plot_intervals(output_folder): from parsers import CVOutputParser from preprocessing import Preprocessor from utils import avg import os import math """ Given a cross validation ouput. Certain triple intervals can be plottet to compare the error for extrapolation, max ent and the heurestic. The algorithm runs through each triple interval, and then for each sampled estiamte output the triples in the interval are looked up in each sample and the MAPE error is recorded and the average errors are added. And the average of these averages are then plottet for each interval. """ if not output_folder[-1] == '/': output_folder += '/' intervals = 30 triple_intervals = Preprocessor.triple_intervals(output_folder + 'observed_frequent_items.out', intervals=intervals) avg_max_ent_errors = [] avg_ext_errors = [] avg_heu_errors = [] pair_triple_ratios = [i/10. for i in range(11)] # binned ratios [0.0 to 1.0] max_ent_ratio_error = [0 for i in range(11)] ext_ratio_error = [0 for i in range(11)] for index, triple_interval in enumerate(triple_intervals): print 'Triple interval {} of {}'.format(index, intervals) iteration = 0 MAPE_avg_errors = [] MAPE_avg_errors_ext = [] # MAPE_avg_errors_heu = [] while True: max_ent_est_file = output_folder + str(iteration) + '_data.tsv' ext_est_file = output_folder + str(iteration) + '_data_extrapolation.tsv' # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv' # read baseline also? # Read until we do not find an output file if not os.path.exists(max_ent_est_file): break max_ent_est = CVOutputParser.read_est_obs_file(max_ent_est_file) ext_est = CVOutputParser.read_est_obs_file(ext_est_file) # heu_est = CVOutputParser.read_est_obs_file(heu_est_file) MAPE_errors = [] MAPE_errors_ext = [] # MAPE_errors_heu = [] for triple in triple_interval: # Check that the triple has been estimated if triple in max_ent_est: # Index 1 should hold the observed value parsed from the file # is the same mapped to every estimate, so hust read it once. obs = max_ent_est[triple][1] # maxent estimate est = max_ent_est[triple][0] # extrapolation estimate est2 = ext_est[triple][0] # # independence estimat? # heurestic, use max_ent for 0 triple in sample # est4 = heu_est[triple][0] # Index 2 should hold the pair triple ratio. # is the sam for every estimat ratio = max_ent_est[triple][2] # bin the ratio to one decimal ratio_binned = round(ratio, 1) # add errors to the ratio max_ent_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est-obs) / float(obs) ext_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est2-obs) / float(obs) # MAPE error max ent # error = abs(obs-est) #/ float(obs) * 100 # MAPE_errors.append(error) # # MAPE error extrapolation # error2 = abs(obs-est2) #/ float(obs) * 100 # MAPE_errors_ext.append(error2) # MAPE error independence? # MAPE error heurestic # error4 = abs(obs-est4) #/ float(obs) * 100 # MAPE_errors_heu.append(error4) # MAPE baseline error? MAPE_avg_errors.append(avg(MAPE_errors)) MAPE_avg_errors_ext.append(avg(MAPE_errors_ext)) # MAPE_avg_errors_heu.append(avg(MAPE_errors_heu)) iteration += 1 avg_max_ent_errors.append(avg(MAPE_avg_errors)) avg_ext_errors.append(avg(MAPE_avg_errors_ext)) # avg_heu_errors.append(avg(MAPE_avg_errors_heu)) plot(range(len(avg_max_ent_errors)), avg_max_ent_errors, color='blue') plot(range(len(avg_ext_errors)), avg_ext_errors, color='red')
def plot_ratios(output_folder): from parsers import CVOutputParser from utils import interpolate import math from collections import Counter import os """ Plot accumulated errors for estimators agains pari triple ratios. Ratios are binned in the range 0.0 to 1.0. """ if not output_folder[-1] == '/': output_folder += '/' pair_triple_ratios = [i/10. for i in range(11)] max_ent_ratio_error = [0 for i in range(11)] ext_ratio_error = [0 for i in range(11)] maxent_better_ratio = [0 for i in range(11)] ext_better_ratio = [0 for i in range(11)] values_binned = 0 values_ignored = 0 iteration = 0 pair_counts = Counter() trip_counts = Counter() while True: max_ent_est_file = output_folder + str(iteration) + '_data.tsv' ext_est_file = output_folder + str(iteration) + '_data_extrapolation.tsv' # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv' # read baseline also? # Read until we do not find an output file if not os.path.exists(max_ent_est_file): break max_ent_est = CVOutputParser.read_est_obs_file(max_ent_est_file) ext_est = CVOutputParser.read_est_obs_file(ext_est_file) # heu_est = CVOutputParser.read_est_obs_file(heu_est_file) for triple in max_ent_est.keys(): (s1, s2, s3, s12, s13, s23, s123) = max_ent_est[triple][3] pair_counts[s12] += 1 pair_counts[s13] += 1 pair_counts[s23] += 1 trip_counts[s123] += 1 # if not s123 < max_trips or not min(s12, s13, s23) > min_pairs: # values_ignored += 1 # continue # Index 1 should hold the observed value parsed from the file # is the same mapped to every estimate, so just read it once. obs = max_ent_est[triple][1] # if obs < 200: # values_ignored += 1 # continue if obs < 200: continue # maxent estimate est = max_ent_est[triple][0] # extrapolation estimate est2 = ext_est[triple][0] # # independence estimat? # heurestic, use max_ent for 0 triple in sample # est4 = heu_est[triple][0] # Index 2 should hold the pair triple ratio. # is the sam for every estimat ratio = max_ent_est[triple][2] # bin the ratio to one decimal ratio_binned = round(ratio, 1) # Record the ratio if maxent was better maxent_error = abs(est-obs)/math.sqrt(obs) ext_error = abs(est2-obs)/math.sqrt(obs) try: if maxent_error < ext_error: maxent_better_ratio[pair_triple_ratios.index(ratio_binned)] +=1 elif maxent_error > ext_error: ext_better_ratio[pair_triple_ratios.index(ratio_binned)] +=1 except ValueError, ve: pass # add errors to the ratio bin try: values_binned += 1 max_ent_ratio_error[pair_triple_ratios.index(ratio_binned)] += maxent_error ext_ratio_error[pair_triple_ratios.index(ratio_binned)] += ext_error except ValueError, ve: pass