def run(self): D = training_instances.get_generation_instances( filenames=self.filenames) splits = None if self.test_size: splits = ShuffleSplit(n=len(D), n_iter=self.cv, test_size=self.test_size) else: splits = KFold(n=len(D), n_folds=self.cv, shuffle=True) cross_val_results = defaultdict(list) for fold_index, (train_indices, test_indices) in enumerate(splits): train = [D[i] for i in train_indices] test = [D[i] for i in test_indices] params = self.set_hyperparameters(train) run_results = self.crossval_run(test, params, fold_index=fold_index) print "======================================================================" print params print run_results for key, val in run_results.items(): cross_val_results[key].append(val) for key, vals in sorted(cross_val_results.items()): modelname, metricname = key lower, upper = confidence_interval(vals) print "%s mean %s: %0.03f (%0.03f-%0.03f)" % ( modelname, metricname, np.mean(vals), lower, upper) pickle.dump(self.log, file(self.logfile, 'w'), 2)
def pooled_experiment(agentname='literal'): # Collapse across folds: results = defaultdict(lambda: defaultdict(list)) for dirname in ('furniture', 'people'): log = pickle.load(file("logs/log_%s.pickle" % dirname)) for d in log: fold = d['fold_index'] acc = d[agentname]['evaluations']['instance_accuracy'] dice = d[agentname]['evaluations']['multiset_dice'] results[fold]['instance_accuracy'].append(acc) results[fold]['multiset_dice'].append(dice) # Means for the folds: pooled = defaultdict(dict) for fold, metric_vals in results.items(): for metric, vals in metric_vals.items(): pooled[metric][fold] = np.mean(vals) # Stats across the folds: runs = {} for metric, fold_dict in pooled.items(): fold_vals = np.array(fold_dict.values()) mu = np.mean(fold_vals) upper, lower = confidence_interval(fold_vals) print '%s mean %s: %0.03f (ci %0.03f, %0.03f)' % (agentname, metric, mu, upper, lower) runs[metric] = fold_vals return runs
def predicted_vs_actual_length(log, agentname='pragmatic'): deltas = [] for d in log: results = d[agentname] delta = len(results['prediction']) - len(results['actual']) deltas.append(delta) upper, lower = confidence_interval(deltas) print '%s mean difference: %0.02f (%0.02f, %0.02f 95%% ci)' % ( agentname, np.mean(deltas), upper, lower)
def evaluation_report(self, all_results, verbose=0, split_info=None): errors = np.array([d['error'] for d in all_results]) iterations = np.array([d['iterations'] for d in all_results]) print "======================================================================" print "Type: %s" % self.typ print "Domain: %s" % self.dirname print "Features: %s" % self.phi.__name__ print split_info print "Learning rate: %s" % self.eta print "L2 coefs:", [r['l2_coeff'] for r in all_results] print "Mean iterations to convergence: %0.3f (+/- %0.3f)" % ( iterations.mean(), iterations.std() * 2) for metric in self.metrics: vals = np.array( [d['evaluations'][metric.__name__] for d in all_results]) ci = confidence_interval(vals) print "Mean %s: %0.3f (%.3f--%.3f)" % (metric.__name__, vals.mean(), ci[0], ci[1])
def crossvalidate(self): kf = KFold(n=len(self.filenames), n_folds=self.cv, shuffle=True) summaries = [] temps = [] for train_indices, test_indices in kf: train = [self.filenames[i] for i in train_indices] temp, nullcost = self.set_hyperparameters(train) test = [self.filenames[i] for i in test_indices] all_reports = self.run(test, temperature=temp, nullcost=nullcost) summary = self.summarize(all_reports) summaries.append(summary) temps.append(temp) print 'Temp: %s; nullcost: %s; %s' % (temp, nullcost, str(summary)) for name in ('Literal', 'Pragmatic', 'Speaker'): vals = np.array([s[name] for s in summaries]) ci = confidence_interval(vals) print "%s mean accuracy: %0.2f (%0.2f-%0.2f)" % (name, vals.mean(), ci[0], ci[1])
def triple_errors(output_folder, triple): from parsers import CVOutputParser from utils import interpolate, avg, confidence_interval import math from collections import Counter import os """ Plot accumulated errors for estimators against pair triple ratios. Ratios are binned in the range 0.0 to 1.0. """ if not output_folder[-1] == "/": output_folder += "/" iteration = -1 max_ent_errors = [] ext_errors = [] max_ent_abs_errors = [] ext_abs_errors = [] samples_ignored = 0 while True: iteration += 1 max_ent_est_file = output_folder + str(iteration) + "_data.tsv" ext_est_file = output_folder + str(iteration) + "_data_extrapolation.tsv" # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv' # read baseline also? # Read until we do not find an output file if not os.path.exists(max_ent_est_file): break # Read the maxent estimate found = False for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2( max_ent_est_file ): (s1, s2, s3, s12, s13, s23, s123) = triangle if sample_triple == triple: # if s123 == 0: # break found = True max_ent_errors.append(est - obs) max_ent_abs_errors.append(abs(obs - est)) break if not found: samples_ignored += 1 continue for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_est_file): (s1, s2, s3, s12, s13, s23, s123) = triangle if sample_triple == triple: ext_errors.append(est - obs) ext_abs_errors.append(abs(obs - est)) break # maxent confidence interval maxent_ci = confidence_interval(max_ent_errors) # extrapolation confidence interval ext_ci = confidence_interval(ext_errors) print "samples ignored: ", samples_ignored print "maxent avg error: ", round(avg(max_ent_errors), 1) print "maxent 95% confidence interval: ", (round(maxent_ci[0], 1), round(maxent_ci[1], 2)) print "extrapolation avg error: ", round(avg(ext_errors), 1) print "extrapolation 95% confidence interval: ", (round(ext_ci[0], 1), round(ext_ci[1], 2)) # round max_ent_errors_rounded = [round(x, 1) for x in max_ent_errors] ext_errors_rounded = [round(x, 1) for x in ext_errors] # plot xlabel("Estimate error") ylabel("Bucket size") # text(0.1, 0.8, 'Maxent') # text(0.1, 0.7, 'avg. error: ' + str(avg(max_ent_errors))) # text(0.1, 0.6, '95% conf. interval: ' + str(maxent_ci)) # text(0.5, 0.8, 'Extrapolation') # text(0.5, 0.7, 'avg. error: ' + str(avg(ext_errors))) # text(0.5, 0.6, '95% conf. interval: ' + str(ext_ci)) hist([max_ent_errors_rounded, ext_errors_rounded], color=("b", "r")) return max_ent_errors, max_ent_abs_errors, ext_errors, ext_abs_errors
def test_confidence_interval(self): assert utils.confidence_interval([2,2,2,2]) == 0 nose.tools.assert_almost_equal(utils.confidence_interval([1,2,3,4]), 1.096,3) assert utils.confidence_interval([2,2,4,4]) == 0.98