def compute_risk(e, mid, sn_tcpra, trials=None): """ Compute (estimate) Bayesian risk (chance that reported outcome is wrong for contest e.cid_m[mid]). sn_tcpra is sampled number: stage_time->cid->pbcid->rvote->avote->count We take sn_tcpra here as argument rather than just use e.sn_tcpra so we can call compute_contest_risk with modified sample counts. (This option not yet used, but might be later, when optimizing workload.) Here sn_tcpra is identical in structure to (and may in fact be identical to) e.sn_tcpra. Here trials is the number of trials to run to obtain the desired precision in the risk estimate. This method is the heart of the Bayesian post-election audit method. But it could be replaced by a frequentist approach instead, at least for those outcome rules and mixes of collection types for which a frequentist method is known. The comparison and ballot-polling audits are blended here; the reported election data just records a ("-noCVR",) vote for the reported vote in a noCVR paper ballot collection. This means that ballot-polling audits have a prior of pseudocount_base, while comparison audits have a prior of pseudocount_base for off-diagonal (non-equal reported and actual) vote pairs, but a prior of pseudocount_match for equal reported-vote and actual-vote pairs. """ cid = e.cid_m[mid] wrong_outcome_count = 0 if trials == None: trials = e.n_trials for trial in range(trials): test_tally = {vote: 0 for vote in e.votes_c[cid]} for pbcid in sorted(e.possible_pbcid_c[cid]): # Draw from posterior for each paper ballot collection, sum them. # Stratify by reported vote. for rv in sorted(sn_tcpra[e.stage_time][cid][pbcid]): tally = sn_tcpra[e.stage_time][cid][pbcid][rv].copy() for av in e.votes_c[cid]: tally[av] = tally.get(av, 0) tally[av] += (e.pseudocount_match if av == rv else e.pseudocount_base) dirichlet_dict = dirichlet(tally) stratum_size = e.rn_cpr[cid][pbcid][rv] # sample_size = sn_tcpr[e.stage_time][cid][pbcid][rv] sample_size = sum([ sn_tcpra[e.stage_time][cid][pbcid][rv][av] for av in sn_tcpra[e.stage_time][cid][pbcid][rv] ]) nonsample_size = stratum_size - sample_size for av in sorted(tally): test_tally[av] += tally[av] test_tally[av] += dirichlet_dict[av] * nonsample_size if e.ro_c[cid] != outcomes.compute_outcome(e, cid, test_tally): wrong_outcome_count += 1 risk = wrong_outcome_count / e.n_trials e.risk_tm[e.stage_time][mid] = risk return risk
def get_noisy_guess(e, mid, pbcids, actual_votes, xs, nonsample_sizes, num_trials=100): """ Use Dirichlet a certain number of times, to measure the probability that a winner that isn't the reported winner wins in the overall election. """ winners = [] cid = e.cid_m[mid] for _ in range(num_trials): current_sample = copy.deepcopy(actual_votes) for pbcid in actual_votes: for av in current_sample[pbcid]: if current_sample[pbcid][av] == 0: current_sample[pbcid][av] += 50 # pseudocount dirichlet_dict = risk_bayes.dirichlet(current_sample[pbcid]) extended_sample = risk_bayes.multinomial(xs[pbcid], dirichlet_dict) for av in current_sample[pbcid]: current_sample[pbcid][av] += extended_sample[av] for pbcid in actual_votes: dirichlet_dict = risk_bayes.dirichlet(current_sample[pbcid]) extended_sample = risk_bayes.multinomial( nonsample_sizes[pbcid] - xs[pbcid], dirichlet_dict) for av in current_sample[pbcid]: current_sample[pbcid][av] += extended_sample[av] merged_sample = {} for pbcid in pbcids: for av in current_sample[pbcid]: if av not in merged_sample: merged_sample[av] = 0 merged_sample[av] += current_sample[pbcid][av] if outcomes.compute_outcome(e, cid, merged_sample) == e.ro_c[cid]: winners.append(1) return abs(float(num_trials - len(winners)) / len(winners) - 0.05)
def get_sample_size(e, pbcids_to_adjust, init_x=1, pick_pbcid_func=round_robin): """ Get sample size, for a given county, given how many ballots have been sampled before, and the number left to audit, as well as the required risk limit. """ default_start_pbcid = 0 start = None num_winners = e.num_winners max_num_it = e.max_num_it for mid in e.cid_m: cid = e.cid_m[mid] xs, actual_votes, nonsample_sizes = create_helper_dicts( e, mid, init_x, pbcids_to_adjust) # For max_num_it iterations, we first choose a county, then, we extend the county # by x. Then, given this extended sample, we use it to extend the entire contest to # n votes. We calculate the winner of the extended contest - if all the winners are # correct, then we update the x for that pbcid, by possibly decreasing it. If not, # with some probability, we increase x for that county. for i in range(max_num_it): current_sample = copy.deepcopy( actual_votes) # pbcid -> av -> count if pick_pbcid_func == random_min_var: pbcid = pick_pbcid_func(pbcids_to_adjust, actual_votes, xs, nonsample_sizes) elif pick_pbcid_func == round_robin: if start is None: start = default_start_pbcid pbcid = pick_pbcid_func(pbcids_to_adjust, start) start += 1 start = (start % len(pbcids_to_adjust)) else: pbcid = pick_pbcid_func(pbcids_to_adjust) for av in current_sample[pbcid]: if current_sample[pbcid][av] == 0: current_sample[pbcid][av] += 50 # pseudocount dirichlet_dict = risk_bayes.dirichlet(current_sample[pbcid]) extended_sample = risk_bayes.multinomial(xs[pbcid], dirichlet_dict) for av in current_sample[pbcid]: current_sample[pbcid][av] += extended_sample[av] for k, pbcid in enumerate(pbcids_to_adjust): dirichlet_dict = risk_bayes.dirichlet(current_sample[pbcid]) extended_sample = risk_bayes.multinomial( nonsample_sizes[pbcid] - xs[pbcid], dirichlet_dict) for av in current_sample[pbcid]: current_sample[pbcid][av] += extended_sample[av] merged_sample = {} for pbcid in pbcids_to_adjust: for av in current_sample[pbcid]: if av not in merged_sample: merged_sample[av] = 0 merged_sample[av] += current_sample[pbcid][av] winners = [] for k in range(num_winners): winners.append(outcomes.compute_outcome(e, cid, merged_sample)) if len(set(winners)) == 1 and winners[0] == e.ro_c[cid]: xs = update_correct(xs, [pbcid], nonsample_sizes, num_winners, e.risk_limit_m[mid]) else: xs = update_incorrect(xs, [pbcid], nonsample_sizes, num_winners, e.risk_limit_m[mid]) return xs