示例#1
0
文件: tally.py 项目: nealmcb/arlo-e2e
def fast_tally_ballots(
    ballots: Sequence[CiphertextBallot],
    pool: Optional[Pool] = None,
) -> TALLY_TYPE:
    """
    This function does a tally of the given list of ballots, returning a dictionary that maps
    from selection object_ids to the ElGamalCiphertext that corresponds to the encrypted tally
    of that selection. An optional `Pool` may be passed in, and it will be used to evaluate
    the ElGamal accumulation in parallel. If it's absent, then the accumulation will happen
    sequentially. Progress bars are not currently supported.
    """

    iter_count = 1
    initial_tallies: Sequence[TALLY_INPUT_TYPE] = ballots

    while True:
        if pool is None or len(initial_tallies) <= BALLOTS_PER_SHARD:
            log_and_print(
                f"tally iteration {iter_count} (FINAL): {len(initial_tallies)} partial tallies"
            )
            return sequential_tally(initial_tallies)

        shards = shard_list_uniform(initial_tallies, BALLOTS_PER_SHARD)
        log_and_print(
            f"tally iteration {iter_count}: {len(initial_tallies)} partial tallies --> {len(shards)} shards"
        )
        partial_tallies: Sequence[TALLY_TYPE] = pool.map(func=sequential_tally,
                                                         iterable=shards)

        iter_count += 1
        initial_tallies = partial_tallies
示例#2
0
文件: tally.py 项目: nealmcb/arlo-e2e
def sequential_tally(
        ptallies: Sequence[Optional[TALLY_INPUT_TYPE]]) -> TALLY_TYPE:
    """
    Internal function: sequentially tallies all of the ciphertext ballots, or other partial tallies,
    and returns a partial tally. If any input tally happens to be `None` or an empty dict,
    the result is an empty dict.
    """
    # log_and_print(f"Sequential, local tally with {len(ptallies)} inputs")

    num_nones = sum([1 for p in ptallies if p is None or p == {}])
    if num_nones > 0 in ptallies:
        log_and_print(
            f"Found {num_nones} failed partial tallies, returning an empty tally"
        )
        return {}

    result: TALLY_TYPE = {}
    for ptally in ptallies:
        # we want do our computation purely in terms of TALLY_TYPE, so we'll convert CiphertextBallots
        if isinstance(ptally, CiphertextBallot):
            ptally = ciphertext_ballot_to_dict(ptally)

        if ptally is None:
            # should never happen, but paranoia to keep the type system happy
            return {}

        for k in ptally.keys():
            if k not in result:
                result[k] = ptally[k]
            else:
                counter_sum = result[k]
                counter_partial = ptally[k]
                counter_sum = elgamal_add(counter_sum, counter_partial)
                result[k] = counter_sum
    return result
示例#3
0
def write_file_with_retries(
    full_file_name: Union[str, PurePath],
    contents: AnyStr,  # bytes or str
    num_retries: int = 1,
) -> None:
    """
    Helper function: given a fully resolved file path, or a path-like object describing
    a file location, writes the given contents to the a file of that name, and if it
    fails, tries it again and again (based on the `num_retries` parameter). This works
    around occasional failures that happen, for no good reason, with s3fs-fuse in big
    clouds.
    """
    prev_exception = None
    write_mode = "w" if isinstance(contents, str) else "wb"

    for retry_number in range(0, num_retries):
        try:
            with open(full_file_name, write_mode) as f:
                f.write(contents)
            return
        except Exception as e:
            prev_exception = e
            log_and_print(
                f"failed to write {full_file_name} (attempt #{retry_number}): {str(e)}"
            )

    if num_retries > 1:
        log_and_print(
            f"giving up writing {full_file_name}: failed {num_retries} times")
    if prev_exception:
        raise prev_exception
示例#4
0
def mkdir_helper(p: Union[str, Path], num_retries: int = 1) -> None:
    """
    Wrapper around `os.mkdir` that will work correctly even if the directory already exists.
    """
    prev_exception = None
    if isinstance(p, str):
        path = Path(p)
    else:
        path = p

    for attempt in range(0, num_retries):
        try:
            path.mkdir(parents=True, exist_ok=True)
            return
        except Exception as e:
            prev_exception = e
            log_and_print(
                f"failed to make directory {p} (attempt {attempt}): {str(e)}")

    if num_retries > 1:
        log_and_print(
            f"failed to make directory {p} after {num_retries} attempts, failing"
        )

    if prev_exception:
        raise prev_exception
示例#5
0
def r_partial_tally(
    progressbar_actor: Optional[ActorHandle],
    *ptallies: Optional[TALLY_TYPE],
) -> Optional[TALLY_TYPE]:  # pragma: no cover
    """
    This is a front-end for `partial_tally`, that can be called remotely via Ray.
    """
    try:
        result = partial_tally(progressbar_actor, *ptallies)
        return result
    except Exception as e:
        log_and_print(f"Unexpected exception in r_partial_tally: {e}", True)
        return None
示例#6
0
def r_decrypt(cec: CiphertextElectionContext, keypair: ElGamalKeyPair,
              di: DecryptInput) -> Optional[DecryptOutput]:  # pragma: no cover
    """
    Remotely decrypts an ElGamalCiphertext (and its related data -- see DecryptInput)
    and returns the plaintext along with a Chaum-Pedersen proof (see DecryptOutput).
    """
    try:
        plaintext, proof = decrypt_ciphertext_with_proof(
            di.ciphertext, keypair, di.seed, cec.crypto_extended_base_hash)
        return DecryptOutput(di.object_id, plaintext, proof)
    except Exception as e:
        log_and_print(f"Unexpected exception in r_decrypt: {e}", True)
        return None
示例#7
0
def r_verify_ballot_proofs(
    manifest: Manifest,
    public_key: ElementModP,
    hash_header: ElementModQ,
    progressbar_actor: Optional[ActorHandle],
    *cballot_filenames: str,
) -> Optional[TALLY_TYPE]:  # pragma: no cover
    """
    Given a list of ballots, verify their Chaum-Pedersen proofs and redo the tally.
    Returns `None` if anything didn't verify correctly, otherwise a partial tally
    of the ballots (of type `TALLY_TYPE`).
    """

    # We're never moving ciphertext ballots through Ray's remote object system. Instead,
    # we've got filenames coming in. We load the ciphertext ballots, verify them, and
    # we're immediately done with them.  This puts a lot of pressure on the filesystem
    # but S3 buckets, Azure blob storage, etc. can handle it.

    try:
        valid_count = 0
        num_ballots = len(cballot_filenames)
        ptallies: List[TALLY_TYPE] = []

        for name in cballot_filenames:
            cballot = manifest.load_ciphertext_ballot(name)

            if cballot is None:
                return None

            is_valid = cballot.is_valid_encryption(cballot.description_hash,
                                                   public_key, hash_header)
            if is_valid:
                valid_count = valid_count + 1
            if progressbar_actor is not None:
                progressbar_actor.update_completed.remote("Ballots", 1)

            ptallies.append(ciphertext_ballot_to_dict(cballot))

        if valid_count < num_ballots:
            # log_and_print(f"Only {valid_count} of {num_ballots} ballots are valid.")
            return None

        ptally = sequential_tally(ptallies)
        if progressbar_actor is not None:
            progressbar_actor.update_completed.remote("Tallies", num_ballots)

        return ptally
    except Exception as e:
        log_and_print(f"Unexpected exception in r_verify_ballot_proofs: {e}",
                      True)
        return None
示例#8
0
    def equivalent(self, other: "Manifest") -> bool:
        """
        Not exactly checking equality, but does check that the manifests are "equivalent",
        which means we're ignoring the root directories, but checking the rest.
        """
        same_bytes = self.bytes_written == other.bytes_written
        same_hashes = self.hashes == other.hashes

        if not same_hashes:
            for k in self.hashes.keys():
                if self.hashes[k] != other.hashes[k]:
                    log_and_print(f"different values for key {k}")

        return same_hashes and same_bytes
示例#9
0
def r_verify_tally_selection_proofs(
    public_key: ElementModP,
    hash_header: ElementModQ,
    *selections: SelectionInfo,
) -> bool:  # pragma: no cover
    """
    Given a list of tally selections, verifies that every one's internal proof is correct.
    """
    try:
        results = [
            s.is_valid_proof(public_key, hash_header) for s in selections
        ]
        return all(results)
    except Exception as e:
        log_and_print(
            f"Unexpected exception in r_verify_tally_selection_proofs: {e}",
            True)
        return False
示例#10
0
    def test_end_to_end_publications_ray(self, input: str, check_proofs: bool,
                                         keypair: ElGamalKeyPair) -> None:
        self.removeTree(
        )  # if there's anything leftover from a prior run, get rid of it

        cvrs = read_dominion_csv(StringIO(input))
        self.assertIsNotNone(cvrs)

        _, ballots, _ = cvrs.to_election_description()
        assert len(ballots) > 0, "can't have zero ballots!"

        results = ray_tally_everything(
            cvrs,
            secret_key=keypair.secret_key,
            verbose=True,
            root_dir=TALLY_TESTING_DIR,
        )

        self.assertTrue(results.all_proofs_valid())

        # dump files out to disk
        write_ray_tally(results, TALLY_TESTING_DIR)
        log_and_print(
            "tally_testing written, proceeding to read it back in again")

        # now, read it back again!
        results2 = load_ray_tally(
            TALLY_TESTING_DIR,
            check_proofs=check_proofs,
            verbose=True,
            recheck_ballots_and_tallies=True,
        )
        self.assertIsNotNone(results2)

        log_and_print("tally_testing got non-null result!")

        self.assertTrue(
            _list_eq(results.encrypted_ballots, results2.encrypted_ballots))
        self.assertTrue(results.equivalent(results2, keypair))
        self.removeTree()  # clean up our mess
示例#11
0
def ray_decrypt_tally(
    tally: TALLY_TYPE,
    cec: ObjectRef,  # ObjectRef[CiphertextElectionContext]
    keypair: ObjectRef,  # ObjectRef[ElGamalKeyPair]
    proof_seed: ElementModQ,
) -> DECRYPT_TALLY_OUTPUT_TYPE:
    """
    Given a tally, this decrypts the tally
    and returns a dict from selection object_ids to tuples containing the decrypted
    total as well as a Chaum-Pedersen proof that the total corresponds to the ciphertext.

    :param tally: an election tally
    :param cec: a Ray ObjectRef containing a `CiphertextElectionContext`
    :param keypair: a Ray ObjectRef containing an `ElGamalKeyPair`
    :param proof_seed: an ElementModQ
    """
    tkeys = tally.keys()
    proof_seeds: List[ElementModQ] = Nonces(proof_seed)[0:len(tkeys)]
    inputs: List[DecryptInput] = [
        DecryptInput(object_id, seed, tally[object_id])
        for seed, object_id in zip(proof_seeds, tkeys)
    ]

    # We can't be lazy here: we need to have all this data in hand so we can
    # rearrange it into a dictionary and return it.
    result: List[Optional[DecryptOutput]] = ray.get(
        [r_decrypt.remote(cec, keypair, x) for x in inputs])

    if None in result:
        log_and_print(
            f"Unexpected failure from in ray_decrypt_tally, returning an empty dict",
            True,
        )
        return {}

    # mypy can't figure this that None isn't here any more, so we need to check for None again
    return {
        r.object_id: (r.plaintext, r.decryption_proof)
        for r in result if r is not None
    }
示例#12
0
    def all_proofs_valid(
        self,
        verbose: bool = False,
        recheck_ballots_and_tallies: bool = False,
        use_progressbar: bool = True,
    ) -> bool:
        """
        Checks all the proofs used in this tally, returns True if everything is good.
        Any errors found will be logged. Normally, this only checks the proofs associated
        with the totals. If you want to also recompute the tally (i.e., tabulate the
        encrypted ballots) and verify every individual ballot proof, then set
        `recheck_ballots_and_tallies` to True.
        """

        ray_wait_for_workers(min_workers=2)

        log_and_print("Verifying proofs.", verbose)

        r_public_key = ray.put(self.context.elgamal_public_key)
        r_hash_header = ray.put(self.context.crypto_extended_base_hash)

        start = timer()
        selections = self.tally.map.values()
        sharded_selections: Sequence[
            Sequence[SelectionInfo]] = shard_list_uniform(selections, 2)

        # parallelizing this is overkill, but why not?
        results: List[bool] = ray.get([
            r_verify_tally_selection_proofs.remote(r_public_key, r_hash_header,
                                                   *s)
            for s in sharded_selections
        ])
        end = timer()

        log_and_print(f"Verification time: {end - start: .3f} sec", verbose)
        log_and_print(
            f"Verification rate: {len(self.tally.map.keys()) / (end - start): .3f} selection/sec",
            verbose,
        )

        if False in results:
            return False

        if recheck_ballots_and_tallies:
            if self.manifest is None:
                log_and_print(
                    "cannot recheck ballots and tallies without a manifest")
                return False

            # next, check each individual ballot's proofs; in this case, we're going to always
            # show the progress bar, even if verbose is false
            num_ballots = self.num_ballots

            r_manifest = ray.put(self.manifest)

            progressbar = (ProgressBar({
                "Ballots": num_ballots,
                "Tallies": num_ballots,
                "Iterations": 0,
                "Batch": 0,
            }) if use_progressbar else None)
            progressbar_actor = progressbar.actor if progressbar is not None else None

            ballot_start = timer()

            batches: Sequence[Sequence[str]] = shard_list_uniform(
                self.cvr_metadata["BallotId"], BATCH_SIZE)

            # List[ObjectRef[Optional[TALLY_TYPE]]]
            recomputed_tallies: List[ObjectRef] = []

            for batch in batches:
                if progressbar_actor:
                    progressbar_actor.update_completed.remote("Batch", 1)

                cballot_manifest_name_shards: Sequence[
                    Sequence[str]] = shard_list_uniform(
                        batch, BALLOTS_PER_SHARD)

                # List[ObjectRef[Optional[TALLY_TYPE]]]
                ballot_results: List[ObjectRef] = [
                    r_verify_ballot_proofs.remote(
                        r_manifest,
                        r_public_key,
                        r_hash_header,
                        progressbar_actor,
                        *shard,
                    ) for shard in cballot_manifest_name_shards
                ]
                # ray.wait(
                #     ballot_results,
                #     num_returns=len(cballot_manifest_name_shards),
                #     timeout=None,
                # )
                # log_and_print("Recomputing tallies.", verbose)

                ptally = ray_tally_ballots(ballot_results,
                                           PARTIAL_TALLIES_PER_SHARD,
                                           progressbar)
                recomputed_tallies.append(ptally)

            if len(recomputed_tallies) > 1:
                recomputed_tally = ray.get(
                    ray_tally_ballots(recomputed_tallies,
                                      PARTIAL_TALLIES_PER_SHARD, progressbar))
            else:
                recomputed_tally = ray.get(recomputed_tallies[0])

            if progressbar:
                progressbar.close()

            if not recomputed_tally:
                return False

            ballot_end = timer()

            log_and_print(
                f"Ballot verification rate: {num_ballots / (ballot_end - ballot_start): .3f} ballot/sec",
                True,
            )

            tally_success = tallies_match(self.tally.to_tally_map(),
                                          recomputed_tally)

            if not tally_success:
                return False

        return True
示例#13
0
def ray_reduce_with_rounds(
    inputs: Iterable[ObjectRef],
    shard_size: int,
    reducer_first_arg: Any,
    reducer: Callable,  # Callable[[Any, VarArg(ObjectRef)], ObjectRef]
    progressbar: Optional[ProgressBar] = None,
    progressbar_key: Optional[str] = None,
    verbose: bool = False,
) -> ObjectRef:
    """
    Given a list of inputs and a Ray remote reducer, manages the Ray cluster to wait for the values
    when they're ready, and call the reducer to ultimately get down to a single value. Unlike
    `ray_reduce_with_ray_wait`, this version builds a reduction tree. It depends on an associative
    property for the reducer, but not a commutative property.

    The `shard_size` parameter specifies how many inputs should be fed to each call to the reducer.
    Since the available data will vary, the actual number fed to the reducer will be at least two
    and at most `shard_size`.

    The `reducer` is a Ray remote method reference that takes a given first argument of whatever
    type and then a varargs sequence of objectrefs, and returns an objectref. So, if you had
    code that looked like:

    ```
    @ray.remote
    def my_reducer(config: Config, *inputs: MyDataType) -> MyDataType:
        ...
    ```

    And let's say you're mapping some remote function to generate those values and later want
    to reduce them. That code might look like this:
    ```
    @ray.remote
    def my_mapper(input: SomethingElse) -> MyDataType:
        ...

    def run_everything(config: Config, inputs: Iterable[SomethingElse]) -> MyDataType:
        map_refs = [my_mapper.remote(i) for i in inputs]
        return ray_reduce_with_rounds(map_refs, 10, config, my_reducer.remote)
    ```

    If your `reducer_first_arg` corresponds to some large object that you don't want to serialize
    over and over, you could of course call `ray_put` on it first and pass that along.

    Optional feature: integration with the progressbar in `ray_progress`. Just pass in the
    ProgressBar as well as the `key` string that you want to use. Whenever more work
    is being dispatched, the progressbar's total amount of work is updated by the dispatcher here.
    The work completion notification is *not* handled here. That needs to be done by the remote
    reducer. (Why? Because it might want to update the progressbar for each element in the shard
    while here we could only see when the whole shard is completed.)
    """

    # TODO: generalize this code so the `reducer_first_arg` is wrapped up in the reducer.
    #   This seems like a job for `kwargs`. Deal with that after everything else works.

    assert (progressbar_key and progressbar
            ) or not progressbar, "progress bar requires a key string"

    assert shard_size > 1, "shard_size must be greater than one"

    progressbar_actor = progressbar.actor if progressbar is not None else None
    iter_count = 1

    result: Optional[ObjectRef] = None

    inputs = list(inputs)

    while True:
        num_inputs = len(inputs)

        if progressbar_actor is not None:
            progressbar_actor.update_completed.remote("Iterations", 1)
            progressbar_actor.update_total.remote(progressbar_key, num_inputs)

        if num_inputs <= shard_size:
            log_and_print(f"Reduction (FINAL): {num_inputs} partial results",
                          verbose=verbose)
            result = reducer(reducer_first_arg, *inputs)
            break

        # Sequence[Sequence[ObjectRef[Optional[TALLY_TYPE]]]]
        shards: Sequence[Sequence[ObjectRef]] = shard_list_uniform(
            inputs, shard_size)

        log_and_print(
            f"Reduction {iter_count:2d}: {num_inputs:6d} partial results --> {len(shards)} shards (bps = {shard_size})",
            verbose=verbose,
        )

        # Sequence[ObjectRef[Optional[TALLY_TYPE]]]
        partial_results: List[ObjectRef] = [
            reducer(reducer_first_arg, *shard) for shard in shards
        ]

        # To avoid deeply nested tasks, we're going to wait for this to finish.
        # If you comment out the call to ray.wait(), everything still works, but
        # you can get warnings about too many tasks.
        # ray.wait(partial_results, num_returns=len(partial_results), timeout=None)

        iter_count += 1
        inputs = partial_results

    if progressbar:
        progressbar.print_until_done()
    assert result is not None, "while loop shouldn't have broken without setting result"
    return result
示例#14
0
文件: tally.py 项目: nealmcb/arlo-e2e
    def all_proofs_valid(
        self,
        pool: Optional[Pool] = None,
        verbose: bool = True,
        recheck_ballots_and_tallies: bool = False,
    ) -> bool:
        """
        Checks all the proofs used in this tally, returns True if everything is good.
        Any errors found will be logged. Normally, this only checks the proofs associated
        with the totals. If you want to also recompute the tally (i.e., tabulate the
        encrypted ballots) and verify every individual ballot proof, then set
        `recheck_ballots_and_tallies` to True.
        """

        wrapped_func = functools.partial(
            verify_tally_selection_proof,
            self.context.elgamal_public_key,
            self.context.crypto_extended_base_hash,
        )
        start = timer()

        inputs = self.tally.map.values()
        if verbose:  # pragma: no cover
            inputs = tqdm(list(inputs), "Tally proof")

        result: List[bool] = ([wrapped_func(x)
                               for x in inputs] if pool is None else pool.map(
                                   func=wrapped_func, iterable=inputs))
        end = timer()
        log_and_print(f"Verification time: {end - start: .3f} sec", verbose)
        log_and_print(
            f"Verification rate: {len(self.tally.map.keys()) / (end - start): .3f} selection/sec",
            verbose,
        )

        if False in result:
            return False

        if recheck_ballots_and_tallies:
            # first, try to load all the ballots and make sure there are no hash errors
            if not self.all_files_present():
                return False

            # next, check each individual ballot's proofs; in this case, we're going to always
            # show the progress bar, even if verbose is false
            ballot_iter = tqdm(self.encrypted_ballots, desc="Ballot proofs")
            ballot_func = functools.partial(verify_ballot_proof, self.context)

            ballot_start = timer()
            ballot_result: List[bool] = ([ballot_func(x)
                                          for x in ballot_iter] if pool is None
                                         else pool.map(func=ballot_func,
                                                       iterable=ballot_iter))

            ballot_end = timer()
            log_and_print(
                f"Ballot verification rate: {len(self.encrypted_ballots) / (ballot_end - ballot_start): .3f} ballot/sec",
                verbose,
            )

            if False in ballot_result:
                return False

            log_and_print("Recomputing tallies:", verbose)
            recomputed_tally = fast_tally_ballots(self.encrypted_ballots, pool)
            tally_success = tallies_match(self.tally.to_tally_map(),
                                          recomputed_tally)

            if not tally_success:
                return False

        return True
示例#15
0
def ray_tally_everything(
    cvrs: DominionCSV,
    verbose: bool = True,
    use_progressbar: bool = True,
    date: Optional[datetime] = None,
    seed_hash: Optional[ElementModQ] = None,
    master_nonce: Optional[ElementModQ] = None,
    secret_key: Optional[ElementModQ] = None,
    root_dir: Optional[str] = None,
) -> "RayTallyEverythingResults":
    """
    This top-level function takes a collection of Dominion CVRs and produces everything that
    we might want for arlo-e2e: a list of encrypted ballots, their encrypted and decrypted tally,
    and proofs of the correctness of the whole thing. The election `secret_key` is an optional
    parameter. If absent, a random keypair is generated and used. Similarly, if a `seed_hash` or
    `master_nonce` is not provided, random ones are generated and used.

    For parallelism, Ray is used. Make sure you've called `ray.init()` or `ray_localhost_init()`
    before calling this.

    If `root_dir` is specified, then the tally is written out to the specified directory, and
    the resulting `RayTallyEverythingResults` object will support the methods that allow those
    ballots to be read back in again. Conversely, if `root_dir` is `None`, then nothing is
    written to disk, and the result will not have access to individual ballots.
    """

    rows, cols = cvrs.data.shape

    ray_wait_for_workers(min_workers=2)

    if date is None:
        date = datetime.now()

    if root_dir is not None:
        mkdir_helper(root_dir, num_retries=NUM_WRITE_RETRIES)
        r_manifest_aggregator = ManifestAggregatorActor.remote(
            root_dir)  # type: ignore
    else:
        r_manifest_aggregator = None

    r_root_dir = ray.put(root_dir)

    start_time = timer()

    # Performance note: by using to_election_description_ray rather than to_election_description, we're
    # only getting back a list of dictionaries rather than a list of PlaintextBallots. We're pushing that
    # work out into the nodes, where it will run in parallel. The BallotPlaintextFactory wraps up all
    # the (immutable) state necessary to convert from these dicts to PlaintextBallots and is meant to
    # be sent to every node in the cluster.

    ed, bpf, ballot_dicts, id_map = cvrs.to_election_description_ray(date=date)
    setup_time = timer()
    num_ballots = len(ballot_dicts)
    assert num_ballots > 0, "can't have zero ballots!"
    log_and_print(
        f"ElectionGuard setup time: {setup_time - start_time: .3f} sec, {num_ballots / (setup_time - start_time):.3f} ballots/sec"
    )

    keypair = (elgamal_keypair_random() if secret_key is None else
               elgamal_keypair_from_secret(secret_key))
    assert keypair is not None, "unexpected failure with keypair computation"
    secret_key, public_key = keypair

    cec = make_ciphertext_election_context(
        number_of_guardians=1,
        quorum=1,
        elgamal_public_key=public_key,
        description_hash=ed.crypto_hash(),
    )
    r_cec = ray.put(cec)

    ied = InternalElectionDescription(ed)
    r_ied = ray.put(ied)

    if seed_hash is None:
        seed_hash = rand_q()
    r_seed_hash = ray.put(seed_hash)
    r_keypair = ray.put(keypair)

    r_ballot_plaintext_factory = ray.put(bpf)

    if master_nonce is None:
        master_nonce = rand_q()

    nonces = Nonces(master_nonce)
    r_nonces = ray.put(nonces)
    nonce_indices = range(num_ballots)

    inputs = list(zip(ballot_dicts, nonce_indices))

    batches = shard_list_uniform(inputs, BATCH_SIZE)
    num_batches = len(batches)
    log_and_print(
        f"Launching Ray.io remote encryption! (number of batches: {num_batches})"
    )

    start_time = timer()

    progressbar = (ProgressBar({
        "Ballots": num_ballots,
        "Tallies": num_ballots,
        "Iterations": 0,
        "Batch": 0,
    }) if use_progressbar else None)
    progressbar_actor = progressbar.actor if progressbar is not None else None

    batch_tallies: List[ObjectRef] = []
    for batch in batches:
        if progressbar_actor:
            progressbar_actor.update_completed.remote("Batch", 1)

        num_ballots_in_batch = len(batch)
        sharded_inputs = shard_list_uniform(batch, BALLOTS_PER_SHARD)
        num_shards = len(sharded_inputs)

        partial_tally_refs = [
            r_encrypt_and_write.remote(
                r_ied,
                r_cec,
                r_seed_hash,
                r_root_dir,
                r_manifest_aggregator,
                progressbar_actor,
                r_ballot_plaintext_factory,
                r_nonces,
                right_tuple_list(shard),
                *(left_tuple_list(shard)),
            ) for shard in sharded_inputs
        ]

        # log_and_print("Remote tallying.")
        btally = ray_tally_ballots(partial_tally_refs, BALLOTS_PER_SHARD,
                                   progressbar)
        batch_tallies.append(btally)

    # Each batch ultimately yields one partial tally; we add these up here at the
    # very end. If we have a million ballots and have batches of 10k ballots, this
    # would mean we'd have only 100 partial tallies. So, what's here works just fine.
    # If we wanted, we could certainly burn some scalar time and keep a running,
    # singular, partial tally. It's probably more important to push onward to the
    # next batch, so we can do as much work in parallel as possible.

    if len(batch_tallies) > 1:
        tally = ray.get(ray_tally_ballots(batch_tallies, 10, progressbar))
    else:
        tally = ray.get(batch_tallies[0])

    if progressbar:
        progressbar.close()

    assert tally is not None, "tally failed!"

    log_and_print("Tally decryption.")
    decrypted_tally: DECRYPT_TALLY_OUTPUT_TYPE = ray_decrypt_tally(
        tally, r_cec, r_keypair, seed_hash)

    log_and_print("Validating tally.")

    # Sanity-checking logic: make sure we don't have any unexpected keys, and that the decrypted totals
    # match up with the columns in the original plaintext data.
    tally_keys = set(decrypted_tally.keys())
    expected_keys = set(id_map.keys())

    assert tally_keys.issubset(
        expected_keys
    ), f"bad tally keys (actual keys: {sorted(tally_keys)}, expected keys: {sorted(expected_keys)})"

    for obj_id in decrypted_tally.keys():
        cvr_sum = int(cvrs.data[id_map[obj_id]].sum())
        decryption, proof = decrypted_tally[obj_id]
        assert cvr_sum == decryption, f"decryption failed for {obj_id}"

    final_manifest: Optional[Manifest] = None

    if root_dir is not None:
        final_manifest = ray.get(r_manifest_aggregator.result.remote())
        assert isinstance(
            final_manifest,
            Manifest), "type error: bad result from manifest aggregation"

    # Assemble the data structure that we're returning. Having nonces in the ciphertext makes these
    # structures sensitive for writing out to disk, but otherwise they're ready to go.
    log_and_print("Constructing results.")
    reported_tally: Dict[str, SelectionInfo] = {
        k: SelectionInfo(
            object_id=k,
            encrypted_tally=tally[k],
            # we need to forcibly convert mpz to int here to make serialization work properly
            decrypted_tally=int(decrypted_tally[k][0]),
            proof=decrypted_tally[k][1],
        )
        for k in tally.keys()
    }

    tabulate_time = timer()

    log_and_print(
        f"Encryption and tabulation: {rows} ballots, {rows / (tabulate_time - start_time): .3f} ballot/sec",
        verbose,
    )

    return RayTallyEverythingResults(
        metadata=cvrs.metadata,
        cvr_metadata=cvrs.dataframe_without_selections(),
        election_description=ed,
        num_ballots=rows,
        manifest=final_manifest,
        tally=SelectionTally(reported_tally),
        context=cec,
    )
示例#16
0
文件: tally.py 项目: nealmcb/arlo-e2e
def fast_tally_everything(
    cvrs: DominionCSV,
    pool: Optional[Pool] = None,
    verbose: bool = True,
    date: Optional[datetime] = None,
    seed_hash: Optional[ElementModQ] = None,
    master_nonce: Optional[ElementModQ] = None,
    secret_key: Optional[ElementModQ] = None,
    use_progressbar: bool = True,
) -> FastTallyEverythingResults:
    """
    This top-level function takes a collection of Dominion CVRs and produces everything that
    we might want for arlo-e2e: a list of encrypted ballots, their encrypted and decrypted tally,
    and proofs of the correctness of the whole thing. The election `secret_key` is an optional
    parameter. If absent, a random keypair is generated and used. Similarly, if a `seed_hash` or
    `master_nonce` is not provided, random ones are generated and used.

    For parallelism, a `multiprocessing.pool.Pool` may be provided, and should result in significant
    speedups on multicore computers. If absent, the computation will proceed sequentially.
    """
    rows, cols = cvrs.data.shape

    if date is None:
        date = datetime.now()

    parse_time = timer()
    log_and_print(f"Rows: {rows}, cols: {cols}", verbose)

    ed, ballots, id_map = cvrs.to_election_description(date=date)
    assert len(ballots) > 0, "can't have zero ballots!"

    keypair = (elgamal_keypair_random() if secret_key is None else
               elgamal_keypair_from_secret(secret_key))
    assert keypair is not None, "unexpected failure with keypair computation"
    secret_key, public_key = keypair

    # This computation exists only to cause side-effects in the DLog engine, so the lame nonce is not an issue.
    assert len(ballots) == get_optional(
        elgamal_encrypt(m=len(ballots),
                        nonce=int_to_q_unchecked(3),
                        public_key=public_key)).decrypt(
                            secret_key), "got wrong ElGamal decryption!"

    dlog_prime_time = timer()
    log_and_print(
        f"DLog prime time (n={len(ballots)}): {dlog_prime_time - parse_time: .3f} sec",
        verbose,
    )

    cec = make_ciphertext_election_context(
        number_of_guardians=1,
        quorum=1,
        elgamal_public_key=public_key,
        description_hash=ed.crypto_hash(),
    )

    ied = InternalElectionDescription(ed)

    # REVIEW THIS: is this cryptographically sound? Is the seed_hash properly a secret? Should
    # it go in the output? The nonces are clearly secret. If you know them, you can decrypt.
    if seed_hash is None:
        seed_hash = rand_q()
    if master_nonce is None:
        master_nonce = rand_q()
    nonces: List[ElementModQ] = Nonces(master_nonce)[0:len(ballots)]

    # even if verbose is false, we still want to see the progress bar for the encryption
    cballots = fast_encrypt_ballots(ballots,
                                    ied,
                                    cec,
                                    seed_hash,
                                    nonces,
                                    pool,
                                    use_progressbar=use_progressbar)
    eg_encrypt_time = timer()

    log_and_print(
        f"Encryption time: {eg_encrypt_time - dlog_prime_time: .3f} sec",
        verbose)
    log_and_print(
        f"Encryption rate: {rows / (eg_encrypt_time - dlog_prime_time): .3f} ballot/sec",
        verbose,
    )

    tally: TALLY_TYPE = fast_tally_ballots(cballots, pool)
    eg_tabulate_time = timer()

    log_and_print(
        f"Tabulation time: {eg_tabulate_time - eg_encrypt_time: .3f} sec",
        verbose)
    log_and_print(
        f"Tabulation rate: {rows / (eg_tabulate_time - eg_encrypt_time): .3f} ballot/sec",
        verbose,
    )
    log_and_print(
        f"Encryption and tabulation: {rows} ballots / {eg_tabulate_time - dlog_prime_time: .3f} sec = {rows / (eg_tabulate_time - dlog_prime_time): .3f} ballot/sec",
        verbose,
    )

    assert tally is not None, "tally failed!"

    if verbose:  # pragma: no cover
        print("Decryption & Proofs: ")
    decrypted_tally: DECRYPT_TALLY_OUTPUT_TYPE = fast_decrypt_tally(
        tally, cec, keypair, seed_hash, pool, verbose)
    eg_decryption_time = timer()
    log_and_print(
        f"Decryption time: {eg_decryption_time - eg_tabulate_time: .3f} sec",
        verbose)
    log_and_print(
        f"Decryption rate: {len(decrypted_tally.keys()) / (eg_decryption_time - eg_tabulate_time): .3f} selection/sec",
        verbose,
    )

    # Sanity-checking logic: make sure we don't have any unexpected keys, and that the decrypted totals
    # match up with the columns in the original plaintext data.
    for obj_id in decrypted_tally.keys():
        assert obj_id in id_map, "object_id in results that we don't know about!"
        cvr_sum = int(cvrs.data[id_map[obj_id]].sum())
        decryption, proof = decrypted_tally[obj_id]
        assert cvr_sum == decryption, f"decryption failed for {obj_id}"

    # Assemble the data structure that we're returning. Having nonces in the ciphertext makes these
    # structures sensitive for writing out to disk, but otherwise they're ready to go.
    reported_tally: Dict[str, SelectionInfo] = {
        k: SelectionInfo(
            object_id=k,
            encrypted_tally=tally[k],
            # we need to forcibly convert mpz to int here to make serialization work properly
            decrypted_tally=int(decrypted_tally[k][0]),
            proof=decrypted_tally[k][1],
        )
        for k in tally.keys()
    }

    # strips the ballots of their nonces, which is important because those could allow for decryption
    accepted_ballots = [ciphertext_ballot_to_accepted(x) for x in cballots]

    return FastTallyEverythingResults(
        metadata=cvrs.metadata,
        cvr_metadata=cvrs.dataframe_without_selections(),
        election_description=ed,
        encrypted_ballot_memos={
            ballot.object_id: make_memo_value(ballot)
            for ballot in accepted_ballots
        },
        tally=SelectionTally(reported_tally),
        context=cec,
    )
示例#17
0
def read_dominion_csv(file: Union[str, StringIO]) -> Optional[DominionCSV]:
    """
    Given a filename of a Dominion CSV (or a StringIO buffer with the same data), tries
    to read it. If successful, you get back a named-tuple which describes the election.

    The contest map is a dictionary. The keys are the titles of the contests, and the
    values are a second level of dictionary, mapping from the name of each choice to
    the ultimate string that's used as a column identifier in the Pandas dataframe.

    """
    try:
        df = pd.read_csv(
            file,
            header=[0, 1, 2, 3],
            quoting=csv.QUOTE_MINIMAL,
            sep=",",
            engine="python",
        )
    except FileNotFoundError:
        return None
    except pd.errors.ParserError:
        return None

    # TODO: At this point, we know the file is a valid CSV and we're *assuming* it's a valid Dominion file.
    #   We shouldn't make that assumption, but checking for it would be really tricky.

    filtered_columns = [[
        fix_strings(e) for e in c
        if (not e.startswith("Unnamed:") and not e == '""')
    ] for c in df.columns]
    election_name = filtered_columns[0][0]

    # The first two columns have the election name and a version number in them, so we have to treat those specially,
    # otherwise, we're looking for columns with only one thing in them, which says that they're not a contest (with
    # choices) but instead they're one of the metadata columns.
    ballot_metadata_fields: List[str] = (
        filtered_columns[0][1:] + filtered_columns[1][1:] +
        [x[0] for x in filtered_columns[2:] if len(x) == 1])

    df = df.applymap(fix_strings)
    column_names = [
        filtered_columns[0][1:],
        filtered_columns[1][1:],
    ] + filtered_columns[2:]

    # new_column_names, max_votes_for_map = _fixup_column_names(column_names)

    max_votes_for_map: Dict[str, int] = {}
    vote_for_n_pattern = re.compile(r"\s*\(Vote For=(\d+)\)$")
    new_column_names: List[str] = []
    contests = []
    selection_uid_iter = UidMaker("s")

    # We might have a case where we have two candidates in the same contest which
    # are identical. This particularly occurs when we have "pick k of n" contests
    # where we might have k write-in slots. Our solution, since we want candidate
    # names within a contest to be unique, so we can use them as dictionary keys,
    # elsewhere in our code, is to append "(2)", "(3)", etc.

    # Since write-ins or other repeats (e.g., "FOR" / "AGAINST" in referenda) might
    # happen in multiple races, we only need the names to be unique within a specific
    # contest. Thus, we've got the dictionary below to track everything.

    # Meanwhile, we're also learning the "k" in "k of n" by parsing it straight
    # out of the contest name.

    # Yes, this code is complex, but then so is the file format we're parsing.

    seen_candidate: Dict[Tuple[str, str], int] = {}

    for column in column_names:
        vote_for_n = 1  # until proven otherwise
        column = [str(x) for x in column]  # force everything to be a string
        title = column[0]
        vote_for_n_match: Optional[re.Match] = vote_for_n_pattern.search(title)

        if vote_for_n_match is not None:
            vote_for_n = int(vote_for_n_match.group(1))
            # chop off the "(Vote For=N)" part
            title = title[0:vote_for_n_match.span()[0]]

        max_votes_for_map[title] = vote_for_n
        new_column = [title] + column[1:]

        if len(column) > 1:
            candidate = column[1]

            if (title, candidate) in seen_candidate:
                seen_candidate[(title, candidate)] += 1
                new_column = (
                    [title] +
                    [f"{candidate} ({seen_candidate[(title, candidate)]})"] +
                    column[2:])
            else:
                seen_candidate[(title, candidate)] = 1

            contests.append(new_column)

        new_column_names.append(" | ".join(new_column))

    # Now we're going to extract a mapping from contest titles to all the choices.
    contest_keys = set()

    all_parties: Set[str] = set()
    contest_map_builder: Dict[str, List[SelectionMetadata]] = {}
    contest_key_to_title: Dict[str, str] = {}
    contest_titles: List[str] = []
    for contest in contests:
        title = contest[0]
        candidate = contest[1]
        party = fix_party_string(contest[2]) if len(contest) > 2 else ""

        if party not in all_parties and party != "":
            all_parties.add(party)

        if party != "":
            key = " | ".join([title, candidate, party])
        else:
            key = " | ".join([title, candidate])

        contest_keys.add(key)

        if title not in contest_map_builder:
            contest_map_builder[title] = []
            contest_titles.append(title)

        # goes from ["Representative - District 1"] to a list of SelectionMetadata objects
        uid_int, uid_str = selection_uid_iter.next_int()
        metadata = SelectionMetadata(
            object_id=uid_str,
            sequence_number=uid_int,
            contest_name=title,
            choice_name=candidate,
            party_name=party,
        )
        contest_map_builder[title].append(metadata)

        # goes from "Representative - District 1 | Alice | DEM" to "Representative - District 1"
        contest_key_to_title[metadata.to_string()] = title

    df.columns = new_column_names

    df["Guid"] = df.apply(
        lambda r: dominion_row_to_uid(r, election_name, ballot_metadata_fields
                                      ),
        axis=1,
    )

    # If there are any duplicated metadata rows, something is really wrong with the data.
    # The "Guid" concatenates all the metadata, so it's perfect for this sanity check.
    num_duplicates = df["Guid"].duplicated().sum()
    if num_duplicates > 0:
        log_and_print(
            f"Error: {num_duplicates} duplicated metadata rows found")
        return None

    # If the election official put numbers in as their ballot types, that's going to cause type
    # errors, because we really want to deal with them as strings.
    df["BallotType"] = df["BallotType"].apply(lambda s: str(s))

    # there's probably an easier way to do this, but it does what we want
    ballot_uid_iter = UidMaker("b")
    df["BallotId"] = df.apply(
        lambda r: ballot_uid_iter.next(),
        axis=1,
    )

    if "BallotType" not in df:
        return None

    ballotstyle_uids = UidMaker("ballotstyle")
    all_ballot_types = sorted(set(df["BallotType"]))
    ballot_type_to_bsid = {
        bt: ballotstyle_uids.next()
        for bt in all_ballot_types
    }

    contest_map: CONTEST_MAP = {
        k: set(contest_map_builder[k])
        for k in contest_map_builder.keys()
    }
    style_map: STYLE_MAP = {}

    # extract a list of dictionaries that have two keys: BallotType and BallotId
    ballot_id_and_types: List[Dict[str,
                                   str]] = df[["BallotType", "BallotId"
                                               ]].to_dict(orient="records")

    # boil this down to a dictionary from BallotId to BallotType
    ballot_id_to_ballot_type: Dict[str, str] = {
        elem["BallotId"]: elem["BallotType"]
        for elem in ballot_id_and_types
    }

    # We're computing a set-union of all the non-empty contest fields we find, in any ballot
    # sharing a given BallotType setting, i.e., we're inferring which contests are actually
    # a part of each BallotType.

    # Potential degenerate result: in a race with very few ballots cast, it's conceivable that
    # every single ballot will undervote an entire contest. In this specific circumstance,
    # the style map will be "wrong", which would mean that that entire contest would be
    # completely missing from subsequent e2e crypto results. Hopefully, actual Dominion CVRs
    # will have zeros rather than blank cells to represent these undervotes, and then this case
    # will never occur. Otherwise, it's unclear how we'd ever be able to distinguish between
    # a contest that's completely undervoted versus a contest that's not part of a ballot style.

    #  For each ballot style:
    #    - fetch all the rows of a given ballot type (e.g., b24 = df[df['BallotType'] == "Ballot 24 - Type 24"])
    #    - convert to true/false based on whether we have "not a number" and "add" (e.g., totals = b24.notna().sum())
    #    - add up the totals for each choice
    #    - if that contest_total is non-zero, then it's a contest that's included in the race, otherwise not

    # mapping from the name of a contest to a list of all the columns names that have selections for that contest
    contest_choices: Dict[str, List[str]] = {
        contest: [selection.to_string() for selection in contest_map[contest]]
        for contest in contest_map.keys()
    }

    for bt in all_ballot_types:
        # Expanded math, useful for debugging:

        # ballots_of_bt = df[df["BallotType"] == bt]
        # column_trues = ballots_of_bt.notna()
        # column_true_sums = column_trues.sum()

        # All in one line, might run faster with Modin's query optimizer?
        column_true_sums = df[df["BallotType"] == bt].notna().sum()

        sums_per_contest = {
            contest: column_true_sums[contest_choices[contest]].sum()
            for contest in contest_choices.keys()
        }
        non_zero_contests = {
            contest
            for contest in contest_choices.keys()
            if sums_per_contest[contest] > 0
        }

        style_map[bt] = non_zero_contests

    return DominionCSV(
        ElectionMetadata(
            fix_strings(election_name),
            ballot_type_to_bsid,
            ballot_id_to_ballot_type,
            all_parties,
            style_map,
            contest_map,
            max_votes_for_map,
            contest_titles,
        ),
        df,
        ballot_metadata_fields + ["Guid", "BallotId"],
    )
示例#18
0
def r_encrypt_and_write(
    ied: InternalElectionDescription,
    cec: CiphertextElectionContext,
    seed_hash: ElementModQ,
    root_dir: Optional[str],
    manifest_aggregator: Optional[ActorHandle],
    progressbar_actor: Optional[ActorHandle],
    bpf: BallotPlaintextFactory,
    nonces: Nonces,
    nonce_indices: List[int],
    *plaintext_ballot_dicts: Dict[str, Any],
) -> Optional[TALLY_TYPE]:  # pragma: no cover
    """
    Remotely encrypts a list of ballots and their associated nonces. If a `root_dir`
    is specified, the encrypted ballots are written to disk, otherwise no disk activity.
    What's returned is a `RemoteTallyResult`. If the ballots were written, the
    `manifest_aggregator` actor will be notified. A "partial tally" of the
    encrypted ballots is returned.
    """

    try:
        manifest = make_fresh_manifest(
            root_dir) if root_dir is not None else None

        num_ballots = len(plaintext_ballot_dicts)
        assert (len(nonce_indices) == num_ballots
                ), "mismatching numbers of nonces and ballots!"
        assert num_ballots > 0, "need at least one ballot"

        ptally_final: Optional[TALLY_TYPE] = None
        for i in range(0, num_ballots):
            pballot = bpf.row_to_plaintext_ballot(plaintext_ballot_dicts[i])
            cballot = ciphertext_ballot_to_accepted(
                get_optional(
                    encrypt_ballot(
                        pballot,
                        ied,
                        cec,
                        seed_hash,
                        nonces[nonce_indices[i]],
                        should_verify_proofs=False,
                    )))
            if manifest is not None:
                manifest.write_ciphertext_ballot(cballot,
                                                 num_retries=NUM_WRITE_RETRIES)

            if progressbar_actor is not None:
                progressbar_actor.update_completed.remote("Ballots", 1)

            ptally = ciphertext_ballot_to_dict(cballot)
            ptally_final = (sequential_tally([ptally_final, ptally])
                            if ptally_final else ptally)

            if progressbar_actor is not None:
                progressbar_actor.update_completed.remote("Tallies", 1)

        if manifest is not None and manifest_aggregator is not None:
            manifest_aggregator.add.remote(manifest)

        return ptally_final
    except Exception as e:
        log_and_print(f"Unexpected exception in r_encrypt_and_write: {e}",
                      True)
        return None
示例#19
0
    def test_end_to_end_publications(self, input: str, check_proofs: bool,
                                     keypair: ElGamalKeyPair) -> None:
        coverage.process_startup(
        )  # necessary for coverage testing to work in parallel
        self.removeTree(
        )  # if there's anything leftover from a prior run, get rid of it

        cvrs = read_dominion_csv(StringIO(input))
        self.assertIsNotNone(cvrs)

        _, ballots, _ = cvrs.to_election_description()
        assert len(ballots) > 0, "can't have zero ballots!"

        results = fast_tally_everything(cvrs,
                                        self.pool,
                                        secret_key=keypair.secret_key,
                                        verbose=True)

        self.assertTrue(results.all_proofs_valid(self.pool))

        # dump files out to disk
        write_fast_tally(results, TALLY_TESTING_DIR)
        log_and_print(
            "tally_testing written, proceeding to read it back in again")

        # now, read it back again!
        results2 = load_fast_tally(
            TALLY_TESTING_DIR,
            check_proofs=check_proofs,
            pool=self.pool,
            verbose=True,
            recheck_ballots_and_tallies=True,
        )
        self.assertIsNotNone(results2)

        log_and_print("tally_testing got non-null result!")

        self.assertTrue(
            _list_eq(results.encrypted_ballots, results2.encrypted_ballots))
        self.assertTrue(results.equivalent(results2, keypair, self.pool))

        # Make sure there's an index.html file; throws an exception if it's missing
        self.assertIsNotNone(stat(path.join(TALLY_TESTING_DIR, "index.html")))

        # And lastly, while we're here, we'll use all this machinery to exercise the ballot decryption
        # read/write facilities.

        ied = InternalElectionDescription(results.election_description)

        log_and_print("decrypting one more time")
        pballots = decrypt_ballots(
            ied,
            results.context.crypto_extended_base_hash,
            keypair,
            self.pool,
            results.encrypted_ballots,
        )
        self.assertEqual(len(pballots), len(results.encrypted_ballots))
        self.assertNotIn(None, pballots)

        # for speed, we're only going to do this for the first ballot, not all of them
        pballot = pballots[0]
        eballot = results.encrypted_ballots[0]
        bid = pballot.ballot.object_id
        self.assertTrue(
            verify_proven_ballot_proofs(
                results.context.crypto_extended_base_hash,
                keypair.public_key,
                eballot,
                pballot,
            ))
        write_proven_ballot(pballot, DECRYPTED_DIR)
        self.assertTrue(exists_proven_ballot(bid, DECRYPTED_DIR))
        self.assertFalse(exists_proven_ballot(bid + "0", DECRYPTED_DIR))
        self.assertEqual(pballot, load_proven_ballot(bid, DECRYPTED_DIR))

        self.removeTree()  # clean up our mess