Пример #1
0
    async def run_async(
        self,
        pc_instance: PrivateComputationInstance,
        server_ips: Optional[List[str]] = None,
    ) -> PrivateComputationInstance:
        """Runs the pcf2.0 based private aggregation stage

        Args:
            pc_instance: the private computation instance to run aggregation stage
            server_ips: only used by the partner role. These are the ip addresses of the publisher's containers.

        Returns:
            An updated version of pc_instance that stores an MPCInstance
        """

        # Prepare arguments for attribution game
        game_args = self._get_compute_metrics_game_args(pc_instance, )

        # We do this check here because depends on how game_args is generated, len(game_args) could be different,
        #   but we will always expect server_ips == len(game_args)
        if server_ips and len(server_ips) != len(game_args):
            raise ValueError(
                f"Unable to rerun MPC pcf2.0 based aggregation because there is a mismatch between the number of server ips given ({len(server_ips)}) and the number of containers ({len(game_args)}) to be spawned."
            )

        # Create and start MPC instance to run MPC compute
        logging.info(
            "Starting to run MPC instance for pcf2.0 based aggregation stage.")

        stage_data = PrivateComputationServiceData.PCF2_AGGREGATION_STAGE_DATA
        binary_name = OneDockerBinaryNames.PCF2_AGGREGATION.value
        game_name = checked_cast(str, stage_data.game_name)

        binary_config = self._onedocker_binary_config_map[binary_name]
        retry_counter_str = str(pc_instance.retry_counter)
        mpc_instance = await create_and_start_mpc_instance(
            mpc_svc=self._mpc_service,
            instance_id=pc_instance.instance_id + "_" +
            GameNames.PCF2_AGGREGATION.value + retry_counter_str,
            game_name=game_name,
            mpc_party=map_private_computation_role_to_mpc_party(
                pc_instance.role),
            num_containers=len(game_args),
            binary_version=binary_config.binary_version,
            server_ips=server_ips,
            game_args=game_args,
            container_timeout=self._container_timeout,
            repository_path=binary_config.repository_path,
        )

        logging.info(
            "MPC instance started running for pcf2.0 based aggregation stage.")

        # Push MPC instance to PrivateComputationInstance.instances and update PL Instance status
        pc_instance.instances.append(
            PCSMPCInstance.from_mpc_instance(mpc_instance))
        return pc_instance
Пример #2
0
def get_updated_pc_status_mpc_game(
    private_computation_instance: PrivateComputationInstance,
    mpc_svc: MPCService,
) -> PrivateComputationInstanceStatus:
    """Updates the MPCInstances and gets latest PrivateComputationInstance status

    Arguments:
        private_computation_instance: The PC instance that is being updated
        mpc_svc: Used to update MPC instances stored on private_computation_instance

    Returns:
        The latest status for private_computation_instance
    """
    status = private_computation_instance.status
    if private_computation_instance.instances:
        # Only need to update the last stage/instance
        last_instance = private_computation_instance.instances[-1]
        if not isinstance(last_instance, MPCInstance):
            return status

        # MPC service has to call update_instance to get the newest containers
        # information in case they are still running
        private_computation_instance.instances[
            -1] = PCSMPCInstance.from_mpc_instance(
                mpc_svc.update_instance(last_instance.instance_id))

        mpc_instance_status = private_computation_instance.instances[-1].status

        current_stage = private_computation_instance.current_stage
        if mpc_instance_status is MPCInstanceStatus.STARTED:
            status = current_stage.started_status
        elif mpc_instance_status is MPCInstanceStatus.COMPLETED:
            status = current_stage.completed_status
        elif mpc_instance_status in (
                MPCInstanceStatus.FAILED,
                MPCInstanceStatus.CANCELED,
        ):
            status = current_stage.failed_status

    return status
Пример #3
0
 def update(self, instance: MPCInstance) -> None:
     self.repo.update(PCSMPCInstance.from_mpc_instance(instance))
Пример #4
0
    async def run_async(
        self,
        pc_instance: PrivateComputationInstance,
        server_ips: Optional[List[str]] = None,
    ) -> PrivateComputationInstance:
        """Runs the private computation aggregate metrics stage

        Args:
            pc_instance: the private computation instance to run aggregate metrics with
            server_ips: only used by the partner role. These are the ip addresses of the publisher's containers.

        Returns:
            An updated version of pc_instance that stores an MPCInstance
        """

        num_shards = (pc_instance.num_mpc_containers *
                      pc_instance.num_files_per_mpc_container)

        # TODO T101225989: map aggregation_type from the compute stage to metrics_format_type
        metrics_format_type = (
            "lift" if pc_instance.game_type is PrivateComputationGameType.LIFT
            else "ad_object")

        binary_name = OneDockerBinaryNames.SHARD_AGGREGATOR.value
        binary_config = self._onedocker_binary_config_map[binary_name]

        # Get output path of previous stage depending on what stage flow we are using
        # Using "PrivateComputationDecoupledStageFlow" instead of PrivateComputationDecoupledStageFlow.get_cls_name() to avoid
        # circular import error.
        if pc_instance.get_flow_cls_name in [
                "PrivateComputationDecoupledStageFlow",
                "PrivateComputationDecoupledLocalTestStageFlow",
        ]:
            input_stage_path = pc_instance.decoupled_aggregation_stage_output_base_path
        elif pc_instance.get_flow_cls_name in [
                "PrivateComputationPCF2StageFlow",
                "PrivateComputationPCF2LocalTestStageFlow",
        ]:
            input_stage_path = pc_instance.pcf2_aggregation_stage_output_base_path
        elif pc_instance.get_flow_cls_name == "PrivateComputationPCF2LiftStageFlow":
            input_stage_path = pc_instance.pcf2_lift_stage_output_base_path
        else:
            input_stage_path = pc_instance.compute_stage_output_base_path

        if self._log_cost_to_s3:
            run_name = pc_instance.instance_id

            if pc_instance.post_processing_data:
                pc_instance.post_processing_data.s3_cost_export_output_paths.add(
                    f"sa-logs/{run_name}_{pc_instance.role.value.title()}.json",
                )
        else:
            run_name = ""

        if self._is_validating:
            # num_containers_real_data is the number of containers processing real data
            # synthetic data is processed by a dedicated extra container, and this container is always the last container,
            # hence synthetic_data_shard_start_index = num_real_data_shards
            # each of the containers, processing real or synthetic data, processes the same number of shards due to our resharding mechanism
            # num_shards representing the total number of shards which is equal to num_real_data_shards + num_synthetic_data_shards
            # hence, when num_containers_real_data and num_shards are given, num_synthetic_data_shards = num_shards / (num_containers_real_data + 1)
            num_containers_real_data = pc_instance.num_pid_containers
            if num_containers_real_data is None:
                raise ValueError("num_containers_real_data is None")
            num_synthetic_data_shards = num_shards // (
                num_containers_real_data + 1)
            num_real_data_shards = num_shards - num_synthetic_data_shards
            synthetic_data_shard_start_index = num_real_data_shards

            # Create and start MPC instance for real data shards and synthetic data shards
            game_args = [
                {
                    "input_base_path": input_stage_path,
                    "num_shards": num_real_data_shards,
                    "metrics_format_type": metrics_format_type,
                    "output_path":
                    pc_instance.shard_aggregate_stage_output_path,
                    "first_shard_index": 0,
                    "threshold": pc_instance.k_anonymity_threshold,
                    "run_name": run_name,
                    "log_cost": self._log_cost_to_s3,
                },
                {
                    "input_base_path": input_stage_path,
                    "num_shards": num_synthetic_data_shards,
                    "metrics_format_type": metrics_format_type,
                    "output_path":
                    pc_instance.shard_aggregate_stage_output_path +
                    "_synthetic_data_shards",
                    "first_shard_index": synthetic_data_shard_start_index,
                    "threshold": pc_instance.k_anonymity_threshold,
                    "run_name": run_name,
                    "log_cost": self._log_cost_to_s3,
                },
            ]
            # We should only export visibility to scribe when it's set
            if pc_instance.result_visibility is not ResultVisibility.PUBLIC:
                result_visibility = int(pc_instance.result_visibility)
                for arg in game_args:
                    arg["visibility"] = result_visibility

            mpc_instance = await create_and_start_mpc_instance(
                mpc_svc=self._mpc_service,
                instance_id=pc_instance.instance_id + "_aggregate_shards" +
                str(pc_instance.retry_counter),
                game_name=GameNames.SHARD_AGGREGATOR.value,
                mpc_party=map_private_computation_role_to_mpc_party(
                    pc_instance.role),
                num_containers=2,
                binary_version=binary_config.binary_version,
                server_ips=server_ips,
                game_args=game_args,
                container_timeout=self._container_timeout,
            )
        else:
            # Create and start MPC instance
            game_args = [
                {
                    "input_base_path": input_stage_path,
                    "metrics_format_type": metrics_format_type,
                    "num_shards": num_shards,
                    "output_path":
                    pc_instance.shard_aggregate_stage_output_path,
                    "threshold": pc_instance.k_anonymity_threshold,
                    "run_name": run_name,
                    "log_cost": self._log_cost_to_s3,
                },
            ]
            # We should only export visibility to scribe when it's set
            if pc_instance.result_visibility is not ResultVisibility.PUBLIC:
                result_visibility = int(pc_instance.result_visibility)
                for arg in game_args:
                    arg["visibility"] = result_visibility

            mpc_instance = await create_and_start_mpc_instance(
                mpc_svc=self._mpc_service,
                instance_id=pc_instance.instance_id + "_aggregate_shards" +
                str(pc_instance.retry_counter),
                game_name=GameNames.SHARD_AGGREGATOR.value,
                mpc_party=map_private_computation_role_to_mpc_party(
                    pc_instance.role),
                num_containers=1,
                binary_version=binary_config.binary_version,
                server_ips=server_ips,
                game_args=game_args,
                container_timeout=self._container_timeout,
                repository_path=binary_config.repository_path,
            )
        # Push MPC instance to PrivateComputationInstance.instances and update PL Instance status
        pc_instance.instances.append(
            PCSMPCInstance.from_mpc_instance(mpc_instance))
        return pc_instance