Пример #1
0
def validate_json_input(message_body, json_schema):
    """Validate the racer_info json we receive.

    Args:
        message_body (str): a json string to validate
        json_schema (dict): json schema format to conform to.

    Raises:
        GenericNonFatalException: If the message has some problem,
                                  we allow new messages to be retried
                                  without exiting the simapp.
    """
    try:
        validate(instance=json.loads(message_body), schema=json_schema)
    except ValidationError as ex:
        error_msg = "[json validation] Invalid json format: {}.".format(ex)
        # even thought it's a client error (the entity calling simapp sending us wrong input message)
        # we are calling it 500 here because:
        # 1. it's the cloud platform team sending us the message and
        # we treat the cloud and the simapp as one internal system for deepracer.
        # 2. a contract is already defined with the cloud which we should honor.
        raise GenericNonFatalException(error_msg=error_msg,
                                       error_code=SIMAPP_EVENT_ERROR_CODE_500,
                                       error_name=SIMAPP_EVENT_SYSTEM_ERROR)
    except Exception as ex:
        error_msg = "[json validation] Something wrong when validating json format: {}.".format(
            ex)
        raise GenericNonFatalException(error_msg=error_msg,
                                       error_code=SIMAPP_EVENT_ERROR_CODE_500,
                                       error_name=SIMAPP_EVENT_SYSTEM_ERROR)
Пример #2
0
    def download_file(self, bucket, s3_key, local_path):
        """download file from s3 with retry logic

        Args:
            bucket (str): s3 bucket
            s3_key (str): s3 key
            local_path (str): file local path

        """

        try:
            self.exp_backoff(action_method=self.get_client().download_file,
                             Bucket=bucket,
                             Key=s3_key,
                             Filename=local_path)
        except botocore.exceptions.ClientError as err:
            # It is possible that the file isn't there in which case we should
            # raise exception and let the client decide the next action
            if self._log_and_cont:
                error_msg = "[s3] ClientError: Unable to download file from \
                            bucket {} with key {}. {}".format(
                    bucket, s3_key, ex)
                raise GenericNonFatalException(
                    error_msg=error_msg,
                    error_code=SIMAPP_EVENT_ERROR_CODE_400,
                    error_name=SIMAPP_EVENT_USER_ERROR)
            raise err
        except botocore.exceptions.ConnectTimeoutError as ex:
            if self._log_and_cont:
                error_msg = "[s3] ConnectTimeoutError: Unable to download file from \
                            bucket {} with key {}. {}".format(
                    bucket, s3_key, ex)
                raise GenericNonFatalException(
                    error_msg=error_msg,
                    error_code=SIMAPP_EVENT_ERROR_CODE_400,
                    error_name=SIMAPP_EVENT_USER_ERROR)
            log_and_exit(
                "Issue with your current VPC stack and IAM roles.\
                          You might need to reset your account resources: {}".
                format(ex), SIMAPP_S3_DATA_STORE_EXCEPTION,
                SIMAPP_EVENT_ERROR_CODE_400)
        except Exception as ex:
            if self._log_and_cont:
                error_msg = "[s3] SystemError: Unable to download file from \
                            bucket {} with key {}. {}".format(
                    bucket, s3_key, ex)
                raise GenericNonFatalException(
                    error_msg=error_msg,
                    error_code=SIMAPP_EVENT_ERROR_CODE_500,
                    error_name=SIMAPP_EVENT_SYSTEM_ERROR)
            log_and_exit(
                "Exception in downloading file (s3bucket: {} s3_key: {}): {}".
                format(bucket, s3_key, ex), SIMAPP_S3_DATA_STORE_EXCEPTION,
                SIMAPP_EVENT_ERROR_CODE_500)
Пример #3
0
    def wait_for_checkpoints(self, num_retry=10):
        """
        block until there is a checkpoint in all of the checkpoint_dirs.

        Args:
            num_retry (int, optional): The number of retries to download the checkpoints.
                                       The total wait time is num_retry * SLEEP_SECONDS.
                                       Defaults to 10.
        """

        for _ in range(num_retry):
            self.load_from_store()
            all_agent_checkpoint_copied = \
                all([checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file.read() is not None
                     for _, checkpoint in self.params.checkpoint_dict.items()])
            if all_agent_checkpoint_copied:
                return
            time.sleep(SLEEP_SECONDS)

        # one last time
        all_agent_checkpoint_copied = \
            all([checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file.read() is not None
                 for _, checkpoint in self.params.checkpoint_dict.items()])
        if all_agent_checkpoint_copied:
            return
        if self._log_and_cont:
            error_msg = "[s3] Checkpoint never found, waited {} seconds.".format(
                timeout)
            raise GenericNonFatalException(
                error_msg=error_msg,
                error_code=SIMAPP_EVENT_ERROR_CODE_500,
                error_name=SIMAPP_EVENT_SYSTEM_ERROR)
        log_and_exit(
            "Checkpoint never found, waited {} seconds.".format(timeout),
            SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
Пример #4
0
def process_car_control_msg(message):
    """Process the car control msg.

    Args:
        message (str): The message to process for car control.

    Raises:
        GenericNonFatalException: If processing message has some problem,
                                  it's not fatal and we ignore it and continue.

    Returns:
        topic (str): the control topic to publish to.
        payload (dict): the actual payload for the control topic.
    """
    try:
        validate_json_input(message, CAR_CONTROL_INPUT_SCHEMA)
        msg_json = json.loads(message)
        control_type = msg_json[WebRTCCarControl.TYPE.value]
        if control_type == WebRTCCarControl.STATUS.value:
            topic = CarControlTopic.STATUS_CTRL.value
        elif control_type == WebRTCCarControl.SPEED.value:
            topic = CarControlTopic.SPEED_CTRL.value
        # log latency
        log_latency(msg_json)
        return topic, msg_json[WebRTCCarControl.PAYLOAD.value]
    except GenericNonFatalException as ex:
        raise ex
    except Exception as ex:
        error_msg = "[webrtc msg process] Exception in processing \
                    webrtc message: {}, {}".format(message, ex)
        raise GenericNonFatalException(error_msg=error_msg,
                                       error_code=SIMAPP_EVENT_ERROR_CODE_500,
                                       error_name=SIMAPP_EVENT_SYSTEM_ERROR)
    def load_from_store(self, expected_checkpoint_number=-1):
        """download tf model, rl coach .coach_checkpoint, .finished, .ready file from s3

        Args:
            expected_checkpoint_number (int): for training, rollout worker will expect the latest
            file for eval, validation, expected_checkpoint_number will always be -1
            to make sure last/best tf model can be downloaded
        """
        try:
            for _, checkpoint in self.params.checkpoint_dict.items():
                while True:
                    # load tf models and rl coach .coach_checkpoint from s3 store
                    if not self._load_tf_model_from_store(
                        checkpoint=checkpoint, expected_checkpoint_number=expected_checkpoint_number
                    ):
                        continue
                    # load .finished from s3 store
                    self._load_syncfile_from_store(sync_file=checkpoint.syncfile_finished)
                    # load .ready from s3 store
                    self._load_syncfile_from_store(sync_file=checkpoint.syncfile_ready)
                    break
        except botocore.exceptions.ClientError as ex:
            if self._log_and_cont:
                error_msg = "[s3] ClientError: Unable to download checkpoint. {}".format(ex)
                raise GenericNonFatalException(
                    error_msg=error_msg,
                    error_code=SIMAPP_EVENT_ERROR_CODE_400,
                    error_name=SIMAPP_EVENT_USER_ERROR,
                )
            log_and_exit(
                "Unable to download checkpoint",
                SIMAPP_S3_DATA_STORE_EXCEPTION,
                SIMAPP_EVENT_ERROR_CODE_400,
            )
        except Exception as ex:
            if self._log_and_cont:
                error_msg = "[s3] SystemError: Unable to download checkpoint. {}".format(ex)
                raise GenericNonFatalException(
                    error_msg=error_msg,
                    error_code=SIMAPP_EVENT_ERROR_CODE_500,
                    error_name=SIMAPP_EVENT_SYSTEM_ERROR,
                )
            log_and_exit(
                "Exception in downloading checkpoint: {}".format(ex),
                SIMAPP_S3_DATA_STORE_EXCEPTION,
                SIMAPP_EVENT_ERROR_CODE_500,
            )
    def _download_model_metadata(self):
        """Attempt to download model metadata from s3.

        Raises:
            GenericNonFatalException: An non fatal exception which we will
                                      catch and proceed with work loop.

        Returns:
            sensors, version, model_metadata: The needed information from model metadata.
        """
        model_metadata_s3_key = get_s3_key(
            self._current_racer.inputModel.s3KeyPrefix, MODEL_METADATA_S3_POSTFIX
        )
        try:
            model_metadata = ModelMetadata(
                bucket=self._current_racer.inputModel.s3BucketName,
                s3_key=model_metadata_s3_key,
                region_name=self._region,
                local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format(self._agent_name),
            )
            model_metadata_info = model_metadata.get_model_metadata_info()
            sensors = model_metadata_info[ModelMetadataKeys.SENSOR.value]
            simapp_version = model_metadata_info[ModelMetadataKeys.VERSION.value]
        except botocore.exceptions.ClientError as err:
            error_msg = "[s3] Client Error: Failed to download model_metadata file: \
                        s3_bucket: {}, s3_key: {}, {}.".format(
                self._current_racer.inputModel.s3BucketName, model_metadata_s3_key, err
            )
            raise GenericNonFatalException(
                error_msg=error_msg,
                error_code=SIMAPP_EVENT_ERROR_CODE_400,
                error_name=SIMAPP_EVENT_USER_ERROR,
            )
        except Exception as err:
            error_msg = "[s3] System Error: Failed to download model_metadata file: \
                        s3_bucket: {}, s3_key: {}, {}.".format(
                self._current_racer.inputModel.s3BucketName, model_metadata_s3_key, err
            )
            raise GenericNonFatalException(
                error_msg=error_msg,
                error_code=SIMAPP_EVENT_ERROR_CODE_500,
                error_name=SIMAPP_EVENT_SYSTEM_ERROR,
            )
        return sensors, simapp_version, model_metadata