Пример #1
0
    def _renew_message_visibility(self, receipt_handle: str):
        interval = self.renewal_period
        new_timeout = self.visibility_timeout

        cur_time = time.time()
        while True:
            time.sleep((cur_time + interval) - time.time())
            cur_time += interval
            new_timeout += interval

            with self.receipt_handle_mutex:
                if receipt_handle in self.stop_renewal:
                    self.stop_renewal.remove(receipt_handle)
                    break

                try:
                    self.sqs_client.change_message_visibility(
                        QueueUrl=self.queue_url,
                        ReceiptHandle=receipt_handle,
                        VisibilityTimeout=new_timeout,
                    )
                except botocore.exceptions.ClientError as err:
                    if err.response["Error"][
                            "Code"] == "InvalidParameterValue":
                        # unexpected; this error is thrown when attempting to renew a message that has been deleted
                        continue
                    elif err.response["Error"][
                            "Code"] == "AWS.SimpleQueueService.NonExistentQueue":
                        # there may be a delay between the cron may deleting the queue and this worker stopping
                        log.info(
                            "failed to renew message visibility because the queue was not found"
                        )
                    else:
                        self.stop_renewal.remove(receipt_handle)
                        raise err
Пример #2
0
    def garbage_collect(
            self,
            exclude_disk_model_ids: List[str] = [],
            dry_run: bool = False) -> Tuple[bool, List[str], List[str]]:
        """
        Removes stale in-memory and on-disk models based on LRU policy.
        Also calls the "remove" callback before removing the models from this object. The callback must not raise any exceptions.

        Must be called with a write lock unless dry_run is set to true.

        Args:
            exclude_disk_model_ids: Model IDs to exclude from removing from disk. Necessary for locally-provided models.
            dry_run: Just test if there are any models to remove. If set to true, this method can then be called with a read lock.

        Returns:
            A 3-element tuple. First element tells whether models had to be collected. The 2nd and 3rd elements contain the model IDs that were removed from memory and disk respectively.
        """
        collected = False
        if self._mem_cache_size <= 0 or self._disk_cache_size <= 0:
            return collected

        stale_mem_model_ids = self._lru_model_ids(self._mem_cache_size,
                                                  filter_in_mem=True)
        stale_disk_model_ids = self._lru_model_ids(self._disk_cache_size -
                                                   len(exclude_disk_model_ids),
                                                   filter_in_mem=False)

        if self._remove_callback and not dry_run:
            self._remove_callback(stale_mem_model_ids)

        # don't delete excluded model IDs from disk
        stale_disk_model_ids = list(
            set(stale_disk_model_ids) - set(exclude_disk_model_ids))
        stale_disk_model_ids = stale_disk_model_ids[len(stale_disk_model_ids) -
                                                    self._disk_cache_size:]

        if not dry_run:
            logger.info(
                f"unloading models {stale_mem_model_ids} from memory using the garbage collector"
            )
            logger.info(
                f"unloading models {stale_disk_model_ids} from disk using the garbage collector"
            )
            for model_id in stale_mem_model_ids:
                self.remove_model_by_id(model_id, mem=True, disk=False)
            for model_id in stale_disk_model_ids:
                self.remove_model_by_id(model_id, mem=False, disk=True)

        if len(stale_mem_model_ids) > 0 or len(stale_disk_model_ids) > 0:
            collected = True

        return collected, stale_mem_model_ids, stale_disk_model_ids
Пример #3
0
    def start(
        self,
        message_fn: Callable[[Dict[str, Any]], None],
        message_failure_fn: Callable[[Dict[str, Any]], None],
        on_job_complete_fn: Optional[Callable[[Dict[str, Any]], None]] = None,
    ):
        no_messages_found_in_previous_iteration = False
        signal_handler = SignalHandler()

        while not signal_handler.received_signal():
            response = self.sqs_client.receive_message(
                QueueUrl=self.queue_url,
                MaxNumberOfMessages=1,
                WaitTimeSeconds=self.message_wait_time,
                VisibilityTimeout=self.visibility_timeout,
                MessageAttributeNames=["All"],
            )

            if response.get("Messages") is None or len(
                    response["Messages"]) == 0:
                visible_messages, invisible_messages = self._get_total_messages_in_queue(
                )
                if visible_messages + invisible_messages == 0:
                    if no_messages_found_in_previous_iteration and self.stop_if_no_messages:
                        log.info("no messages left in queue, exiting...")
                        return
                    no_messages_found_in_previous_iteration = True

                time.sleep(self.not_found_sleep_time)
                continue

            no_messages_found_in_previous_iteration = False
            message = response["Messages"][0]
            receipt_handle = message["ReceiptHandle"]

            renewer = threading.Thread(
                target=self._renew_message_visibility,
                args=(receipt_handle, ),
                daemon=True,
            )
            renewer.start()

            if is_on_job_complete(message):
                self._handle_on_job_complete(message, on_job_complete_fn)
            else:
                self._handle_message(message, message_fn, message_failure_fn)
Пример #4
0
    def __init__(self, config):
        num_success = 0
        num_fail = 0
        for i in range(config["num_requests"]):
            if i > 0:
                time.sleep(config["sleep"])
            try:
                # response = requests.get(config["endpoint"])
                response = requests.post(config["endpoint"],
                                         json=config["data"])
            except Exception as e:
                num_fail += 1
                cortex_logger.error(
                    e,
                    extra={
                        "error": True,
                        "request_number": i,
                    },
                )
                continue
            if response.status_code == 200:
                num_success += 1
                cortex_logger.info("successful request",
                                   extra={
                                       "request_success": True,
                                       "request_number": i
                                   })
            else:
                num_fail += 1
                cortex_logger.error(
                    response.text,
                    extra={
                        "error": True,
                        "code": response.status_code,
                        "request_number": i,
                    },
                )

        cortex_logger.warn(
            "FINISHED",
            extra={
                "finished": True,
                "num_success": num_success,
                "num_fail": num_fail
            },
        )
Пример #5
0
    def _remove_models(self, model_ids: List[str]) -> None:
        """
        Remove models from TFS.
        Must only be used when caching enabled.
        """
        logger.info(f"unloading models with model IDs {model_ids} from TFS")

        models = {}
        for model_id in model_ids:
            model_name, model_version = model_id.rsplit("-", maxsplit=1)
            if model_name not in models:
                models[model_name] = [model_version]
            else:
                models[model_name].append(model_version)

        model_names = []
        model_versions = []
        for model_name, versions in models.items():
            model_names.append(model_name)
            model_versions.append(versions)

        self._client.remove_models(model_names, model_versions)
Пример #6
0
    def _run_inference(self, model_input: Any, model_name: str,
                       model_version: str) -> dict:
        """
        When processes_per_replica = 1 and caching enabled, check/load model and make prediction.
        When processes_per_replica > 0 and caching disabled, attempt to make prediction regardless.

        Args:
            model_input: Input to the model.
            model_name: Name of the model, as it's specified in predictor:models:paths or in the other case as they are named on disk.
            model_version: Version of the model, as it's found on disk. Can also infer the version number from the "latest" version tag.

        Returns:
            The prediction.
        """

        model = None
        tag = ""
        if model_version == "latest":
            tag = model_version

        if not self._caching_enabled:

            # determine model version
            if tag == "latest":
                versions = self._client.poll_available_model_versions(
                    model_name)
                if len(versions) == 0:
                    raise UserException(
                        f"model '{model_name}' accessed with tag {tag} couldn't be found"
                    )
                model_version = str(max(map(lambda x: int(x), versions)))
            model_id = model_name + "-" + model_version

            return self._client.predict(model_input, model_name, model_version)

        if not self._multiple_processes and self._caching_enabled:

            # determine model version
            try:
                if tag == "latest":
                    model_version = self._get_latest_model_version_from_tree(
                        model_name, self._models_tree.model_info(model_name))
            except ValueError:
                # if model_name hasn't been found
                raise UserRuntimeException(
                    f"'{model_name}' model of tag {tag} wasn't found in the list of available models"
                )

            models_stats = []
            for model_id in self._models.get_model_ids():
                models_stats = self._models.has_model_id(model_id)

            # grab shared access to model tree
            available_model = True
            logger.info(
                f"grabbing access to model {model_name} of version {model_version}"
            )
            with LockedModelsTree(self._models_tree, "r", model_name,
                                  model_version):

                # check if the versioned model exists
                model_id = model_name + "-" + model_version
                if model_id not in self._models_tree:
                    available_model = False
                    logger.info(
                        f"model {model_name} of version {model_version} is not available"
                    )
                    raise WithBreak

                # retrieve model tree's metadata
                upstream_model = self._models_tree[model_id]
                current_upstream_ts = int(
                    upstream_model["timestamp"].timestamp())
                logger.info(
                    f"model {model_name} of version {model_version} is available"
                )

            if not available_model:
                if tag == "":
                    raise UserException(
                        f"model '{model_name}' of version '{model_version}' couldn't be found"
                    )
                raise UserException(
                    f"model '{model_name}' accessed with tag '{tag}' couldn't be found"
                )

            # grab shared access to models holder and retrieve model
            update_model = False
            prediction = None
            tfs_was_unresponsive = False
            with LockedModel(self._models, "r", model_name, model_version):
                logger.info(
                    f"checking the {model_name} {model_version} status")
                status, local_ts = self._models.has_model(
                    model_name, model_version)
                if status in ["not-available", "on-disk"
                              ] or (status != "not-available"
                                    and local_ts != current_upstream_ts):
                    logger.info(
                        f"model {model_name} of version {model_version} is not loaded (with status {status} or different timestamp)"
                    )
                    update_model = True
                    raise WithBreak

                # run prediction
                logger.info(
                    f"run the prediction on model {model_name} of version {model_version}"
                )
                self._models.get_model(model_name, model_version, tag)
                try:
                    prediction = self._client.predict(model_input, model_name,
                                                      model_version)
                except grpc.RpcError as e:
                    # effectively when it got restarted
                    if len(
                            self._client.poll_available_model_versions(
                                model_name)) > 0:
                        raise
                    tfs_was_unresponsive = True

            # remove model from disk and memory references if TFS gets unresponsive
            if tfs_was_unresponsive:
                with LockedModel(self._models, "w", model_name, model_version):
                    available_versions = self._client.poll_available_model_versions(
                        model_name)
                    status, _ = self._models.has_model(model_name,
                                                       model_version)
                    if not (status == "in-memory"
                            and model_version not in available_versions):
                        raise WithBreak

                    logger.info(
                        f"removing model {model_name} of version {model_version} because TFS got unresponsive"
                    )
                    self._models.remove_model(model_name, model_version)

            # download, load into memory the model and retrieve it
            if update_model:
                # grab exclusive access to models holder
                with LockedModel(self._models, "w", model_name, model_version):

                    # check model status
                    status, local_ts = self._models.has_model(
                        model_name, model_version)

                    # refresh disk model
                    if status == "not-available" or (
                            status in ["on-disk", "in-memory"]
                            and local_ts != current_upstream_ts):
                        # unload model from TFS
                        if status == "in-memory":
                            try:
                                logger.info(
                                    f"unloading model {model_name} of version {model_version} from TFS"
                                )
                                self._models.unload_model(
                                    model_name, model_version)
                            except Exception:
                                logger.info(
                                    f"failed unloading model {model_name} of version {model_version} from TFS"
                                )
                                raise

                        # remove model from disk and references
                        if status in ["on-disk", "in-memory"]:
                            logger.info(
                                f"removing model references from memory and from disk for model {model_name} of version {model_version}"
                            )
                            self._models.remove_model(model_name,
                                                      model_version)

                        # download model
                        if model_name not in self._spec_models.get_local_model_names(
                        ):
                            logger.info(
                                f"downloading model {model_name} of version {model_version} from the {upstream_model['provider']} upstream"
                            )
                            date = self._models.download_model(
                                upstream_model["provider"],
                                upstream_model["bucket"],
                                model_name,
                                model_version,
                                upstream_model["path"],
                            )
                            if not date:
                                raise WithBreak
                            current_upstream_ts = int(date.timestamp())

                    # load model
                    try:
                        logger.info(
                            f"loading model {model_name} of version {model_version} into memory"
                        )
                        self._models.load_model(
                            model_name,
                            model_version,
                            current_upstream_ts,
                            [tag],
                            kwargs={
                                "model_name":
                                model_name,
                                "model_version":
                                model_version,
                                "signature_key":
                                self._determine_model_signature_key(
                                    model_name),
                            },
                        )
                    except Exception as e:
                        raise UserRuntimeException(
                            f"failed (re-)loading model {model_name} of version {model_version} (thread {td.get_ident()})",
                            str(e),
                        )

                    # run prediction
                    self._models.get_model(model_name, model_version, tag)
                    prediction = self._client.predict(model_input, model_name,
                                                      model_version)

            return prediction
Пример #7
0
    def _get_model(self, model_name: str, model_version: str) -> Any:
        """
        Checks if versioned model is on disk, then checks if model is in memory,
        and if not, it loads it into memory, and returns the model.

        Args:
            model_name: Name of the model, as it's specified in predictor:models:paths or in the other case as they are named on disk.
            model_version: Version of the model, as it's found on disk. Can also infer the version number from the "latest" tag.

        Exceptions:
            RuntimeError: if another thread tried to load the model at the very same time.

        Returns:
            The model as returned by self._load_model method.
            None if the model wasn't found or if it didn't pass the validation.
        """

        model = None
        tag = ""
        if model_version == "latest":
            tag = model_version

        if not self._caching_enabled:
            # determine model version
            if tag == "latest":
                model_version = self._get_latest_model_version_from_disk(
                    model_name)
            model_id = model_name + "-" + model_version

            # grab shared access to versioned model
            resource = os.path.join(self._lock_dir, model_id + ".txt")
            with LockedFile(resource, "r", reader_lock=True) as f:

                # check model status
                file_status = f.read()
                if file_status == "" or file_status == "not-available":
                    raise WithBreak

                current_upstream_ts = int(file_status.split(" ")[1])
                update_model = False

                # grab shared access to models holder and retrieve model
                with LockedModel(self._models, "r", model_name, model_version):
                    status, local_ts = self._models.has_model(
                        model_name, model_version)
                    if status == "not-available" or (
                            status == "in-memory"
                            and local_ts != current_upstream_ts):
                        update_model = True
                        raise WithBreak
                    model, _ = self._models.get_model(model_name,
                                                      model_version, tag)

                # load model into memory and retrieve it
                if update_model:
                    with LockedModel(self._models, "w", model_name,
                                     model_version):
                        status, _ = self._models.has_model(
                            model_name, model_version)
                        if status == "not-available" or (
                                status == "in-memory"
                                and local_ts != current_upstream_ts):
                            if status == "not-available":
                                logger.info(
                                    f"loading model {model_name} of version {model_version} (thread {td.get_ident()})"
                                )
                            else:
                                logger.info(
                                    f"reloading model {model_name} of version {model_version} (thread {td.get_ident()})"
                                )
                            try:
                                self._models.load_model(
                                    model_name,
                                    model_version,
                                    current_upstream_ts,
                                    [tag],
                                )
                            except Exception as e:
                                raise UserRuntimeException(
                                    f"failed (re-)loading model {model_name} of version {model_version} (thread {td.get_ident()})",
                                    str(e),
                                )
                        model, _ = self._models.get_model(
                            model_name, model_version, tag)

        if not self._multiple_processes and self._caching_enabled:
            # determine model version
            try:
                if tag == "latest":
                    model_version = self._get_latest_model_version_from_tree(
                        model_name, self._models_tree.model_info(model_name))
            except ValueError:
                # if model_name hasn't been found
                raise UserRuntimeException(
                    f"'{model_name}' model of tag latest wasn't found in the list of available models"
                )

            # grab shared access to model tree
            available_model = True
            with LockedModelsTree(self._models_tree, "r", model_name,
                                  model_version):

                # check if the versioned model exists
                model_id = model_name + "-" + model_version
                if model_id not in self._models_tree:
                    available_model = False
                    raise WithBreak

                # retrieve model tree's metadata
                upstream_model = self._models_tree[model_id]
                current_upstream_ts = int(
                    upstream_model["timestamp"].timestamp())

            if not available_model:
                return None

            # grab shared access to models holder and retrieve model
            update_model = False
            with LockedModel(self._models, "r", model_name, model_version):
                status, local_ts = self._models.has_model(
                    model_name, model_version)
                if status in ["not-available", "on-disk"
                              ] or (status != "not-available"
                                    and local_ts != current_upstream_ts):
                    update_model = True
                    raise WithBreak
                model, _ = self._models.get_model(model_name, model_version,
                                                  tag)

            # download, load into memory the model and retrieve it
            if update_model:
                # grab exclusive access to models holder
                with LockedModel(self._models, "w", model_name, model_version):

                    # check model status
                    status, local_ts = self._models.has_model(
                        model_name, model_version)

                    # refresh disk model
                    if status == "not-available" or (
                            status in ["on-disk", "in-memory"]
                            and local_ts != current_upstream_ts):
                        if status == "not-available":
                            logger.info(
                                f"model {model_name} of version {model_version} not found locally; continuing with the download..."
                            )
                        elif status == "on-disk":
                            logger.info(
                                f"found newer model {model_name} of vesion {model_version} on the {upstream_model['provider']} upstream than the one on the disk"
                            )
                        else:
                            logger.info(
                                f"found newer model {model_name} of vesion {model_version} on the {upstream_model['provider']} upstream than the one loaded into memory"
                            )

                        # remove model from disk and memory
                        if status == "on-disk":
                            logger.info(
                                f"removing model from disk for model {model_name} of version {model_version}"
                            )
                            self._models.remove_model(model_name,
                                                      model_version)
                        if status == "in-memory":
                            logger.info(
                                f"removing model from disk and memory for model {model_name} of version {model_version}"
                            )
                            self._models.remove_model(model_name,
                                                      model_version)

                        # download model
                        logger.info(
                            f"downloading model {model_name} of version {model_version} from the {upstream_model['provider']} upstream"
                        )
                        date = self._models.download_model(
                            upstream_model["provider"],
                            upstream_model["bucket"],
                            model_name,
                            model_version,
                            upstream_model["path"],
                        )
                        if not date:
                            raise WithBreak
                        current_upstream_ts = int(date.timestamp())

                    # load model
                    try:
                        logger.info(
                            f"loading model {model_name} of version {model_version} into memory"
                        )
                        self._models.load_model(
                            model_name,
                            model_version,
                            current_upstream_ts,
                            [tag],
                        )
                    except Exception as e:
                        raise UserRuntimeException(
                            f"failed (re-)loading model {model_name} of version {model_version} (thread {td.get_ident()})",
                            str(e),
                        )

                    # retrieve model
                    model, _ = self._models.get_model(model_name,
                                                      model_version, tag)

        return model
Пример #8
0
 def _signal_handler(self, sys_signal, _):
     log.info(f"handling signal {sys_signal}, exiting gracefully")
     self.__received_signal = True
Пример #9
0
    def _extract_signatures(
        self, signature_def, signature_key, model_name: str, model_version: str
    ):
        logger.info(
            "signature defs found in model '{}' for version '{}': {}".format(
                model_name, model_version, signature_def
            )
        )

        available_keys = list(signature_def.keys())
        if len(available_keys) == 0:
            raise UserException(
                "unable to find signature defs in model '{}' of version '{}'".format(
                    model_name, model_version
                )
            )

        if signature_key is None:
            if len(available_keys) == 1:
                logger.info(
                    "signature_key was not configured by user, using signature key '{}' for model '{}' of version '{}' (found in the signature def map)".format(
                        available_keys[0],
                        model_name,
                        model_version,
                    )
                )
                signature_key = available_keys[0]
            elif "predict" in signature_def:
                logger.info(
                    "signature_key was not configured by user, using signature key 'predict' for model '{}' of version '{}' (found in the signature def map)".format(
                        model_name,
                        model_version,
                    )
                )
                signature_key = "predict"
            else:
                raise UserException(
                    "signature_key was not configured by user, please specify one the following keys '{}' for model '{}' of version '{}' (found in the signature def map)".format(
                        ", ".join(available_keys), model_name, model_version
                    )
                )
        else:
            if signature_def.get(signature_key) is None:
                possibilities_str = "key: '{}'".format(available_keys[0])
                if len(available_keys) > 1:
                    possibilities_str = "keys: '{}'".format("', '".join(available_keys))

                raise UserException(
                    "signature_key '{}' was not found in signature def map for model '{}' of version '{}', but found the following {}".format(
                        signature_key, model_name, model_version, possibilities_str
                    )
                )

        signature_def_val = signature_def.get(signature_key)

        if signature_def_val.get("inputs") is None:
            raise UserException(
                "unable to find 'inputs' in signature def '{}' for model '{}'".format(
                    signature_key, model_name
                )
            )

        parsed_signatures = {}
        for input_name, input_metadata in signature_def_val["inputs"].items():
            if input_metadata["tensorShape"] == {}:
                # a scalar with rank 0 and empty shape
                shape = "scalar"
            elif input_metadata["tensorShape"].get("unknownRank", False):
                # unknown rank and shape
                #
                # unknownRank is set to True if the model input has no rank
                # it may lead to an undefined behavior if unknownRank is only checked for its presence
                # so it also gets to be tested against its value
                shape = "unknown"
            elif input_metadata["tensorShape"].get("dim", None):
                # known rank and known/unknown shape
                shape = [int(dim["size"]) for dim in input_metadata["tensorShape"]["dim"]]
            else:
                raise UserException(
                    "invalid 'tensorShape' specification for input '{}' in signature key '{}' for model '{}'",
                    input_name,
                    signature_key,
                    model_name,
                )

            parsed_signatures[input_name] = {
                "shape": shape if type(shape) == list else [shape],
                "type": DTYPE_TO_TF_TYPE[input_metadata["dtype"]].name,
            }
        return signature_key, parsed_signatures
Пример #10
0
def model_downloader(
    predictor_type: PredictorType,
    bucket_provider: str,
    bucket_name: str,
    model_name: str,
    model_version: str,
    model_path: str,
    temp_dir: str,
    model_dir: str,
) -> Optional[datetime.datetime]:
    """
    Downloads model to disk. Validates the cloud model path and the downloaded model as well.

    Args:
        predictor_type: The predictor type as implemented by the API.
        bucket_provider: Provider for the bucket. Can be "s3" or "gs".
        bucket_name: Name of the bucket where the model is stored.
        model_name: Name of the model. Is part of the model's local path.
        model_version: Version of the model. Is part of the model's local path.
        model_path: Model prefix of the versioned model.
        temp_dir: Where to temporarily store the model for validation.
        model_dir: The top directory of where all models are stored locally.

    Returns:
        The model's timestamp. None if the model didn't pass the validation, if it doesn't exist or if there are not enough permissions.
    """

    logger.info(
        f"downloading from bucket {bucket_name}/{model_path}, model {model_name} of version {model_version}, temporarily to {temp_dir} and then finally to {model_dir}"
    )

    if bucket_provider == "s3":
        client = S3(bucket_name)
    if bucket_provider == "gs":
        client = GCS(bucket_name)

    # validate upstream cloud model
    sub_paths, ts = client.search(model_path)
    try:
        validate_model_paths(sub_paths, predictor_type, model_path)
    except CortexException:
        logger.info(
            f"failed validating model {model_name} of version {model_version}")
        return None

    # download model to temp dir
    temp_dest = os.path.join(temp_dir, model_name, model_version)
    try:
        client.download_dir_contents(model_path, temp_dest)
    except CortexException:
        logger.info(
            f"failed downloading model {model_name} of version {model_version} to temp dir {temp_dest}"
        )
        shutil.rmtree(temp_dest)
        return None

    # validate model
    model_contents = glob.glob(os.path.join(temp_dest, "**"), recursive=True)
    model_contents = util.remove_non_empty_directory_paths(model_contents)
    try:
        validate_model_paths(model_contents, predictor_type, temp_dest)
    except CortexException:
        logger.info(
            f"failed validating model {model_name} of version {model_version} from temp dir"
        )
        shutil.rmtree(temp_dest)
        return None

    # move model to dest dir
    model_top_dir = os.path.join(model_dir, model_name)
    ondisk_model_version = os.path.join(model_top_dir, model_version)
    logger.info(
        f"moving model {model_name} of version {model_version} to final dir {ondisk_model_version}"
    )
    if os.path.isdir(ondisk_model_version):
        shutil.rmtree(ondisk_model_version)
    shutil.move(temp_dest, ondisk_model_version)

    return max(ts)
Пример #11
0
 def predict(self, payload):
     cortex_logger.info("received payload", extra={"payload": payload})
     return payload