示例#1
0
def save(model_in: MLModelIn):
    """Register a model into ModelDB and GridFS. `model.id` should be set as `None`, otherwise, the function will
    raise a `ValueError`.

    Args:
        model_in (MLModelIn): model object to be registered

    Return:
        MLModel: Saved ML model object.

    Raises:
        BadRequestValueException: If `model.id` is not None.
        ServiceException: If model has exists with the same primary keys (name, framework, engine and version).
    """

    if _collection.count_documents(
            filter=model_in.dict(
                use_enum_values=True,
                include={'architecture', 'framework', 'engine', 'version', 'task', 'dataset'}
            ),
            limit=1
    ):
        raise ServiceException(
            f'Model business object with primary keys architecture={model_in.architecture}, '
            f'framework={model_in.framework}, engine={model_in.engine}, version={model_in.version},'
            f'task={model_in.task}, and dataset={model_in.dataset}  has exists.'
        )

    # TODO: update weight ID in the MLModelIn
    weight_id = _fs.put(bytes(model_in.weight), filename=model_in.weight.filename)
    model = MLModel(**model_in.dict(exclude={'weight'}), weight=weight_id)
    model.id = _collection.insert_one(model.dict(exclude_none=True, by_alias=True, use_enum_values=True)).inserted_id
    return model
示例#2
0
def get_by_id(id: str) -> MLModel:
    """Get a MLModel object by its ID.
    """
    model_data = _collection.find_one(filter={'_id': ObjectId(id)})
    if model_data is not None:
        return MLModel.parse_obj(model_data)
    else:
        raise ServiceException(f'Model with id={id} does not exist.')
示例#3
0
def save(training_job_in: TrainingJobIn) -> str:
    model_id = training_job_in.model
    if not ModelDAO.exists_by_id(ObjectId(model_id)):
        raise ServiceException(f'Model with ID {model_id} not exist.')

    training_job = TrainingJob(**training_job_in.dict(exclude_none=True))
    return _collection.insert_one(
        training_job.dict(exclude_none=True)).inserted_id
示例#4
0
    def diagnose(self,
                 server_name: str,
                 batch_size: int = None,
                 device='cuda',
                 timeout=30) -> DynamicProfileResult:
        """Start diagnosing and profiling model.

        Args:
            server_name (str): to assign a name for the container you are creating for model profile
            batch_size (int): Batch size.
            device (str): Device name.
            timeout (float): Waiting for docker container timeout in second. Default timeout period is 30s.
        """
        # Check server status

        model_status = False
        retry_time = 0  # use binary exponential backoff algorithm
        tick = time.time()
        while time.time() - tick < timeout:
            if self.inspector.check_model_status():
                model_status = True
                break
            retry_time += 1
            # get backoff time in s
            backoff_time = random.randint(0, 2**retry_time - 1) * 1e-3
            time.sleep(backoff_time)

        if not model_status:  # raise an error as model is not served.
            raise ServiceException('Model not served!')

        if batch_size is not None:
            self.inspector.set_batch_size(batch_size)

        result = self.inspector.run_model(server_name=server_name,
                                          device=device)

        dpr = DynamicProfileResult(
            device_id=result['device_id'],
            device_name=result['device_name'],
            batch=result['batch_size'],
            memory=ProfileMemory(
                total_memory=result['total_gpu_memory'],
                memory_usage=result['gpu_memory_used'],
                utilization=result['gpu_utilization'],
            ),
            latency=ProfileLatency(inference_latency=result['latency'], ),
            throughput=ProfileThroughput(
                inference_throughput=result['total_throughput']),
            ip=get_ip(),
            create_time=result['completed_time'],
        )
        return dpr
示例#5
0
def update(training_job: TrainingJobUpdate) -> int:
    # exists by ID
    if not bool(
            _collection.count_documents(
                filter={'_id': ObjectId(training_job.id)}, limit=1)):
        raise ValueError(f'id {training_job.id} not found.')

    # check model ID
    if training_job.model and not ModelDAO.exists_by_id(
            ObjectId(training_job.model)):
        raise ServiceException(
            f'Model with ID {training_job.model} not exist.')

    # save update
    update_data = training_job.dict(exclude_unset=True)
    result = _collection.update_one(filter={'_id': ObjectId(training_job.id)},
                                    update={'$set': update_data})
    return result.modified_count