def save(model_in: MLModelIn): """Register a model into ModelDB and GridFS. `model.id` should be set as `None`, otherwise, the function will raise a `ValueError`. Args: model_in (MLModelIn): model object to be registered Return: MLModel: Saved ML model object. Raises: BadRequestValueException: If `model.id` is not None. ServiceException: If model has exists with the same primary keys (name, framework, engine and version). """ if _collection.count_documents( filter=model_in.dict( use_enum_values=True, include={'architecture', 'framework', 'engine', 'version', 'task', 'dataset'} ), limit=1 ): raise ServiceException( f'Model business object with primary keys architecture={model_in.architecture}, ' f'framework={model_in.framework}, engine={model_in.engine}, version={model_in.version},' f'task={model_in.task}, and dataset={model_in.dataset} has exists.' ) # TODO: update weight ID in the MLModelIn weight_id = _fs.put(bytes(model_in.weight), filename=model_in.weight.filename) model = MLModel(**model_in.dict(exclude={'weight'}), weight=weight_id) model.id = _collection.insert_one(model.dict(exclude_none=True, by_alias=True, use_enum_values=True)).inserted_id return model
def get_by_id(id: str) -> MLModel: """Get a MLModel object by its ID. """ model_data = _collection.find_one(filter={'_id': ObjectId(id)}) if model_data is not None: return MLModel.parse_obj(model_data) else: raise ServiceException(f'Model with id={id} does not exist.')
def save(training_job_in: TrainingJobIn) -> str: model_id = training_job_in.model if not ModelDAO.exists_by_id(ObjectId(model_id)): raise ServiceException(f'Model with ID {model_id} not exist.') training_job = TrainingJob(**training_job_in.dict(exclude_none=True)) return _collection.insert_one( training_job.dict(exclude_none=True)).inserted_id
def diagnose(self, server_name: str, batch_size: int = None, device='cuda', timeout=30) -> DynamicProfileResult: """Start diagnosing and profiling model. Args: server_name (str): to assign a name for the container you are creating for model profile batch_size (int): Batch size. device (str): Device name. timeout (float): Waiting for docker container timeout in second. Default timeout period is 30s. """ # Check server status model_status = False retry_time = 0 # use binary exponential backoff algorithm tick = time.time() while time.time() - tick < timeout: if self.inspector.check_model_status(): model_status = True break retry_time += 1 # get backoff time in s backoff_time = random.randint(0, 2**retry_time - 1) * 1e-3 time.sleep(backoff_time) if not model_status: # raise an error as model is not served. raise ServiceException('Model not served!') if batch_size is not None: self.inspector.set_batch_size(batch_size) result = self.inspector.run_model(server_name=server_name, device=device) dpr = DynamicProfileResult( device_id=result['device_id'], device_name=result['device_name'], batch=result['batch_size'], memory=ProfileMemory( total_memory=result['total_gpu_memory'], memory_usage=result['gpu_memory_used'], utilization=result['gpu_utilization'], ), latency=ProfileLatency(inference_latency=result['latency'], ), throughput=ProfileThroughput( inference_throughput=result['total_throughput']), ip=get_ip(), create_time=result['completed_time'], ) return dpr
def update(training_job: TrainingJobUpdate) -> int: # exists by ID if not bool( _collection.count_documents( filter={'_id': ObjectId(training_job.id)}, limit=1)): raise ValueError(f'id {training_job.id} not found.') # check model ID if training_job.model and not ModelDAO.exists_by_id( ObjectId(training_job.model)): raise ServiceException( f'Model with ID {training_job.model} not exist.') # save update update_data = training_job.dict(exclude_unset=True) result = _collection.update_one(filter={'_id': ObjectId(training_job.id)}, update={'$set': update_data}) return result.modified_count