def test_is_dataset_validation_finished(self): # This is an error invalid_states = ["NO_DATA", "UPLOADING"] unfinished_states = ["VALIDATING"] finished_states = [ "VALIDATION_FAILED", "SUCCEEDED", "INVALID_DATA", "PROGRAM_ERROR", ] for unfinished_state in unfinished_states: data_set = self._make_dataset_response(unfinished_state) assert not DataManagerClient.is_dataset_validation_finished( data_set) for final_state in finished_states: data_set = self._make_dataset_response(final_state) assert DataManagerClient.is_dataset_validation_finished(data_set) for invalid_state in invalid_states: data_set = self._make_dataset_response(invalid_state) with pytest.raises(DatasetInvalidStateException) as exc_info: DataManagerClient.is_dataset_validation_finished(data_set) expected = ("Cannot wait for Dataset '{}' in status '{}'!" " Upload must finish first.").format( data_set["id"], invalid_state) assert expected == str(exc_info.value)
def test_dataset(self, data_manager_client: DataManagerClient): # Tests dataset functionality without upload create_response = data_manager_client.create_dataset_schema( dataset_schema=self.new_schema) new_dataset_schema_id = create_response["id"] all_datasets = data_manager_client.read_dataset_collection() prev_count = all_datasets["count"] new_dataset_response = data_manager_client.create_dataset( dataset_schema_id=new_dataset_schema_id, dataset_name="my-dataset") assert new_dataset_response["name"] == "my-dataset" assert new_dataset_response["datasetSchemaId"] == new_dataset_schema_id assert new_dataset_response["status"] == "NO_DATA" new_dataset_id = new_dataset_response["id"] all_datasets = data_manager_client.read_dataset_collection() new_count = all_datasets["count"] assert new_count == prev_count + 1 data_manager_client.delete_dataset_by_id(new_dataset_id) data_manager_client.delete_dataset_schema_by_id(new_dataset_schema_id) all_datasets = data_manager_client.read_dataset_collection() after_deletion_count = all_datasets["count"] assert after_deletion_count == new_count - 1
def test_is_dataset_validation_failed(self): non_failed_states = ["SUCCEEDED", "NO_DATA", "UPLOADING", "VALIDATING"] failed_states = ["VALIDATION_FAILED", "INVALID_DATA", "PROGRAM_ERROR"] for failed_state in failed_states: assert DataManagerClient.is_dataset_validation_failed( self._make_dataset_response(failed_state)) for non_failed_state in non_failed_states: assert not DataManagerClient.is_dataset_validation_failed( self._make_dataset_response(non_failed_state))
def test_dataset_upload(self, data_manager_client: DataManagerClient): csv = """ manufacturer,description,category,subcategory me,"simple è test, records",A,AA me,"übrigens ein Beispiel, records",A,AA me,"un po' di testo",A,AA me,"какой-то текст",A,AA me,"du texte",A,AA me,"一些文字",A,AA me,"कुछ पाठ",A,AA me,"κάποιο κείμενο",A,AA me,"кейбір мәтін",A,AA me,"iu teksto",A,AA """ data_stream = BytesIO(csv.strip().encode("utf-8")) create_response = data_manager_client.create_dataset_schema( dataset_schema=self.new_schema) new_dataset_schema_id = create_response["id"] new_dataset_response = data_manager_client.create_dataset( dataset_schema_id=new_dataset_schema_id, dataset_name="my-dataset") new_dataset_id = new_dataset_response["id"] response = data_manager_client.upload_data_and_validate( new_dataset_id, data_stream) assert response["status"] == "SUCCEEDED" data_manager_client.delete_dataset_by_id(new_dataset_id) data_manager_client.delete_dataset_schema_by_id(new_dataset_schema_id)
def test_read_dataset_schema_collection(self): client = DataManagerClient.construct_from_jwt(self.dar_url, "abcd") mock_session = create_autospec(DARSession, instance=True) client.session = mock_session dataset_schema_example = { "createdAt": "2020-02-18T22:18:08.263202+00:00", "features": [ { "label": "manufacturer", "type": "CATEGORY" }, { "label": "description", "type": "TEXT" }, ], "id": "f6bad1ae-b217-40bb-b7f4-b32a221add9f", "labels": [ { "label": "category", "type": "CATEGORY" }, { "label": "subcategory", "type": "CATEGORY" }, ], "name": "test", } dataset_schema_collection = { "count": 1, "datasetSchemas": [dataset_schema_example], } expected_api_response = dataset_schema_collection mock_response = Mock(spec_set=["json", "status_code"]) mock_response.json.return_value = dataset_schema_collection mock_response.status_code = 200 mock_session.get_from_endpoint.return_value = mock_response observed_result = client.read_dataset_schema_collection() expected_url = "/data-manager/api/v3/datasetSchemas" expected_get_call = call(expected_url) assert mock_session.get_from_endpoint.call_args_list == [ expected_get_call ] assert observed_result == expected_api_response
def test_dataset_schema(self, data_manager_client: DataManagerClient): create_response = data_manager_client.create_dataset_schema( dataset_schema=self.new_schema) new_id = create_response["id"] read_response = data_manager_client.read_dataset_schema_by_id(new_id) assert create_response == read_response all_dataset_schemas = data_manager_client.read_dataset_schema_collection( ) count_before_deletion = all_dataset_schemas["count"] assert count_before_deletion > 0 data_manager_client.delete_dataset_schema_by_id(new_id) all_dataset_schemas = data_manager_client.read_dataset_schema_collection( ) count_after_deletion = all_dataset_schemas["count"] assert count_before_deletion - count_after_deletion == 1
def test_create( self, model_creator: ModelCreator, model_manager_client: ModelManagerClient, data_manager_client: DataManagerClient, inference_client: InferenceClient, ): """ :param model_creator: provided by pytest fixture, see conftest.py :param model_manager_client: provided by pytest fixture, see conftest.py :param data_manager_client: provided by pytest fixture, see conftest.py """ # When running under pytest, logging will not be emitted to stdout # by pytest by default. ModelCreator.setup_basic_logging(debug=False) csv = """ manufacturer,description,category,subcategory me,"simple è test, records",A,AA me,"übrigens ein Beispiel, records",A,AA me,"un po' di testo",A,AA me,"какой-то текст",A,AA me,"du texte",A,AA me,"一些文字",A,AA me,"कुछ पाठ",A,AA me,"κάποιο κείμενο",A,AA me,"кейбір мәтін",A,AA me,"iu teksto",A,AA """ data_stream = BytesIO(csv.strip().encode("utf-8")) new_schema = { "features": [ { "label": "manufacturer", "type": "CATEGORY" }, { "label": "description", "type": "TEXT" }, ], "labels": [ { "label": "category", "type": "CATEGORY" }, { "label": "subcategory", "type": "CATEGORY" }, ], "name": "test", } model_name = "dar-client-test-" + str(uuid.uuid4()) # Before we start: model is not there with pytest.raises(DARHTTPException) as exc_info: model_manager_client.read_model_by_name(model_name) assert exc_info.value.status_code == 404 # Create model resp = model_creator.create( model_template_id="d7810207-ca31-4d4d-9b5a-841a644fd81f", dataset_schema=new_schema, model_name=model_name, data_stream=data_stream, ) assert resp["name"] == model_name assert "validationResult" in resp # Check if model is indeed there self._assert_model_exists(model_manager_client, model_name) # Attempt to deploy model deployment_resource = model_manager_client.deploy_and_wait(model_name) deployment_id = deployment_resource["id"] logger.info("Deployed model '%s' with deployment ID '%s'", model_name, deployment_id) self._assert_deployment_exists(deployment_id, model_manager_client) # Test inference self._assert_inference_works(inference_client, model_name) # Now delete deployment by model name (i.e. undeploy model) model_manager_client.ensure_model_is_undeployed(model_name) # Deployment should be gone self._assert_deployment_does_not_exist(model_manager_client, deployment_id) # Deploy model again, to exercise ensure_deployment_exists deployment_resource = model_manager_client.ensure_deployment_exists( model_name) deployment_id = deployment_resource["id"] model_manager_client.wait_for_deployment(deployment_id) self._assert_deployment_exists(deployment_id, model_manager_client) # Now delete deployment (i.e. undeploy model) model_manager_client.delete_deployment_by_id(deployment_id) # Deployment should be gone self._assert_deployment_does_not_exist(model_manager_client, deployment_id) # Delete Model model_manager_client.delete_model_by_name(model_name) # Model should now be gone self._assert_model_does_not_exist(model_manager_client, model_name) # Now check resources created internally by ModelCreator.create # and clean up! # Job # The Model resource does not have a jobId property, so we # have to look up the job ID via the job collection job_collection = model_manager_client.read_job_collection() job_id = None for job in job_collection["jobs"]: if job["modelName"] == model_name: job_id = job["id"] break assert job_id is not None self._assert_job_exists(model_manager_client, job_id) # Get dataset ID used in this job job_resource = model_manager_client.read_job_by_id(job_id) dataset_id = job_resource["datasetId"] # Clean up job model_manager_client.delete_job_by_id(job_id) self._assert_job_does_not_exist(model_manager_client, job_id) # Dataset self._assert_dataset_exists(data_manager_client, dataset_id) # Get DatasetSchema used in this Dataset dataset_resource = data_manager_client.read_dataset_by_id(dataset_id) dataset_schema_id = dataset_resource["datasetSchemaId"] # Clean up Dataset data_manager_client.delete_dataset_by_id(dataset_id) self._assert_dataset_does_not_exist(data_manager_client, dataset_id) # DatasetSchema self._assert_dataset_schema_exists(data_manager_client, dataset_schema_id) # Clean up DatasetSchema data_manager_client.delete_dataset_schema_by_id(dataset_schema_id) self._assert_dataset_schema_does_not_exist(data_manager_client, dataset_schema_id)
def __init__(self, url: str, source: CredentialsSource): self.data_manager_client = DataManagerClient(url=url, credentials_source=source) self.model_manager_client = ModelManagerClient( url=url, credentials_source=source)
class ModelCreator(BaseClient): """ This class provides a high-level means of training a model from a CSV file. To construct an instance of this class, see the various *construct_* methods such as :meth:`~sap.aibus.dar.client.base_client.BaseClient.construct_from_credentials` in :class:`~sap.aibus.dar.client.base_client.BaseClient`. Internally, the class wraps and orchestrates :class:`DataManagerClient` and :class:`ModelManagerClient`. """ def __init__(self, url: str, source: CredentialsSource): self.data_manager_client = DataManagerClient(url=url, credentials_source=source) self.model_manager_client = ModelManagerClient( url=url, credentials_source=source) def create( self, data_stream: typing.BinaryIO, model_template_id: str, dataset_schema: dict, model_name: str, ) -> dict: """ Trains a model from a CSV file. Internally, this method creates the required DatasetSchema and Dataset entities, uploads the data and starts the training job. The method will block until the training job finishes. Once this method returns, the model `model_name` can be deployed and used for inference. This method will raise an Exception if an error occurs. **No** clean up is performed: if for example a *TrainingJobFailed* or *TrainingJobTimeOut* exception occurs, the previously created Dataset and DatasetSchema will remain within the service and must be cleaned up manually. :param data_stream: binary stream containing a CSV file in UTF-8 encoding :param model_template_id: the model template ID :param dataset_schema: dataset schema as dict :param model_name: name of the model to be trained :raises TrainingJobFailed: When training job has status FAILED :raises TrainingJobTimeOut: When training job takes too long :raises: DatasetValidationTimeout: if validation takes too long :raises: DatasetValidationFailed: if validation does not finish in state *SUCCEEDED* :return: """ self.log.info("Creating DatasetSchema.") response_dataset_schema = self.data_manager_client.create_dataset_schema( dataset_schema) dataset_schema_id = response_dataset_schema["id"] self.log.info("Created dataset schema with id '%s'", dataset_schema_id) dataset_name = self.format_dataset_name(model_name) self.log.info("Creating Dataset with name '%s'", dataset_name) response_dataset = self.data_manager_client.create_dataset( dataset_name=dataset_name, dataset_schema_id=dataset_schema_id) dataset_id = response_dataset["id"] self.log.info("Created Dataset with id '%s'", dataset_id) self.log.info("Uploading data to Dataset '%s'", dataset_id) self.data_manager_client.upload_data_and_validate( dataset_id=dataset_id, data_stream=data_stream) self.log.info( "Data uploaded and validated successfully for dataset '%s'", dataset_id) self.log.info("Starting training job.") response_job_creation = self.model_manager_client.create_job_and_wait( model_name=model_name, dataset_id=dataset_id, model_template_id=model_template_id, ) self.log.info("Training finished successfully. Job ID: '%s'", response_job_creation["id"]) model = self.model_manager_client.read_model_by_name( model_name=model_name) self.log.debug("Final model resource: '%s'", model) return model @staticmethod def format_dataset_name(model_name: str) -> str: """ Derives a Dataset name from a Model name. For the purpose of automation, we automatically create a Dataset name from a Model name. Return value has no more than 255 characters. :param model_name: Model name :return: suitable Dataset name """ random_string = "-" + str(uuid.uuid4()) return (model_name[0:255 - len(model_name) - len(random_string)] + random_string)
def data_manager_client(dar_url, credentials_source): client = DataManagerClient(dar_url, credentials_source) return client