def test_request_absolute_endpoint(): endpoint = "/api/service/health" full_url = f"http://localhost:9100{endpoint}" responses.add(responses.GET, full_url, json={}) client = Client(UsernamePasswordAuth("username", "password")) # If client does not properly handle absolute paths, client.get() will # raise a ConnectionRefused exception. client.get(endpoint)
def test_dataset_profile(self): auth = UsernamePasswordAuth("username", "password") client = Client(auth) dataset_id = "3" dataset_url = f"{client.protocol}://{client.host}:{client.port}/api/versioned/v1/datasets/{dataset_id}" profile_url = f"{dataset_url}/profile" responses.add(responses.GET, dataset_url, json={}) responses.add(responses.GET, profile_url, json=self.profile_stale) dataset = client.datasets.by_resource_id(dataset_id) profile = dataset.profile() self.assertEqual(self.profile_stale["datasetName"], profile.dataset_name) self.assertEqual(self.profile_stale["relativeDatasetId"], profile.relative_dataset_id) self.assertEqual(self.profile_stale["isUpToDate"], profile.is_up_to_date) self.assertEqual(self.profile_stale["profiledDataVersion"], profile.profiled_data_version) self.assertEqual(self.profile_stale["profiledAt"], profile.profiled_at) self.assertEqual(self.profile_stale["simpleMetrics"], profile.simple_metrics) self.assertEqual(self.profile_stale["attributeProfiles"], profile.attribute_profiles)
def initiate_backup( client: Client, *, poll_interval_seconds: int = 30, polling_timeout_seconds: Optional[int] = None, connection_retry_timeout_seconds: int = 600, ) -> requests.Response: """Runs a backup of Tamr client and waits until it is finished. Args: client: A Tamr client object poll_interval_seconds: Amount of time in between polls of job state. polling_timeout_seconds: Amount of time before a timeout error is thrown. connection_retry_timeout_seconds: Amount of time before timeout error is thrown during connection retry Returns: Json dict of response from API request.""" response = client.post("backups") if not response.ok: message = f"Received non-200 response code '{response.status_code}': {response.json()}" LOGGER.error(message) raise RuntimeError(message) backup_id = response.json()["relativeId"] op = utils.client.poll_endpoint( client=client, api_endpoint=f"backups/{backup_id}", poll_interval_seconds=poll_interval_seconds, polling_timeout_seconds=polling_timeout_seconds, connection_retry_timeout_seconds=connection_retry_timeout_seconds, ) return op
def test_binning_model_records(): records_body = [{ "id": ["d8b7351d-24ce-49aa-8655-5b5809ab6bb8"], "isActive": ["true"], "clauseId": ["2e6c5f1b-ed49-40ab-8cbb-350aded25070"], "similarityFunction": ["COSINE"], "tokenizer": ["DEFAULT"], "fieldName": ["surname"], "threshold": ["0.75"], }] records_url = ( f"http://localhost:9100/api/versioned/v1/projects/1/binningModel/records" ) responses.add(responses.GET, project_url, json=project_config) responses.add( responses.GET, records_url, body="\n".join(json.dumps(body) for body in records_body), ) tamr = Client(UsernamePasswordAuth("username", "password")) project = tamr.projects.by_resource_id("1").as_mastering() binning_model = project.binning_model() binning_model_records = list(binning_model.records()) assert binning_model_records == records_body
def get_backup_by_id(client: Client, backup_id: str) -> JsonDict: """Fetches the json object for a given backup ID. Args: client: A Tamr client object. backup_id: The relativeID corresponding to the desired backup. Returns: Json dict corresponding to the desired backup. Raises: ValueError: Raised if GET request to Tamr fails """ api_string = f"backups/{backup_id}" response = client.get(api_string) if not response.ok: message = ( f"Received non-200 response code '{response.status_code}' " f"with message '{response.json()['message']}': '{response.json()}'" ) LOGGER.error(message) raise ValueError(message) return response.json()
def client(): from tamr_unify_client import Client from tamr_unify_client.auth import UsernamePasswordAuth auth = UsernamePasswordAuth("username", "password") tamr = Client(auth) return tamr
def test_continuous_mastering(): auth = UsernamePasswordAuth("username", "password") unify = Client(auth) project_id = "1" project = unify.projects.by_resource_id(project_id) project = project.as_mastering() unified_dataset = project.unified_dataset() op = unified_dataset.refresh(poll_interval_seconds=0) assert op.succeeded() op = project.pairs().refresh(poll_interval_seconds=0) assert op.succeeded() model = project.pair_matching_model() op = model.train(poll_interval_seconds=0) assert op.succeeded() op = model.predict(poll_interval_seconds=0) assert op.succeeded() op = project.record_clusters().refresh(poll_interval_seconds=0) assert op.succeeded() op = project.published_clusters().refresh(poll_interval_seconds=0) assert op.succeeded()
def health_check(client: Client) -> bool: """ Query the health check API and check if each service is healthy (returns True) Args: client: the tamr client Returns: True if all services are healthy, False if unhealthy """ try: response = client.get(endpoint="/api/service/health") healthy_status = all( [value["healthy"] for value in response.json().values()]) if healthy_status: LOGGER.info( f"Client is healthy: {dumps(response.json(), indent=2)}") else: LOGGER.error( f"Client is unhealthy: {dumps(response.json(), indent=2)}") return healthy_status except requests.exceptions.ConnectionError as e: LOGGER.error(f"Could not connect to {client.host}. Error: {e}") return False
def get_with_connection_retry(client: Client, api_endpoint: str, *, timeout_seconds: int = 600, sleep_seconds: int = 20) -> requests.Response: """Will handle exceptions when attempting to connect to the Tamr API. This is used to handle connection issues when Tamr restarts due to a restore. Args: client: A Tamr client object api_endpoint: Tamr API endpoint timeout_seconds: Amount of time before a timeout error is thrown. Default is 600 seconds sleep_seconds: Amount of time in between attempts to connect to Tamr. Returns: A response object from API request.""" started = now() while timeout_seconds is None or now() - started < timeout_seconds: try: response = client.get(api_endpoint) return response except ConnectionError as e: # If we got for example a connection refused exception, try again later LOGGER.warning(f"Caught exception in connect {e}") sleep(sleep_seconds) raise TimeoutError( f"Took longer than {timeout_seconds} seconds to connect to tamr.")
def test_client_repr(): auth = UsernamePasswordAuth("username", "password") unify = Client(auth) rstr = f"{unify!r}" assert rstr.startswith("tamr_unify_client.client.Client(") assert "http" in rstr assert rstr.endswith(")") assert "password" not in rstr unify = Client(auth, protocol="http", port=1234, base_path="foo/bar") rstr = f"{unify!r}" assert "'http'" in rstr assert "1234" in rstr assert "foo/bar" in rstr
def test_project_by_name(self): responses.add(responses.GET, self.project_list_url, json=self.project_json) auth = UsernamePasswordAuth("username", "password") tamr = Client(auth) actual_project = tamr.projects.by_name(self.project_name) assert actual_project._data == self.project_json[0]
def initiate_restore( client: Client, backup_id: str, *, polling_timeout_seconds: Optional[int] = None, poll_interval_seconds: int = 30, connection_retry_timeout_seconds: int = 600, ) -> requests.Response: """Restores the Tamr client to the state of the supplied backup. Args: client: A Tamr client object backup_id: BackupId of the desired backup. polling_timeout_seconds: Amount of time before a timeout error is thrown. poll_interval_seconds: Amount of time in between polls of job state. connection_retry_timeout_seconds: Amount of time before timeout error is thrown during connection retry Returns: Json dict of response from API request. Raises: ValueError: Raised if the target backup contains errors RuntimeError: Raised if the restore fails to start """ backup = get_backup_by_id(client=client, backup_id=backup_id) backup_state = backup["state"] if not backup_state == "SUCCEEDED": value_error_message1 = ( f"Backup file with ID {backup_id} did not succeed and has status {backup_state}" ) raise ValueError(value_error_message1) error_message = backup["errorMessage"] if not error_message == "": value_error_message2 = ( f"Backup file with ID {backup_id} contains non-null error message {error_message}" ) LOGGER.error(value_error_message2) raise ValueError(value_error_message2) response = client.post("instance/restore", data=backup_id) if not response.ok: runtime_error_message = ( f"Received non-200 response code '{response.status_code}' : {response.json()}" ) LOGGER.error(runtime_error_message) raise RuntimeError(runtime_error_message) op = utils.client.poll_endpoint( client=client, api_endpoint="instance/restore", poll_interval_seconds=poll_interval_seconds, polling_timeout_seconds=polling_timeout_seconds, connection_retry_timeout_seconds=connection_retry_timeout_seconds, ) return op
def test_dataset_collection_repr(): client = Client(UsernamePasswordAuth("username", "password")) full_clz_name = "tamr_unify_client.models.dataset.collection.DatasetCollection" rstr = f"{client.datasets!r}" assert rstr.startswith(f"{full_clz_name}(") assert "api_path='datasets'" in rstr assert rstr.endswith(")")
def test_http_error(): """Ensure that the client surfaces HTTP errors as exceptions. """ endpoint = f"http://localhost:9100/api/versioned/v1/projects/1" responses.add(responses.GET, endpoint, status=401) auth = UsernamePasswordAuth("nonexistent-username", "invalid-password") unify = Client(auth) with raises(HTTPError) as e: unify.projects.by_resource_id("1") assert f"401 Client Error: Unauthorized for url: {endpoint}" in str(e)
def list_backups(client: Client) -> Generator[JsonDict, None, None]: """Lists all backups available to Tamr client. Will list both succeeded and failed backups. Args: client: A client object Returns: A generator of json dict objects for the backups available to client.""" response = client.get("backups") for backup in response.json(): yield backup
def test_client_repr(): auth = UsernamePasswordAuth("username", "password") unify = Client(auth) full_clz_name = "tamr_unify_client.client.Client" rstr = f"{unify!r}" assert rstr.startswith(f"{full_clz_name}(") assert "http" in rstr assert "password" not in rstr assert rstr.endswith(")") # further testing when Client has optional arguments unify = Client(auth, protocol="http", port=1234, base_path="foo/bar") rstr = f"{unify!r}" assert "'http'" in rstr assert "1234" in rstr assert "foo/bar" in rstr
def test_dataset_status(): dataset_id = "1" dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/{dataset_id}" status_url = f"{dataset_url}/status" responses.add(responses.GET, dataset_url, json={}) responses.add(responses.GET, status_url, json=status_json) auth = UsernamePasswordAuth("username", "password") tamr = Client(auth) dataset = tamr.datasets.by_resource_id(dataset_id) status = dataset.status() assert status._data == status_json
def from_resource_id(tamr: Client, *, job_id: Union[int, str]) -> Operation: """Create an operation from a job id Args: tamr: A Tamr client job_id: A job ID Returns: A Tamr operation """ job_response = tamr.get(f"/api/versioned/v1/operations/{job_id}") return Operation.from_response(tamr, job_response)
def test_request_session_cookie(): endpoint = "http://localhost:9100/api/versioned/v1/test" responses.add(responses.GET, endpoint, json={}) session = requests.Session() cookie = requests.cookies.create_cookie( name="test_cookie", value="the-cookie-works" ) session.cookies.set_cookie(cookie) client = Client(UsernamePasswordAuth("username", "password"), session=session) assert client.session is session endpoint = "test" client.get(endpoint) assert len(responses.calls) == 1 req = responses.calls[0].request assert req.url.endswith("test") assert req.headers.get("Cookie") is not None assert "test_cookie=" in req.headers.get("Cookie")
def current(client: Client) -> str: """Gets the version of Tamr for provided client Args: client: Tamr client Returns: String representation of Tamr version """ url = "/api/versioned/service/version" response = client.get(url).successful() return json.loads(response.content)["version"]
def test_continuous_mastering(): auth = UsernamePasswordAuth("username", "password") unify = Client(auth) project_id = "1" project = unify.projects.by_resource_id(project_id) project = project.as_mastering() unified_dataset = project.unified_dataset() op = unified_dataset.refresh(poll_interval_seconds=0) assert op.succeeded() op = project.pairs().refresh(poll_interval_seconds=0) assert op.succeeded() model = project.pair_matching_model() op = model.train(poll_interval_seconds=0) assert op.succeeded() op = model.predict(poll_interval_seconds=0) assert op.succeeded() op = project.record_clusters().refresh(poll_interval_seconds=0) assert op.succeeded() op = project.published_clusters().refresh(poll_interval_seconds=0) assert op.succeeded() estimate_url = ( f"http://localhost:9100/api/versioned/v1/projects/1/estimatedPairCounts" ) estimate_json = { "isUpToDate": "true", "totalEstimate": {"candidatePairCount": "200", "generatedPairCount": "100"}, "clauseEstimates": { "clause1": {"candidatePairCount": "50", "generatedPairCount": "25"}, "clause2": {"candidatePairCount": "50", "generatedPairCount": "25"}, "clause3": {"candidatePairCount": "100", "generatedPairCount": "50"}, }, } responses.add(responses.GET, estimate_url, json=estimate_json) status = project.estimate_pairs().is_up_to_date assert status candidate = project.estimate_pairs().total_estimate["candidatePairCount"] assert candidate == "200" clause1 = project.estimate_pairs().clause_estimates["clause1"] assert clause1["generatedPairCount"] == "25"
def test_dataset_records(): dataset_id = "1" dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/{dataset_id}" records_url = f"{dataset_url}/records" responses.add(responses.GET, dataset_url, json={}) responses.add(responses.GET, records_url, body='{"attribute1": 1}\n{"attribute1": 2}') auth = UsernamePasswordAuth("username", "password") unify = Client(auth) dataset = unify.datasets.by_resource_id(dataset_id) records = list(dataset.records()) assert records == [{"attribute1": 1}, {"attribute1": 2}]
def test_dataset_profile(): dataset_id = "3" dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/{dataset_id}" profile_url = f"{dataset_url}/profile" profile_refresh_url = f"{profile_url}:refresh" responses.add(responses.GET, dataset_url, json={}) responses.add(responses.GET, profile_url, json=profile_json1) responses.add(responses.POST, profile_refresh_url, json=[], status=204) auth = UsernamePasswordAuth("username", "password") unify = Client(auth) dataset = unify.datasets.by_resource_id(dataset_id) profile = dataset.profile() assert profile._data == profile_json1
def test_project_add_source_dataset(): responses.add(responses.GET, datasets_url, json=dataset_json) responses.add(responses.GET, projects_url, json=project_json) responses.add( responses.POST, input_datasets_url, json=post_input_datasets_json, status=204 ) responses.add(responses.GET, input_datasets_url, json=get_input_datasets_json) auth = UsernamePasswordAuth("username", "password") unify = Client(auth) dataset = unify.datasets.by_external_id(dataset_external_id) project = unify.projects.by_external_id(project_external_id) project.add_source_dataset(dataset) alias = project.api_path + "/inputDatasets" input_datasets = project.client.get(alias).successful().json() assert input_datasets == dataset_json
def test_record_clusters_with_data(): project_config = { "name": "Project 1", "description": "Mastering Project", "type": "DEDUP", "unifiedDatasetName": "Project 1 - Unified Dataset", "externalId": "Project1", "resourceId": "1", } unified_dataset_json = { "id": "unify://unified-data/v1/datasets/8", "name": "Project_1_unified_dataset", "version": "10", "relativeId": "datasets/8", "externalId": "Project_1_unified_dataset", } rcwd_json = { "externalId": "1", "id": "unify://unified-data/v1/datasets/36", "name": "Project_1_unified_dataset_dedup_clusters_with_data", "relativeId": "datasets/36", "version": "251", } datasets_json = [rcwd_json] unify = Client(UsernamePasswordAuth("username", "password")) project_id = "1" project_url = f"http://localhost:9100/api/versioned/v1/projects/{project_id}" unified_dataset_url = ( f"http://localhost:9100/api/versioned/v1/projects/{project_id}/unifiedDataset" ) datasets_url = f"http://localhost:9100/api/versioned/v1/datasets" responses.add(responses.GET, project_url, json=project_config) responses.add(responses.GET, unified_dataset_url, json=unified_dataset_json) responses.add(responses.GET, datasets_url, json=datasets_json) project = unify.projects.by_resource_id(project_id) actual_rcwd_dataset = project.as_mastering().record_clusters_with_data() assert actual_rcwd_dataset.name == rcwd_json["name"]
def get_all(tamr: Client) -> List[Operation]: """ Get a list of all jobs or operations. Args: tamr: A Tamr client Returns: A list of Operation objects. """ response = tamr.get( "/api/versioned/v1/operations", headers={"Accept": "application/json"}, stream=True ).json() ops = [Operation.from_json(tamr, item) for item in response] return ops
def test_profile_refresh(self): auth = UsernamePasswordAuth("username", "password") client = Client(auth) dataset_id = "3" dataset_url = f"{client.protocol}://{client.host}:{client.port}/api/versioned/v1/datasets/{dataset_id}" profile_url = f"{dataset_url}/profile" profile_refresh_url = f"{profile_url}:refresh" responses.add(responses.GET, dataset_url, json={}) responses.add(responses.GET, profile_url, json=self.profile_stale) responses.add(responses.POST, profile_refresh_url, json=self.operation_succeeded) dataset = client.datasets.by_resource_id(dataset_id) profile = dataset.profile() op = profile.refresh() self.assertTrue(op.succeeded())
def test_continuous_categorization(): auth = UsernamePasswordAuth("username", "password") unify = Client(auth) project_id = "3" project = unify.projects.by_resource_id(project_id) project = project.as_categorization() unified_dataset = project.unified_dataset() op = unified_dataset.refresh(poll_interval_seconds=0) assert op.succeeded() model = project.model() op = model.train(poll_interval_seconds=0) assert op.succeeded() op = model.predict(poll_interval_seconds=0) assert op.succeeded()
def test_dataset_status_repr(): client = Client(UsernamePasswordAuth("username", "password")) data = { "relativeId": "path/to/thing/1", "datasetName": "testdsname", "relativeDatasetId": "path/to/data/1", "isStreamable": True, } status = DatasetStatus.from_json(client, data) full_clz_name = "tamr_unify_client.models.dataset_status.DatasetStatus" rstr = f"{status!r}" assert rstr.startswith(f"{full_clz_name}(") assert "testdsname" in rstr assert "True" in rstr assert "path/to/thing" in rstr assert rstr.endswith(")")
def test_profile_create(self): auth = UsernamePasswordAuth("username", "password") client = Client(auth) dataset_id = "3" dataset_url = f"{client.protocol}://{client.host}:{client.port}/api/versioned/v1/datasets/{dataset_id}" profile_url = f"{dataset_url}/profile" profile_refresh_url = f"{profile_url}:refresh" responses.add(responses.GET, dataset_url, json={}) # We need to ensure that, when creating the profile, # nothing ever tries to access the (non-existent) profile. responses.add(responses.GET, profile_url, status=404) responses.add(responses.POST, profile_refresh_url, json=self.operation_succeeded) dataset = client.datasets.by_resource_id(dataset_id) op = dataset.create_profile() self.assertTrue(op.succeeded())