def namespace(namespace_name): owner_name = "some_owner" description = "this is a very nice namespace." basic_marquez_client = MarquezClient(host="localhost", port=8080) created_ns = basic_marquez_client.create_namespace( namespace_name, owner_name, description) return created_ns
class DAG(airflow.models.DAG): DEFAULT_NAMESPACE = 'default' _job_id_mapping = None _marquez_client = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.marquez_namespace = os.environ.get('MARQUEZ_NAMESPACE') or \ DAG.DEFAULT_NAMESPACE self.marquez_location = kwargs['default_args'].get( 'marquez_location', 'unknown') self.marquez_input_urns = kwargs['default_args'].get( 'marquez_input_urns', []) self.marquez_output_urns = kwargs['default_args'].get( 'marquez_output_urns', []) self._job_id_mapping = JobIdMapping() def create_dagrun(self, *args, **kwargs): run_args = "{}" # TODO extract the run Args from the tasks marquez_jobrun_id = None try: marquez_jobrun_id = self.report_jobrun(run_args, kwargs['execution_date']) log.info(f'Successfully recorded job run.', airflow_dag_id=self.dag_id, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace) except Exception as e: log.error(f'Failed to record job run: {e}', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace) pass run = super(DAG, self).create_dagrun(*args, **kwargs) if marquez_jobrun_id: try: self._job_id_mapping.set( JobIdMapping.make_key(run.dag_id, run.run_id), marquez_jobrun_id) except Exception as e: log.error(f'Failed job run lookup: {e}', airflow_dag_id=self.dag_id, airflow_run_id=run.run_id, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace) pass return run def handle_callback(self, *args, **kwargs): try: self.report_jobrun_change(args[0], **kwargs) except Exception as e: log.error(f'Failed to record job run state change: {e}', dag_id=self.dag_id) return super().handle_callback(*args, **kwargs) def report_jobrun(self, run_args, execution_date): now_ms = self._now_ms() job_name = self.dag_id start_time = execution_date.format("%Y-%m-%dT%H:%M:%SZ") end_time = self.compute_endtime(execution_date) if end_time: end_time = end_time.strftime("%Y-%m-%dT%H:%M:%SZ") marquez_client = self.get_marquez_client() marquez_client.create_job(job_name, self.marquez_location, self.marquez_input_urns, self.marquez_output_urns, description=self.description) log.info(f'Successfully recorded job: {job_name}', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace) marquez_jobrun = marquez_client.create_job_run( job_name, run_args=run_args, nominal_start_time=start_time, nominal_end_time=end_time) marquez_jobrun_id = marquez_jobrun.get('runId') if marquez_jobrun_id: marquez_client.mark_job_run_as_running(marquez_jobrun_id) log.info(f'Successfully recorded job run: {job_name}', airflow_dag_id=self.dag_id, airflow_dag_execution_time=start_time, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - now_ms)) else: log.warn(f'Run id found not found: {job_name}', airflow_dag_id=self.dag_id, airflow_dag_execution_time=start_time, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - now_ms)) return marquez_jobrun_id def compute_endtime(self, execution_date): return self.following_schedule(execution_date) def report_jobrun_change(self, dagrun, **kwargs): session = kwargs.get('session') marquez_job_run_id = self._job_id_mapping.pop( JobIdMapping.make_key(dagrun.dag_id, dagrun.run_id), session) if marquez_job_run_id: log.info(f'Found job run.', airflow_dag_id=dagrun.dag_id, airflow_run_id=dagrun.run_id, marquez_run_id=marquez_job_run_id, marquez_namespace=self.marquez_namespace) if kwargs.get('success'): self.get_marquez_client().mark_job_run_as_completed( marquez_job_run_id) else: self.get_marquez_client().mark_job_run_as_failed( marquez_job_run_id) state = 'COMPLETED' if kwargs.get('success') else 'FAILED' log.info(f'Marked job run as {state}.', airflow_dag_id=dagrun.dag_id, airflow_run_id=dagrun.run_id, marquez_run_id=marquez_job_run_id, marquez_namespace=self.marquez_namespace) def get_marquez_client(self): if not self._marquez_client: self._marquez_client = MarquezClient( namespace_name=self.marquez_namespace) self._marquez_client.create_namespace(self.marquez_namespace, "default_owner") return self._marquez_client @staticmethod def _now_ms(): return int(round(time.time() * 1000))
class TestMarquezClient(unittest.TestCase): def setUp(self): self.client = MarquezClient() @mock.patch("marquez_client.MarquezClient._put") def test_create_namespace(self, mock_put): owner_name = "me" description = "my namespace for testing." mock_put.return_value = { "name": _NAMESPACE, "ownerName": owner_name, "description": description } response = self.client.create_namespace(_NAMESPACE, owner_name, description) assert _NAMESPACE == str(response['name']) assert owner_name == str(response['ownerName']) assert description == str(response['description']) @mock.patch("marquez_client.MarquezClient._put") def test_create_dataset(self, mock_put): dataset_name = "my-dataset" description = "My dataset for testing." fields = [{ "name": "flight_id", "type": "INTEGER", "description": "flight id" }, { "name": "flight_name", "type": "VARCHAR", "description": "flight name" }, { "name": "flight_date", "type": "TIMESTAMP", "description": "flight date" }] mock_put.return_value = { 'id': { 'namespace': 'my-namespace', 'name': 'my-dataset' }, 'type': 'DB_TABLE', 'name': 'my-dataset', 'physicalName': 'public.mytable', 'createdAt': '2020-08-12T05:46:31.172877Z', 'updatedAt': '2020-08-12T05:46:31.184934Z', 'namespace': 'my-namespace', 'sourceName': 'mydb', 'fields': [{ 'name': 'my_date', 'type': 'TIMESTAMP', 'description': 'my date' }, { 'name': 'my_id', 'type': 'INTEGER', 'description': 'my id' }, { 'name': 'my_name', 'type': 'VARCHAR', 'description': 'my name' }], 'tags': [], 'lastModifiedAt': None, 'description': 'My dataset for testing.' } response = self.client.create_dataset( namespace_name=_NAMESPACE, dataset_name=dataset_name, dataset_type=DatasetType.DB_TABLE, physical_name=dataset_name, source_name='my-source', description=description, run_id=None, schema_location=None, fields=fields, tags=None) assert str(response['description']) == description assert str(response['name']) == dataset_name @mock.patch("marquez_client.MarquezClient._put") def test_create_datasource(self, mock_put): source_name = "flight_schedules_db" source_type = SourceType.POSTGRESQL source_url = "jdbc:postgresql://*****:*****@mock.patch("marquez_client.MarquezClient._put") def test_create_job(self, mock_put): job_name = "my-job" input_dataset = [{ "namespace": "my-namespace", "name": "public.mytable" }] output_dataset = { "namespace": "my-namespace", "name": "public.mytable" } location = "https://github.com/my-jobs/blob/" \ "07f3d2dfc8186cadae9146719e70294a4c7a8ee8" context = {"SQL": "SELECT * FROM public.mytable;"} mock_put.return_value = { "id": { "namespace": "my-namespace", "name": "my-job" }, "type": "BATCH", "name": "my-job", "createdAt": "2020-08-12T07:30:55.321059Z", "updatedAt": "2020-08-12T07:30:55.333230Z", "namespace": "my-namespace", "inputs": [{ "namespace": "my-namespace", "name": "public.mytable" }], "outputs": [{ "namespace": "my-namespace", "name": "public.mytable" }], "location": "https://github.com/my-jobs/blob/" "07f3d2dfc8186cadae9146719e70294a4c7a8ee8", "context": { "SQL": "SELECT * FROM public.mytable;" }, "description": "My first job.", "latestRun": None } response = self.client.create_job(namespace_name=_NAMESPACE, job_name=job_name, job_type=JobType.BATCH, location=location, input_dataset=input_dataset, output_dataset=output_dataset, context=context) assert str(response['id']) is not None assert str(response['location']) == location @mock.patch("marquez_client.MarquezClient._post") def test_create_job_run(self, mock_post): job_name = "my-job" run_args = { "email": "*****@*****.**", "emailOnFailure": "true", "emailOnRetry": "true", "retries": "1" } created_at = str( generate(datetime.datetime.utcnow().replace(tzinfo=pytz.utc))) mock_post.return_value = { 'id': f'{uuid.uuid4()}', 'createdAt': f'{created_at}', 'updatedAt': '2020-08-12T22:33:02.787228Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'NEW', 'startedAt': None, 'endedAt': None, 'durationMs': None, 'run_args': { "email": "*****@*****.**", "emailOnFailure": "true", "emailOnRetry": "true", "retries": "1" } } response = self.client.create_job_run(namespace_name=_NAMESPACE, job_name=job_name, nominal_start_time=None, nominal_end_time=None, run_args=run_args, mark_as_running=False) assert response['id'] is not None assert str(response['run_args']) == str(run_args) assert str(response['createdAt']) == created_at @mock.patch("marquez_client.MarquezClient._post") def test_mark_job_run_as_start(self, mock_post): run_id = str(uuid.uuid4()) mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:36:50.739951Z', 'updatedAt': '2020-08-13T17:56:39.516802Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'RUNNING', 'startedAt': '2020-08-13T17:56:39.516802Z', 'endedAt': None, 'durationMs': None, 'args': {} } response = self.client.mark_job_run_as_started(run_id=run_id) assert str(response['id']) == run_id assert str(response['state']) == RunState.RUNNING.value @mock.patch("marquez_client.MarquezClient._post") def test_mark_job_run_as_completed(self, mock_post): run_id = str(uuid.uuid4()) mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:36:50.739951Z', 'updatedAt': '2020-08-13T17:56:39.516802Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'COMPLETED', 'startedAt': '2020-08-13T17:56:39.516802Z', 'endedAt': None, 'durationMs': None, 'args': {} } response = self.client.mark_job_run_as_completed(run_id=run_id) assert str(response['id']) == run_id assert str(response['state']) == RunState.COMPLETED.value @mock.patch("marquez_client.MarquezClient._post") def test_mark_job_run_as_failed(self, mock_post): run_id = str(uuid.uuid4()) mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:36:50.739951Z', 'updatedAt': '2020-08-13T17:56:39.516802Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'FAILED', 'startedAt': '2020-08-13T17:56:39.516802Z', 'endedAt': None, 'durationMs': None, 'args': {} } response = self.client.mark_job_run_as_failed(run_id=run_id) assert str(response['id']) == run_id assert str(response['state']) == RunState.FAILED.value @mock.patch("marquez_client.MarquezClient._post") def test_mark_job_run_as_aborted(self, mock_post): run_id = str(uuid.uuid4()) mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:36:50.739951Z', 'updatedAt': '2020-08-13T17:56:39.516802Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'ABORTED', 'startedAt': '2020-08-13T17:56:39.516802Z', 'endedAt': None, 'durationMs': None, 'args': {} } response = self.client.mark_job_run_as_aborted(run_id=run_id) assert str(response['id']) == run_id assert str(response['state']) == RunState.ABORTED.value