Пример #1
0
def namespace(namespace_name):
    owner_name = "some_owner"
    description = "this is a very nice namespace."
    basic_marquez_client = MarquezClient(host="localhost", port=8080)
    created_ns = basic_marquez_client.create_namespace(
        namespace_name, owner_name, description)
    return created_ns
Пример #2
0
 def get_marquez_client(self):
     if not self._marquez_client:
         self._marquez_client = MarquezClient(
             namespace_name=self.marquez_namespace)
         self._marquez_client.create_namespace(self.marquez_namespace,
                                               "default_owner")
     return self._marquez_client
Пример #3
0
def job_default_ns(job_name):
    marquez_client = MarquezClient(host="localhost", port=5000)

    input_datsets = ['input1a', 'input2a']
    output_datsets = ['output1a', 'output2a']
    return marquez_client.create_job(
        job_name, 'BATCH', 'https://github.com/wework/jobs/commit/124f',
        input_datsets, output_datsets)
Пример #4
0
def job_default_ns(job_name):
    marquez_client = MarquezClient(host="localhost", port=8080)

    input_datset_urns = ['input1a', 'input2a']
    output_datset_urns = ['output1a', 'output2a']
    return marquez_client.create_job(
        job_name, 'some_other_location',
        input_datset_urns,
        output_datset_urns)
def test_data_in_marquez(wait_for_marquez, init_airflow_db):

    dag_id = "test_dag_v2"
    execution_date = "2019-02-01T00:00:00"
    namespace = "integration-test"

    c = MarquezClient(namespace_name=namespace)

    assert (trigger_dag(dag_id, execution_date))
    assert (check_dag_state(dag_id, execution_date))
    result = c.get_namespace(namespace)
    assert (result and result['name'] == namespace)

    expected_job = "test_dag_v2"
    result = c.get_job(expected_job)
    assert (result and result['name'] == expected_job)
Пример #6
0
def test_namespace_from_constructor(clear_env):
    os.environ['MARQUEZ_NAMESPACE'] = 'from_env'

    client = MarquezClient(namespace_name='from_constructor')
    assert client.namespace == 'from_constructor'

    # TODO: https://github.com/MarquezProject/marquez-python/issues/59
    os.environ.clear()
Пример #7
0
def marquez_client(namespace_name):
    return MarquezClient(host="localhost",
                         namespace_name=namespace_name,
                         port=8080)
Пример #8
0
def marquez_client_default_ns():
    return MarquezClient(host="localhost", port=8080)
Пример #9
0
def test_timeout_from_constructor(clear_env):
    os.environ['MARQUEZ_TIMEOUT_MS'] = '2000'

    client = MarquezClient(timeout_ms=3500)
    assert client._timeout == 3.5
Пример #10
0
def test_timeout_default(clear_env):
    client = MarquezClient()
    assert client._timeout == DEFAULT_TIMEOUT_MS / 1000.0
Пример #11
0
def test_port_default(clear_env):
    client = MarquezClient()
    assert client._api_base == f'http://{DEFAULT_HOST}:{DEFAULT_PORT}/api/v1'
Пример #12
0
def test_host_from_env(clear_env):
    os.environ['MARQUEZ_HOST'] = 'marquez.dev'

    client = MarquezClient()
    assert client._api_base == f'http://marquez.dev:8080/api/v1'
Пример #13
0
def marquez_client(namespace):
    return MarquezClient(host="localhost",
                         namespace_name=namespace['name'],
                         port=5000)
Пример #14
0
 def setUp(self):
     self.client = MarquezClient()
Пример #15
0
class TestMarquezClient(unittest.TestCase):
    def setUp(self):
        self.client = MarquezClient()

    @mock.patch("marquez_client.MarquezClient._put")
    def test_create_namespace(self, mock_put):
        owner_name = "me"
        description = "my namespace for testing."

        mock_put.return_value = {
            "name": _NAMESPACE,
            "ownerName": owner_name,
            "description": description
        }

        response = self.client.create_namespace(_NAMESPACE, owner_name,
                                                description)

        assert _NAMESPACE == str(response['name'])
        assert owner_name == str(response['ownerName'])
        assert description == str(response['description'])

    @mock.patch("marquez_client.MarquezClient._put")
    def test_create_dataset(self, mock_put):
        dataset_name = "my-dataset"
        description = "My dataset for testing."

        fields = [{
            "name": "flight_id",
            "type": "INTEGER",
            "description": "flight id"
        }, {
            "name": "flight_name",
            "type": "VARCHAR",
            "description": "flight name"
        }, {
            "name": "flight_date",
            "type": "TIMESTAMP",
            "description": "flight date"
        }]

        mock_put.return_value = {
            'id': {
                'namespace': 'my-namespace',
                'name': 'my-dataset'
            },
            'type':
            'DB_TABLE',
            'name':
            'my-dataset',
            'physicalName':
            'public.mytable',
            'createdAt':
            '2020-08-12T05:46:31.172877Z',
            'updatedAt':
            '2020-08-12T05:46:31.184934Z',
            'namespace':
            'my-namespace',
            'sourceName':
            'mydb',
            'fields': [{
                'name': 'my_date',
                'type': 'TIMESTAMP',
                'description': 'my date'
            }, {
                'name': 'my_id',
                'type': 'INTEGER',
                'description': 'my id'
            }, {
                'name': 'my_name',
                'type': 'VARCHAR',
                'description': 'my name'
            }],
            'tags': [],
            'lastModifiedAt':
            None,
            'description':
            'My dataset for testing.'
        }

        response = self.client.create_dataset(
            namespace_name=_NAMESPACE,
            dataset_name=dataset_name,
            dataset_type=DatasetType.DB_TABLE,
            physical_name=dataset_name,
            source_name='my-source',
            description=description,
            run_id=None,
            schema_location=None,
            fields=fields,
            tags=None)

        assert str(response['description']) == description
        assert str(response['name']) == dataset_name

    @mock.patch("marquez_client.MarquezClient._put")
    def test_create_datasource(self, mock_put):
        source_name = "flight_schedules_db"
        source_type = SourceType.POSTGRESQL
        source_url = "jdbc:postgresql://*****:*****@mock.patch("marquez_client.MarquezClient._put")
    def test_create_job(self, mock_put):
        job_name = "my-job"
        input_dataset = [{
            "namespace": "my-namespace",
            "name": "public.mytable"
        }]
        output_dataset = {
            "namespace": "my-namespace",
            "name": "public.mytable"
        }

        location = "https://github.com/my-jobs/blob/" \
                   "07f3d2dfc8186cadae9146719e70294a4c7a8ee8"

        context = {"SQL": "SELECT * FROM public.mytable;"}

        mock_put.return_value = {
            "id": {
                "namespace": "my-namespace",
                "name": "my-job"
            },
            "type": "BATCH",
            "name": "my-job",
            "createdAt": "2020-08-12T07:30:55.321059Z",
            "updatedAt": "2020-08-12T07:30:55.333230Z",
            "namespace": "my-namespace",
            "inputs": [{
                "namespace": "my-namespace",
                "name": "public.mytable"
            }],
            "outputs": [{
                "namespace": "my-namespace",
                "name": "public.mytable"
            }],
            "location": "https://github.com/my-jobs/blob/"
            "07f3d2dfc8186cadae9146719e70294a4c7a8ee8",
            "context": {
                "SQL": "SELECT * FROM public.mytable;"
            },
            "description": "My first job.",
            "latestRun": None
        }

        response = self.client.create_job(namespace_name=_NAMESPACE,
                                          job_name=job_name,
                                          job_type=JobType.BATCH,
                                          location=location,
                                          input_dataset=input_dataset,
                                          output_dataset=output_dataset,
                                          context=context)

        assert str(response['id']) is not None
        assert str(response['location']) == location

    @mock.patch("marquez_client.MarquezClient._post")
    def test_create_job_run(self, mock_post):
        job_name = "my-job"
        run_args = {
            "email": "*****@*****.**",
            "emailOnFailure": "true",
            "emailOnRetry": "true",
            "retries": "1"
        }
        created_at = str(
            generate(datetime.datetime.utcnow().replace(tzinfo=pytz.utc)))

        mock_post.return_value = {
            'id': f'{uuid.uuid4()}',
            'createdAt': f'{created_at}',
            'updatedAt': '2020-08-12T22:33:02.787228Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'NEW',
            'startedAt': None,
            'endedAt': None,
            'durationMs': None,
            'run_args': {
                "email": "*****@*****.**",
                "emailOnFailure": "true",
                "emailOnRetry": "true",
                "retries": "1"
            }
        }

        response = self.client.create_job_run(namespace_name=_NAMESPACE,
                                              job_name=job_name,
                                              nominal_start_time=None,
                                              nominal_end_time=None,
                                              run_args=run_args,
                                              mark_as_running=False)

        assert response['id'] is not None
        assert str(response['run_args']) == str(run_args)
        assert str(response['createdAt']) == created_at

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_start(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'RUNNING',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_started(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.RUNNING.value

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_completed(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'COMPLETED',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_completed(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.COMPLETED.value

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_failed(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'FAILED',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_failed(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.FAILED.value

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_aborted(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'ABORTED',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_aborted(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.ABORTED.value
Пример #16
0
def test_namespace_not_found(wait_for_marquez):
    c = MarquezClient(host=MARQUEZ_HOST, port=MARQUEZ_PORT)

    expected_namespace = "not_found"
    with pytest.raises(errors.APIError):
        c.get_namespace(expected_namespace)
Пример #17
0
def test_timeout(wait_for_marquez, broken_network):
    c = MarquezClient(host=MARQUEZ_HOST, port=MARQUEZ_PORT, timeout_ms=1)

    expected_namespace = "timeout_test"
    with pytest.raises(ReadTimeout):
        c.get_namespace(expected_namespace)
Пример #18
0
def test_bad_port(wait_for_marquez):
    c = MarquezClient(host=MARQUEZ_HOST, port="6000")
    with pytest.raises(requests.exceptions.ConnectionError) as e:
        c.get_namespace("no_connection")
    assert isinstance(e.value.args[0], MaxRetryError)
Пример #19
0
def test_host_default(clear_env):
    client = MarquezClient()
    assert client._api_base == f'http://{DEFAULT_HOST}:8080/api/v1'
Пример #20
0
def marquez_client():
    return MarquezClient(host="localhost", port=5000)
Пример #21
0
def test_host_from_constructor(clear_env):
    os.environ['MARQUEZ_HOST'] = 'marquez.dev'

    client = MarquezClient(host='marquez.staging')
    assert client._api_base == f'http://marquez.staging:8080/api/v1'
Пример #22
0
def marquez_client_with_timeout():
    return MarquezClient(host="localhost", port=5000, timeout_ms=4000)
Пример #23
0
def test_port_from_constructor(clear_env):
    os.environ['MARQUEZ_PORT'] = '5000'

    client = MarquezClient(port=5001)
    assert client._api_base == f'http://{DEFAULT_HOST}:5001/api/v1'
Пример #24
0
 def get_marquez_client(self):
     if not self._marquez_client:
         self._marquez_client = MarquezClient()
     return self._marquez_client
Пример #25
0
def test_timeout_from_env(clear_env):
    os.environ['MARQUEZ_TIMEOUT_MS'] = '2000'

    client = MarquezClient()
    assert client._timeout == 2.0
Пример #26
0
def client():
    return MarquezClient(url='http;//localhost:5000')
Пример #27
0
def test_namespace_default(clear_env):
    client = MarquezClient()
    assert client.namespace == DEFAULT_NAMESPACE_NAME
Пример #28
0
    'type': 'VARCHAR',
    'tags': [],
    'description': None
}, {
    'name': 'order_placed_on',
    'type': 'TIMESTAMP',
    'tags': [],
    'description': None
}, {
    'name': 'orders_placed',
    'type': 'INT4',
    'tags': [],
    'description': None
}]

client = MarquezClient(url='http://marquez:5000')

airflow_db_conn = psycopg2.connect(host="postgres",
                                   database="airflow",
                                   user="******",
                                   password="******")
airflow_db_conn.autocommit = True


@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def wait_for_dag():
    log.info(f"Waiting for DAG '{DAG_ID}'...")

    cur = airflow_db_conn.cursor()
    cur.execute(f"""
        SELECT dag_id, state
Пример #29
0
 def new_client():
     return MarquezClient(url=os.environ.get('MARQUEZ_URL',
                                             DEFAULT_MARQUEZ_URL),
                          api_key=os.environ.get('MARQUEZ_API_KEY'))
Пример #30
0
class DAG(airflow.models.DAG):
    DEFAULT_NAMESPACE = 'default'
    _job_id_mapping = None
    _marquez_client = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.marquez_namespace = os.environ.get('MARQUEZ_NAMESPACE') or \
            DAG.DEFAULT_NAMESPACE
        self.marquez_location = kwargs['default_args'].get(
            'marquez_location', 'unknown')
        self.marquez_input_urns = kwargs['default_args'].get(
            'marquez_input_urns', [])
        self.marquez_output_urns = kwargs['default_args'].get(
            'marquez_output_urns', [])
        self._job_id_mapping = JobIdMapping()

    def create_dagrun(self, *args, **kwargs):
        run_args = "{}"  # TODO extract the run Args from the tasks
        marquez_jobrun_id = None
        try:
            marquez_jobrun_id = self.report_jobrun(run_args,
                                                   kwargs['execution_date'])
            log.info(f'Successfully recorded job run.',
                     airflow_dag_id=self.dag_id,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace)
        except Exception as e:
            log.error(f'Failed to record job run: {e}',
                      airflow_dag_id=self.dag_id,
                      marquez_namespace=self.marquez_namespace)
            pass

        run = super(DAG, self).create_dagrun(*args, **kwargs)

        if marquez_jobrun_id:
            try:
                self._job_id_mapping.set(
                    JobIdMapping.make_key(run.dag_id, run.run_id),
                    marquez_jobrun_id)
            except Exception as e:
                log.error(f'Failed job run lookup: {e}',
                          airflow_dag_id=self.dag_id,
                          airflow_run_id=run.run_id,
                          marquez_run_id=marquez_jobrun_id,
                          marquez_namespace=self.marquez_namespace)
                pass

        return run

    def handle_callback(self, *args, **kwargs):
        try:
            self.report_jobrun_change(args[0], **kwargs)
        except Exception as e:
            log.error(f'Failed to record job run state change: {e}',
                      dag_id=self.dag_id)

        return super().handle_callback(*args, **kwargs)

    def report_jobrun(self, run_args, execution_date):
        now_ms = self._now_ms()

        job_name = self.dag_id
        start_time = execution_date.format("%Y-%m-%dT%H:%M:%SZ")
        end_time = self.compute_endtime(execution_date)
        if end_time:
            end_time = end_time.strftime("%Y-%m-%dT%H:%M:%SZ")
        marquez_client = self.get_marquez_client()

        marquez_client.create_job(job_name,
                                  self.marquez_location,
                                  self.marquez_input_urns,
                                  self.marquez_output_urns,
                                  description=self.description)
        log.info(f'Successfully recorded job: {job_name}',
                 airflow_dag_id=self.dag_id,
                 marquez_namespace=self.marquez_namespace)

        marquez_jobrun = marquez_client.create_job_run(
            job_name,
            run_args=run_args,
            nominal_start_time=start_time,
            nominal_end_time=end_time)

        marquez_jobrun_id = marquez_jobrun.get('runId')
        if marquez_jobrun_id:
            marquez_client.mark_job_run_as_running(marquez_jobrun_id)
            log.info(f'Successfully recorded job run: {job_name}',
                     airflow_dag_id=self.dag_id,
                     airflow_dag_execution_time=start_time,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace,
                     duration_ms=(self._now_ms() - now_ms))
        else:
            log.warn(f'Run id found not found: {job_name}',
                     airflow_dag_id=self.dag_id,
                     airflow_dag_execution_time=start_time,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace,
                     duration_ms=(self._now_ms() - now_ms))

        return marquez_jobrun_id

    def compute_endtime(self, execution_date):
        return self.following_schedule(execution_date)

    def report_jobrun_change(self, dagrun, **kwargs):
        session = kwargs.get('session')
        marquez_job_run_id = self._job_id_mapping.pop(
            JobIdMapping.make_key(dagrun.dag_id, dagrun.run_id), session)
        if marquez_job_run_id:
            log.info(f'Found job run.',
                     airflow_dag_id=dagrun.dag_id,
                     airflow_run_id=dagrun.run_id,
                     marquez_run_id=marquez_job_run_id,
                     marquez_namespace=self.marquez_namespace)

            if kwargs.get('success'):
                self.get_marquez_client().mark_job_run_as_completed(
                    marquez_job_run_id)
            else:
                self.get_marquez_client().mark_job_run_as_failed(
                    marquez_job_run_id)

        state = 'COMPLETED' if kwargs.get('success') else 'FAILED'
        log.info(f'Marked job run as {state}.',
                 airflow_dag_id=dagrun.dag_id,
                 airflow_run_id=dagrun.run_id,
                 marquez_run_id=marquez_job_run_id,
                 marquez_namespace=self.marquez_namespace)

    def get_marquez_client(self):
        if not self._marquez_client:
            self._marquez_client = MarquezClient(
                namespace_name=self.marquez_namespace)
            self._marquez_client.create_namespace(self.marquez_namespace,
                                                  "default_owner")
        return self._marquez_client

    @staticmethod
    def _now_ms():
        return int(round(time.time() * 1000))