Пример #1
0
    def extract_on_complete(self, task_instance) -> [StepMetadata]:
        log.debug(f"extract_on_complete({task_instance})")
        context = self.parse_sql_context()
        source = self._source()

        try:
            bigquery_job_id = self._get_xcom_bigquery_job_id(task_instance)
            context['bigquery.job_id'] = bigquery_job_id
            if bigquery_job_id is None:
                raise Exception("Xcom could not resolve BigQuery job id." +
                                "Job may have failed.")
        except Exception as e:
            log.error(f"Cannot retrieve job details from BigQuery.Client. {e}",
                      exc_info=True)
            context['bigquery.extractor.client_error'] = \
                f"{e}: {traceback.format_exc()}"
            return [StepMetadata(
                name=get_job_name(task=self.operator),
                context=context,
                inputs=None,
                outputs=None
            )]

        inputs = None
        outputs = None
        try:
            client = bigquery.Client()
            try:
                job = client.get_job(job_id=bigquery_job_id)
                job_properties_str = json.dumps(job._properties)
                context['bigquery.job_properties'] = job_properties_str

                inputs = self._get_input_from_bq(job, context, source, client)
                outputs = self._get_output_from_bq(job, source, client)
            finally:
                # Ensure client has close() defined, otherwise ignore.
                # NOTE: close() was introduced in python-bigquery v1.23.0
                if hasattr(client, "close"):
                    client.close()
        except Exception as e:
            log.error(f"Cannot retrieve job details from BigQuery.Client. {e}",
                      exc_info=True)
            context['bigquery.extractor.error'] = \
                f"{e}: {traceback.format_exc()}"

        return [StepMetadata(
            name=get_job_name(task=self.operator),
            inputs=inputs,
            outputs=outputs,
            context=context
        )]
Пример #2
0
 def extract_on_complete(self, task_instance) -> StepMetadata:
     inputs = [
         Dataset.from_table_schema(self.source, DbTableSchema(
             schema_name='schema',
             table_name=DbTableName('extract_on_complete_input1'),
             columns=[DbColumn(
                 name='field1',
                 type='text',
                 description='',
                 ordinal_position=1
             ),
                 DbColumn(
                 name='field2',
                 type='text',
                 description='',
                 ordinal_position=2
             )]
         ))
     ]
     outputs = [
         Dataset.from_table(self.source, "extract_on_complete_output1")
     ]
     return StepMetadata(
         name=get_job_name(task=self.operator),
         inputs=inputs,
         outputs=outputs,
         context={
             "extract_on_complete": "extract_on_complete"
         }
     )
Пример #3
0
    def extract(self) -> [StepMetadata]:
        # (1) Parse sql statement to obtain input / output tables.
        sql_meta: SqlMeta = SqlParser.parse(self.operator.sql)

        # (2) Default all inputs / outputs to current connection.
        # NOTE: We'll want to look into adding support for the `database`
        # property that is used to override the one defined in the connection.
        conn_id = self.operator.postgres_conn_id
        source = Source(type='POSTGRESQL',
                        name=conn_id,
                        connection_url=get_connection_uri(conn_id))

        # (3) Map input / output tables to dataset objects with source set
        # as the current connection. We need to also fetch the schema for the
        # input tables to format the dataset name as:
        # {schema_name}.{table_name}
        inputs = [
            Dataset.from_table(source=source,
                               table_name=in_table_schema.table_name.name,
                               schema_name=in_table_schema.schema_name)
            for in_table_schema in self._get_table_schemas(sql_meta.in_tables)
        ]
        outputs = [
            Dataset.from_table_schema(source=source,
                                      table_schema=out_table_schema) for
            out_table_schema in self._get_table_schemas(sql_meta.out_tables)
        ]

        return [
            StepMetadata(
                name=f"{self.operator.dag_id}.{self.operator.task_id}",
                inputs=inputs,
                outputs=outputs,
                context={'sql': self.operator.sql})
        ]
Пример #4
0
    def _extract_metadata(self, dagrun, task, ti=None):
        extractor = self._get_extractor(task)
        task_info = f'task_type={task.__class__.__name__} ' \
            f'airflow_dag_id={self.dag_id} ' \
            f'task_id={task.task_id} ' \
            f'airflow_run_id={dagrun.run_id} '
        if extractor:
            try:
                self.log.debug(
                    f'Using extractor {extractor.__name__} {task_info}')
                steps = self._extract(extractor, task, ti)

                return add_airflow_info_to(
                    task,
                    steps
                )
            except Exception as e:
                self.log.error(
                    f'Failed to extract metadata {e} {task_info}',
                    exc_info=True)
        else:
            self.log.warning(
                f'Unable to find an extractor. {task_info}')

        return add_airflow_info_to(
            task,
            [StepMetadata(name=self._marquez_job_name(
                self.dag_id, task.task_id))]
        )
Пример #5
0
 def extract(self) -> [StepMetadata]:
     inputs = [Dataset.from_table(self.source, "extract_input1")]
     outputs = [Dataset.from_table(self.source, "extract_output1")]
     return [
         StepMetadata(name=get_job_name(task=self.operator),
                      inputs=inputs,
                      outputs=outputs,
                      context={"extract": "extract"})
     ]
Пример #6
0
def test_add_airflow_info_to():
    task = DummyOperator(task_id='test.task')
    steps_metadata = [StepMetadata(name='test.task')]

    add_airflow_info_to(task, steps_metadata)
    for step_metadata in steps_metadata:
        assert step_metadata.context['airflow.operator'] == \
            'airflow.operators.dummy_operator.DummyOperator'
        assert step_metadata.context['airflow.version'] == AIRFLOW_VERSION
        assert step_metadata.context['airflow.task_info'] is not None
        assert step_metadata.context['marquez_airflow.version'] == \
            MARQUEZ_AIRFLOW_VERSION
Пример #7
0
    def _extract_metadata(self, dagrun, task, task_instance=None) -> StepMetadata:
        extractor = self._get_extractor(task)
        task_info = f'task_type={task.__class__.__name__} ' \
            f'airflow_dag_id={self.dag_id} ' \
            f'task_id={task.task_id} ' \
            f'airflow_run_id={dagrun.run_id} '
        if extractor:
            try:
                self.log.debug(
                    f'Using extractor {extractor.__name__} {task_info}')
                step = self._extract(extractor, task, task_instance)

                if isinstance(step, StepMetadata):
                    return step

                # Compatibility with custom extractors
                if isinstance(step, list):
                    if len(step) == 0:
                        return StepMetadata(
                            name=self._marquez_job_name(self.dag_id, task.task_id)
                        )
                    elif len(step) >= 1:
                        self.log.warning(
                            f'Extractor {extractor.__name__} {task_info} '
                            f'returned more then one StepMetadata instance: {step} '
                            f'will drop steps except for first!'
                        )
                    return step[0]

            except Exception as e:
                self.log.error(
                    f'Failed to extract metadata {e} {task_info}',
                    exc_info=True)
        else:
            self.log.warning(
                f'Unable to find an extractor. {task_info}')

        return StepMetadata(
            name=self._marquez_job_name(self.dag_id, task.task_id)
        )
Пример #8
0
    def extract_on_complete(self, task_instance) -> Optional[StepMetadata]:
        log.debug(f"extract_on_complete({task_instance})")
        context = self.parse_sql_context()

        try:
            bigquery_job_id = self._get_xcom_bigquery_job_id(task_instance)
            if bigquery_job_id is None:
                raise Exception("Xcom could not resolve BigQuery job id." +
                                "Job may have failed.")
        except Exception as e:
            log.error(f"Cannot retrieve job details from BigQuery.Client. {e}",
                      exc_info=True)
            return StepMetadata(
                name=get_job_name(task=self.operator),
                inputs=None,
                outputs=None,
                run_facets={
                    "bigQuery_error": BigQueryErrorRunFacet(
                        clientError=f"{e}: {traceback.format_exc()}",
                        parserError=context.parser_error
                    )
                }
            )

        inputs = None
        output = None
        run_facets = {}
        try:
            client = bigquery.Client()
            try:
                job = client.get_job(job_id=bigquery_job_id)
                props = job._properties

                run_stat_facet, dataset_stat_facet = self._get_output_statistics(props)

                run_facets.update({
                    "bigQuery_statistics": run_stat_facet
                })

                inputs = self._get_input_from_bq(props, client)
                output = self._get_output_from_bq(props, client)
                if output:
                    output.custom_facets.update({
                        "stats": dataset_stat_facet
                    })

            finally:
                # Ensure client has close() defined, otherwise ignore.
                # NOTE: close() was introduced in python-bigquery v1.23.0
                if hasattr(client, "close"):
                    client.close()
        except Exception as e:
            log.error(f"Cannot retrieve job details from BigQuery.Client. {e}",
                      exc_info=True)
            run_facets.update({
                "bigQuery_error": BigQueryErrorRunFacet(
                    clientError=f"{e}: {traceback.format_exc()}",
                    parserError=context.parser_error
                )
            })

        return StepMetadata(
            name=get_job_name(task=self.operator),
            inputs=inputs,
            outputs=[output] if output else [],
            run_facets=run_facets
        )
Пример #9
0
    def report_task(self, dag_run_id, execution_date, run_args, task,
                    extractor):

        report_job_start_ms = self._now_ms()
        marquez_client = self.get_marquez_client()
        if execution_date:
            start_time = self._to_iso_8601(execution_date)
            end_time = self.compute_endtime(execution_date)
        else:
            start_time = None
            end_time = None

        if end_time:
            end_time = self._to_iso_8601(end_time)

        task_location = None
        try:
            if hasattr(task, 'file_path') and task.file_path:
                task_location = get_location(task.file_path)
            else:
                task_location = get_location(task.dag.fileloc)
        except Exception:
            log.warn('Unable to fetch the location')

        steps_metadata = []
        if extractor:
            try:
                log.info(f'Using extractor {extractor.__name__}',
                         task_type=task.__class__.__name__,
                         airflow_dag_id=self.dag_id,
                         task_id=task.task_id,
                         airflow_run_id=dag_run_id,
                         marquez_namespace=self.marquez_namespace)
                steps_metadata = extractor(task).extract()
            except Exception as e:
                log.error(f'Failed to extract metadata {e}',
                          airflow_dag_id=self.dag_id,
                          task_id=task.task_id,
                          airflow_run_id=dag_run_id,
                          marquez_namespace=self.marquez_namespace)
        else:
            log.warn('Unable to find an extractor.',
                     task_type=task.__class__.__name__,
                     airflow_dag_id=self.dag_id,
                     task_id=task.task_id,
                     airflow_run_id=dag_run_id,
                     marquez_namespace=self.marquez_namespace)

        task_name = f'{self.dag_id}.{task.task_id}'

        # If no extractor found or failed to extract metadata,
        # report the task metadata
        if not steps_metadata:
            steps_metadata = [StepMetadata(task_name)]

        # store all the JobRuns associated with a task
        marquez_jobrun_ids = []

        for step in steps_metadata:
            input_datasets = []
            output_datasets = []

            try:
                input_datasets = self.register_datasets(step.inputs)
            except Exception as e:
                log.error(f'Failed to register inputs: {e}',
                          inputs=str(step.inputs),
                          airflow_dag_id=self.dag_id,
                          task_id=task.task_id,
                          step=step.name,
                          airflow_run_id=dag_run_id,
                          marquez_namespace=self.marquez_namespace)
            try:
                output_datasets = self.register_datasets(step.outputs)
            except Exception as e:
                log.error(f'Failed to register outputs: {e}',
                          outputs=str(step.outputs),
                          airflow_dag_id=self.dag_id,
                          task_id=task.task_id,
                          step=step.name,
                          airflow_run_id=dag_run_id,
                          marquez_namespace=self.marquez_namespace)

            marquez_client.create_job(
                job_name=step.name,
                job_type='BATCH',  # job type
                location=(step.location or task_location),
                input_dataset=input_datasets,
                output_dataset=output_datasets,
                context=step.context,
                description=self.description,
                namespace_name=self.marquez_namespace)
            log.info(f'Successfully recorded job: {step.name}',
                     airflow_dag_id=self.dag_id,
                     marquez_namespace=self.marquez_namespace)

            marquez_jobrun_id = marquez_client.create_job_run(
                step.name,
                run_args=run_args,
                nominal_start_time=start_time,
                nominal_end_time=end_time).get('runId')

            if marquez_jobrun_id:
                marquez_jobrun_ids.append(marquez_jobrun_id)
                marquez_client.mark_job_run_as_started(marquez_jobrun_id)
            else:
                log.error(f'Failed to get run id: {step.name}',
                          airflow_dag_id=self.dag_id,
                          airflow_run_id=dag_run_id,
                          marquez_namespace=self.marquez_namespace)
            log.info(f'Successfully recorded job run: {step.name}',
                     airflow_dag_id=self.dag_id,
                     airflow_dag_execution_time=start_time,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace,
                     duration_ms=(self._now_ms() - report_job_start_ms))

        # Store the mapping for all the steps associated with a task
        try:
            self._job_id_mapping.set(
                JobIdMapping.make_key(task_name, dag_run_id),
                json.dumps(marquez_jobrun_ids))

        except Exception as e:
            log.error(f'Failed to set id mapping : {e}',
                      airflow_dag_id=self.dag_id,
                      task_id=task.task_id,
                      airflow_run_id=dag_run_id,
                      marquez_run_id=marquez_jobrun_ids,
                      marquez_namespace=self.marquez_namespace)