def extract_on_complete(self, task_instance) -> [StepMetadata]: log.debug(f"extract_on_complete({task_instance})") context = self.parse_sql_context() source = self._source() try: bigquery_job_id = self._get_xcom_bigquery_job_id(task_instance) context['bigquery.job_id'] = bigquery_job_id if bigquery_job_id is None: raise Exception("Xcom could not resolve BigQuery job id." + "Job may have failed.") except Exception as e: log.error(f"Cannot retrieve job details from BigQuery.Client. {e}", exc_info=True) context['bigquery.extractor.client_error'] = \ f"{e}: {traceback.format_exc()}" return [StepMetadata( name=get_job_name(task=self.operator), context=context, inputs=None, outputs=None )] inputs = None outputs = None try: client = bigquery.Client() try: job = client.get_job(job_id=bigquery_job_id) job_properties_str = json.dumps(job._properties) context['bigquery.job_properties'] = job_properties_str inputs = self._get_input_from_bq(job, context, source, client) outputs = self._get_output_from_bq(job, source, client) finally: # Ensure client has close() defined, otherwise ignore. # NOTE: close() was introduced in python-bigquery v1.23.0 if hasattr(client, "close"): client.close() except Exception as e: log.error(f"Cannot retrieve job details from BigQuery.Client. {e}", exc_info=True) context['bigquery.extractor.error'] = \ f"{e}: {traceback.format_exc()}" return [StepMetadata( name=get_job_name(task=self.operator), inputs=inputs, outputs=outputs, context=context )]
def extract_on_complete(self, task_instance) -> StepMetadata: inputs = [ Dataset.from_table_schema(self.source, DbTableSchema( schema_name='schema', table_name=DbTableName('extract_on_complete_input1'), columns=[DbColumn( name='field1', type='text', description='', ordinal_position=1 ), DbColumn( name='field2', type='text', description='', ordinal_position=2 )] )) ] outputs = [ Dataset.from_table(self.source, "extract_on_complete_output1") ] return StepMetadata( name=get_job_name(task=self.operator), inputs=inputs, outputs=outputs, context={ "extract_on_complete": "extract_on_complete" } )
def extract(self) -> [StepMetadata]: # (1) Parse sql statement to obtain input / output tables. sql_meta: SqlMeta = SqlParser.parse(self.operator.sql) # (2) Default all inputs / outputs to current connection. # NOTE: We'll want to look into adding support for the `database` # property that is used to override the one defined in the connection. conn_id = self.operator.postgres_conn_id source = Source(type='POSTGRESQL', name=conn_id, connection_url=get_connection_uri(conn_id)) # (3) Map input / output tables to dataset objects with source set # as the current connection. We need to also fetch the schema for the # input tables to format the dataset name as: # {schema_name}.{table_name} inputs = [ Dataset.from_table(source=source, table_name=in_table_schema.table_name.name, schema_name=in_table_schema.schema_name) for in_table_schema in self._get_table_schemas(sql_meta.in_tables) ] outputs = [ Dataset.from_table_schema(source=source, table_schema=out_table_schema) for out_table_schema in self._get_table_schemas(sql_meta.out_tables) ] return [ StepMetadata( name=f"{self.operator.dag_id}.{self.operator.task_id}", inputs=inputs, outputs=outputs, context={'sql': self.operator.sql}) ]
def _extract_metadata(self, dagrun, task, ti=None): extractor = self._get_extractor(task) task_info = f'task_type={task.__class__.__name__} ' \ f'airflow_dag_id={self.dag_id} ' \ f'task_id={task.task_id} ' \ f'airflow_run_id={dagrun.run_id} ' if extractor: try: self.log.debug( f'Using extractor {extractor.__name__} {task_info}') steps = self._extract(extractor, task, ti) return add_airflow_info_to( task, steps ) except Exception as e: self.log.error( f'Failed to extract metadata {e} {task_info}', exc_info=True) else: self.log.warning( f'Unable to find an extractor. {task_info}') return add_airflow_info_to( task, [StepMetadata(name=self._marquez_job_name( self.dag_id, task.task_id))] )
def extract(self) -> [StepMetadata]: inputs = [Dataset.from_table(self.source, "extract_input1")] outputs = [Dataset.from_table(self.source, "extract_output1")] return [ StepMetadata(name=get_job_name(task=self.operator), inputs=inputs, outputs=outputs, context={"extract": "extract"}) ]
def test_add_airflow_info_to(): task = DummyOperator(task_id='test.task') steps_metadata = [StepMetadata(name='test.task')] add_airflow_info_to(task, steps_metadata) for step_metadata in steps_metadata: assert step_metadata.context['airflow.operator'] == \ 'airflow.operators.dummy_operator.DummyOperator' assert step_metadata.context['airflow.version'] == AIRFLOW_VERSION assert step_metadata.context['airflow.task_info'] is not None assert step_metadata.context['marquez_airflow.version'] == \ MARQUEZ_AIRFLOW_VERSION
def _extract_metadata(self, dagrun, task, task_instance=None) -> StepMetadata: extractor = self._get_extractor(task) task_info = f'task_type={task.__class__.__name__} ' \ f'airflow_dag_id={self.dag_id} ' \ f'task_id={task.task_id} ' \ f'airflow_run_id={dagrun.run_id} ' if extractor: try: self.log.debug( f'Using extractor {extractor.__name__} {task_info}') step = self._extract(extractor, task, task_instance) if isinstance(step, StepMetadata): return step # Compatibility with custom extractors if isinstance(step, list): if len(step) == 0: return StepMetadata( name=self._marquez_job_name(self.dag_id, task.task_id) ) elif len(step) >= 1: self.log.warning( f'Extractor {extractor.__name__} {task_info} ' f'returned more then one StepMetadata instance: {step} ' f'will drop steps except for first!' ) return step[0] except Exception as e: self.log.error( f'Failed to extract metadata {e} {task_info}', exc_info=True) else: self.log.warning( f'Unable to find an extractor. {task_info}') return StepMetadata( name=self._marquez_job_name(self.dag_id, task.task_id) )
def extract_on_complete(self, task_instance) -> Optional[StepMetadata]: log.debug(f"extract_on_complete({task_instance})") context = self.parse_sql_context() try: bigquery_job_id = self._get_xcom_bigquery_job_id(task_instance) if bigquery_job_id is None: raise Exception("Xcom could not resolve BigQuery job id." + "Job may have failed.") except Exception as e: log.error(f"Cannot retrieve job details from BigQuery.Client. {e}", exc_info=True) return StepMetadata( name=get_job_name(task=self.operator), inputs=None, outputs=None, run_facets={ "bigQuery_error": BigQueryErrorRunFacet( clientError=f"{e}: {traceback.format_exc()}", parserError=context.parser_error ) } ) inputs = None output = None run_facets = {} try: client = bigquery.Client() try: job = client.get_job(job_id=bigquery_job_id) props = job._properties run_stat_facet, dataset_stat_facet = self._get_output_statistics(props) run_facets.update({ "bigQuery_statistics": run_stat_facet }) inputs = self._get_input_from_bq(props, client) output = self._get_output_from_bq(props, client) if output: output.custom_facets.update({ "stats": dataset_stat_facet }) finally: # Ensure client has close() defined, otherwise ignore. # NOTE: close() was introduced in python-bigquery v1.23.0 if hasattr(client, "close"): client.close() except Exception as e: log.error(f"Cannot retrieve job details from BigQuery.Client. {e}", exc_info=True) run_facets.update({ "bigQuery_error": BigQueryErrorRunFacet( clientError=f"{e}: {traceback.format_exc()}", parserError=context.parser_error ) }) return StepMetadata( name=get_job_name(task=self.operator), inputs=inputs, outputs=[output] if output else [], run_facets=run_facets )
def report_task(self, dag_run_id, execution_date, run_args, task, extractor): report_job_start_ms = self._now_ms() marquez_client = self.get_marquez_client() if execution_date: start_time = self._to_iso_8601(execution_date) end_time = self.compute_endtime(execution_date) else: start_time = None end_time = None if end_time: end_time = self._to_iso_8601(end_time) task_location = None try: if hasattr(task, 'file_path') and task.file_path: task_location = get_location(task.file_path) else: task_location = get_location(task.dag.fileloc) except Exception: log.warn('Unable to fetch the location') steps_metadata = [] if extractor: try: log.info(f'Using extractor {extractor.__name__}', task_type=task.__class__.__name__, airflow_dag_id=self.dag_id, task_id=task.task_id, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) steps_metadata = extractor(task).extract() except Exception as e: log.error(f'Failed to extract metadata {e}', airflow_dag_id=self.dag_id, task_id=task.task_id, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) else: log.warn('Unable to find an extractor.', task_type=task.__class__.__name__, airflow_dag_id=self.dag_id, task_id=task.task_id, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) task_name = f'{self.dag_id}.{task.task_id}' # If no extractor found or failed to extract metadata, # report the task metadata if not steps_metadata: steps_metadata = [StepMetadata(task_name)] # store all the JobRuns associated with a task marquez_jobrun_ids = [] for step in steps_metadata: input_datasets = [] output_datasets = [] try: input_datasets = self.register_datasets(step.inputs) except Exception as e: log.error(f'Failed to register inputs: {e}', inputs=str(step.inputs), airflow_dag_id=self.dag_id, task_id=task.task_id, step=step.name, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) try: output_datasets = self.register_datasets(step.outputs) except Exception as e: log.error(f'Failed to register outputs: {e}', outputs=str(step.outputs), airflow_dag_id=self.dag_id, task_id=task.task_id, step=step.name, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) marquez_client.create_job( job_name=step.name, job_type='BATCH', # job type location=(step.location or task_location), input_dataset=input_datasets, output_dataset=output_datasets, context=step.context, description=self.description, namespace_name=self.marquez_namespace) log.info(f'Successfully recorded job: {step.name}', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace) marquez_jobrun_id = marquez_client.create_job_run( step.name, run_args=run_args, nominal_start_time=start_time, nominal_end_time=end_time).get('runId') if marquez_jobrun_id: marquez_jobrun_ids.append(marquez_jobrun_id) marquez_client.mark_job_run_as_started(marquez_jobrun_id) else: log.error(f'Failed to get run id: {step.name}', airflow_dag_id=self.dag_id, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) log.info(f'Successfully recorded job run: {step.name}', airflow_dag_id=self.dag_id, airflow_dag_execution_time=start_time, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - report_job_start_ms)) # Store the mapping for all the steps associated with a task try: self._job_id_mapping.set( JobIdMapping.make_key(task_name, dag_run_id), json.dumps(marquez_jobrun_ids)) except Exception as e: log.error(f'Failed to set id mapping : {e}', airflow_dag_id=self.dag_id, task_id=task.task_id, airflow_run_id=dag_run_id, marquez_run_id=marquez_jobrun_ids, marquez_namespace=self.marquez_namespace)