class TestFixtureDummyExtractor(BaseExtractor): operator_class = TestFixtureDummyOperator source = Source( type="DummySource", name="dummy_source_name", connection_url="http://dummy/source/url") def __init__(self, operator): super().__init__(operator) def extract(self) -> StepMetadata: inputs = [ Dataset.from_table(self.source, "extract_input1") ] outputs = [ Dataset.from_table(self.source, "extract_output1") ] return StepMetadata( name=get_job_name(task=self.operator), inputs=inputs, outputs=outputs, context={ "extract": "extract" } ) def extract_on_complete(self, task_instance) -> StepMetadata: return None
def test_extract(mock_get_table_schemas): mock_get_table_schemas.side_effect = \ [[DB_TABLE_SCHEMA], NO_DB_TABLE_SCHEMA] expected_inputs = [ Dataset(type=DatasetType.DB_TABLE, name=f"{DB_SCHEMA_NAME}.{DB_TABLE_NAME.name}", source=Source(type='POSTGRESQL', name=CONN_ID, connection_url=CONN_URI), fields=[]) ] expected_context = { 'sql': SQL, } # Set the environment variable for the connection os.environ[f"AIRFLOW_CONN_{CONN_ID.upper()}"] = CONN_URI step_metadata = PostgresExtractor(TASK).extract() assert step_metadata.name == f"{DAG_ID}.{TASK_ID}" assert step_metadata.inputs == expected_inputs assert step_metadata.outputs == [] assert step_metadata.context == expected_context
def test_extract(mock_get_table_schemas): mock_get_table_schemas.side_effect = \ [[DB_TABLE_SCHEMA], NO_DB_TABLE_SCHEMA] expected_inputs = [ Dataset(type=DatasetType.DB_TABLE, name=f"{DB_SCHEMA_NAME}.{DB_TABLE_NAME.name}", source=Source(type='POSTGRESQL', name=CONN_ID, connection_url=CONN_URI), fields=[]) ] expected_context = { 'sql': SQL, } # Set the environment variable for the connection os.environ[f"AIRFLOW_CONN_{CONN_ID.upper()}"] = CONN_URI # NOTE: When extracting operator metadata, only a single StepMetadata # object is returned. We'll want to cleanup the Extractor interface to # not return an array. step_metadata = PostgresExtractor(TASK).extract()[0] assert step_metadata.name == f"{DAG_ID}.{TASK_ID}" assert step_metadata.inputs == expected_inputs assert step_metadata.outputs == [] assert step_metadata.context == expected_context
def extract(self) -> [StepMetadata]: # (1) Parse sql statement to obtain input / output tables. sql_meta: SqlMeta = SqlParser.parse(self.operator.sql) # (2) Default all inputs / outputs to current connection. # NOTE: We'll want to look into adding support for the `database` # property that is used to override the one defined in the connection. conn_id = self.operator.postgres_conn_id source = Source(type='POSTGRESQL', name=conn_id, connection_url=get_connection_uri(conn_id)) # (3) Map input / output tables to dataset objects with source set # as the current connection. We need to also fetch the schema for the # input tables to format the dataset name as: # {schema_name}.{table_name} inputs = [ Dataset.from_table(source=source, table_name=in_table_schema.table_name.name, schema_name=in_table_schema.schema_name) for in_table_schema in self._get_table_schemas(sql_meta.in_tables) ] outputs = [ Dataset.from_table_schema(source=source, table_schema=out_table_schema) for out_table_schema in self._get_table_schemas(sql_meta.out_tables) ] return [ StepMetadata( name=f"{self.operator.dag_id}.{self.operator.task_id}", inputs=inputs, outputs=outputs, context={'sql': self.operator.sql}) ]
class TestFixtureDummyExtractorOnComplete(BaseExtractor): operator_class = TestFixtureDummyOperator source = Source( type="DummySource", name="dummy_source_name", connection_url="http://dummy/source/url") def __init__(self, operator): super().__init__(operator) def extract(self) -> StepMetadata: return None def extract_on_complete(self, task_instance) -> StepMetadata: inputs = [ Dataset.from_table_schema(self.source, DbTableSchema( schema_name='schema', table_name=DbTableName('extract_on_complete_input1'), columns=[DbColumn( name='field1', type='text', description='', ordinal_position=1 ), DbColumn( name='field2', type='text', description='', ordinal_position=2 )] )) ] outputs = [ Dataset.from_table(self.source, "extract_on_complete_output1") ] return StepMetadata( name=get_job_name(task=self.operator), inputs=inputs, outputs=outputs, context={ "extract_on_complete": "extract_on_complete" } )
def _source(self, bq_table) -> Source: conn_id = self.operator.bigquery_conn_id return Source( type="BIGQUERY", name=conn_id, connection_url=_BIGQUERY_CONN_URL.format(self._bq_table_name(bq_table)))
def _source(self) -> Source: conn_id = self.operator.bigquery_conn_id return Source(type="BIGQUERY", name=conn_id, connection_url=_BIGQUERY_CONN_URL)