Пример #1
0
    def _get_input_from_bq(self, properties, client):
        bq_input_tables = get_from_nullable_chain(properties, [
            'statistics', 'query', 'referencedTables'
        ])
        if not bq_input_tables:
            return None

        input_table_names = [
            self._bq_table_name(bq_t) for bq_t in bq_input_tables
        ]
        sources = [
            self._source(bq_t) for bq_t in bq_input_tables
        ]
        try:
            return [
                Dataset.from_table_schema(
                    source=source,
                    table_schema=table_schema
                )
                for table_schema, source in zip(self._get_table_schemas(
                    input_table_names, client
                ), sources)
            ]
        except Exception as e:
            log.warning(f'Could not extract schema from bigquery. {e}')
            return [
                Dataset.from_table(source, table)
                for table, source in zip(input_table_names, sources)
            ]
Пример #2
0
    def _get_input_from_bq(self, job, context, source, client):
        if not job._properties.get('statistics')\
              or not job._properties.get('statistics').get('query')\
              or not job._properties.get('statistics').get('query')\
                  .get('referencedTables'):
            return None

        bq_input_tables = job._properties.get('statistics')\
            .get('query')\
            .get('referencedTables')

        input_table_names = [
            self._bq_table_name(bq_t) for bq_t in bq_input_tables
        ]
        try:
            return [
                Dataset.from_table_schema(source=source,
                                          table_schema=table_schema)
                for table_schema in self._get_table_schemas(
                    input_table_names, client)
            ]
        except Exception as e:
            log.warn(f'Could not extract schema from bigquery. {e}')
            context['bigquery.extractor.bq_schema_error'] = \
                f'{e}: {traceback.format_exc()}'
            return [
                Dataset.from_table(source, table)
                for table in input_table_names
            ]
Пример #3
0
 def extract_on_complete(self, task_instance) -> StepMetadata:
     inputs = [
         Dataset.from_table_schema(self.source, DbTableSchema(
             schema_name='schema',
             table_name=DbTableName('extract_on_complete_input1'),
             columns=[DbColumn(
                 name='field1',
                 type='text',
                 description='',
                 ordinal_position=1
             ),
                 DbColumn(
                 name='field2',
                 type='text',
                 description='',
                 ordinal_position=2
             )]
         ))
     ]
     outputs = [
         Dataset.from_table(self.source, "extract_on_complete_output1")
     ]
     return StepMetadata(
         name=get_job_name(task=self.operator),
         inputs=inputs,
         outputs=outputs,
         context={
             "extract_on_complete": "extract_on_complete"
         }
     )
Пример #4
0
    def extract(self) -> [StepMetadata]:
        # (1) Parse sql statement to obtain input / output tables.
        sql_meta: SqlMeta = SqlParser.parse(self.operator.sql)

        # (2) Default all inputs / outputs to current connection.
        # NOTE: We'll want to look into adding support for the `database`
        # property that is used to override the one defined in the connection.
        conn_id = self.operator.postgres_conn_id
        source = Source(type='POSTGRESQL',
                        name=conn_id,
                        connection_url=get_connection_uri(conn_id))

        # (3) Map input / output tables to dataset objects with source set
        # as the current connection. We need to also fetch the schema for the
        # input tables to format the dataset name as:
        # {schema_name}.{table_name}
        inputs = [
            Dataset.from_table(source=source,
                               table_name=in_table_schema.table_name.name,
                               schema_name=in_table_schema.schema_name)
            for in_table_schema in self._get_table_schemas(sql_meta.in_tables)
        ]
        outputs = [
            Dataset.from_table_schema(source=source,
                                      table_schema=out_table_schema) for
            out_table_schema in self._get_table_schemas(sql_meta.out_tables)
        ]

        return [
            StepMetadata(
                name=f"{self.operator.dag_id}.{self.operator.task_id}",
                inputs=inputs,
                outputs=outputs,
                context={'sql': self.operator.sql})
        ]
Пример #5
0
 def extract(self) -> [StepMetadata]:
     inputs = [Dataset.from_table(self.source, "extract_input1")]
     outputs = [Dataset.from_table(self.source, "extract_output1")]
     return [
         StepMetadata(name=get_job_name(task=self.operator),
                      inputs=inputs,
                      outputs=outputs,
                      context={"extract": "extract"})
     ]
Пример #6
0
 def _get_output_from_bq(self, job, source, client):
     bq_output_table = job._properties.get('configuration') \
         .get('query') \
         .get('destinationTable')
     output_table_name = self._bq_table_name(bq_output_table)
     table_schema = self._get_table_safely(output_table_name, client)
     if table_schema:
         return [
             Dataset.from_table_schema(source=source,
                                       table_schema=table_schema)
         ]
     else:
         log.warn("Could not resolve output table from bq")
         return [Dataset.from_table(source, output_table_name)]
Пример #7
0
def test_extract(mock_get_table_schemas):
    mock_get_table_schemas.side_effect = \
        [[DB_TABLE_SCHEMA], NO_DB_TABLE_SCHEMA]

    expected_inputs = [
        Dataset(type=DatasetType.DB_TABLE,
                name=f"{DB_SCHEMA_NAME}.{DB_TABLE_NAME.name}",
                source=Source(type='POSTGRESQL',
                              name=CONN_ID,
                              connection_url=CONN_URI),
                fields=[])
    ]

    expected_context = {
        'sql': SQL,
    }

    # Set the environment variable for the connection
    os.environ[f"AIRFLOW_CONN_{CONN_ID.upper()}"] = CONN_URI

    step_metadata = PostgresExtractor(TASK).extract()

    assert step_metadata.name == f"{DAG_ID}.{TASK_ID}"
    assert step_metadata.inputs == expected_inputs
    assert step_metadata.outputs == []
    assert step_metadata.context == expected_context
Пример #8
0
def test_extract(mock_get_table_schemas):
    mock_get_table_schemas.side_effect = \
        [[DB_TABLE_SCHEMA], NO_DB_TABLE_SCHEMA]

    expected_inputs = [
        Dataset(type=DatasetType.DB_TABLE,
                name=f"{DB_SCHEMA_NAME}.{DB_TABLE_NAME.name}",
                source=Source(type='POSTGRESQL',
                              name=CONN_ID,
                              connection_url=CONN_URI),
                fields=[])
    ]

    expected_context = {
        'sql': SQL,
    }

    # Set the environment variable for the connection
    os.environ[f"AIRFLOW_CONN_{CONN_ID.upper()}"] = CONN_URI

    # NOTE: When extracting operator metadata, only a single StepMetadata
    # object is returned. We'll want to cleanup the Extractor interface to
    # not return an array.
    step_metadata = PostgresExtractor(TASK).extract()[0]

    assert step_metadata.name == f"{DAG_ID}.{TASK_ID}"
    assert step_metadata.inputs == expected_inputs
    assert step_metadata.outputs == []
    assert step_metadata.context == expected_context
Пример #9
0
    def _get_output_from_bq(self, properties, client) -> Optional[Dataset]:
        bq_output_table = get_from_nullable_chain(properties, [
            'configuration', 'query', 'destinationTable'
        ])
        if not bq_output_table:
            return None

        output_table_name = self._bq_table_name(bq_output_table)
        source = self._source(bq_output_table)
        table_schema = self._get_table_safely(output_table_name, client)
        if table_schema:
            return Dataset.from_table_schema(
                source=source,
                table_schema=table_schema
            )
        else:
            log.warning("Could not resolve output table from bq")
            return Dataset.from_table(source, output_table_name)