示例#1
0
    def setUp(self):
        # type: () -> None
        super(TestApplication, self).setUp()

        self.application = Application(
            task_id='hive.default.test_table',
            dag_id='event_test',
            exec_date='2018-05-31T00:00:00',
            application_url_template=
            'airflow_host.net/admin/airflow/tree?dag_id={dag_id}')

        self.expected_node_result = {
            NODE_KEY:
            'application://gold.airflow/event_test/hive.default.test_table',
            NODE_LABEL: 'Application',
            'application_url':
            'airflow_host.net/admin/airflow/tree?dag_id=event_test',
            'id': 'event_test/hive.default.test_table',
            'name': 'Airflow',
            'description': 'Airflow with id event_test/hive.default.test_table'
        }

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.default/test_table',
            RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL,
            RELATION_END_KEY:
            'application://gold.airflow/event_test/hive.default.test_table',
            RELATION_END_LABEL: 'Application',
            RELATION_TYPE: 'DERIVED_FROM',
            RELATION_REVERSE_TYPE: 'GENERATES'
        }
class TestApplication(unittest.TestCase):
    def setUp(self) -> None:
        super(TestApplication, self).setUp()

        self.application = Application(
            task_id='hive.default.test_table',
            dag_id='event_test',
            schema='default',
            table_name='test_table',
            application_url_template=
            'airflow_host.net/admin/airflow/tree?dag_id={dag_id}')

        self.expected_node_result = {
            NODE_KEY:
            'application://gold.airflow/event_test/hive.default.test_table',
            NODE_LABEL: 'Application',
            'application_url':
            'airflow_host.net/admin/airflow/tree?dag_id=event_test',
            'id': 'event_test/hive.default.test_table',
            'name': 'Airflow',
            'description': 'Airflow with id event_test/hive.default.test_table'
        }

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.default/test_table',
            RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL,
            RELATION_END_KEY:
            'application://gold.airflow/event_test/hive.default.test_table',
            RELATION_END_LABEL: 'Application',
            RELATION_TYPE: 'DERIVED_FROM',
            RELATION_REVERSE_TYPE: 'GENERATES'
        }

    def test_create_next_node(self) -> None:
        next_node = self.application.create_next_node()
        self.assertEquals(next_node, self.expected_node_result)

    def test_create_next_relation(self) -> None:
        next_relation = self.application.create_next_relation()
        self.assertEquals(next_relation, self.expected_relation_result)

    def test_get_table_model_key(self) -> None:
        table = self.application.get_table_model_key()
        self.assertEquals(table, 'hive://gold.default/test_table')

    def test_get_application_model_key(self) -> None:
        application = self.application.get_application_model_key()
        self.assertEquals(application, self.expected_node_result[NODE_KEY])

    def test_create_nodes(self) -> None:
        nodes = self.application.create_nodes()
        self.assertEquals(len(nodes), 1)
        self.assertEquals(nodes[0], self.expected_node_result)

    def test_create_relation(self) -> None:
        relation = self.application.create_relation()
        self.assertEquals(len(relation), 1)
        self.assertEquals(relation[0], self.expected_relation_result)
示例#3
0
class TestApplication(unittest.TestCase):
    def setUp(self) -> None:
        super(TestApplication, self).setUp()

        self.application = Application(
            task_id='hive.default.test_table',
            dag_id='event_test',
            schema='default',
            table_name='test_table',
            application_url_template=
            'airflow_host.net/admin/airflow/tree?dag_id={dag_id}')

        self.expected_node_result = {
            NODE_KEY:
            'application://gold.airflow/event_test/hive.default.test_table',
            NODE_LABEL: 'Application',
            'application_url':
            'airflow_host.net/admin/airflow/tree?dag_id=event_test',
            'id': 'event_test/hive.default.test_table',
            'name': 'Airflow',
            'description': 'Airflow with id event_test/hive.default.test_table'
        }

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.default/test_table',
            RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL,
            RELATION_END_KEY:
            'application://gold.airflow/event_test/hive.default.test_table',
            RELATION_END_LABEL: 'Application',
            RELATION_TYPE: 'DERIVED_FROM',
            RELATION_REVERSE_TYPE: 'GENERATES'
        }

    def test_create_next_node(self) -> None:
        next_node = self.application.create_next_node()
        serialized_next_node = neo4_serializer.serialize_node(next_node)
        self.assertEquals(serialized_next_node, self.expected_node_result)

    def test_create_next_node_neptune(self) -> None:
        next_node = self.application.create_next_node()
        serialized_next_node = neptune_serializer.convert_node(next_node)
        neptune_expected = {
            NEPTUNE_HEADER_ID:
            'application://gold.airflow/event_test/hive.default.test_table',
            NEPTUNE_HEADER_LABEL:
            'Application',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB,
            'application_url:String(single)':
            'airflow_host.net/admin/airflow/tree?dag_id=event_test',
            'id:String(single)':
            'event_test/hive.default.test_table',
            'name:String(single)':
            'Airflow',
            'description:String(single)':
            'Airflow with id event_test/hive.default.test_table',
        }
        self.assertDictEqual(neptune_expected, serialized_next_node)

    def test_create_next_relation(self) -> None:
        next_relation = self.application.create_next_relation()
        serialized_next_relation = neo4_serializer.serialize_relationship(
            next_relation)
        self.assertEquals(serialized_next_relation,
                          self.expected_relation_result)

    def test_create_next_relation_neptune(self) -> None:

        neptune_forward_expected = {
            NEPTUNE_HEADER_ID:
            "{from_vertex_id}_{to_vertex_id}_{label}".format(
                from_vertex_id='hive://gold.default/test_table',
                to_vertex_id=
                'application://gold.airflow/event_test/hive.default.test_table',
                label='DERIVED_FROM'),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            'hive://gold.default/test_table',
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            'application://gold.airflow/event_test/hive.default.test_table',
            NEPTUNE_HEADER_LABEL:
            'DERIVED_FROM',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }

        neptune_reversed_expected = {
            NEPTUNE_HEADER_ID:
            "{from_vertex_id}_{to_vertex_id}_{label}".format(
                from_vertex_id=
                'application://gold.airflow/event_test/hive.default.test_table',
                to_vertex_id='hive://gold.default/test_table',
                label='GENERATES'),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            'application://gold.airflow/event_test/hive.default.test_table',
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            'hive://gold.default/test_table',
            NEPTUNE_HEADER_LABEL:
            'GENERATES',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }

        next_relation = self.application.create_next_relation()
        serialized_next_relations = neptune_serializer.convert_relationship(
            next_relation)
        self.assertDictEqual(serialized_next_relations[0],
                             neptune_forward_expected)
        self.assertDictEqual(serialized_next_relations[1],
                             neptune_reversed_expected)

    def test_get_table_model_key(self) -> None:
        table = self.application.get_table_model_key()
        self.assertEqual(table, 'hive://gold.default/test_table')

    def test_get_application_model_key(self) -> None:
        application = self.application.get_application_model_key()
        self.assertEqual(application, self.expected_node_result[NODE_KEY])

    def test_create_nodes(self) -> None:
        nodes = self.application.create_nodes()
        self.assertEquals(len(nodes), 1)
        serialized_next_node = neo4_serializer.serialize_node(nodes[0])
        self.assertEquals(serialized_next_node, self.expected_node_result)

    def test_create_relation(self) -> None:
        relation = self.application.create_relation()
        self.assertEquals(len(relation), 1)
        self.assertEquals(neo4_serializer.serialize_relationship(relation[0]),
                          self.expected_relation_result)
class TestApplication(unittest.TestCase):

    def setUp(self) -> None:
        super(TestApplication, self).setUp()

        self.application = Application(task_id='hive.default.test_table',
                                       dag_id='event_test',
                                       schema='default',
                                       table_name='test_table',
                                       application_url_template='airflow_host.net/admin/airflow/tree?dag_id={dag_id}')

        self.expected_node_results = [{
            NODE_KEY: 'application://gold.airflow/event_test/hive.default.test_table',
            NODE_LABEL: 'Application',
            'application_url': 'airflow_host.net/admin/airflow/tree?dag_id=event_test',
            'id': 'event_test/hive.default.test_table',
            'name': 'Airflow',
            'description': 'Airflow with id event_test/hive.default.test_table'
        }]

        self.expected_relation_results = [{
            RELATION_START_KEY: 'hive://gold.default/test_table',
            RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL,
            RELATION_END_KEY: 'application://gold.airflow/event_test/hive.default.test_table',
            RELATION_END_LABEL: 'Application',
            RELATION_TYPE: 'DERIVED_FROM',
            RELATION_REVERSE_TYPE: 'GENERATES'
        }]

    def test_get_table_model_key(self) -> None:
        table = self.application.get_table_model_key()
        self.assertEqual(table, 'hive://gold.default/test_table')

    def test_get_application_model_key(self) -> None:
        application = self.application.get_application_model_key()
        self.assertEqual(application, self.expected_node_results[0][NODE_KEY])

    def test_create_nodes(self) -> None:
        actual = []
        node = self.application.create_next_node()
        while node:
            serialized_next_node = neo4_serializer.serialize_node(node)
            actual.append(serialized_next_node)
            node = self.application.create_next_node()

        self.assertEqual(actual, self.expected_node_results)

    def test_create_nodes_neptune(self) -> None:
        actual = []
        next_node = self.application.create_next_node()
        while next_node:
            serialized_next_node = neptune_serializer.convert_node(next_node)
            actual.append(serialized_next_node)
            next_node = self.application.create_next_node()

        node_id = 'Application:application://gold.airflow/event_test/hive.default.test_table'
        neptune_expected = [{
            NEPTUNE_HEADER_ID: node_id,
            METADATA_KEY_PROPERTY_NAME: node_id,
            NEPTUNE_HEADER_LABEL: 'Application',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
            NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB,
            'application_url:String(single)': 'airflow_host.net/admin/airflow/tree?dag_id=event_test',
            'id:String(single)': 'event_test/hive.default.test_table',
            'name:String(single)': 'Airflow',
            'description:String(single)': 'Airflow with id event_test/hive.default.test_table',
        }]
        self.assertEqual(neptune_expected, actual)

    def test_create_relation(self) -> None:
        actual = []
        relation = self.application.create_next_relation()
        while relation:
            serialized_relation = neo4_serializer.serialize_relationship(relation)
            actual.append(serialized_relation)
            relation = self.application.create_next_relation()

        self.assertEqual(actual, self.expected_relation_results)

    def test_create_relations_neptune(self) -> None:
        application_id = 'Application:application://gold.airflow/event_test/hive.default.test_table'
        table_id = 'Table:hive://gold.default/test_table'
        neptune_forward_expected = {
            NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id=table_id,
                to_vertex_id=application_id,
                label='DERIVED_FROM'
            ),
            METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id=table_id,
                to_vertex_id=application_id,
                label='DERIVED_FROM'
            ),
            NEPTUNE_RELATIONSHIP_HEADER_FROM: table_id,
            NEPTUNE_RELATIONSHIP_HEADER_TO: application_id,
            NEPTUNE_HEADER_LABEL: 'DERIVED_FROM',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
        }

        neptune_reversed_expected = {
            NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id=application_id,
                to_vertex_id=table_id,
                label='GENERATES'
            ),
            METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id=application_id,
                to_vertex_id=table_id,
                label='GENERATES'
            ),
            NEPTUNE_RELATIONSHIP_HEADER_FROM: application_id,
            NEPTUNE_RELATIONSHIP_HEADER_TO: table_id,
            NEPTUNE_HEADER_LABEL: 'GENERATES',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
        }
        neptune_expected = [[neptune_forward_expected, neptune_reversed_expected]]

        actual = []
        next_relation = self.application.create_next_relation()
        while next_relation:
            serialized_next_relation = neptune_serializer.convert_relationship(next_relation)
            actual.append(serialized_next_relation)
            next_relation = self.application.create_next_relation()

        self.assertEqual(actual, neptune_expected)

    def test_create_records(self) -> None:
        expected_application_record = {
            'rk': 'application://gold.airflow/event_test/hive.default.test_table',
            'application_url': 'airflow_host.net/admin/airflow/tree?dag_id=event_test',
            'id': 'event_test/hive.default.test_table',
            'name': 'Airflow',
            'description': 'Airflow with id event_test/hive.default.test_table'
        }
        expected_application_table_record = {
            'rk': 'hive://gold.default/test_table',
            'application_rk': 'application://gold.airflow/event_test/hive.default.test_table'
        }
        expected = [expected_application_record, expected_application_table_record]

        actual = []
        record = self.application.create_next_record()
        while record:
            serialized_record = mysql_serializer.serialize_record(record)
            actual.append(serialized_record)
            record = self.application.create_next_record()

        self.assertEqual(expected, actual)
示例#5
0
    def setUp(self) -> None:
        super(TestApplication, self).setUp()

        self.test_cases = []

        # Explicitly add test case for Airflow to verify backwards compatibility
        airflow_application = Application(
            task_id='hive.default.test_table',
            dag_id='event_test',
            schema='default',
            table_name='test_table',
            application_url_template=
            'airflow_host.net/admin/airflow/tree?dag_id={dag_id}',
        )

        airflow_expected_node_results = [{
            NODE_KEY:
            'application://gold.airflow/event_test/hive.default.test_table',
            NODE_LABEL:
            'Application',
            'application_url':
            'airflow_host.net/admin/airflow/tree?dag_id=event_test',
            'id':
            'event_test/hive.default.test_table',
            'name':
            'Airflow',
            'description':
            'Airflow with id event_test/hive.default.test_table'
        }]

        airflow_expected_relation_results = [{
            RELATION_START_KEY:
            'hive://gold.default/test_table',
            RELATION_START_LABEL:
            TableMetadata.TABLE_NODE_LABEL,
            RELATION_END_KEY:
            'application://gold.airflow/event_test/hive.default.test_table',
            RELATION_END_LABEL:
            'Application',
            RELATION_TYPE:
            'DERIVED_FROM',
            RELATION_REVERSE_TYPE:
            'GENERATES'
        }]

        airflow_expected_application_record = {
            'rk':
            'application://gold.airflow/event_test/hive.default.test_table',
            'application_url':
            'airflow_host.net/admin/airflow/tree?dag_id=event_test',
            'id': 'event_test/hive.default.test_table',
            'name': 'Airflow',
            'description': 'Airflow with id event_test/hive.default.test_table'
        }

        airflow_expected_application_table_record = {
            'rk':
            'hive://gold.default/test_table',
            'application_rk':
            'application://gold.airflow/event_test/hive.default.test_table'
        }

        airflow_expected_records = [
            airflow_expected_application_record,
            airflow_expected_application_table_record,
        ]

        self.test_cases.append(
            ApplicationTestCase(
                airflow_application,
                airflow_expected_node_results,
                airflow_expected_relation_results,
                airflow_expected_records,
            ), )

        # Test several non-airflow applications
        AppTestCase = namedtuple('AppTestCase', ['name', 'generates_table'])
        non_airflow_cases = [
            AppTestCase(name='Databricks', generates_table=False),
            AppTestCase(name='Snowflake', generates_table=True),
            AppTestCase(name='EMR', generates_table=False),
        ]

        for case in non_airflow_cases:
            application_type = case.name
            url = f'https://{application_type.lower()}.com/job/1234'
            id = f'{application_type}.hive.test_table'
            description = f'{application_type} application for hive.test_table'
            table_key = TableMetadata.TABLE_KEY_FORMAT.format(
                db='hive',
                cluster='gold',
                schema='default',
                tbl='test_table',
            )

            application = GenericApplication(
                start_label=TableMetadata.TABLE_NODE_LABEL,
                start_key=table_key,
                application_type=application_type,
                application_id=id,
                application_url=url,
                application_description=description,
                app_key_override=
                f'application://{application_type}/hive/test_table',
                generates_resource=case.generates_table,
            )

            expected_node_results = [{
                NODE_KEY: f'application://{application_type}/hive/test_table',
                NODE_LABEL: 'Application',
                'application_url': url,
                'id': id,
                'name': application_type,
                'description': description,
            }]

            expected_relation_results = [{
                RELATION_START_KEY:
                'hive://gold.default/test_table',
                RELATION_START_LABEL:
                TableMetadata.TABLE_NODE_LABEL,
                RELATION_END_KEY:
                f'application://{application_type}/hive/test_table',
                RELATION_END_LABEL:
                'Application',
                RELATION_TYPE: (GenericApplication.DERIVED_FROM_REL_TYPE
                                if case.generates_table else
                                GenericApplication.CONSUMED_BY_REL_TYPE),
                RELATION_REVERSE_TYPE:
                (GenericApplication.GENERATES_REL_TYPE if case.generates_table
                 else GenericApplication.CONSUMES_REL_TYPE),
            }]

            expected_application_record = {
                'rk': f'application://{application_type}/hive/test_table',
                'application_url': url,
                'id': id,
                'name': application_type,
                'description': description,
            }

            expected_application_table_record = {
                'rk':
                'hive://gold.default/test_table',
                'application_rk':
                f'application://{application_type}/hive/test_table'
            }

            expected_records = [
                expected_application_record, expected_application_table_record
            ]

            self.test_cases.append(
                ApplicationTestCase(
                    application,
                    expected_node_results,
                    expected_relation_results,
                    expected_records,
                ), )
示例#6
0
    def _retrieve_tables(self, dataset):
        # type: () -> Any
        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']
                table_id = tableRef['tableId']

                # BigQuery tables that have 8 digits as last characters are
                # considered date range tables and are grouped together in the UI.
                # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
                if self._is_sharded_table(table_id):
                    # If the last eight characters are digits, we assume the table is of a table date range type
                    # and then we only need one schema definition
                    table_prefix = table_id[:-BigQueryApplicationExtractor.DATE_LENGTH]
                    if table_prefix in self.grouped_tables:
                        # If one table in the date range is processed, then ignore other ones
                        # (it adds too much metadata)
                        continue

                    table_id = table_prefix
                    self.grouped_tables.add(table_prefix)

                table = self.bigquery_service.tables().get(
                    projectId=tableRef['projectId'],
                    datasetId=tableRef['datasetId'],
                    tableId=tableRef['tableId']).execute(num_retries=BigQueryApplicationExtractor.NUM_RETRIES)

                # BigQuery tables also have interesting metadata about partitioning
                # data location (EU/US), mod/create time, etc... Extract that some other time?
                #cols = []
                # Not all tables have schemas
                #if 'schema' in table:
                #    schema = table['schema']
                #    if 'fields' in schema:
                #        total_cols = 0
                #        for column in schema['fields']:
                #            total_cols = self._iterate_over_cols('', column, cols, total_cols + 1)

                table_app = Application(
                    task_id='la_task',  # type: str
                    dag_id='el_id',  # type: str,
                    application_url_template='el_template',  # type: str
                    db_name='bigquery',  # type: str
                    cluster=tableRef['projectId'],  # type: str
                    schema=tableRef['datasetId'],  # type: str
                    table_name=table_id,  # type: str
                    exec_date='20200811'
                )
                    
                    #database='bigquery',
                    #cluster=tableRef['projectId'],
                    #schema=tableRef['datasetId'],
                    #name=table_id,
                    #description=table.get('description', ''),
                    #columns=cols,
                    #is_view=table['type'] == 'VIEW')

                yield(table_app)