def setUp(self): # type: () -> None super(TestApplication, self).setUp() self.application = Application( task_id='hive.default.test_table', dag_id='event_test', exec_date='2018-05-31T00:00:00', application_url_template= 'airflow_host.net/admin/airflow/tree?dag_id={dag_id}') self.expected_node_result = { NODE_KEY: 'application://gold.airflow/event_test/hive.default.test_table', NODE_LABEL: 'Application', 'application_url': 'airflow_host.net/admin/airflow/tree?dag_id=event_test', 'id': 'event_test/hive.default.test_table', 'name': 'Airflow', 'description': 'Airflow with id event_test/hive.default.test_table' } self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_END_KEY: 'application://gold.airflow/event_test/hive.default.test_table', RELATION_END_LABEL: 'Application', RELATION_TYPE: 'DERIVED_FROM', RELATION_REVERSE_TYPE: 'GENERATES' }
class TestApplication(unittest.TestCase): def setUp(self) -> None: super(TestApplication, self).setUp() self.application = Application( task_id='hive.default.test_table', dag_id='event_test', schema='default', table_name='test_table', application_url_template= 'airflow_host.net/admin/airflow/tree?dag_id={dag_id}') self.expected_node_result = { NODE_KEY: 'application://gold.airflow/event_test/hive.default.test_table', NODE_LABEL: 'Application', 'application_url': 'airflow_host.net/admin/airflow/tree?dag_id=event_test', 'id': 'event_test/hive.default.test_table', 'name': 'Airflow', 'description': 'Airflow with id event_test/hive.default.test_table' } self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_END_KEY: 'application://gold.airflow/event_test/hive.default.test_table', RELATION_END_LABEL: 'Application', RELATION_TYPE: 'DERIVED_FROM', RELATION_REVERSE_TYPE: 'GENERATES' } def test_create_next_node(self) -> None: next_node = self.application.create_next_node() self.assertEquals(next_node, self.expected_node_result) def test_create_next_relation(self) -> None: next_relation = self.application.create_next_relation() self.assertEquals(next_relation, self.expected_relation_result) def test_get_table_model_key(self) -> None: table = self.application.get_table_model_key() self.assertEquals(table, 'hive://gold.default/test_table') def test_get_application_model_key(self) -> None: application = self.application.get_application_model_key() self.assertEquals(application, self.expected_node_result[NODE_KEY]) def test_create_nodes(self) -> None: nodes = self.application.create_nodes() self.assertEquals(len(nodes), 1) self.assertEquals(nodes[0], self.expected_node_result) def test_create_relation(self) -> None: relation = self.application.create_relation() self.assertEquals(len(relation), 1) self.assertEquals(relation[0], self.expected_relation_result)
class TestApplication(unittest.TestCase): def setUp(self) -> None: super(TestApplication, self).setUp() self.application = Application( task_id='hive.default.test_table', dag_id='event_test', schema='default', table_name='test_table', application_url_template= 'airflow_host.net/admin/airflow/tree?dag_id={dag_id}') self.expected_node_result = { NODE_KEY: 'application://gold.airflow/event_test/hive.default.test_table', NODE_LABEL: 'Application', 'application_url': 'airflow_host.net/admin/airflow/tree?dag_id=event_test', 'id': 'event_test/hive.default.test_table', 'name': 'Airflow', 'description': 'Airflow with id event_test/hive.default.test_table' } self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_END_KEY: 'application://gold.airflow/event_test/hive.default.test_table', RELATION_END_LABEL: 'Application', RELATION_TYPE: 'DERIVED_FROM', RELATION_REVERSE_TYPE: 'GENERATES' } def test_create_next_node(self) -> None: next_node = self.application.create_next_node() serialized_next_node = neo4_serializer.serialize_node(next_node) self.assertEquals(serialized_next_node, self.expected_node_result) def test_create_next_node_neptune(self) -> None: next_node = self.application.create_next_node() serialized_next_node = neptune_serializer.convert_node(next_node) neptune_expected = { NEPTUNE_HEADER_ID: 'application://gold.airflow/event_test/hive.default.test_table', NEPTUNE_HEADER_LABEL: 'Application', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, 'application_url:String(single)': 'airflow_host.net/admin/airflow/tree?dag_id=event_test', 'id:String(single)': 'event_test/hive.default.test_table', 'name:String(single)': 'Airflow', 'description:String(single)': 'Airflow with id event_test/hive.default.test_table', } self.assertDictEqual(neptune_expected, serialized_next_node) def test_create_next_relation(self) -> None: next_relation = self.application.create_next_relation() serialized_next_relation = neo4_serializer.serialize_relationship( next_relation) self.assertEquals(serialized_next_relation, self.expected_relation_result) def test_create_next_relation_neptune(self) -> None: neptune_forward_expected = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id='hive://gold.default/test_table', to_vertex_id= 'application://gold.airflow/event_test/hive.default.test_table', label='DERIVED_FROM'), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'hive://gold.default/test_table', NEPTUNE_RELATIONSHIP_HEADER_TO: 'application://gold.airflow/event_test/hive.default.test_table', NEPTUNE_HEADER_LABEL: 'DERIVED_FROM', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id= 'application://gold.airflow/event_test/hive.default.test_table', to_vertex_id='hive://gold.default/test_table', label='GENERATES'), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'application://gold.airflow/event_test/hive.default.test_table', NEPTUNE_RELATIONSHIP_HEADER_TO: 'hive://gold.default/test_table', NEPTUNE_HEADER_LABEL: 'GENERATES', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } next_relation = self.application.create_next_relation() serialized_next_relations = neptune_serializer.convert_relationship( next_relation) self.assertDictEqual(serialized_next_relations[0], neptune_forward_expected) self.assertDictEqual(serialized_next_relations[1], neptune_reversed_expected) def test_get_table_model_key(self) -> None: table = self.application.get_table_model_key() self.assertEqual(table, 'hive://gold.default/test_table') def test_get_application_model_key(self) -> None: application = self.application.get_application_model_key() self.assertEqual(application, self.expected_node_result[NODE_KEY]) def test_create_nodes(self) -> None: nodes = self.application.create_nodes() self.assertEquals(len(nodes), 1) serialized_next_node = neo4_serializer.serialize_node(nodes[0]) self.assertEquals(serialized_next_node, self.expected_node_result) def test_create_relation(self) -> None: relation = self.application.create_relation() self.assertEquals(len(relation), 1) self.assertEquals(neo4_serializer.serialize_relationship(relation[0]), self.expected_relation_result)
class TestApplication(unittest.TestCase): def setUp(self) -> None: super(TestApplication, self).setUp() self.application = Application(task_id='hive.default.test_table', dag_id='event_test', schema='default', table_name='test_table', application_url_template='airflow_host.net/admin/airflow/tree?dag_id={dag_id}') self.expected_node_results = [{ NODE_KEY: 'application://gold.airflow/event_test/hive.default.test_table', NODE_LABEL: 'Application', 'application_url': 'airflow_host.net/admin/airflow/tree?dag_id=event_test', 'id': 'event_test/hive.default.test_table', 'name': 'Airflow', 'description': 'Airflow with id event_test/hive.default.test_table' }] self.expected_relation_results = [{ RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_END_KEY: 'application://gold.airflow/event_test/hive.default.test_table', RELATION_END_LABEL: 'Application', RELATION_TYPE: 'DERIVED_FROM', RELATION_REVERSE_TYPE: 'GENERATES' }] def test_get_table_model_key(self) -> None: table = self.application.get_table_model_key() self.assertEqual(table, 'hive://gold.default/test_table') def test_get_application_model_key(self) -> None: application = self.application.get_application_model_key() self.assertEqual(application, self.expected_node_results[0][NODE_KEY]) def test_create_nodes(self) -> None: actual = [] node = self.application.create_next_node() while node: serialized_next_node = neo4_serializer.serialize_node(node) actual.append(serialized_next_node) node = self.application.create_next_node() self.assertEqual(actual, self.expected_node_results) def test_create_nodes_neptune(self) -> None: actual = [] next_node = self.application.create_next_node() while next_node: serialized_next_node = neptune_serializer.convert_node(next_node) actual.append(serialized_next_node) next_node = self.application.create_next_node() node_id = 'Application:application://gold.airflow/event_test/hive.default.test_table' neptune_expected = [{ NEPTUNE_HEADER_ID: node_id, METADATA_KEY_PROPERTY_NAME: node_id, NEPTUNE_HEADER_LABEL: 'Application', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, 'application_url:String(single)': 'airflow_host.net/admin/airflow/tree?dag_id=event_test', 'id:String(single)': 'event_test/hive.default.test_table', 'name:String(single)': 'Airflow', 'description:String(single)': 'Airflow with id event_test/hive.default.test_table', }] self.assertEqual(neptune_expected, actual) def test_create_relation(self) -> None: actual = [] relation = self.application.create_next_relation() while relation: serialized_relation = neo4_serializer.serialize_relationship(relation) actual.append(serialized_relation) relation = self.application.create_next_relation() self.assertEqual(actual, self.expected_relation_results) def test_create_relations_neptune(self) -> None: application_id = 'Application:application://gold.airflow/event_test/hive.default.test_table' table_id = 'Table:hive://gold.default/test_table' neptune_forward_expected = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=table_id, to_vertex_id=application_id, label='DERIVED_FROM' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=table_id, to_vertex_id=application_id, label='DERIVED_FROM' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: table_id, NEPTUNE_RELATIONSHIP_HEADER_TO: application_id, NEPTUNE_HEADER_LABEL: 'DERIVED_FROM', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=application_id, to_vertex_id=table_id, label='GENERATES' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=application_id, to_vertex_id=table_id, label='GENERATES' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: application_id, NEPTUNE_RELATIONSHIP_HEADER_TO: table_id, NEPTUNE_HEADER_LABEL: 'GENERATES', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_expected = [[neptune_forward_expected, neptune_reversed_expected]] actual = [] next_relation = self.application.create_next_relation() while next_relation: serialized_next_relation = neptune_serializer.convert_relationship(next_relation) actual.append(serialized_next_relation) next_relation = self.application.create_next_relation() self.assertEqual(actual, neptune_expected) def test_create_records(self) -> None: expected_application_record = { 'rk': 'application://gold.airflow/event_test/hive.default.test_table', 'application_url': 'airflow_host.net/admin/airflow/tree?dag_id=event_test', 'id': 'event_test/hive.default.test_table', 'name': 'Airflow', 'description': 'Airflow with id event_test/hive.default.test_table' } expected_application_table_record = { 'rk': 'hive://gold.default/test_table', 'application_rk': 'application://gold.airflow/event_test/hive.default.test_table' } expected = [expected_application_record, expected_application_table_record] actual = [] record = self.application.create_next_record() while record: serialized_record = mysql_serializer.serialize_record(record) actual.append(serialized_record) record = self.application.create_next_record() self.assertEqual(expected, actual)
def setUp(self) -> None: super(TestApplication, self).setUp() self.test_cases = [] # Explicitly add test case for Airflow to verify backwards compatibility airflow_application = Application( task_id='hive.default.test_table', dag_id='event_test', schema='default', table_name='test_table', application_url_template= 'airflow_host.net/admin/airflow/tree?dag_id={dag_id}', ) airflow_expected_node_results = [{ NODE_KEY: 'application://gold.airflow/event_test/hive.default.test_table', NODE_LABEL: 'Application', 'application_url': 'airflow_host.net/admin/airflow/tree?dag_id=event_test', 'id': 'event_test/hive.default.test_table', 'name': 'Airflow', 'description': 'Airflow with id event_test/hive.default.test_table' }] airflow_expected_relation_results = [{ RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_END_KEY: 'application://gold.airflow/event_test/hive.default.test_table', RELATION_END_LABEL: 'Application', RELATION_TYPE: 'DERIVED_FROM', RELATION_REVERSE_TYPE: 'GENERATES' }] airflow_expected_application_record = { 'rk': 'application://gold.airflow/event_test/hive.default.test_table', 'application_url': 'airflow_host.net/admin/airflow/tree?dag_id=event_test', 'id': 'event_test/hive.default.test_table', 'name': 'Airflow', 'description': 'Airflow with id event_test/hive.default.test_table' } airflow_expected_application_table_record = { 'rk': 'hive://gold.default/test_table', 'application_rk': 'application://gold.airflow/event_test/hive.default.test_table' } airflow_expected_records = [ airflow_expected_application_record, airflow_expected_application_table_record, ] self.test_cases.append( ApplicationTestCase( airflow_application, airflow_expected_node_results, airflow_expected_relation_results, airflow_expected_records, ), ) # Test several non-airflow applications AppTestCase = namedtuple('AppTestCase', ['name', 'generates_table']) non_airflow_cases = [ AppTestCase(name='Databricks', generates_table=False), AppTestCase(name='Snowflake', generates_table=True), AppTestCase(name='EMR', generates_table=False), ] for case in non_airflow_cases: application_type = case.name url = f'https://{application_type.lower()}.com/job/1234' id = f'{application_type}.hive.test_table' description = f'{application_type} application for hive.test_table' table_key = TableMetadata.TABLE_KEY_FORMAT.format( db='hive', cluster='gold', schema='default', tbl='test_table', ) application = GenericApplication( start_label=TableMetadata.TABLE_NODE_LABEL, start_key=table_key, application_type=application_type, application_id=id, application_url=url, application_description=description, app_key_override= f'application://{application_type}/hive/test_table', generates_resource=case.generates_table, ) expected_node_results = [{ NODE_KEY: f'application://{application_type}/hive/test_table', NODE_LABEL: 'Application', 'application_url': url, 'id': id, 'name': application_type, 'description': description, }] expected_relation_results = [{ RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_END_KEY: f'application://{application_type}/hive/test_table', RELATION_END_LABEL: 'Application', RELATION_TYPE: (GenericApplication.DERIVED_FROM_REL_TYPE if case.generates_table else GenericApplication.CONSUMED_BY_REL_TYPE), RELATION_REVERSE_TYPE: (GenericApplication.GENERATES_REL_TYPE if case.generates_table else GenericApplication.CONSUMES_REL_TYPE), }] expected_application_record = { 'rk': f'application://{application_type}/hive/test_table', 'application_url': url, 'id': id, 'name': application_type, 'description': description, } expected_application_table_record = { 'rk': 'hive://gold.default/test_table', 'application_rk': f'application://{application_type}/hive/test_table' } expected_records = [ expected_application_record, expected_application_table_record ] self.test_cases.append( ApplicationTestCase( application, expected_node_results, expected_relation_results, expected_records, ), )
def _retrieve_tables(self, dataset): # type: () -> Any for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] table_id = tableRef['tableId'] # BigQuery tables that have 8 digits as last characters are # considered date range tables and are grouped together in the UI. # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. ) if self._is_sharded_table(table_id): # If the last eight characters are digits, we assume the table is of a table date range type # and then we only need one schema definition table_prefix = table_id[:-BigQueryApplicationExtractor.DATE_LENGTH] if table_prefix in self.grouped_tables: # If one table in the date range is processed, then ignore other ones # (it adds too much metadata) continue table_id = table_prefix self.grouped_tables.add(table_prefix) table = self.bigquery_service.tables().get( projectId=tableRef['projectId'], datasetId=tableRef['datasetId'], tableId=tableRef['tableId']).execute(num_retries=BigQueryApplicationExtractor.NUM_RETRIES) # BigQuery tables also have interesting metadata about partitioning # data location (EU/US), mod/create time, etc... Extract that some other time? #cols = [] # Not all tables have schemas #if 'schema' in table: # schema = table['schema'] # if 'fields' in schema: # total_cols = 0 # for column in schema['fields']: # total_cols = self._iterate_over_cols('', column, cols, total_cols + 1) table_app = Application( task_id='la_task', # type: str dag_id='el_id', # type: str, application_url_template='el_template', # type: str db_name='bigquery', # type: str cluster=tableRef['projectId'], # type: str schema=tableRef['datasetId'], # type: str table_name=table_id, # type: str exec_date='20200811' ) #database='bigquery', #cluster=tableRef['projectId'], #schema=tableRef['datasetId'], #name=table_id, #description=table.get('description', ''), #columns=cols, #is_view=table['type'] == 'VIEW') yield(table_app)