def test_extraction_with_single_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = {'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing', 'cluster': self.conf['extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.CLUSTER_KEY)] } sql_execute.return_value = [ self._union( {'col_name': 'col_id1', 'col_type': 'bigint', 'col_description': 'description of id1', 'col_sort_order': 0}, table), self._union( {'col_name': 'col_id2', 'col_type': 'bigint', 'col_description': 'description of id2', 'col_sort_order': 1}, table), self._union( {'col_name': 'is_active', 'col_type': 'boolean', 'col_description': None, 'col_sort_order': 2}, table), self._union( {'col_name': 'source', 'col_type': 'varchar', 'col_description': 'description of source', 'col_sort_order': 3}, table), self._union( {'col_name': 'etl_created_at', 'col_type': 'timestamp', 'col_description': 'description of etl_created_at', 'col_sort_order': 4}, table), self._union( {'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 5}, table) ] extractor = PostgresMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata('postgres', 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing', [ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5)]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_col_badge_field(self) -> None: self.table_metadata4 = TableMetadata('hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0, ['col-badge1', 'col-badge2'])], is_view=False, attr1='uri', attr2='attr2') node_row = self.table_metadata4.next_node() actual = [] while node_row: serialized_node_row = neo4_serializer.serialize_node(node_row) actual.append(serialized_node_row) node_row = self.table_metadata4.next_node() self.assertEqual(actual[4].get('KEY'), 'col-badge1') self.assertEqual(actual[5].get('KEY'), 'col-badge2') relation_row = self.table_metadata4.next_relation() actual = [] while relation_row: serialized_relation_row = neo4_serializer.serialize_relationship(relation_row) actual.append(serialized_relation_row) relation_row = self.table_metadata4.next_relation() expected_col_badge_rel1 = {'END_KEY': 'col-badge1', 'START_LABEL': 'Column', 'END_LABEL': 'Badge', 'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1', 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'} expected_col_badge_rel2 = {'END_KEY': 'col-badge2', 'START_LABEL': 'Column', 'END_LABEL': 'Badge', 'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1', 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'} self.assertEqual(actual[4], expected_col_badge_rel1) self.assertEqual(actual[5], expected_col_badge_rel2)
def test_tags_populated_from_str(self): # type: () -> None self.table_metadata5 = TableMetadata('hive', 'gold', 'test_schema5', 'test_table5', 'test_table5', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0)], tags="tag3, tag4") # Test table tag field populated from str node_row = self.table_metadata5.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.table_metadata5.next_node() self.assertEqual(actual[2].get('LABEL'), 'Tag') self.assertEqual(actual[2].get('KEY'), 'tag3') self.assertEqual(actual[3].get('KEY'), 'tag4') relation_row = self.table_metadata5.next_relation() actual = [] while relation_row: actual.append(relation_row) relation_row = self.table_metadata5.next_relation() # Table tag relationship expected_tab_tag_rel3 = {'END_KEY': 'tag3', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'} expected_tab_tag_rel4 = {'END_KEY': 'tag4', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'} self.assertEqual(actual[2], expected_tab_tag_rel3) self.assertEqual(actual[3], expected_tab_tag_rel4)
def setUp(self) -> None: super(TestQueryJoin, self).setUp() # Display full diffs self.maxDiff = None self.tbl1_col = ColumnMetadata('field', '', '', 0) self.left_table_metadata = TableMetadata('hive', 'gold', 'test_schema1', 'test_table1', 'test_table1 desc', [self.tbl1_col]) self.tbl2_col = ColumnMetadata('field', '', '', 0) self.right_table_metadata = TableMetadata('hive', 'gold', 'test_schema1', 'test_table2', 'test_table2 desc', [self.tbl2_col]) self.query_metadata = QueryMetadata( sql="select * from table a where a.field > 3", tables=[self.left_table_metadata, self.right_table_metadata]) self.query_join_metadata = QueryJoinMetadata( left_table=self.left_table_metadata, right_table=self.right_table_metadata, left_column=self.tbl1_col, right_column=self.tbl2_col, join_type='inner join', join_operator='=', join_sql= 'test_table1 = join test_table2 on test_tabl1.field = test_table2.field', query_metadata=self.query_metadata) self._expected_key = ('inner-join-' 'hive://gold.test_schema1/test_table1/field-' '=-' 'hive://gold.test_schema1/test_table2/field')
def test_z_custom_sources(self): # type: () -> None self.custom_source = TableMetadata( 'hive', 'gold', 'test_schema3', 'test_table4', 'test_table4', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ], is_view=False, description_source="custom") node_row = self.custom_source.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.custom_source.next_node() expected = { 'LABEL': 'Programmatic_Description', 'KEY': 'hive://gold.test_schema3/test_table4/_custom_description', 'description_source': 'custom', 'description': 'test_table4' } self.assertEqual(actual[1], expected)
def test_tags_arent_populated_from_empty_list_and_str(self): # type: () -> None self.table_metadata6 = TableMetadata( 'hive', 'gold', 'test_schema6', 'test_table6', 'test_table6', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0) ], tags=[]) self.table_metadata7 = TableMetadata( 'hive', 'gold', 'test_schema7', 'test_table7', 'test_table7', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0) ], tags="") # Test table tag fields are not populated from empty List node_row = self.table_metadata6.next_node() while node_row: self.assertNotEqual(node_row.get('LABEL'), 'Tag') node_row = self.table_metadata6.next_node() # Test table tag fields are not populated from empty str node_row = self.table_metadata7.next_node() while node_row: self.assertNotEqual(node_row.get('LABEL'), 'Tag') node_row = self.table_metadata7.next_node()
def test_table_attributes(self): # type: () -> None self.table_metadata3 = TableMetadata( 'hive', 'gold', 'test_schema3', 'test_table3', 'test_table3', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ], is_view=False, attr1='uri', attr2='attr2') node_row = self.table_metadata3.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.table_metadata3.next_node() self.assertEqual(actual[0].get('attr1'), 'uri') self.assertEqual(actual[0].get('attr2'), 'attr2')
def test_extraction_with_database_specified(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute sql_execute.return_value = [{ 'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing', 'cluster': 'MY_CLUSTER', 'is_view': 'false', 'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 0 }] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( self.database_key, 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing', [ColumnMetadata('ds', None, 'varchar', 0)]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_partition_badge(self) -> None: with patch.object(GlueExtractor, '_search_tables') as mock_search: mock_search.return_value = [test_table] extractor = GlueExtractor() extractor.init(conf=ConfigFactory.from_dict({ GlueExtractor.PARTITION_BADGE_LABEL_KEY: "partition_key", })) actual = extractor.extract() expected = TableMetadata( 'glue', 'gold', 'test_schema', 'test_table', 'a table for testing', [ ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5), ColumnMetadata( 'partition_key1', 'description of partition_key1', 'string', 6, ["partition_key"], ), ], False) self.assertEqual(expected.__repr__(), actual.__repr__())
def setUp(self) -> None: super(TestTableMetadata, self).setUp() TableMetadata.serialized_nodes_keys = set() TableMetadata.serialized_rels_keys = set() self.table_metadata = TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ]) self.table_metadata2 = TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ])
def test_extraction_one_object(self, mock_salesforce: Any) -> None: mock_salesforce.return_value = MockSalesForce() config_dict: Dict = { f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [ "Account" ], **self.config, } conf = ConfigFactory.from_dict(config_dict) mock_salesforce.return_value = MockSalesForce() extractor = SalesForceExtractor() extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsInstance(result, TableMetadata) expected = TableMetadata( "salesforce", "gold", "default", "Account", None, [ ColumnMetadata("Id", "The Account Id", "id", 0, []), ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []), ], False, [], ) self.assertEqual(expected.__repr__(), result.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_single_result(self, mock_connect: MagicMock) -> None: """ Test Extraction with single table result from query """ mock_connection = MagicMock() mock_connect.return_value = mock_connection mock_cursor = MagicMock() mock_connection.cursor.return_value = mock_cursor mock_execute = MagicMock() mock_cursor.execute = mock_execute mock_cursor.description = [['col_name'], ['col_description'], ['col_type'], ['col_sort_order'], ['database'], ['cluster'], ['schema'], ['name'], ['description'], ['is_view']] # Pass flake8 Unsupported operand types for + error table: List[Any] = [ 'DREMIO', 'Production', 'test_schema', 'test_table', 'a table for testing', 'false' ] # Pass flake8 Unsupported operand types for + error expected_input: List[List[Any]] = [ ['col_id1', 'description of id1', 'number', 0] + table, ['col_id2', 'description of id2', 'number', 1] + table, ['is_active', None, 'boolean', 2] + table, ['source', 'description of source', 'varchar', 3] + table, [ 'etl_created_at', 'description of etl_created_at', 'timestamp_ltz', 4 ] + table, ['ds', None, 'varchar', 5] + table ] mock_cursor.execute.return_value = expected_input extractor = DremioMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( 'DREMIO', 'Production', 'test_schema', 'test_table', 'a table for testing', [ ColumnMetadata('col_id1', 'description of id1', 'number', 0), ColumnMetadata('col_id2', 'description of id2', 'number', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp_ltz', 4), ColumnMetadata('ds', None, 'varchar', 5) ]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_tags_field(self) -> None: self.table_metadata4 = TableMetadata( 'hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0) ], is_view=False, tags=['tag1', 'tag2'], attr1='uri', attr2='attr2') node_row = self.table_metadata4.next_node() actual = [] while node_row: node_row_serialized = neo4_serializer.serialize_node(node_row) actual.append(node_row_serialized) node_row = self.table_metadata4.next_node() self.assertEqual(actual[0].get('attr1'), 'uri') self.assertEqual(actual[0].get('attr2'), 'attr2') self.assertEqual(actual[2].get('LABEL'), 'Tag') self.assertEqual(actual[2].get('KEY'), 'tag1') self.assertEqual(actual[3].get('KEY'), 'tag2') relation_row = self.table_metadata4.next_relation() actual = [] while relation_row: relation_row_serialized = neo4_serializer.serialize_relationship( relation_row) actual.append(relation_row_serialized) relation_row = self.table_metadata4.next_relation() # Table tag relationship expected_tab_tag_rel1 = { 'END_KEY': 'tag1', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG' } expected_tab_tag_rel2 = { 'END_KEY': 'tag2', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG' } self.assertEqual(actual[2], expected_tab_tag_rel1) self.assertEqual(actual[3], expected_tab_tag_rel2)
def test_extraction_with_single_result(self): # type: () -> None with patch.object(GlueExtractor, '_search_tables') as mock_search: mock_search.return_value = [{ 'Name': 'test_table', 'DatabaseName': 'test_schema', 'Description': 'a table for testing', 'StorageDescriptor': { 'Columns': [{ 'Name': 'col_id1', 'Type': 'bigint', 'Comment': 'description of id1' }, { 'Name': 'col_id2', 'Type': 'bigint', 'Comment': 'description of id2' }, { 'Name': 'is_active', 'Type': 'boolean' }, { 'Name': 'source', 'Type': 'varchar', 'Comment': 'description of source' }, { 'Name': 'etl_created_at', 'Type': 'timestamp', 'Comment': 'description of etl_created_at' }, { 'Name': 'ds', 'Type': 'varchar' }] } }] extractor = GlueExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( 'glue', 'gold', 'test_schema', 'test_table', 'a table for testing', [ ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_multiple_objects(self, mock_salesforce: Any) -> None: mock_salesforce.return_value = MockSalesForce() config_dict: Dict = { f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [ "Account", "Profile", ], **self.config, } conf = ConfigFactory.from_dict(config_dict) mock_salesforce.return_value = MockSalesForce() extractor = SalesForceExtractor() extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) results = [extractor.extract(), extractor.extract()] for result in results: self.assertIsInstance(result, TableMetadata) expecteds = [ TableMetadata( "salesforce", "gold", "default", "Account", None, [ ColumnMetadata("Id", "The Account Id", "id", 0, []), ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []), ], False, [], ), TableMetadata( "salesforce", "gold", "default", "Profile", None, [ # These columns are sorted alphabetically ColumnMetadata("Business", "Important Bizness", "string", 0, []), ColumnMetadata("Id", "The Profile Id", "id", 1, []), ], False, [], ), ] for result, expected in zip(results, expecteds): self.assertEqual(expected.__repr__(), result.__repr__()) self.assertIsNone(extractor.extract())
def _retrieve_tables(self, dataset): # type: () -> Any for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] table = self.bigquery_service.tables().get( projectId=tableRef['projectId'], datasetId=tableRef['datasetId'], tableId=tableRef['tableId']).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES) # BigQuery tables also have interesting metadata about partitioning # data location (EU/US), mod/create time, etc... Extract that some other time? schema = table['schema'] cols = [] if 'fields' in schema: total_cols = 0 for column in schema['fields']: total_cols = self._iterate_over_cols('', column, cols, total_cols + 1) table_meta = TableMetadata( database='bigquery', cluster=tableRef['projectId'], schema_name=tableRef['datasetId'], name=tableRef['tableId'], description=table.get('description', ''), columns=cols, is_view=table['type'] == 'VIEW') yield(table_meta)
def test_hive_parser_with_failures(self) -> None: transformer = ComplexTypeTransformer() config = ConfigFactory.from_dict({ PARSING_FUNCTION: 'databuilder.utils.hive_complex_type_parser.parse_hive_type', }) transformer.init(conf=config) column = ColumnMetadata('col1', 'array type', 'array<array<int>>', 0) table_metadata = TableMetadata('hive', 'gold', 'test_schema', 'test_table', 'test_table', [column]) default_scalar_type = ScalarTypeMetadata(name='col1', parent=column, type_str='array<array<int>>') with patch.object(transformer, '_parsing_function') as mock: mock.side_effect = MagicMock( side_effect=Exception('Could not parse')) result = transformer.transform(table_metadata) self.assertEqual(transformer.success_count, 0) self.assertEqual(transformer.failure_count, 1) for actual in result.columns: self.assertEqual(actual.get_type_metadata(), default_scalar_type)
def test_hive_parser_usage(self) -> None: transformer = ComplexTypeTransformer() config = ConfigFactory.from_dict({ PARSING_FUNCTION: 'databuilder.utils.hive_complex_type_parser.parse_hive_type', }) transformer.init(conf=config) column = ColumnMetadata('col1', 'array type', 'array<array<int>>', 0) table_metadata = TableMetadata('hive', 'gold', 'test_schema', 'test_table', 'test_table', [column]) array_type = ArrayTypeMetadata(name='col1', parent=column, type_str='array<array<int>>') inner_array = ArrayTypeMetadata(name='_inner_', parent=array_type, type_str='array<int>') array_type.array_inner_type = inner_array result = transformer.transform(table_metadata) for actual in result.columns: self.assertTrue( isinstance(actual.get_type_metadata(), TypeMetadata)) self.assertEqual(actual.get_type_metadata(), array_type) self.assertEqual(transformer.success_count, 1) self.assertEqual(transformer.failure_count, 0)
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], row["col_description"], row["col_type"], row["col_sort_order"], ) ) yield TableMetadata( self._database, self._cluster, last_row["schema"], last_row["name"], last_row["description"], columns, is_view=bool(last_row["is_view"]), )
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ It gets all tables and yields TableMetadata :return: """ keyspaces = self._get_keyspaces() for keyspace in keyspaces: # system keyspaces if keyspace.startswith('system'): continue for table in self._get_tables(keyspace): if self._filter and not self._filter(keyspace, table): continue columns = [] columns_dict = self._get_columns(keyspace, table) for idx, (column_name, column) in enumerate(columns_dict.items()): columns.append(ColumnMetadata( column_name, None, column.cql_type, idx )) yield TableMetadata( 'cassandra', self._cluster, keyspace, table, None, columns )
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ It gets all tables and yields TableMetadata :return: """ for row in self._get_raw_extract_iter(): columns = [] for i in range(len(row['StorageDescriptor']['Columns'])): column = row['StorageDescriptor']['Columns'][i] columns.append(ColumnMetadata( column['Name'], column['Comment'] if 'Comment' in column else None, column['Type'], i )) yield TableMetadata( 'glue', self._cluster, row['DatabaseName'], row['Name'], row['Description'] if 'Description' in row else None, columns )
def setUp(self) -> None: self.maxDiff = None super(TestQuery, self).setUp() self.user = User(first_name='test_first', last_name='test_last', full_name='test_first test_last', email='*****@*****.**', github_username='******', team_name='test_team', employee_type='FTE', manager_email='*****@*****.**', slack_id='slack', is_active=True, profile_url='https://profile', updated_at=1, role_name='swe') self.table_metadata = TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ]) self.sql = "select * from table" self.query_metadata = QueryMetadata(sql=self.sql, tables=[self.table_metadata], user=self.user) self._query_hash = 'da44ff72560e593a8eca9ffcee6a2696'
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row column = None if row['is_partition_col'] == 1: # create add a badge to indicate partition column column = ColumnMetadata(row['col_name'], row['col_description'], row['col_type'], row['col_sort_order'], [PARTITION_BADGE]) else: column = ColumnMetadata(row['col_name'], row['col_description'], row['col_type'], row['col_sort_order']) columns.append(column) is_view = last_row['is_view'] == 1 yield TableMetadata('hive', self._cluster, last_row['schema'], last_row['name'], last_row['description'], columns, is_view=is_view)
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ It gets all tables and yields TableMetadata :return: """ for row in self._get_raw_extract_iter(): columns, i = [], 0 for column in row['StorageDescriptor']['Columns'] \ + row.get('PartitionKeys', []): columns.append(ColumnMetadata( column['Name'], column['Comment'] if 'Comment' in column else None, column['Type'], i )) i += 1 yield TableMetadata( 'glue', self._cluster, row['DatabaseName'], row['Name'], row.get('Description') or row.get('Parameters', {}).get('comment'), columns, row.get('TableType') == 'VIRTUAL_VIEW', )
def test_create_table_metadata(self) -> None: scraped = ScrapedTableMetadata(schema="test_schema1", table="test_table1") scraped.set_columns([ScrapedColumnMetadata(name="a", description=None, data_type="string", sort_order=0), ScrapedColumnMetadata(name="b", description=None, data_type="int", sort_order=1)]) created_metadata = self.dExtractor.create_table_metadata(scraped) expected = TableMetadata("test_database", "test_cluster", "test_schema1", "test_table1", description=None, columns=[ColumnMetadata("a", None, "string", 0), ColumnMetadata("b", None, "int", 1)]) self.assertEqual(str(expected), str(created_metadata))
def setUp(self) -> None: super(TestTableMetadata, self).setUp() TableMetadata.serialized_nodes_keys = set() TableMetadata.serialized_rels_keys = set() column_with_type_metadata = ColumnMetadata( 'has_nested_type', 'column with nested types', 'array<array<array<string>>>', 6) column_with_type_metadata.set_column_key( 'hive://gold.test_schema1/test_table1/has_nested_type') column_with_type_metadata.set_type_metadata( self._set_up_type_metadata(column_with_type_metadata)) self.table_metadata = TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5), column_with_type_metadata ]) self.table_metadata2 = TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5), column_with_type_metadata ])
def _retrieve_tables(self, dataset: DatasetRef) -> Any: grouped_tables: Set[str] = set([]) for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] table_id = tableRef['tableId'] # BigQuery tables that have numeric suffix starting with a date string will be # considered date range tables. # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. ) if self._is_sharded_table(table_id): # Sharded tables have numeric suffix starting with a date string # and then we only need one schema definition table_prefix = table_id[:-len( self._get_sharded_table_suffix(table_id))] if table_prefix in grouped_tables: # If one table in the date range is processed, then ignore other ones # (it adds too much metadata) continue table_id = table_prefix grouped_tables.add(table_prefix) table = self.bigquery_service.tables().get( projectId=tableRef['projectId'], datasetId=tableRef['datasetId'], tableId=tableRef['tableId']).execute( num_retries=BigQueryMetadataExtractor.NUM_RETRIES) # BigQuery tables also have interesting metadata about partitioning # data location (EU/US), mod/create time, etc... Extract that some other time? cols: List[ColumnMetadata] = [] # Not all tables have schemas if 'schema' in table: schema = table['schema'] if 'fields' in schema: total_cols = 0 for column in schema['fields']: # TRICKY: this mutates :cols: total_cols = self._iterate_over_cols( '', column, cols, total_cols + 1) table_meta = TableMetadata(database='bigquery', cluster=tableRef['projectId'], schema=tableRef['datasetId'], name=table_id, description=table.get( 'description', ''), columns=cols, is_view=table['type'] == 'VIEW') yield (table_meta)
def _retrieve_tables(self, dataset): # type: () -> Any for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] table_id = tableRef['tableId'] # BigQuery tables that have 8 digits as last characters are # considered date range tables and are grouped together in the UI. # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. ) last_eight_chars = table_id[-BigQueryMetadataExtractor. DATE_LENGTH:] if last_eight_chars.isdigit(): # If the last eight characters are digits, we assume the table is of a table date range type # and then we only need one schema definition table_prefix = table_id[:-BigQueryMetadataExtractor. DATE_LENGTH] if table_prefix in self.grouped_tables: # If one table in the date range is processed, then ignore other ones # (it adds too much metadata) continue table_id = table_prefix self.grouped_tables.add(table_prefix) table = self.bigquery_service.tables().get( projectId=tableRef['projectId'], datasetId=tableRef['datasetId'], tableId=tableRef['tableId']).execute( num_retries=BigQueryMetadataExtractor.NUM_RETRIES) # BigQuery tables also have interesting metadata about partitioning # data location (EU/US), mod/create time, etc... Extract that some other time? schema = table['schema'] cols = [] if 'fields' in schema: total_cols = 0 for column in schema['fields']: total_cols = self._iterate_over_cols( '', column, cols, total_cols + 1) table_meta = TableMetadata(database='bigquery', cluster=tableRef['projectId'], schema_name=tableRef['datasetId'], name=table_id, description=table.get( 'description', ''), columns=cols, is_view=table['type'] == 'VIEW') yield (table_meta)
def _retrieve_tables(self, dataset: DatasetRef) -> Any: for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] table_id = tableRef['tableId'] # BigQuery tables that have 8 digits as last characters are # considered date range tables and are grouped together in the UI. # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. ) if self._is_sharded_table(table_id): # If the last eight characters are digits, we assume the table is of a table date range type # and then we only need one schema definition table_prefix = table_id[:-BigQueryMetadataExtractor.DATE_LENGTH] table_id = table_prefix sharded_table_key = BigQueryMetadataExtractor.SHARDED_TABLE_KEY_FORMAT.format( dataset_id=tableRef['datasetId'], table_id=table_id) if sharded_table_key in self.grouped_tables: # If one table in the date range is processed, then ignore other ones # (it adds too much metadata) continue self.grouped_tables.add(sharded_table_key) table = self.bigquery_service.tables().get( projectId=tableRef['projectId'], datasetId=tableRef['datasetId'], tableId=tableRef['tableId']).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES) # BigQuery tables also have interesting metadata about partitioning # data location (EU/US), mod/create time, etc... Extract that some other time? cols: List[ColumnMetadata] = [] # Not all tables have schemas if 'schema' in table: schema = table['schema'] if 'fields' in schema: total_cols = 0 for column in schema['fields']: # TRICKY: this mutates :cols: total_cols = self._iterate_over_cols('', column, cols, total_cols + 1) table_meta = TableMetadata( database='bigquery', cluster=tableRef['projectId'], schema=tableRef['datasetId'], name=table_id, description=table.get('description', ''), columns=cols, is_view=table['type'] == 'VIEW') yield(table_meta)
def test_extraction_with_default_conf(self, mock_columns, mock_tables, mock_keyspaces): # type: () -> None mock_keyspaces.return_value = {'test_schema': None} mock_tables.return_value = {'test_table': None} columns_dict = OrderedDict() columns_dict['id'] = CassandraColumnMetadata(None, 'id', 'int') columns_dict['txt'] = CassandraColumnMetadata(None, 'txt', 'text') mock_columns.return_value = columns_dict extractor = CassandraExtractor() extractor.init(self.default_conf) actual = extractor.extract() expected = TableMetadata('cassandra', 'gold', 'test_schema', 'test_table', None, [ ColumnMetadata('id', None, 'int', 0), ColumnMetadata('txt', None, 'text', 1) ]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())