def setUp(self) -> None: super(TestQueryJoin, self).setUp() # Display full diffs self.maxDiff = None self.tbl1_col = ColumnMetadata('field', '', '', 0) self.left_table_metadata = TableMetadata('hive', 'gold', 'test_schema1', 'test_table1', 'test_table1 desc', [self.tbl1_col]) self.tbl2_col = ColumnMetadata('field', '', '', 0) self.right_table_metadata = TableMetadata('hive', 'gold', 'test_schema1', 'test_table2', 'test_table2 desc', [self.tbl2_col]) self.query_metadata = QueryMetadata( sql="select * from table a where a.field > 3", tables=[self.left_table_metadata, self.right_table_metadata]) self.query_join_metadata = QueryJoinMetadata( left_table=self.left_table_metadata, right_table=self.right_table_metadata, left_column=self.tbl1_col, right_column=self.tbl2_col, join_type='inner join', join_operator='=', join_sql= 'test_table1 = join test_table2 on test_tabl1.field = test_table2.field', query_metadata=self.query_metadata) self._expected_key = ('inner-join-' 'hive://gold.test_schema1/test_table1/field-' '=-' 'hive://gold.test_schema1/test_table2/field')
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row column = None if row['is_partition_col'] == 1: # create add a badge to indicate partition column column = ColumnMetadata(row['col_name'], row['col_description'], row['col_type'], row['col_sort_order'], [PARTITION_BADGE]) else: column = ColumnMetadata(row['col_name'], row['col_description'], row['col_type'], row['col_sort_order']) columns.append(column) is_view = last_row['is_view'] == 1 yield TableMetadata('hive', self._cluster, last_row['schema'], last_row['name'], last_row['description'], columns, is_view=is_view)
def _iterate_over_cols(self, parent, column, cols, total_cols): # type: (str, str, List[ColumnMetadata()], int) -> int if len(parent) > 0: col_name = '{parent}.{field}'.format(parent=parent, field=column['name']) else: col_name = column['name'] if column['type'] == 'RECORD': col = ColumnMetadata( name=col_name, description=column.get('description', ''), col_type=column['type'], sort_order=total_cols) cols.append(col) total_cols += 1 for field in column['fields']: total_cols = self._iterate_over_cols(col_name, field, cols, total_cols) return total_cols else: col = ColumnMetadata( name=col_name, description=column.get('description', ''), col_type=column['type'], sort_order=total_cols) cols.append(col) return total_cols + 1
def test_tags_arent_populated_from_empty_list_and_str(self): # type: () -> None self.table_metadata6 = TableMetadata( 'hive', 'gold', 'test_schema6', 'test_table6', 'test_table6', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0) ], tags=[]) self.table_metadata7 = TableMetadata( 'hive', 'gold', 'test_schema7', 'test_table7', 'test_table7', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0) ], tags="") # Test table tag fields are not populated from empty List node_row = self.table_metadata6.next_node() while node_row: self.assertNotEqual(node_row.get('LABEL'), 'Tag') node_row = self.table_metadata6.next_node() # Test table tag fields are not populated from empty str node_row = self.table_metadata7.next_node() while node_row: self.assertNotEqual(node_row.get('LABEL'), 'Tag') node_row = self.table_metadata7.next_node()
def _iterate_over_cols(self, parent: str, column: Dict[str, str], cols: List[ColumnMetadata], total_cols: int) -> int: if len(parent) > 0: col_name = '{parent}.{field}'.format(parent=parent, field=column['name']) else: col_name = column['name'] if column['type'] == 'RECORD': col = ColumnMetadata( name=col_name, description=column.get('description', ''), col_type=column['type'], sort_order=total_cols) cols.append(col) total_cols += 1 for field in column['fields']: # TODO field is actually a TableFieldSchema, per # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema # however it's typed as str, which is incorrect. Work-around by casting. field_casted = cast(Dict[str, str], field) total_cols = self._iterate_over_cols(col_name, field_casted, cols, total_cols) return total_cols else: col = ColumnMetadata( name=col_name, description=column.get('description', ''), col_type=column['type'], sort_order=total_cols) cols.append(col) return total_cols + 1
def test_extraction_one_object(self, mock_salesforce: Any) -> None: mock_salesforce.return_value = MockSalesForce() config_dict: Dict = { f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [ "Account" ], **self.config, } conf = ConfigFactory.from_dict(config_dict) mock_salesforce.return_value = MockSalesForce() extractor = SalesForceExtractor() extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsInstance(result, TableMetadata) expected = TableMetadata( "salesforce", "gold", "default", "Account", None, [ ColumnMetadata("Id", "The Account Id", "id", 0, []), ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []), ], False, [], ) self.assertEqual(expected.__repr__(), result.__repr__()) self.assertIsNone(extractor.extract())
def test_create_table_metadata(self) -> None: scraped = ScrapedTableMetadata(schema="test_schema1", table="test_table1") scraped.set_columns([ScrapedColumnMetadata(name="a", description=None, data_type="string", sort_order=0), ScrapedColumnMetadata(name="b", description=None, data_type="int", sort_order=1)]) created_metadata = self.dExtractor.create_table_metadata(scraped) expected = TableMetadata("test_database", "test_cluster", "test_schema1", "test_table1", description=None, columns=[ColumnMetadata("a", None, "string", 0), ColumnMetadata("b", None, "int", 1)]) self.assertEqual(str(expected), str(created_metadata))
def test_extraction_multiple_objects(self, mock_salesforce: Any) -> None: mock_salesforce.return_value = MockSalesForce() config_dict: Dict = { f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [ "Account", "Profile", ], **self.config, } conf = ConfigFactory.from_dict(config_dict) mock_salesforce.return_value = MockSalesForce() extractor = SalesForceExtractor() extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) results = [extractor.extract(), extractor.extract()] for result in results: self.assertIsInstance(result, TableMetadata) expecteds = [ TableMetadata( "salesforce", "gold", "default", "Account", None, [ ColumnMetadata("Id", "The Account Id", "id", 0, []), ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []), ], False, [], ), TableMetadata( "salesforce", "gold", "default", "Profile", None, [ # These columns are sorted alphabetically ColumnMetadata("Business", "Important Bizness", "string", 0, []), ColumnMetadata("Id", "The Profile Id", "id", 1, []), ], False, [], ), ] for result, expected in zip(results, expecteds): self.assertEqual(expected.__repr__(), result.__repr__()) self.assertIsNone(extractor.extract())
def test_hive_parser_with_failures(self) -> None: transformer = ComplexTypeTransformer() config = ConfigFactory.from_dict({ PARSING_FUNCTION: 'databuilder.utils.hive_complex_type_parser.parse_hive_type', }) transformer.init(conf=config) column = ColumnMetadata('col1', 'array type', 'array<array<int>>', 0) table_metadata = TableMetadata('hive', 'gold', 'test_schema', 'test_table', 'test_table', [column]) default_scalar_type = ScalarTypeMetadata(name='col1', parent=column, type_str='array<array<int>>') with patch.object(transformer, '_parsing_function') as mock: mock.side_effect = MagicMock( side_effect=Exception('Could not parse')) result = transformer.transform(table_metadata) self.assertEqual(transformer.success_count, 0) self.assertEqual(transformer.failure_count, 1) for actual in result.columns: self.assertEqual(actual.get_type_metadata(), default_scalar_type)
def test_transform_array_struct_nested_type(self) -> None: column = ColumnMetadata('col1', None, 'array<struct<nest1:int,nest2:int>>', 0) column.set_column_key(self.column_key) array_type = ArrayTypeMetadata( name='col1', parent=column, type_str='array<struct<nest1:int,nest2:int>>') inner_struct = StructTypeMetadata( name='_inner_', parent=array_type, type_str='struct<nest1:int,nest2:int>') inner_scalar_nest1 = ScalarTypeMetadata(name='nest1', parent=inner_struct, type_str='int') inner_scalar_nest2 = ScalarTypeMetadata(name='nest2', parent=inner_struct, type_str='int') array_type.array_inner_type = inner_struct inner_struct.struct_items = { 'nest1': inner_scalar_nest1, 'nest2': inner_scalar_nest2 } inner_scalar_nest1.sort_order = 0 inner_scalar_nest2.sort_order = 1 actual = parse_hive_type(column.type, column.name, column) self.assertEqual(actual, array_type)
def test_transform_union_as_nested_type(self) -> None: column = ColumnMetadata( 'col1', None, 'struct<nest1:uniontype<string,struct<c1:int,c2:string>>,' 'nest2:uniontype<string,int>>', 0) column.set_column_key(self.column_key) struct_type = StructTypeMetadata( name='col1', parent=column, type_str='struct<nest1:uniontype<string,struct<c1:int,c2:string>>,' 'nest2:uniontype<string,int>>') inner_scalar_nest1 = ScalarTypeMetadata( name='nest1', parent=struct_type, type_str='uniontype<string,struct<c1:int,c2:string>>') inner_scalar_nest2 = ScalarTypeMetadata( name='nest2', parent=struct_type, type_str='uniontype<string,int>') struct_type.struct_items = { 'nest1': inner_scalar_nest1, 'nest2': inner_scalar_nest2 } inner_scalar_nest1.sort_order = 0 inner_scalar_nest2.sort_order = 1 actual = parse_hive_type(column.type, column.name, column) self.assertEqual(actual, struct_type)
def test_transform_struct_map_array_nested_type(self) -> None: column = ColumnMetadata( 'col1', None, 'struct<nest1:map<string,array<int>>,nest2:array<string>>', 0) column.set_column_key(self.column_key) struct_type = StructTypeMetadata( name='col1', parent=column, type_str='struct<nest1:map<string,array<int>>,nest2:array<string>>' ) inner_map = MapTypeMetadata(name='nest1', parent=struct_type, type_str='map<string,array<int>>') inner_map_key = ScalarTypeMetadata(name='_map_key', parent=inner_map, type_str='string') inner_map_array = ArrayTypeMetadata(name='_map_value', parent=inner_map, type_str='array<int>') inner_struct_array = ArrayTypeMetadata(name='nest2', parent=struct_type, type_str='array<string>') struct_type.struct_items = { 'nest1': inner_map, 'nest2': inner_struct_array } inner_map.map_key_type = inner_map_key inner_map.map_value_type = inner_map_array inner_map.sort_order = 0 inner_struct_array.sort_order = 1 actual = parse_hive_type(column.type, column.name, column) self.assertEqual(actual, struct_type)
def test_transform_map_struct_nested_type(self) -> None: column = ColumnMetadata('col1', None, 'map<string,struct<nest1:int,nest2:int>>', 0) column.set_column_key(self.column_key) map_type = MapTypeMetadata( name='col1', parent=column, type_str='map<string,struct<nest1:int,nest2:int>>') map_key = ScalarTypeMetadata(name='_map_key', parent=map_type, type_str='string') inner_struct = StructTypeMetadata( name='_map_value', parent=map_type, type_str='struct<nest1:int,nest2:int>') inner_scalar_nest1 = ScalarTypeMetadata(name='nest1', parent=inner_struct, type_str='int') inner_scalar_nest2 = ScalarTypeMetadata(name='nest2', parent=inner_struct, type_str='int') map_type.map_key_type = map_key map_type.map_value_type = inner_struct inner_struct.struct_items = { 'nest1': inner_scalar_nest1, 'nest2': inner_scalar_nest2 } inner_scalar_nest1.sort_order = 0 inner_scalar_nest2.sort_order = 1 actual = parse_hive_type(column.type, column.name, column) self.assertEqual(actual, map_type)
def test_transform_map_type(self) -> None: column = ColumnMetadata('col1', None, 'map<string,map<string,int>>', 0) column.set_column_key(self.column_key) map_type = MapTypeMetadata(name='col1', parent=column, type_str='map<string,map<string,int>>') map_key = ScalarTypeMetadata(name='_map_key', parent=map_type, type_str='string') map_value = MapTypeMetadata(name='_map_value', parent=map_type, type_str='map<string,int>') inner_map_key = ScalarTypeMetadata(name='_map_key', parent=map_value, type_str='string') inner_scalar = ScalarTypeMetadata(name='_map_value', parent=map_value, type_str='int') map_type.map_key_type = map_key map_type.map_value_type = map_value map_value.map_key_type = inner_map_key map_value.map_value_type = inner_scalar actual = parse_hive_type(column.type, column.name, column) self.assertEqual(actual, map_type)
def test_tags_populated_from_str(self): # type: () -> None self.table_metadata5 = TableMetadata('hive', 'gold', 'test_schema5', 'test_table5', 'test_table5', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0)], tags="tag3, tag4") # Test table tag field populated from str node_row = self.table_metadata5.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.table_metadata5.next_node() self.assertEqual(actual[2].get('LABEL'), 'Tag') self.assertEqual(actual[2].get('KEY'), 'tag3') self.assertEqual(actual[3].get('KEY'), 'tag4') relation_row = self.table_metadata5.next_relation() actual = [] while relation_row: actual.append(relation_row) relation_row = self.table_metadata5.next_relation() # Table tag relationship expected_tab_tag_rel3 = {'END_KEY': 'tag3', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'} expected_tab_tag_rel4 = {'END_KEY': 'tag4', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'} self.assertEqual(actual[2], expected_tab_tag_rel3) self.assertEqual(actual[3], expected_tab_tag_rel4)
def test_hive_parser_usage(self) -> None: transformer = ComplexTypeTransformer() config = ConfigFactory.from_dict({ PARSING_FUNCTION: 'databuilder.utils.hive_complex_type_parser.parse_hive_type', }) transformer.init(conf=config) column = ColumnMetadata('col1', 'array type', 'array<array<int>>', 0) table_metadata = TableMetadata('hive', 'gold', 'test_schema', 'test_table', 'test_table', [column]) array_type = ArrayTypeMetadata(name='col1', parent=column, type_str='array<array<int>>') inner_array = ArrayTypeMetadata(name='_inner_', parent=array_type, type_str='array<int>') array_type.array_inner_type = inner_array result = transformer.transform(table_metadata) for actual in result.columns: self.assertTrue( isinstance(actual.get_type_metadata(), TypeMetadata)) self.assertEqual(actual.get_type_metadata(), array_type) self.assertEqual(transformer.success_count, 1) self.assertEqual(transformer.failure_count, 0)
def test_col_badge_field(self) -> None: self.table_metadata4 = TableMetadata('hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0, ['col-badge1', 'col-badge2'])], is_view=False, attr1='uri', attr2='attr2') node_row = self.table_metadata4.next_node() actual = [] while node_row: serialized_node_row = neo4_serializer.serialize_node(node_row) actual.append(serialized_node_row) node_row = self.table_metadata4.next_node() self.assertEqual(actual[4].get('KEY'), 'col-badge1') self.assertEqual(actual[5].get('KEY'), 'col-badge2') relation_row = self.table_metadata4.next_relation() actual = [] while relation_row: serialized_relation_row = neo4_serializer.serialize_relationship(relation_row) actual.append(serialized_relation_row) relation_row = self.table_metadata4.next_relation() expected_col_badge_rel1 = {'END_KEY': 'col-badge1', 'START_LABEL': 'Column', 'END_LABEL': 'Badge', 'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1', 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'} expected_col_badge_rel2 = {'END_KEY': 'col-badge2', 'START_LABEL': 'Column', 'END_LABEL': 'Badge', 'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1', 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'} self.assertEqual(actual[4], expected_col_badge_rel1) self.assertEqual(actual[5], expected_col_badge_rel2)
def test_extraction_with_database_specified(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute sql_execute.return_value = [{ 'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing', 'cluster': 'MY_CLUSTER', 'is_view': 'false', 'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 0 }] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( self.database_key, 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing', [ColumnMetadata('ds', None, 'varchar', 0)]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], row["col_description"], row["col_type"], row["col_sort_order"], ) ) yield TableMetadata( self._database, self._cluster, last_row["schema"], last_row["name"], last_row["description"], columns, is_view=bool(last_row["is_view"]), )
def _get_column_metadata(self, view_original_text): # type: (str) -> List[ColumnMetadata] """ Get Column Metadata from VIEW_ORIGINAL_TEXT from TBLS table for Presto Views. Columns are sorted the same way as they appear in Presto Create View SQL. :param view_original_text: :return: """ # remove encoded Presto View data prefix and suffix encoded_view_info = (view_original_text.split( PrestoViewMetadataExtractor.PRESTO_VIEW_PREFIX, 1)[-1].rsplit(PrestoViewMetadataExtractor.PRESTO_VIEW_SUFFIX, 1)[0]) # view_original_text is b64 encoded: # https://github.com/prestodb/presto/blob/43bd519052ba4c56ff1f4fc807075637ab5f4f10/presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java#L602-L605 decoded_view_info = base64.b64decode(encoded_view_info) columns = json.loads(decoded_view_info).get('columns') return [ ColumnMetadata(name=column['name'], description=None, col_type=column['type'], sort_order=i) for i, column in enumerate(columns) ]
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ It gets all tables and yields TableMetadata :return: """ keyspaces = self._get_keyspaces() for keyspace in keyspaces: # system keyspaces if keyspace.startswith('system'): continue for table in self._get_tables(keyspace): if self._filter and not self._filter(keyspace, table): continue columns = [] columns_dict = self._get_columns(keyspace, table) for idx, (column_name, column) in enumerate(columns_dict.items()): columns.append(ColumnMetadata( column_name, None, column.cql_type, idx )) yield TableMetadata( 'cassandra', self._cluster, keyspace, table, None, columns )
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ It gets all tables and yields TableMetadata :return: """ for row in self._get_raw_extract_iter(): columns = [] for i in range(len(row['StorageDescriptor']['Columns'])): column = row['StorageDescriptor']['Columns'][i] columns.append(ColumnMetadata( column['Name'], column['Comment'] if 'Comment' in column else None, column['Type'], i )) yield TableMetadata( 'glue', self._cluster, row['DatabaseName'], row['Name'], row['Description'] if 'Description' in row else None, columns )
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ It gets all tables and yields TableMetadata :return: """ for row in self._get_raw_extract_iter(): columns, i = [], 0 for column in row['StorageDescriptor']['Columns'] \ + row.get('PartitionKeys', []): columns.append(ColumnMetadata( column['Name'], column['Comment'] if 'Comment' in column else None, column['Type'], i )) i += 1 yield TableMetadata( 'glue', self._cluster, row['DatabaseName'], row['Name'], row.get('Description') or row.get('Parameters', {}).get('comment'), columns, row.get('TableType') == 'VIRTUAL_VIEW', )
def _get_column_values(self, manifest_columns: Dict, catalog_columns: Dict) -> List[ColumnMetadata]: """ Iterates over the columns in the manifest file and creates a `ColumnMetadata` object with the combined information from the manifest file as well as the catalog file. :params manifest_columns: A dictionary of values from the manifest.json, the keys are column names and the values are column metadata :params catalog_columns: A dictionary of values from the catalog.json, the keys are column names and the values are column metadata :returns: A list of `ColumnMetadata` in Amundsen. """ tbl_columns = [] for catalog_col_name, catalog_col_content in catalog_columns.items(): manifest_col_content = manifest_columns.get(catalog_col_name, {}) if catalog_col_content: col_desc = None if self._extract_descriptions: col_desc = manifest_col_content.get('description') # Only extract column-level tags IF converting to badges, Amundsen does not have column-level tags badges = None if self._extract_tags and self._dbt_tag_as == DBT_TAG_AS.BADGE: badges = manifest_col_content.get('tags') col_metadata = ColumnMetadata( name=self._default_sanitize(catalog_col_content['name']), description=col_desc, col_type=catalog_col_content['type'], sort_order=catalog_col_content['index'], badges=badges ) tbl_columns.append(col_metadata) return tbl_columns
def test_transform_invalid_struct_inner_type(self) -> None: column = ColumnMetadata( 'col1', None, 'struct<nest1:varchar(256)å,' 'nest2:<derived from deserializer>>', 0) column.set_column_key(self.column_key) with self.assertRaises(ParseException): parse_hive_type(column.type, column.name, column)
def test_extraction_with_partition_badge(self) -> None: with patch.object(GlueExtractor, '_search_tables') as mock_search: mock_search.return_value = [test_table] extractor = GlueExtractor() extractor.init(conf=ConfigFactory.from_dict({ GlueExtractor.PARTITION_BADGE_LABEL_KEY: "partition_key", })) actual = extractor.extract() expected = TableMetadata( 'glue', 'gold', 'test_schema', 'test_table', 'a table for testing', [ ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5), ColumnMetadata( 'partition_key1', 'description of partition_key1', 'string', 6, ["partition_key"], ), ], False) self.assertEqual(expected.__repr__(), actual.__repr__())
def test_tags_field(self) -> None: self.table_metadata4 = TableMetadata( 'hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0) ], is_view=False, tags=['tag1', 'tag2'], attr1='uri', attr2='attr2') node_row = self.table_metadata4.next_node() actual = [] while node_row: node_row_serialized = neo4_serializer.serialize_node(node_row) actual.append(node_row_serialized) node_row = self.table_metadata4.next_node() self.assertEqual(actual[0].get('attr1'), 'uri') self.assertEqual(actual[0].get('attr2'), 'attr2') self.assertEqual(actual[2].get('LABEL'), 'Tag') self.assertEqual(actual[2].get('KEY'), 'tag1') self.assertEqual(actual[3].get('KEY'), 'tag2') relation_row = self.table_metadata4.next_relation() actual = [] while relation_row: relation_row_serialized = neo4_serializer.serialize_relationship( relation_row) actual.append(relation_row_serialized) relation_row = self.table_metadata4.next_relation() # Table tag relationship expected_tab_tag_rel1 = { 'END_KEY': 'tag1', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG' } expected_tab_tag_rel2 = { 'END_KEY': 'tag2', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG' } self.assertEqual(actual[2], expected_tab_tag_rel1) self.assertEqual(actual[3], expected_tab_tag_rel2)
def test_transform_no_complex_type(self) -> None: column = ColumnMetadata('col1', None, 'int', 0) column.set_column_key(self.column_key) scalar_type = ScalarTypeMetadata(name='col1', parent=column, type_str='int') actual = parse_hive_type(column.type, column.name, column) self.assertEqual(actual, scalar_type)
def test_extraction_with_default_conf(self, mock_columns, mock_tables, mock_keyspaces): # type: () -> None mock_keyspaces.return_value = {'test_schema': None} mock_tables.return_value = {'test_table': None} columns_dict = OrderedDict() columns_dict['id'] = CassandraColumnMetadata(None, 'id', 'int') columns_dict['txt'] = CassandraColumnMetadata(None, 'txt', 'text') mock_columns.return_value = columns_dict extractor = CassandraExtractor() extractor.init(self.default_conf) actual = extractor.extract() expected = TableMetadata('cassandra', 'gold', 'test_schema', 'test_table', None, [ ColumnMetadata('id', None, 'int', 0), ColumnMetadata('txt', None, 'text', 1) ]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_multiple_views(self) -> None: with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute columns1 = {'columns': [{'name': 'xyz', 'type': 'varchar'}, {'name': 'xyy', 'type': 'double'}, {'name': 'aaa', 'type': 'int'}, {'name': 'ab', 'type': 'varchar'}]} columns2 = {'columns': [{'name': 'xyy', 'type': 'varchar'}, {'name': 'ab', 'type': 'double'}, {'name': 'aaa', 'type': 'int'}, {'name': 'xyz', 'type': 'varchar'}]} sql_execute.return_value = [ {'tbl_id': 2, 'schema': 'test_schema2', 'name': 'test_view2', 'tbl_type': 'virtual_view', 'view_original_text': base64.b64encode(json.dumps(columns2).encode()).decode("utf-8")}, {'tbl_id': 1, 'schema': 'test_schema1', 'name': 'test_view1', 'tbl_type': 'virtual_view', 'view_original_text': base64.b64encode(json.dumps(columns1).encode()).decode("utf-8")}, ] extractor = PrestoViewMetadataExtractor() extractor.init(self.conf) actual_first_view = extractor.extract() expected_first_view = TableMetadata('presto', 'gold', 'test_schema2', 'test_view2', None, [ColumnMetadata(u'xyy', None, u'varchar', 0), ColumnMetadata(u'ab', None, u'double', 1), ColumnMetadata(u'aaa', None, u'int', 2), ColumnMetadata(u'xyz', None, u'varchar', 3)], True) self.assertEqual(expected_first_view.__repr__(), actual_first_view.__repr__()) actual_second_view = extractor.extract() expected_second_view = TableMetadata('presto', 'gold', 'test_schema1', 'test_view1', None, [ColumnMetadata(u'xyz', None, u'varchar', 0), ColumnMetadata(u'xyy', None, u'double', 1), ColumnMetadata(u'aaa', None, u'int', 2), ColumnMetadata(u'ab', None, u'varchar', 3)], True) self.assertEqual(expected_second_view.__repr__(), actual_second_view.__repr__()) self.assertIsNone(extractor.extract())