def test_mysql_serialize(self) -> None: col_readers = [ColumnReader(database='db', cluster='gold', schema='scm', table='foo', column='*', user_email='*****@*****.**')] table_col_usage = TableColumnUsage(col_readers=col_readers) actual = [] record = table_col_usage.next_record() while record: actual.append(mysql_serializer.serialize_record(record)) record = table_col_usage.next_record() expected_user = {'rk': '*****@*****.**', 'first_name': '', 'last_name': '', 'full_name': '', 'employee_type': '', 'is_active': True, 'updated_at': 0, 'slack_id': '', 'github_username': '', 'team_name': '', 'email': '*****@*****.**', 'role_name': ''} expected_usage = {'table_rk': 'db://gold.scm/foo', 'user_rk': '*****@*****.**', 'read_count': 1} expected = [expected_user, expected_usage] self.assertEqual(expected, actual)
def test_extraction(self) -> None: with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute sql_execute.return_value = [{ 'database': 'gold', 'schema': 'scm', 'name': 'foo', 'user_email': '*****@*****.**', 'read_count': 1 }] expected = TableColumnUsage(col_readers=[ ColumnReader(database='snowflake', cluster='gold', schema='scm', table='foo', column='*', user_email='*****@*****.**', read_count=1) ]) extractor = GenericUsageExtractor() extractor.init(self.conf) actual = extractor.extract() self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_mysql_serialize(self) -> None: col_readers = [ ColumnReader(database='db', cluster='gold', schema='scm', table='foo', column='*', user_email='*****@*****.**') ] table_col_usage = TableColumnUsage(col_readers=col_readers) actual = [] record = table_col_usage.next_record() while record: actual.append(mysql_serializer.serialize_record(record)) record = table_col_usage.next_record() expected_user = {'rk': '*****@*****.**', 'email': '*****@*****.**'} expected_usage = { 'table_rk': 'db://gold.scm/foo', 'user_rk': '*****@*****.**', 'read_count': 1 } expected = [expected_user, expected_usage] self.assertEqual(expected, actual)
def setUp(self) -> None: col_readers = [ ColumnReader(database='db', cluster='gold', schema='scm', table='foo', column='*', user_email='*****@*****.**') ] self.table_col_usage = TableColumnUsage(col_readers=col_readers)
def transform(self, record): # type: (Any) -> Optional[TableColumnUsage] SqlToTblColUsageTransformer.total_counts += 1 stmt = getattr(record, self._sql_stmt_attr) email = getattr(record, self._user_email_attr) result = [] # type: List[ColumnReader] try: columns = ColumnUsageProvider.get_columns(query=stmt) # LOGGER.info('Statement: {} ---> columns: {}'.format(stmt, columns)) except Exception: SqlToTblColUsageTransformer.failure_counts += 1 LOGGER.exception( 'Failed to get column usage from query: {}'.format(stmt)) return None # Dedupe is needed to make it table level. TODO: Remove this once we are at column level dedupe_tuples = set() # type: set for col in columns: sub_result = self._get_col_readers(table=col.table, stmt=stmt, email=email, dedupe_tuples=dedupe_tuples) result.extend(sub_result) if not result: return None return TableColumnUsage(col_readers=result)
def test(self): # type: () -> None config = ConfigFactory.from_dict({ SqlToTblColUsageTransformer.DATABASE_NAME: 'database', SqlToTblColUsageTransformer.USER_EMAIL_ATTRIBUTE_NAME: 'email', SqlToTblColUsageTransformer.SQL_STATEMENT_ATTRIBUTE_NAME: 'statement' }) with patch.object(HiveTableMetadataExtractor, 'extract') as mock_extract,\ patch.object(HiveTableMetadataExtractor, 'init'): mock_extract.side_effect = [ TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ]), None ] transformer = SqlToTblColUsageTransformer() transformer.init(config) foo = Foo(email='*****@*****.**', statement='SELECT foo, bar FROM test_table1') actual = transformer.transform(foo) expected = TableColumnUsage(col_readers=[ ColumnReader(database=u'database', cluster=u'gold', schema='test_schema1', table='test_table1', column='*', user_email='*****@*****.**') ]) self.assertEqual(expected.__repr__(), actual.__repr__())
def test_serialize(self): # type: () -> None col_readers = [ ColumnReader(database='db', cluster='gold', schema='scm', table='foo', column='*', user_email='*****@*****.**'), ColumnReader(database='db', cluster='gold', schema='scm', table='bar', column='*', user_email='*****@*****.**') ] table_col_usage = TableColumnUsage(col_readers=col_readers) node_row = table_col_usage.next_node() actual = [] while node_row: actual.append(node_row) node_row = table_col_usage.next_node() expected = [{ 'email': '*****@*****.**', 'KEY': '*****@*****.**', 'LABEL': 'User' }, { 'email': '*****@*****.**', 'KEY': '*****@*****.**', 'LABEL': 'User' }] self.assertEqual(expected, actual) rel_row = table_col_usage.next_relation() actual = [] while rel_row: actual.append(rel_row) rel_row = table_col_usage.next_relation() expected = [{ 'read_count:UNQUOTED': 1, 'END_KEY': '*****@*****.**', 'START_LABEL': 'Table', 'END_LABEL': 'User', 'START_KEY': 'db://gold.scm/foo', 'TYPE': 'READ_BY', 'REVERSE_TYPE': 'READ' }, { 'read_count:UNQUOTED': 1, 'END_KEY': '*****@*****.**', 'START_LABEL': 'Table', 'END_LABEL': 'User', 'START_KEY': 'db://gold.scm/bar', 'TYPE': 'READ_BY', 'REVERSE_TYPE': 'READ' }] self.assertEqual(expected, actual)
def extract(self): # type: () -> Optional[TableColumnUsage] """ It aggregates all count per table and user in memory. Table level aggregation don't expect to occupy much memory. :return: Provides a record or None if no more to extract """ count_map = {} # type: Dict[TableColumnUsageTuple, int] record = self._extractor.extract() count = 0 while record: count += 1 if count % 1000 == 0: LOGGER.info('Aggregated {} records'.format(count)) tbl_col_usg = self._transformer.transform(record=record) record = self._extractor.extract() # filtered case if not tbl_col_usg: continue for col_rdr in tbl_col_usg.col_readers: key = TableColumnUsageTuple(database=col_rdr.database, cluster=col_rdr.cluster, schema=col_rdr.schema, table=col_rdr.table, column=col_rdr.column, email=col_rdr.user_email) new_count = count_map.get(key, 0) + col_rdr.read_count count_map[key] = new_count if not len(count_map): return None col_readers = [] # type: List[ColumnReader] while len(count_map): tbl_col_rdr_tuple, count = count_map.popitem() col_readers.append( ColumnReader(database=tbl_col_rdr_tuple.database, cluster=tbl_col_rdr_tuple.cluster, schema=tbl_col_rdr_tuple.schema, table=tbl_col_rdr_tuple.table, column=tbl_col_rdr_tuple.column, user_email=tbl_col_rdr_tuple.email, read_count=count)) return TableColumnUsage(col_readers=col_readers)
def _get_extract_iter(self) -> Iterator[TableColumnUsage]: """ Using raw level iterator, it groups to table and yields TableColumnUsage :return: """ for row in self._get_raw_extract_iter(): col_readers = [] col_readers.append( ColumnReader(database=self._database_key, cluster=row["database"], schema=row["schema"], table=row["name"], column="*", user_email=row["user_email"], read_count=row["read_count"])) yield TableColumnUsage(col_readers=col_readers)
def test_serialize(self) -> None: col_readers = [ ColumnReader(database='db', cluster='gold', schema='scm', table='foo', column='*', user_email='*****@*****.**') ] table_col_usage = TableColumnUsage(col_readers=col_readers) node_row = table_col_usage.next_node() actual = [] while node_row: actual.append(neo4_serializer.serialize_node(node_row)) node_row = table_col_usage.next_node() expected = [{ 'first_name': '', 'last_name': '', 'full_name': '', 'employee_type': '', 'is_active:UNQUOTED': True, 'updated_at:UNQUOTED': 0, 'LABEL': 'User', 'slack_id': '', 'KEY': '*****@*****.**', 'github_username': '', 'team_name': '', 'email': '*****@*****.**', 'role_name': '' }] self.assertEqual(expected, actual) rel_row = table_col_usage.next_relation() actual = [] while rel_row: actual.append(neo4_serializer.serialize_relationship(rel_row)) rel_row = table_col_usage.next_relation() expected = [{ 'read_count:UNQUOTED': 1, 'END_KEY': '*****@*****.**', 'START_LABEL': 'Table', 'END_LABEL': 'User', 'START_KEY': 'db://gold.scm/foo', 'TYPE': 'READ_BY', 'REVERSE_TYPE': 'READ' }] self.assertEqual(expected, actual)
def transform(self, record): # type: (Any) -> Optional[TableColumnUsage] SqlToTblColUsageTransformer.total_counts += 1 stmt = getattr(record, self._sql_stmt_attr) email = getattr(record, self._user_email_attr) result = [] # type: List[ColumnReader] try: columns = self._worker_pool.apply_async( ColumnUsageProvider.get_columns, (stmt, )).get(self._time_out_sec) # LOGGER.info('Statement: {} ---> columns: {}'.format(stmt, columns)) except TimeoutError: SqlToTblColUsageTransformer.failure_counts += 1 LOGGER.exception( 'Timed out while getting column usage from query: {}'.format( stmt)) LOGGER.info('Killing the thread.') self._worker_pool.terminate() self._worker_pool = Pool(processes=1) LOGGER.info('Killed the thread.') return None except Exception: SqlToTblColUsageTransformer.failure_counts += 1 if self._log_all_extraction_failures: LOGGER.exception( 'Failed to get column usage from query: {}'.format(stmt)) return None # Dedupe is needed to make it table level. TODO: Remove this once we are at column level dedupe_tuples = set() # type: set for col in columns: sub_result = self._get_col_readers(table=col.table, stmt=stmt, email=email, dedupe_tuples=dedupe_tuples) result.extend(sub_result) if not result: return None return TableColumnUsage(col_readers=result)
def transform(self, record: Tuple[TableColumnUsageTuple, int]) -> Optional[TableColumnUsage]: if not record: return None (key, count) = record if not isinstance(key, TableColumnUsageTuple): raise Exception("BigqueryUsageTransformer expects record of type TableColumnUsageTuple") col_readers = [] col_readers.append(ColumnReader(database=key.database, cluster=key.cluster, schema=key.schema, table=key.table, column=key.column, user_email=key.email, read_count=count)) return TableColumnUsage(col_readers=col_readers)
def test_aggregate(self): # type: () -> None with patch.object(RegexStrReplaceTransformer, 'init'),\ patch.object(SqlToTblColUsageTransformer, 'init'),\ patch.object(RegexStrReplaceTransformer, 'transform'),\ patch.object(SqlToTblColUsageTransformer, 'transform') as mock_sql_transform: raw_extractor = MagicMock() mock_raw_extractor = MagicMock() raw_extractor.extract = mock_raw_extractor raw_extractor.get_scope.return_value = 'foo' # Just to iterate 5 times mock_raw_extractor.side_effect = ['foo', 'bar', 'foo', 'bar', None] conf = ConfigFactory.from_dict({RAW_EXTRACTOR: raw_extractor}) mock_sql_transform.side_effect = [ TableColumnUsage(col_readers=[ ColumnReader(database='database', cluster='gold', schema='test_schema1', table='test_table1', column='*', user_email='*****@*****.**') ]), TableColumnUsage(col_readers=[ ColumnReader(database='database', cluster='gold', schema='test_schema1', table='test_table1', column='*', user_email='*****@*****.**', read_count=2) ]), TableColumnUsage(col_readers=[ ColumnReader(database='database', cluster='gold', schema='test_schema1', table='test_table2', column='*', user_email='*****@*****.**', read_count=5) ]), None ] extractor = TblColUsgAggExtractor() extractor.init(conf) actual = extractor.extract() expected = TableColumnUsage(col_readers=[ ColumnReader(database='database', cluster='gold', schema='test_schema1', table='test_table1', column='*', user_email='*****@*****.**', read_count=3), ColumnReader(database='database', cluster='gold', schema='test_schema1', table='test_table2', column='*', user_email='*****@*****.**', read_count=5) ]) self.assertEqual(expected.__repr__(), actual.__repr__())
class TestTableColumnUsage(unittest.TestCase): def setUp(self) -> None: col_readers = [ ColumnReader( database='db', cluster='gold', schema='scm', table='foo', column='*', user_email='*****@*****.**' ) ] self.table_col_usage = TableColumnUsage(col_readers=col_readers) @no_type_check # mypy is somehow complaining on assignment on expected dict. def test_serialize(self) -> None: node_row = self.table_col_usage.next_node() actual = [] while node_row: actual.append(neo4_serializer.serialize_node(node_row)) node_row = self.table_col_usage.next_node() expected = [{'first_name': '', 'last_name': '', 'full_name': '', 'employee_type': '', 'is_active:UNQUOTED': True, 'updated_at:UNQUOTED': 0, 'LABEL': 'User', 'slack_id': '', 'KEY': '*****@*****.**', 'github_username': '', 'team_name': '', 'email': '*****@*****.**', 'role_name': ''}] self.assertEqual(expected, actual) rel_row = self.table_col_usage.next_relation() actual = [] while rel_row: actual.append(neo4_serializer.serialize_relationship(rel_row)) rel_row = self.table_col_usage.next_relation() expected = [{'read_count:UNQUOTED': 1, 'END_KEY': '*****@*****.**', 'START_LABEL': 'Table', 'END_LABEL': 'User', 'START_KEY': 'db://gold.scm/foo', 'TYPE': 'READ_BY', 'REVERSE_TYPE': 'READ'}] self.assertEqual(expected, actual) def test_neptune_serialize(self) -> None: rel_row = self.table_col_usage.next_relation() actual = [] while rel_row: actual.append(neptune_serializer.convert_relationship(rel_row)) rel_row = self.table_col_usage.next_relation() expected = [[ { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:db://gold.scm/foo', to_vertex_id='User:[email protected]', label='READ_BY' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:db://gold.scm/foo', to_vertex_id='User:[email protected]', label='READ_BY' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Table:db://gold.scm/foo', NEPTUNE_RELATIONSHIP_HEADER_TO: 'User:[email protected]', NEPTUNE_HEADER_LABEL: 'READ_BY', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, 'read_count:Long(single)': 1 }, { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='User:[email protected]', to_vertex_id='Table:db://gold.scm/foo', label='READ' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='User:[email protected]', to_vertex_id='Table:db://gold.scm/foo', label='READ' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'User:[email protected]', NEPTUNE_RELATIONSHIP_HEADER_TO: 'Table:db://gold.scm/foo', NEPTUNE_HEADER_LABEL: 'READ', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, 'read_count:Long(single)': 1 } ]] self.maxDiff = None self.assertListEqual(expected, actual) def test_mysql_serialize(self) -> None: col_readers = [ColumnReader(database='db', cluster='gold', schema='scm', table='foo', column='*', user_email='*****@*****.**')] table_col_usage = TableColumnUsage(col_readers=col_readers) actual = [] record = table_col_usage.next_record() while record: actual.append(mysql_serializer.serialize_record(record)) record = table_col_usage.next_record() expected_user = {'rk': '*****@*****.**', 'first_name': '', 'last_name': '', 'full_name': '', 'employee_type': '', 'is_active': True, 'updated_at': 0, 'slack_id': '', 'github_username': '', 'team_name': '', 'email': '*****@*****.**', 'role_name': ''} expected_usage = {'table_rk': 'db://gold.scm/foo', 'user_rk': '*****@*****.**', 'read_count': 1} expected = [expected_user, expected_usage] self.assertEqual(expected, actual)