def test_sql_statement(self) -> None:
     """
     Test Extraction with empty result from query
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         extractor = HiveTableMetadataExtractor()
         extractor.init(self.conf)
         self.assertTrue(self.where_clause_suffix in extractor.sql_stmt)
    def test_extraction_with_empty_query_result(self) -> None:
        """
        Test Extraction with empty result from query
        """
        with patch.object(SQLAlchemyExtractor, '_get_connection'):
            extractor = HiveTableMetadataExtractor()
            extractor.init(self.conf)

            results = extractor.extract()
            self.assertEqual(results, None)
Пример #3
0
 def test_sql_statement(self) -> None:
     """
     Test Extraction with empty result from query
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'), \
         patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm',
                      return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT):
         extractor = HiveTableMetadataExtractor()
         extractor.init(self.conf)
         self.assertTrue(self.where_clause_suffix in extractor.sql_stmt)
Пример #4
0
    def test_extraction_with_empty_query_result(self) -> None:
        """
        Test Extraction with empty result from query
        """
        with patch.object(SQLAlchemyExtractor, '_get_connection'), \
            patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm',
                         return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT):
            extractor = HiveTableMetadataExtractor()
            extractor.init(self.conf)

            results = extractor.extract()
            self.assertEqual(results, None)
def run_hive_metastore_job():
    where_clause_suffix = textwrap.dedent("""
    """)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = f'{tmp_folder}/nodes/'
    relationship_files_folder = f'{tmp_folder}/relationships/'

    job_config = ConfigFactory.from_dict({
        f'extractor.hive_table_metadata.{HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}':
        where_clause_suffix,
        f'extractor.hive_table_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}':
        connection_string(),
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=HiveTableMetadataExtractor(),
                                      loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    return job
 def test_hive_sql_statement_with_custom_sql(self) -> None:
     """
     Test Extraction by providing a custom sql
     :return:
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         config_dict = {
             HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY:
             self.where_clause_suffix,
             'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
             'TEST_CONNECTION',
             HiveTableMetadataExtractor.EXTRACT_SQL:
             'select sth for test {where_clause_suffix}'
         }
         conf = ConfigFactory.from_dict(config_dict)
         extractor = HiveTableMetadataExtractor()
         extractor.init(conf)
         self.assertTrue('select sth for test' in extractor.sql_stmt)
Пример #7
0
def create_table_metadata_databuilder_job():
    """
    Launches data builder job that extracts table and column metadata from MySQL Hive metastore database,
    and publishes to Neo4j.
    @param kwargs:
    @return:
    """

    # Adding to where clause to scope schema, filter out temp tables which start with numbers and views
    where_clause_suffix = textwrap.dedent("""
        WHERE d.NAME IN {schemas}
        AND t.TBL_NAME NOT REGEXP '^[0-9]+'
        AND t.TBL_TYPE IN ( 'EXTERNAL_TABLE', 'MANAGED_TABLE' )
    """).format(schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships/'.format(
        tmp_folder=tmp_folder)

    job_config = ConfigFactory.from_dict({
        'extractor.hive_table_metadata.{}'.format(HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY):
        where_clause_suffix,
        'extractor.hive_table_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
        connection_string(),
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_CREATE_ONLY_NODES):
        [DESCRIPTION_NODE_LABEL],
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_tag',  # TO-DO unique tag must be added
    })

    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=HiveTableMetadataExtractor(),
                                      loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    job.launch()
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {'schema': 'test_schema',
                     'name': 'test_table',
                     'description': 'a table for testing'}

            sql_execute.return_value = [
                self._union(
                    {'col_name': 'col_id1',
                     'col_type': 'bigint',
                     'col_description': 'description of id1',
                     'col_sort_order': 0}, table),
                self._union(
                    {'col_name': 'col_id2',
                     'col_type': 'bigint',
                     'col_description': 'description of id2',
                     'col_sort_order': 1}, table),
                self._union(
                    {'col_name': 'is_active',
                     'col_type': 'boolean',
                     'col_description': None,
                     'col_sort_order': 2}, table),
                self._union(
                    {'col_name': 'source',
                     'col_type': 'varchar',
                     'col_description': 'description of source',
                     'col_sort_order': 3}, table),
                self._union(
                    {'col_name': 'etl_created_at',
                     'col_type': 'timestamp',
                     'col_description': 'description of etl_created_at',
                     'col_sort_order': 4}, table),
                self._union(
                    {'col_name': 'ds',
                     'col_type': 'varchar',
                     'col_description': None,
                     'col_sort_order': 5}, table)
            ]

            extractor = HiveTableMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata('hive', 'gold', 'test_schema', 'test_table', 'a table for testing',
                                     [ColumnMetadata('col_id1', 'description of id1', 'bigint', 0),
                                      ColumnMetadata('col_id2', 'description of id2', 'bigint', 1),
                                      ColumnMetadata('is_active', None, 'boolean', 2),
                                      ColumnMetadata('source', 'description of source', 'varchar', 3),
                                      ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
                                      ColumnMetadata('ds', None, 'varchar', 5)])
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Пример #9
0
    def _create_schema_by_table_mapping(self):
        # type: () -> dict
        # TODO: Make extractor generic
        table_metadata_extractor = HiveTableMetadataExtractor()
        table_metadata_extractor.init(
            Scoped.get_scoped_conf(self._conf,
                                   table_metadata_extractor.get_scope()))

        table_to_schema = {}
        table_metadata = table_metadata_extractor.extract()
        while table_metadata:
            # TODO: deal with collision
            table_to_schema[table_metadata.name.lower(
            )] = table_metadata.schema_name.lower()
            table_metadata = table_metadata_extractor.extract()
        return table_to_schema
    def test_extraction_with_multiple_result(self) -> None:
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                'schema': 'test_schema1',
                'name': 'test_table1',
                'description': 'test table 1',
                'is_view': 0
            }

            table1 = {
                'schema': 'test_schema1',
                'name': 'test_table2',
                'description': 'test table 2',
                'is_view': 0
            }

            table2 = {
                'schema': 'test_schema2',
                'name': 'test_table3',
                'description': 'test table 3',
                'is_view': 0
            }

            sql_execute.return_value = [
                self._union(
                    {
                        'col_name': 'col_id1',
                        'col_type': 'bigint',
                        'col_description': 'description of col_id1',
                        'col_sort_order': 0,
                        'is_partition_col': 1
                    }, table),
                self._union(
                    {
                        'col_name': 'col_id2',
                        'col_type': 'bigint',
                        'col_description': 'description of col_id2',
                        'col_sort_order': 1,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'is_active',
                        'col_type': 'boolean',
                        'col_description': None,
                        'col_sort_order': 2,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'source',
                        'col_type': 'varchar',
                        'col_description': 'description of source',
                        'col_sort_order': 3,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'etl_created_at',
                        'col_type': 'timestamp',
                        'col_description': 'description of etl_created_at',
                        'col_sort_order': 4,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'ds',
                        'col_type': 'varchar',
                        'col_description': None,
                        'col_sort_order': 5,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'col_name',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name',
                        'col_sort_order': 0,
                        'is_partition_col': 0
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_name2',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name2',
                        'col_sort_order': 1,
                        'is_partition_col': 0
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_id3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_id3',
                        'col_sort_order': 0,
                        'is_partition_col': 0
                    }, table2),
                self._union(
                    {
                        'col_name': 'col_name3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name3',
                        'col_sort_order': 1,
                        'is_partition_col': 0
                    }, table2)
            ]

            extractor = HiveTableMetadataExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                'hive',
                'gold',
                'test_schema1',
                'test_table1',
                'test table 1', [
                    ColumnMetadata('col_id1', 'description of col_id1',
                                   'bigint', 0, ['partition column']),
                    ColumnMetadata('col_id2', 'description of col_id2',
                                   'bigint', 1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ],
                is_view=False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'hive',
                'gold',
                'test_schema1',
                'test_table2',
                'test table 2', [
                    ColumnMetadata('col_name', 'description of col_name',
                                   'varchar', 0),
                    ColumnMetadata('col_name2', 'description of col_name2',
                                   'varchar', 1)
                ],
                is_view=False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'hive',
                'gold',
                'test_schema2',
                'test_table3',
                'test table 3', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ],
                is_view=False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())
Пример #11
0
    def test_extraction_with_single_result(self) -> None:
        with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection, \
                patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm',
                             return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT):
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                'schema': 'test_schema',
                'name': 'test_table',
                'description': 'a table for testing',
                'is_view': 0
            }

            sql_execute.return_value = [
                self._union(
                    {
                        'col_name': 'col_id1',
                        'col_type': 'bigint',
                        'col_description': 'description of id1',
                        'col_sort_order': 0,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'col_id2',
                        'col_type': 'bigint',
                        'col_description': 'description of id2',
                        'col_sort_order': 1,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'is_active',
                        'col_type': 'boolean',
                        'col_description': None,
                        'col_sort_order': 2,
                        'is_partition_col': 1
                    }, table),
                self._union(
                    {
                        'col_name': 'source',
                        'col_type': 'varchar',
                        'col_description': 'description of source',
                        'col_sort_order': 3,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'etl_created_at',
                        'col_type': 'timestamp',
                        'col_description': 'description of etl_created_at',
                        'col_sort_order': 4,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'ds',
                        'col_type': 'varchar',
                        'col_description': None,
                        'col_sort_order': 5,
                        'is_partition_col': 0
                    }, table)
            ]

            extractor = HiveTableMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'hive',
                'gold',
                'test_schema',
                'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2,
                                   ['partition column']),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ],
                is_view=False)

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())