Пример #1
0
    def __get_tables_by_db(self, database) -> dict:
        dao = pf.create(key=self._connection_name,
                        configuration={'connection': self._connection_name})
        query_string = f"SELECT DATABASE, SCHEMA, NAME, ATTNAME, FORMAT_TYPE, ATTLEN, ATTNOTNULL, COLDEFAULT " \
                       f"FROM {database}.._V_RELATION_COLUMN " \
                       f"WHERE DATABASE <> 'SYSTEM' AND TYPE = 'TABLE' ORDER BY SCHEMA, NAME, ATTNUM ASC"
        tables = {}

        with dao.connection as conn:
            cursor = conn.cursor()
            cursor.execute(query_string)
            result = cursor.fetchall()
            for row in result:
                table_name = f"{row[0]}.{row[1]}.{row[2]}"  # ignoring name collisions across multiple db's for now
                if table_name not in tables:
                    tables[table_name] = []

                column = {
                    'database': row[0],
                    'schema': row[1],
                    'name': row[2],
                    'columnName': row[3],
                    'columnType': row[4],
                    'columnSize': row[5],
                    'notNull': row[6],
                    'default': row[7]
                }
                tables[table_name].append(column)

        return tables
Пример #2
0
    def __init__(self, **kwargs):

        self._logger = self._get_logger()
        self._format_date = kwargs.get('format_date')
        self._source: SourceSinkDescriptor = SourceSinkDescriptor()
        self._sink: SourceSinkDescriptor = SourceSinkDescriptor()
        self._job_id = None
        self._run_id = None
        self._manifest_name = None
        self._ddl_file = kwargs.get('ddl_file')

        self._env = kwargs.get('connection', 'state_manager')
        self._conn = pf.create(key=kwargs.get('dao'),
                               configuration={
                                   'connection':
                                   kwargs.get('connection', 'state_manager')
                               })
        # TODO: Pass Table name as args
        metadata = schema.MetaData(bind=self._conn.engine)
        metadata.reflect()
        if ProjectConfig.state_manager_table_name(
        ) not in metadata.tables.keys():
            with open(self._ddl_file, 'r') as stream:
                ddl = stream.read()

            with self._conn.connection as conn:
                conn.execute(ddl)

            metadata = schema.MetaData(bind=self._conn.engine)
            metadata.reflect()

        self._table: schema.Table = metadata.tables[
            ProjectConfig.state_manager_table_name()]
Пример #3
0
    def setUp(self) -> None:
        self._app_config = {
            "destinations": {
                "snowflake": {
                    "type": "SnowflakeCreator",
                    "conf": {
                        "type": "SnowflakeDAO",
                        "profile": "snowflake_knierr_profile"
                    }
                }
            }
        }
        self._profiles = {
            'snowflake_knierr_profile': {
                'protocol': 'snowflake',
                'account': '< account name >',
                'role': '< snowflake role >',
                'warehouse': '< snowflake warehouse >',
                'user': '******',
                'password': '******'
            }
        }

        self._sample_query = 'CREATE OR REPLACE TEMPORARY TABLE TEMP (COL1 VARCHAR)'

        self._creator: Creator = providah_pkg_factory.create(
            key=self._app_config['destinations']['snowflake']['type'],
            library='hdc',
            configuration={
                'conf': self._app_config['destinations']['snowflake']['conf']
            })
Пример #4
0
    def setUp(self) -> None:
        self._app_config = {
            "mappers": {
                "hdfs":
                    {"snowflake": {
                        "type": "HdfsToSnowflake",
                        "conf": {
                            "report": False,
                            "schema": {
                                "department": {"type": "record", "name": "department",
                                               "fields": [{"name": "column1", "type": "string"},
                                                          {"name": "column2", "type": "enum"}]},
                                "resources": {"type": "record", "name": "resources",
                                              "fields": [{"name": "column1", "type": "string"},
                                                         {"name": "column2", "type": "string"},
                                                         {"name": "column3", "type": "string"}]}
                            }
                        }
                    }
                    }
            }
        }

        self._mapper: Mapper = providah_pkg_factory.create(key=self._app_config['mappers']['hdfs']['snowflake']['type'],
                                                           library='hdc',
                                                           configuration={'conf': (self._app_config['mappers']['hdfs']
                                                           ['snowflake']).get('conf', {"report": False})
                                                                          }
                                                           )
Пример #5
0
    def setUp(self) -> None:
        self._app_config = {
            "sources": {
                "hdfs": {
                    "type": "HdfsCrawler",
                    "conf": {
                        "type": "Hdfs",
                        "profile": "hdfs_dummy_profile",
                        "dir": Path.home() / "Documents" / "Work",
                        "file_format": "csv",
                        "partition_depth": 0
                    }
                }
            }
        }

        self._profiles = {
            'hdfs_dummy_profile': {
                'protocol': 'hadoop',
                'user': '******',
            }
        }

        self._crawler: Crawler = providah_pkg_factory.create(
            key=self._app_config['sources']['hdfs']['type'],
            library='hdc',
            configuration={
                'conf': self._app_config['sources']['hdfs']['conf']
            })
Пример #6
0
    def obtain_catalog(self) -> pd.DataFrame:
        try:
            dao: RdbmsDAO = providah_pkg_factory.create(
                key=self._conf['type'].capitalize(),
                library='hdc',
                configuration={'connection': self._conf['profile']})

            df_table_catalog: pd.DataFrame = self._fetch_all(
                dao,
                query_string=HiveCrawler.__template_select_all_tables.
                substitute(schema_name='default',
                           db_name=dao.get_conn_profile_key('database')))

            if not df_table_catalog.empty:
                # This had to be re-applied because the query alias doesnt seem to be working.
                # Please do not remove below mapping.
                df_table_catalog.columns = [
                    'DATABASE_NAME', 'SCHEMA_NAME', 'TABLE_NAME',
                    'COLUMN_NAME', 'COLUMN_TYPE'
                ]

            return df_table_catalog
        except Exception as e:
            raise e

        return None
Пример #7
0
    def __configure_catalog(cls) -> None:
        """Constructor method that calls factory to create catalog instance."""
        # When a configuration already exists, load it
        with open(cls.__configuration_path, 'r') as stream:
            registry_configuration = yaml.safe_load(stream)['configurations']['catalog']

        # Load the configuration
        cls.__catalog = pf.create(key=registry_configuration['type'],
                                  configuration=registry_configuration['conf'])
Пример #8
0
    def __init__(self, **kwargs):
        self._logger = self._get_logger()
        source = kwargs.get('source')
        destination = kwargs.get('destination')
        app_config = file_utils.get_app_config(kwargs.get('app_config', None))

        if source in app_config['sources'].keys():
            self._crawler: Crawler = providah_pkg_factory.create(
                key=app_config['sources'][source]['type'],
                library='hdc',
                configuration={'conf': app_config['sources'][source]['conf']})
        else:
            raise HdcError(
                message=
                f"{source} not registered in 'sources' in {kwargs.get('app_config') or 'hdc.yml'}"
            )

        if source in app_config['mappers'].keys(
        ) and destination in app_config['mappers'][source].keys():
            self._mapper: Mapper = providah_pkg_factory.create(
                key=app_config['mappers'][source][destination]['type'],
                library='hdc',
                configuration={
                    'conf': (app_config['mappers'][source][destination]).get(
                        'conf', {"report": False})
                })
        else:
            raise HdcError(
                message=
                f"{source}/{destination} not registered in 'mappers' in {kwargs.get('app_config') or 'hdc.yml'}"
            )

        if destination in app_config['destinations'].keys():
            self._creator: Creator = providah_pkg_factory.create(
                key=app_config['destinations'][destination]['type'],
                library='hdc',
                configuration={
                    'conf': app_config['destinations'][destination]['conf']
                })
        else:
            raise HdcError(
                message=
                f"{destination} not registered in 'destinations' in {kwargs.get('app_config') or 'hdc.yml'}"
            )
Пример #9
0
    def replicate_structures(self, sql_ddl_list):
        try:

            dao: RdbmsDAO = providah_pkg_factory.create(
                key=self._conf['type'].capitalize(),
                library='hdc',
                configuration={'connection': self._conf['profile']})
            self._execute_update(dao, sql_ddl_list)

        except:
            raise
Пример #10
0
    def setUp(self) -> None:
        self._app_config = {"sources": {
            "netezza": {
                "type": "NetezzaCrawler",
                "conf": {
                    "type": "Netezza",
                    "profile": "netezza_jdbc"}
            }
        }
        }
        self._profiles = {
            'netezza_jdbc': {
                'protocol': 'jdbc',
                'host': '<host>',
                'port': '<port>',
                'database': '<database>',
                'user': '******',
                'password': '******',
                'driver': {
                    'name': '<java class name>',
                    'path': '<jar path>'
                }
            },
            "netezza_odbc":
                {
                    "protocol": "odbc",
                    "host": "<host>",
                    "port": "<port>",
                    "database": "<database_name>",
                    "user": "******",
                    "password": "******",
                    "driver":
                        {"name": "<ODBC Driver Name>"}
                }
        }

        self._sample_column_desc = [('DATABASE', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('SCHEMA_NAME', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('TABLE_NAME', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('COLUMN_NAME', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('COLUMN_TYPE', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('COLUMN_SIZE', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('IS_NULL', 'VARCHAR', 100, 100, 0, 0, True,)]

        self._sample_query_result = [('HR', 'FINANCE', 'ACCOUNTS_2020',
                                      'NARRATIVE', 'VARCHAR', 1000,
                                      'NOT NULL')]

        self._crawler: Crawler = providah_pkg_factory.create(key=self._app_config['sources']['netezza']['type'],
                                                             library='hdc',
                                                             configuration={
                                                                 'conf': self._app_config['sources']['netezza']['conf']}
                                                             )
Пример #11
0
    def __configure_reader_methods(cls):
        """Constructor method to populate allowed reader methods"""
        # ----------- create local registry of all writers ---------- #
        # Load configuration
        with open(cls.__configuration_path, 'r') as config_stream:
            configuration = yaml.safe_load(stream=config_stream)['configurations']

        for key, value in configuration['writers'].items():
            if value['conf']['allowed']:
                cls.__readers[key.lower()] = pf.create(key=value['type'].lower(),
                                                       library='dataframez',
                                                       configuration=value['conf']).read
Пример #12
0
    def __init__(self, **kwargs):
        self._logger = self._get_logger()

        source = kwargs.get('source')
        app_config = file_utils.get_app_config(kwargs.get('app_config', None))

        if source in app_config['sources'].keys():
            self._crawler: Crawler = providah_pkg_factory.create(key=app_config['sources'][source]['type'],
                                                                 library='hdc',
                                                                 configuration={
                                                                     'conf': app_config['sources'][source]['conf']}
                                                                 )
        else:
            raise HdcError(message=f"{source} not registered in 'sources' in {kwargs.get('app_config') or 'hdc.yml'}")
Пример #13
0
 def __get_database_names(self) -> list:
     dao = pf.create(key=self._connection_name,
                     configuration={'connection': self._connection_name})
     query_string = "SELECT DATABASE FROM _V_DATABASE WHERE DATABASE <> 'SYSTEM'"
     databases = []
     try:
         with dao.connection as conn:
             cursor = conn.cursor()
             cursor.execute(query_string)
             result = cursor.fetchall()
             for row in result:
                 databases.append(row[0])
         return databases
     finally:
         if cursor:
             cursor.close()
Пример #14
0
 def __get_schema_names_by_db(self, database) -> list:
     dao = pf.create(key=self._connection_name,
                     configuration={'connection': self._connection_name})
     query_string = f"SELECT DISTINCT SCHEMA FROM {database}.._V_SCHEMA"  # WHERE OBJTYPE = 'TABLE'"
     schemas = []
     try:
         with dao.connection as conn:
             cursor = conn.cursor()
             cursor.execute(query_string)
             result = cursor.fetchall()
             for row in result:
                 schemas.append(row[0])
         return schemas
     finally:
         if cursor:
             cursor.close()
Пример #15
0
    def obtain_catalog(self) -> DataFrame:
        try:
            dao: FileSystemDAO = providah_pkg_factory.create(key=self._conf['type'].capitalize(),
                                                             library='hdc',
                                                             configuration={'connection': self._conf['profile']})

            df_filesystem_catalog = self._fetch_all(dao,
                                                    dir_path=self._conf.get("dir", "/".join(
                                                        ["/user", dao.get_conn_profile_key("user")])),
                                                    format=self._conf.get("file_format", "csv"),
                                                    partition_depth=self._conf.get("partition_depth", 0))

            return df_filesystem_catalog
        except:
            raise

        return None
Пример #16
0
    def obtain_catalog(self) -> pd.DataFrame:
        try:
            dao: RdbmsDAO = providah_pkg_factory.create(key=self._conf['type'].capitalize(),
                                                        library='hdc',
                                                        configuration={
                                                            'connection': self._conf['profile']})

            df_databases = self._fetch_all(dao, NetezzaCrawler.__select_all_databases)

            df_table_catalog = []
            for db in df_databases['DATABASE'].to_list():
                df_table_catalog.extend(self._fetch_all_list(dao,
                                                             query_string=NetezzaCrawler.__template_select_all_tables
                                                             .substitute(db_name=db)))

            return pd.DataFrame(df_table_catalog)
        except:
            raise
Пример #17
0
    def setUp(self) -> None:
        self._app_config = {
            "mappers": {
                "oracle":
                    {"snowflake": {
                        "type": "OracleToSnowflake",
                        "conf": {"report": False}
                    }}
            }
        }

        self._mapper: Mapper = providah_pkg_factory.create(
            key=self._app_config['mappers']['oracle']['snowflake']['type'],
            library='hdc',
            configuration={'conf': (self._app_config['mappers']['oracle']
            ['snowflake']).get('conf', {"report": False})
                           }
        )
Пример #18
0
    def setUp(self) -> None:
        self._app_config = {"sources": {
            "hive": {
                "type": "HiveCrawler",
                "conf": {
                    "type": "Hive",
                    "profile": "hive_jdbc"}
            }
        }
        }
        self._profiles = {
            'hive_jdbc': {
                'protocol': 'jdbc',
                'user': '******',
                'password': '******',
                'connection_url': '<metastore jdbc connection url>',
                'database': '<hive database to crawl>',
                'driver': {
                    'name': 'Hive.jdbc.HiveDriver',
                    'path': '<jar path>'
                }
            }
        }

        self._sample_column_desc = [('DATABASE_NAME', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('SCHEMA_NAME', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('TABLE_NAME', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('COLUMN_NAME', 'VARCHAR', 100, 100, 0, 0, True,),
                                    ('COLUMN_TYPE', 'VARCHAR', 100, 100, 0, 0, True,)]

        self._sample_query_result = [('HR', 'FINANCE', 'ACCOUNTS_2020',
                                      'NARRATIVE', 'VARCHAR')]

        self._crawler: Crawler = providah_pkg_factory.create(key=self._app_config['sources']['hive']['type'],
                                                             library='hdc',
                                                             configuration={
                                                                 'conf': self._app_config['sources']['hive']['conf']}
                                                             )
Пример #19
0
    def obtain_catalog(self) -> pd.DataFrame:
        try:
            dao: RdbmsDAO = providah_pkg_factory.create(
                key=self._conf['type'].capitalize(),
                library='hdc',
                configuration={'connection': self._conf['profile']})

            # Extract the table metadata/catalog from connected Oracle source
            oracle_database = dao.get_conn_profile_key("sid") if dao.get_conn_profile_key("sid") is not None \
                else dao.get_conn_profile_key("service_name")

            df_table_catalog: pd.DataFrame = self._fetch_all(
                dao,
                query_string=OracleCrawler.__template_select_all_tables.
                substitute(db=oracle_database,
                           user=dao.get_conn_profile_key("user")))

            return df_table_catalog

        except Exception as e:
            raise e

        return None
Пример #20
0
 def build_source(cls, configuration: dict) -> Source:
     source = pf.create(key=configuration['type'], configuration=configuration['conf'])
     return source
Пример #21
0
 def __init__(self, **kwargs):
     # pylint: disable=unused-argument
     with open(self.__configuration_path, 'r') as stream:
         configuration = yaml.safe_load(stream)['configurations']['catalog']
     self._catalog: Catalog = pf.create(key=configuration['type'],
                                        configuration=configuration['conf'])
Пример #22
0
# Copyright © 2020 Hashmap, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Something"""
from providah.factories.package_factory import PackageFactory as pf


pf.fill_registry()
Пример #23
0
def start_here():
    hdc_parser = build_parser()
    cli_args = hdc_parser.parse_args()

    try:
        validate_hdc_cli_args(vars(cli_args))

        app_config: dict = file_utils.get_app_config(cli_args.app_config)

        if cli_args.log_settings is not None:
            logging.config.dictConfig(
                file_utils.yaml_parser(yaml_file_path=cli_args.log_settings))
        else:
            logging.config.dictConfig(
                file_utils.yaml_parser(
                    yaml_file_path=file_utils.get_default_log_config_path()))

        if app_config is not None:
            if cli_args.run.lower() == 'map':
                try:
                    asset_mapper: AssetMapper = providah_pkg_factory.create(
                        key='AssetMapper',
                        library='hdc',
                        configuration={
                            'source': cli_args.source,
                            'destination': cli_args.destination,
                            'app_config': cli_args.app_config
                        })

                    if asset_mapper.map_assets():
                        print(
                            f"Successfully mapped the source '{cli_args.source}' to destination '{cli_args.destination}'"
                        )

                except HdcError as hde:
                    print(hde)

            elif cli_args.run.lower() == 'catalog':
                try:
                    cataloger: Cataloger = providah_pkg_factory.create(
                        key='Cataloger',
                        library='hdc',
                        configuration={
                            'source': cli_args.source,
                            'app_config': cli_args.app_config
                        })

                    df_catalog = cataloger.obtain_catalog()

                    cataloger.pretty_print(df_catalog)

                except HdcError as hde:
                    print(hde)
        else:
            raise HdcError(message=f"Could not find file {app_config}")

    except ArgumentTypeError as err:
        hdc_parser.print_usage()
        raise HdcError(message=err)

    except RuntimeError as err:
        raise HdcError(message=err)
Пример #24
0
    def setUp(self) -> None:
        self._app_config = {
            "sources": {
                "oracle": {
                    "type": "OracleCrawler",
                    "conf": {
                        "type": "Oracle",
                        "profile": "oracle_local_profile"
                    }
                }
            }
        }
        self._profiles = {
            'oracle_jdbc': {
                'protocol': 'jdbc',
                'host': 'localhost',
                'port': 1521,
                'sid': 'XEPDB1',
                'user': '******',
                'password': '******',
                'driver': {
                    'name': 'oracle.jdbc.OracleDriver',
                    'path': '<jar path>'
                }
            },
            "oracle_odbc": {
                "protocol": "odbc",
                "host": "<host>",
                "port": "<port>",
                "database": "<database_name>",
                "user": "******",
                "password": "******",
                "driver": {
                    "name": "<ODBC Driver Name>"
                }
            },
            "oracle_cx": {
                "protocol": "cx_oracle",
                "host": "< host >",
                "port": "< port >",
                "sid": "< Oracle instance identifier >",
                "user": "******",
                "password": "******",
                "client_library_dir": "< client_library_dir >"
            }
        }

        self._sample_column_desc = [(
            'DATABASE_NAME',
            'VARCHAR',
            100,
            100,
            0,
            0,
            True,
        ), (
            'SCHEMA_NAME',
            'VARCHAR',
            100,
            100,
            0,
            0,
            True,
        ), (
            'TABLE_NAME',
            'VARCHAR',
            100,
            100,
            0,
            0,
            True,
        ), (
            'COLUMN_NAME',
            'VARCHAR',
            100,
            100,
            0,
            0,
            True,
        ), (
            'COLUMN_TYPE',
            'VARCHAR',
            100,
            100,
            0,
            0,
            True,
        ), (
            'COLUMN_SIZE',
            'VARCHAR',
            100,
            100,
            0,
            0,
            True,
        ), (
            'IS_NULL',
            'VARCHAR',
            100,
            100,
            0,
            0,
            True,
        )]

        self._sample_query_result = [('HR', 'FINANCE', 'ACCOUNTS_2020',
                                      'NARRATIVE', 'VARCHAR', 1000, 'NOT NULL')
                                     ]

        self._crawler: Crawler = providah_pkg_factory.create(
            key=self._app_config['sources']['oracle']['type'],
            library='hdc',
            configuration={
                'conf': self._app_config['sources']['oracle']['conf']
            })
Пример #25
0
 def build_state_manager(cls, configuration: dict) -> StateManager:
     state_manager = pf.create(key=configuration['type'], configuration=configuration['conf'])
     return state_manager
Пример #26
0
 def build_sink(cls, configuration: dict) -> Sink:
     sink = pf.create(key=configuration['type'], configuration=configuration['conf'])
     return sink
Пример #27
0
 def _initiate_orchestrator(self):
     orchestrator_config = ParseConfig.parse(
         config_path=os.getenv('HDM_MANIFEST'))['orchestrator']
     self._orchestrator: Orchestrator = pf.create(
         key=orchestrator_config['type'],
         configuration=orchestrator_config['conf'])