def __get_tables_by_db(self, database) -> dict: dao = pf.create(key=self._connection_name, configuration={'connection': self._connection_name}) query_string = f"SELECT DATABASE, SCHEMA, NAME, ATTNAME, FORMAT_TYPE, ATTLEN, ATTNOTNULL, COLDEFAULT " \ f"FROM {database}.._V_RELATION_COLUMN " \ f"WHERE DATABASE <> 'SYSTEM' AND TYPE = 'TABLE' ORDER BY SCHEMA, NAME, ATTNUM ASC" tables = {} with dao.connection as conn: cursor = conn.cursor() cursor.execute(query_string) result = cursor.fetchall() for row in result: table_name = f"{row[0]}.{row[1]}.{row[2]}" # ignoring name collisions across multiple db's for now if table_name not in tables: tables[table_name] = [] column = { 'database': row[0], 'schema': row[1], 'name': row[2], 'columnName': row[3], 'columnType': row[4], 'columnSize': row[5], 'notNull': row[6], 'default': row[7] } tables[table_name].append(column) return tables
def __init__(self, **kwargs): self._logger = self._get_logger() self._format_date = kwargs.get('format_date') self._source: SourceSinkDescriptor = SourceSinkDescriptor() self._sink: SourceSinkDescriptor = SourceSinkDescriptor() self._job_id = None self._run_id = None self._manifest_name = None self._ddl_file = kwargs.get('ddl_file') self._env = kwargs.get('connection', 'state_manager') self._conn = pf.create(key=kwargs.get('dao'), configuration={ 'connection': kwargs.get('connection', 'state_manager') }) # TODO: Pass Table name as args metadata = schema.MetaData(bind=self._conn.engine) metadata.reflect() if ProjectConfig.state_manager_table_name( ) not in metadata.tables.keys(): with open(self._ddl_file, 'r') as stream: ddl = stream.read() with self._conn.connection as conn: conn.execute(ddl) metadata = schema.MetaData(bind=self._conn.engine) metadata.reflect() self._table: schema.Table = metadata.tables[ ProjectConfig.state_manager_table_name()]
def setUp(self) -> None: self._app_config = { "destinations": { "snowflake": { "type": "SnowflakeCreator", "conf": { "type": "SnowflakeDAO", "profile": "snowflake_knierr_profile" } } } } self._profiles = { 'snowflake_knierr_profile': { 'protocol': 'snowflake', 'account': '< account name >', 'role': '< snowflake role >', 'warehouse': '< snowflake warehouse >', 'user': '******', 'password': '******' } } self._sample_query = 'CREATE OR REPLACE TEMPORARY TABLE TEMP (COL1 VARCHAR)' self._creator: Creator = providah_pkg_factory.create( key=self._app_config['destinations']['snowflake']['type'], library='hdc', configuration={ 'conf': self._app_config['destinations']['snowflake']['conf'] })
def setUp(self) -> None: self._app_config = { "mappers": { "hdfs": {"snowflake": { "type": "HdfsToSnowflake", "conf": { "report": False, "schema": { "department": {"type": "record", "name": "department", "fields": [{"name": "column1", "type": "string"}, {"name": "column2", "type": "enum"}]}, "resources": {"type": "record", "name": "resources", "fields": [{"name": "column1", "type": "string"}, {"name": "column2", "type": "string"}, {"name": "column3", "type": "string"}]} } } } } } } self._mapper: Mapper = providah_pkg_factory.create(key=self._app_config['mappers']['hdfs']['snowflake']['type'], library='hdc', configuration={'conf': (self._app_config['mappers']['hdfs'] ['snowflake']).get('conf', {"report": False}) } )
def setUp(self) -> None: self._app_config = { "sources": { "hdfs": { "type": "HdfsCrawler", "conf": { "type": "Hdfs", "profile": "hdfs_dummy_profile", "dir": Path.home() / "Documents" / "Work", "file_format": "csv", "partition_depth": 0 } } } } self._profiles = { 'hdfs_dummy_profile': { 'protocol': 'hadoop', 'user': '******', } } self._crawler: Crawler = providah_pkg_factory.create( key=self._app_config['sources']['hdfs']['type'], library='hdc', configuration={ 'conf': self._app_config['sources']['hdfs']['conf'] })
def obtain_catalog(self) -> pd.DataFrame: try: dao: RdbmsDAO = providah_pkg_factory.create( key=self._conf['type'].capitalize(), library='hdc', configuration={'connection': self._conf['profile']}) df_table_catalog: pd.DataFrame = self._fetch_all( dao, query_string=HiveCrawler.__template_select_all_tables. substitute(schema_name='default', db_name=dao.get_conn_profile_key('database'))) if not df_table_catalog.empty: # This had to be re-applied because the query alias doesnt seem to be working. # Please do not remove below mapping. df_table_catalog.columns = [ 'DATABASE_NAME', 'SCHEMA_NAME', 'TABLE_NAME', 'COLUMN_NAME', 'COLUMN_TYPE' ] return df_table_catalog except Exception as e: raise e return None
def __configure_catalog(cls) -> None: """Constructor method that calls factory to create catalog instance.""" # When a configuration already exists, load it with open(cls.__configuration_path, 'r') as stream: registry_configuration = yaml.safe_load(stream)['configurations']['catalog'] # Load the configuration cls.__catalog = pf.create(key=registry_configuration['type'], configuration=registry_configuration['conf'])
def __init__(self, **kwargs): self._logger = self._get_logger() source = kwargs.get('source') destination = kwargs.get('destination') app_config = file_utils.get_app_config(kwargs.get('app_config', None)) if source in app_config['sources'].keys(): self._crawler: Crawler = providah_pkg_factory.create( key=app_config['sources'][source]['type'], library='hdc', configuration={'conf': app_config['sources'][source]['conf']}) else: raise HdcError( message= f"{source} not registered in 'sources' in {kwargs.get('app_config') or 'hdc.yml'}" ) if source in app_config['mappers'].keys( ) and destination in app_config['mappers'][source].keys(): self._mapper: Mapper = providah_pkg_factory.create( key=app_config['mappers'][source][destination]['type'], library='hdc', configuration={ 'conf': (app_config['mappers'][source][destination]).get( 'conf', {"report": False}) }) else: raise HdcError( message= f"{source}/{destination} not registered in 'mappers' in {kwargs.get('app_config') or 'hdc.yml'}" ) if destination in app_config['destinations'].keys(): self._creator: Creator = providah_pkg_factory.create( key=app_config['destinations'][destination]['type'], library='hdc', configuration={ 'conf': app_config['destinations'][destination]['conf'] }) else: raise HdcError( message= f"{destination} not registered in 'destinations' in {kwargs.get('app_config') or 'hdc.yml'}" )
def replicate_structures(self, sql_ddl_list): try: dao: RdbmsDAO = providah_pkg_factory.create( key=self._conf['type'].capitalize(), library='hdc', configuration={'connection': self._conf['profile']}) self._execute_update(dao, sql_ddl_list) except: raise
def setUp(self) -> None: self._app_config = {"sources": { "netezza": { "type": "NetezzaCrawler", "conf": { "type": "Netezza", "profile": "netezza_jdbc"} } } } self._profiles = { 'netezza_jdbc': { 'protocol': 'jdbc', 'host': '<host>', 'port': '<port>', 'database': '<database>', 'user': '******', 'password': '******', 'driver': { 'name': '<java class name>', 'path': '<jar path>' } }, "netezza_odbc": { "protocol": "odbc", "host": "<host>", "port": "<port>", "database": "<database_name>", "user": "******", "password": "******", "driver": {"name": "<ODBC Driver Name>"} } } self._sample_column_desc = [('DATABASE', 'VARCHAR', 100, 100, 0, 0, True,), ('SCHEMA_NAME', 'VARCHAR', 100, 100, 0, 0, True,), ('TABLE_NAME', 'VARCHAR', 100, 100, 0, 0, True,), ('COLUMN_NAME', 'VARCHAR', 100, 100, 0, 0, True,), ('COLUMN_TYPE', 'VARCHAR', 100, 100, 0, 0, True,), ('COLUMN_SIZE', 'VARCHAR', 100, 100, 0, 0, True,), ('IS_NULL', 'VARCHAR', 100, 100, 0, 0, True,)] self._sample_query_result = [('HR', 'FINANCE', 'ACCOUNTS_2020', 'NARRATIVE', 'VARCHAR', 1000, 'NOT NULL')] self._crawler: Crawler = providah_pkg_factory.create(key=self._app_config['sources']['netezza']['type'], library='hdc', configuration={ 'conf': self._app_config['sources']['netezza']['conf']} )
def __configure_reader_methods(cls): """Constructor method to populate allowed reader methods""" # ----------- create local registry of all writers ---------- # # Load configuration with open(cls.__configuration_path, 'r') as config_stream: configuration = yaml.safe_load(stream=config_stream)['configurations'] for key, value in configuration['writers'].items(): if value['conf']['allowed']: cls.__readers[key.lower()] = pf.create(key=value['type'].lower(), library='dataframez', configuration=value['conf']).read
def __init__(self, **kwargs): self._logger = self._get_logger() source = kwargs.get('source') app_config = file_utils.get_app_config(kwargs.get('app_config', None)) if source in app_config['sources'].keys(): self._crawler: Crawler = providah_pkg_factory.create(key=app_config['sources'][source]['type'], library='hdc', configuration={ 'conf': app_config['sources'][source]['conf']} ) else: raise HdcError(message=f"{source} not registered in 'sources' in {kwargs.get('app_config') or 'hdc.yml'}")
def __get_database_names(self) -> list: dao = pf.create(key=self._connection_name, configuration={'connection': self._connection_name}) query_string = "SELECT DATABASE FROM _V_DATABASE WHERE DATABASE <> 'SYSTEM'" databases = [] try: with dao.connection as conn: cursor = conn.cursor() cursor.execute(query_string) result = cursor.fetchall() for row in result: databases.append(row[0]) return databases finally: if cursor: cursor.close()
def __get_schema_names_by_db(self, database) -> list: dao = pf.create(key=self._connection_name, configuration={'connection': self._connection_name}) query_string = f"SELECT DISTINCT SCHEMA FROM {database}.._V_SCHEMA" # WHERE OBJTYPE = 'TABLE'" schemas = [] try: with dao.connection as conn: cursor = conn.cursor() cursor.execute(query_string) result = cursor.fetchall() for row in result: schemas.append(row[0]) return schemas finally: if cursor: cursor.close()
def obtain_catalog(self) -> DataFrame: try: dao: FileSystemDAO = providah_pkg_factory.create(key=self._conf['type'].capitalize(), library='hdc', configuration={'connection': self._conf['profile']}) df_filesystem_catalog = self._fetch_all(dao, dir_path=self._conf.get("dir", "/".join( ["/user", dao.get_conn_profile_key("user")])), format=self._conf.get("file_format", "csv"), partition_depth=self._conf.get("partition_depth", 0)) return df_filesystem_catalog except: raise return None
def obtain_catalog(self) -> pd.DataFrame: try: dao: RdbmsDAO = providah_pkg_factory.create(key=self._conf['type'].capitalize(), library='hdc', configuration={ 'connection': self._conf['profile']}) df_databases = self._fetch_all(dao, NetezzaCrawler.__select_all_databases) df_table_catalog = [] for db in df_databases['DATABASE'].to_list(): df_table_catalog.extend(self._fetch_all_list(dao, query_string=NetezzaCrawler.__template_select_all_tables .substitute(db_name=db))) return pd.DataFrame(df_table_catalog) except: raise
def setUp(self) -> None: self._app_config = { "mappers": { "oracle": {"snowflake": { "type": "OracleToSnowflake", "conf": {"report": False} }} } } self._mapper: Mapper = providah_pkg_factory.create( key=self._app_config['mappers']['oracle']['snowflake']['type'], library='hdc', configuration={'conf': (self._app_config['mappers']['oracle'] ['snowflake']).get('conf', {"report": False}) } )
def setUp(self) -> None: self._app_config = {"sources": { "hive": { "type": "HiveCrawler", "conf": { "type": "Hive", "profile": "hive_jdbc"} } } } self._profiles = { 'hive_jdbc': { 'protocol': 'jdbc', 'user': '******', 'password': '******', 'connection_url': '<metastore jdbc connection url>', 'database': '<hive database to crawl>', 'driver': { 'name': 'Hive.jdbc.HiveDriver', 'path': '<jar path>' } } } self._sample_column_desc = [('DATABASE_NAME', 'VARCHAR', 100, 100, 0, 0, True,), ('SCHEMA_NAME', 'VARCHAR', 100, 100, 0, 0, True,), ('TABLE_NAME', 'VARCHAR', 100, 100, 0, 0, True,), ('COLUMN_NAME', 'VARCHAR', 100, 100, 0, 0, True,), ('COLUMN_TYPE', 'VARCHAR', 100, 100, 0, 0, True,)] self._sample_query_result = [('HR', 'FINANCE', 'ACCOUNTS_2020', 'NARRATIVE', 'VARCHAR')] self._crawler: Crawler = providah_pkg_factory.create(key=self._app_config['sources']['hive']['type'], library='hdc', configuration={ 'conf': self._app_config['sources']['hive']['conf']} )
def obtain_catalog(self) -> pd.DataFrame: try: dao: RdbmsDAO = providah_pkg_factory.create( key=self._conf['type'].capitalize(), library='hdc', configuration={'connection': self._conf['profile']}) # Extract the table metadata/catalog from connected Oracle source oracle_database = dao.get_conn_profile_key("sid") if dao.get_conn_profile_key("sid") is not None \ else dao.get_conn_profile_key("service_name") df_table_catalog: pd.DataFrame = self._fetch_all( dao, query_string=OracleCrawler.__template_select_all_tables. substitute(db=oracle_database, user=dao.get_conn_profile_key("user"))) return df_table_catalog except Exception as e: raise e return None
def build_source(cls, configuration: dict) -> Source: source = pf.create(key=configuration['type'], configuration=configuration['conf']) return source
def __init__(self, **kwargs): # pylint: disable=unused-argument with open(self.__configuration_path, 'r') as stream: configuration = yaml.safe_load(stream)['configurations']['catalog'] self._catalog: Catalog = pf.create(key=configuration['type'], configuration=configuration['conf'])
# Copyright © 2020 Hashmap, Inc # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Something""" from providah.factories.package_factory import PackageFactory as pf pf.fill_registry()
def start_here(): hdc_parser = build_parser() cli_args = hdc_parser.parse_args() try: validate_hdc_cli_args(vars(cli_args)) app_config: dict = file_utils.get_app_config(cli_args.app_config) if cli_args.log_settings is not None: logging.config.dictConfig( file_utils.yaml_parser(yaml_file_path=cli_args.log_settings)) else: logging.config.dictConfig( file_utils.yaml_parser( yaml_file_path=file_utils.get_default_log_config_path())) if app_config is not None: if cli_args.run.lower() == 'map': try: asset_mapper: AssetMapper = providah_pkg_factory.create( key='AssetMapper', library='hdc', configuration={ 'source': cli_args.source, 'destination': cli_args.destination, 'app_config': cli_args.app_config }) if asset_mapper.map_assets(): print( f"Successfully mapped the source '{cli_args.source}' to destination '{cli_args.destination}'" ) except HdcError as hde: print(hde) elif cli_args.run.lower() == 'catalog': try: cataloger: Cataloger = providah_pkg_factory.create( key='Cataloger', library='hdc', configuration={ 'source': cli_args.source, 'app_config': cli_args.app_config }) df_catalog = cataloger.obtain_catalog() cataloger.pretty_print(df_catalog) except HdcError as hde: print(hde) else: raise HdcError(message=f"Could not find file {app_config}") except ArgumentTypeError as err: hdc_parser.print_usage() raise HdcError(message=err) except RuntimeError as err: raise HdcError(message=err)
def setUp(self) -> None: self._app_config = { "sources": { "oracle": { "type": "OracleCrawler", "conf": { "type": "Oracle", "profile": "oracle_local_profile" } } } } self._profiles = { 'oracle_jdbc': { 'protocol': 'jdbc', 'host': 'localhost', 'port': 1521, 'sid': 'XEPDB1', 'user': '******', 'password': '******', 'driver': { 'name': 'oracle.jdbc.OracleDriver', 'path': '<jar path>' } }, "oracle_odbc": { "protocol": "odbc", "host": "<host>", "port": "<port>", "database": "<database_name>", "user": "******", "password": "******", "driver": { "name": "<ODBC Driver Name>" } }, "oracle_cx": { "protocol": "cx_oracle", "host": "< host >", "port": "< port >", "sid": "< Oracle instance identifier >", "user": "******", "password": "******", "client_library_dir": "< client_library_dir >" } } self._sample_column_desc = [( 'DATABASE_NAME', 'VARCHAR', 100, 100, 0, 0, True, ), ( 'SCHEMA_NAME', 'VARCHAR', 100, 100, 0, 0, True, ), ( 'TABLE_NAME', 'VARCHAR', 100, 100, 0, 0, True, ), ( 'COLUMN_NAME', 'VARCHAR', 100, 100, 0, 0, True, ), ( 'COLUMN_TYPE', 'VARCHAR', 100, 100, 0, 0, True, ), ( 'COLUMN_SIZE', 'VARCHAR', 100, 100, 0, 0, True, ), ( 'IS_NULL', 'VARCHAR', 100, 100, 0, 0, True, )] self._sample_query_result = [('HR', 'FINANCE', 'ACCOUNTS_2020', 'NARRATIVE', 'VARCHAR', 1000, 'NOT NULL') ] self._crawler: Crawler = providah_pkg_factory.create( key=self._app_config['sources']['oracle']['type'], library='hdc', configuration={ 'conf': self._app_config['sources']['oracle']['conf'] })
def build_state_manager(cls, configuration: dict) -> StateManager: state_manager = pf.create(key=configuration['type'], configuration=configuration['conf']) return state_manager
def build_sink(cls, configuration: dict) -> Sink: sink = pf.create(key=configuration['type'], configuration=configuration['conf']) return sink
def _initiate_orchestrator(self): orchestrator_config = ParseConfig.parse( config_path=os.getenv('HDM_MANIFEST'))['orchestrator'] self._orchestrator: Orchestrator = pf.create( key=orchestrator_config['type'], configuration=orchestrator_config['conf'])