def main(): for pipe in db.get_pipes('data'): metadata = yaml.load(pipe['comment']) if metadata and metadata.get('type') != 'Azure': log.info(f"{pipe['name']} is not an Azure pipe, and will be skipped.") continue blob_name = metadata['blob'] account_name = metadata['account'] pipe_name = pipe['name'] table = metadata['target'] sas_token_envar = 'AZURE_SAS_TOKEN_' + metadata.get('suffix', '') if sas_token_envar in environ: encrypted_sas_token = environ.get(sas_token_envar) elif 'encrypted_sas_token' in metadata: encrypted_sas_token = metadata['encrypted_sas_token'] else: log.info(f"{pipe['name']} has no azure auth") continue sas_token = vault.decrypt_if_encrypted(encrypted_sas_token) log.info(f"Now working on pipe {pipe_name}") endpoint_suffix = metadata.get('endpoint_suffix', 'core.windows.net') block_blob_service = BlockBlobService( account_name=account_name, sas_token=sas_token, endpoint_suffix=endpoint_suffix ) files = block_blob_service.list_blobs(blob_name) newest_time = get_timestamp(table) new_files = [] if newest_time: for file in files: if file.properties.creation_time > newest_time: new_files.append(StagedFile(file.name, None)) else: for file in files: new_files.append(StagedFile(file.name, None)) log.info(new_files) # Proxy object that abstracts the Snowpipe REST API ingest_manager = SimpleIngestManager(account=environ.get('SNOWFLAKE_ACCOUNT'), host=f'{environ.get("SNOWFLAKE_ACCOUNT")}.snowflakecomputing.com', user=environ.get('SA_USER'), pipe=f'SNOWALERT.DATA.{pipe_name}', private_key=load_pkb_rsa(PRIVATE_KEY, PRIVATE_KEY_PASSWORD)) if len(new_files) > 0: try: response = ingest_manager.ingest_files(new_files) log.info(response) except Exception as e: log.error(e) return
def __init__(self, config: Config, pipe: str) -> None: self.logger = logging.getLogger(self.__class__.__name__) self.config = config self.ingest_manager = SimpleIngestManager( account=self.config.account, host=self.config.url, user=self.config.user, private_key=self.config.private_key_pem, pipe=pipe, )
def test_simple_ingest(connection_ctx, test_util): param = connection_ctx['param'] pipe_name = '{}.{}.TEST_SIMPLE_INGEST_PIPE'.format(param['database'], param['schema']) private_key = test_util.read_private_key() print(private_key) cur = connection_ctx['cnx'].cursor() test_file = os.path.join(test_util.get_data_dir(), 'test_file.csv') cur.execute( 'create or replace table TEST_SIMPLE_INGEST_TABLE(c1 number, c2 string)' ) cur.execute('create or replace stage TEST_SIMPLE_INGEST_STAGE') cur.execute('put file://{} @TEST_SIMPLE_INGEST_STAGE'.format(test_file)) cur.execute( 'create or replace pipe {0} as copy into TEST_SIMPLE_INGEST_TABLE ' 'from @TEST_SIMPLE_INGEST_STAGE'.format(pipe_name)) ingest_manager = SimpleIngestManager(account=param['account'], user=param['user'], private_key=private_key, pipe=pipe_name, scheme=param['protocol'], host=param['host'], port=param['port']) staged_files = [StagedFile('test_file.csv.gz', None)] resp = ingest_manager.ingest_files(staged_files) assert resp['responseCode'] == 'SUCCESS' start_polling_time = time.time() while time.time() - start_polling_time < 120: history_resp = ingest_manager.get_history() if len(history_resp['files']) == 1: assert history_resp['files'][0]['path'] == 'test_file.csv.gz' return else: # wait for 20 seconds time.sleep(20) assert False
class SnowpipeApi: def __init__(self, config: Config, pipe: str) -> None: self.logger = logging.getLogger(self.__class__.__name__) self.config = config self.ingest_manager = SimpleIngestManager( account=self.config.account, host=self.config.url, user=self.config.user, private_key=self.config.private_key_pem, pipe=pipe, ) def report(self, recent_seconds: Optional[int] = None) -> None: request_id = uuid.uuid4() self.logger.debug(f'request_id: {request_id}') body = self.ingest_manager.get_history(recent_seconds=recent_seconds, request_id=request_id) print(json.dumps(body, indent=4)) def history(self, start_time: datetime, end_time: Optional[datetime] = None) -> None: if not start_time: raise ValueError('start_time must be defined') request_id = uuid.uuid4() self.logger.debug(f'request_id: {request_id}') body = self.ingest_manager.get_history_range( start_time_inclusive=start_time.isoformat(), end_time_exclusive=end_time.isoformat() if end_time else None, request_id=request_id) print(json.dumps(body, indent=4)) def ingest(self, files: Iterable[str]) -> None: if not files: raise ValueError('files must be defined') request_id = uuid.uuid4() self.logger.debug(f'request_id: {request_id}') staged_files = [StagedFile(name, None) for name in files] body = self.ingest_manager.ingest_files( staged_files=staged_files, request_id=request_id, ) print(json.dumps(body, indent=4))
def ingest(table_name, options): base_name = re.sub(r'_CONNECTION$', '', table_name) storage_account = options['storage_account'] sas_token = vault.decrypt_if_encrypted(options['sas_token']) suffix = options['suffix'] container_name = options['container_name'] snowflake_account = options['snowflake_account'] sa_user = options['sa_user'] database = options['database'] block_blob_service = BlockBlobService(account_name=storage_account, sas_token=sas_token, endpoint_suffix=suffix) db.execute(f"select SYSTEM$PIPE_FORCE_RESUME('DATA.{base_name}_PIPE');") last_loaded = db.fetch_latest(f'data.{table_name}', 'loaded_on') log.info(f"Last loaded time is {last_loaded}") blobs = block_blob_service.list_blobs(container_name) new_files = [ StagedFile(b.name, None) for b in blobs if (last_loaded is None or b.properties.creation_time > last_loaded) ] log.info(f"Found {len(new_files)} files to ingest") # Proxy object that abstracts the Snowpipe REST API ingest_manager = SimpleIngestManager( account=snowflake_account, host=f'{snowflake_account}.snowflakecomputing.com', user=sa_user, pipe=f'{database}.data.{base_name}_PIPE', private_key=load_pkb_rsa(PRIVATE_KEY, PRIVATE_KEY_PASSWORD)) if len(new_files) > 0: for file_group in groups_of(4999, new_files): response = ingest_manager.ingest_files(file_group) log.info(response) yield len(file_group)
pemlines = pem_in.read() private_key_obj = load_pem_private_key( pemlines, os.environ['PRIVATE_KEY_PASSPHRASE'].encode(), default_backend()) private_key_text = private_key_obj.private_bytes( Encoding.PEM, PrivateFormat.PKCS8, NoEncryption()).decode('utf-8') # Assume the public key has been registered in Snowflake: # private key in PEM format # List of files in the stage specified in the pipe definition file_list = ['departments.csv'] ingest_manager = SimpleIngestManager( account='<>', host='<>.<>.snowflakecomputing.com', user='******', pipe='dataload_db.dataload.load_dept_pipe', private_key=private_key_text) # List of files, but wrapped into a class staged_file_list = [] for file_name in file_list: print(file_name) staged_file_list.append(StagedFile(file_name, None)) for staged_file in staged_file_list: print(staged_file) try: resp = ingest_manager.ingest_files(staged_file_list) print(resp)
), 'rb') as pem_in: pemlines = pem_in.read() private_key_obj = load_pem_private_key( pemlines, os.environ['PRIVATE_KEY_PASSPHRASE'].encode(), default_backend()) private_key_text = private_key_obj.private_bytes( Encoding.PEM, PrivateFormat.PKCS8, NoEncryption()).decode('utf-8') # Assume the public key has been registered in Snowflake: # private key in PE format # List of files in the stage specified in the pipe definition file_list = ['/CSKT_TEXT_2020-01-15-10-17.parquet'] ingest_manager = SimpleIngestManager( account='storaenso', host='storaenso.west-europe.azure.snowflakecomputing.com', user='******', pipe='SOURCE_SYSTEM_TEST.SAPRAW.TEST_PIPE_1', private_key=private_key_text) # List of files, but wrapped into a class staged_file_list = [] for file_name in file_list: staged_file_list.append(StagedFile(file_name, None)) try: resp = ingest_manager.ingest_files(staged_file_list) except HTTPError as e: # HTTP error, may need to retry logger.error(e) exit(1) # This means Snowflake has received file and will start loading
def load_via_snowpipe(self, s3_key, stream): """ Performs data transfer from the stage to snowflake using snowpipe. """ def _generate_pipe_name(dbname, schema_table_name): stripped_db_name = dbname.replace('"', '') stripped_table_name = schema_table_name.replace('"', '') return f"{stripped_db_name}.{stripped_table_name}_s3_pipe" def _generate_pipe_args(pipe_name, schema_table_name, columns_with_trans): pipe_args = dict( pipe_name=pipe_name, db_name=self.connection_config['dbname'], obj_name=schema_table_name, stage=self.connection_config['stage'], file_format=self.connection_config['file_format'], cols=', '.join([c['name'] for c in columns_with_trans]), ) return pipe_args def _load_private_key(): key_path = getattr(self.connection_config, "private_key_path", "/rsa_key.p8") password = getattr(self.connection_config, "private_key_password", None) with open(key_path, 'rb') as pem_in: private_key_obj = load_pem_private_key( pem_in.read(), password=password, backend=default_backend()) private_key_text = private_key_obj.private_bytes( Encoding.PEM, PrivateFormat.PKCS8, NoEncryption()).decode('utf-8') return private_key_text def _increment_value(exponentially=False): previous = 0 current = 1 while True: yield 2**(current + previous) if exponentially \ else current + previous + 30 current = current + previous previous = current - previous self.logger.info("Loading data using Snowpipe.") # Get list if columns with types and transformation columns_with_trans = [{ "name": safe_column_name(name), "trans": column_trans(schema) } for (name, schema) in self.flatten_schema.items()] schema_table_name = self.table_name(stream, False) db_name = self.connection_config['dbname'] pipe_name = _generate_pipe_name(db_name, schema_table_name) pipe_args = _generate_pipe_args(pipe_name, schema_table_name, columns_with_trans) create_pipe_sql = """create pipe {pipe_name} as copy into {db_name}.{obj_name} ({cols}) from @{db_name}.{stage} file_format = (format_name = {db_name}.{file_format} );""".format( **pipe_args) drop_pipe_sql = f"drop pipe if exists {pipe_name};" # Create snowpipe try: self.logger.debug("Creating snowpipe - %s.", pipe_name) # primary key in records found, raise warning if len(self.stream_schema_message['key_properties']) > 0: self.logger.warning( "Primary key %s found in the data stream. Snowpipe can not be used to " "consolidate records based upon keys. It can just copy data. " "Please refer the docs for further details", self.stream_schema_message['key_properties']) # primary key not present in the records, perform copy self.query(create_pipe_sql) except ProgrammingError as error: self.logger.error( "An error was encountered while creating the snowpipe, %s", error) # Private key encription required to perform snowpipe data transfer private_key_text = _load_private_key() ingest_manager = SimpleIngestManager( account=self.connection_config['account'].split('.')[0], host=self.connection_config['account'] + '.snowflakecomputing.com', user=self.connection_config['user'], pipe=pipe_name, scheme='https', port=443, private_key=private_key_text) # List of files, but wrapped into a class staged_file_list = [StagedFile(s3_key, None)] #ingest files using snowpipe retries = self.connection_config.get('max_retry', 5) wait_time = _increment_value(exponentially=True) while True: try: self.logger.debug( "Starting to ingest file via snowpipe, retries left %s", retries) resp = ingest_manager.ingest_files(staged_file_list) self.logger.info( "Snowpipe has recived the files and will now start loading: %s", resp['responseCode']) break except HTTPError as e: # HTTP error, wait and retry, exit if still fails self.logger.error(e) time.sleep(next(wait_time)) retries -= 1 if not retries: self.logger.critcal( "Max retry limit reached, Failed to load data using snowpipe" ) sys.exit(1) # Needs to wait for a while to perform transfer, delete pipe after transfer wait_time = _increment_value() while True: history_resp = ingest_manager.get_history() if len(history_resp['files']) > 0: self.logger.info( '''Ingest Report for snowpipe : %s STATUS: %s rowsInserted(rowsParsed): %s(%s)''', history_resp['pipe'], history_resp['completeResult'], history_resp['files'][0]['rowsInserted'], history_resp['files'][0]['rowsParsed']) self.query(drop_pipe_sql) break else: self.logger.debug('waiting for snowpipe to transfer data...') time.sleep(next(wait_time))
# load config # contains user, keypassphrase, account, host and pipe attributes parser = SafeConfigParser() parser.read('config.ini') pipe = dict(parser.items("keyrus_snowflake_pipe")) # add private key to dictionary (and drop keypassphrase) with open('/home/dom/.ssh/sys_mlb_snowpipe_key.p8', 'rb') as key: pipe['private_key'] = load_pem_private_key( key.read(), password=pipe.pop('keypassphrase').encode(), backend=default_backend()) # ingest manager ingest_manager = SimpleIngestManager(**pipe) # list of files to ingest file_list = ['game_log_2017_04.csv'] staged_file_list = [] for file_name in file_list: staged_file_list.append(StagedFile(file_name, None)) # submit ingestion request try: resp = ingest_manager.ingest_files(staged_file_list) log.info(f"Ingest Manager Response: {resp['responseCode']}") except (IngestResponseError, HTTPError) as e: log.error(e) exit(1)
backend=default_backend()) private_key_text = private_key_obj.private_bytes( Encoding.PEM, PrivateFormat.PKCS8, NoEncryption()).decode('utf-8') # Assume the public key has been registered in Snowflake: # private key in PEM format # List of files in the stage specified in the pipe definition file_list = [ '@%diamonds/diamonds_snowpipe_output0.csv.gz', '@%diamonds/diamonds_snowpipe_output1.csv.gz', '@%diamonds/diamonds_snowpipe_output2.csv.gz' ] ingest_manager = SimpleIngestManager( account='firn', host='firn.ap-southeast-2.snowflakecomputing.com', user='******', pipe='OLIVER_DB.PUBLIC.DIAMONDS_PIPE', private_key=private_key_text) # List of files, but wrapped into a class staged_file_list = [] for file_name in file_list: staged_file_list.append(StagedFile(file_name, None)) try: resp = ingest_manager.ingest_files(staged_file_list) except HTTPError as e: # HTTP error, may need to retry logger.error(e) exit(1) # This means Snowflake has received file and will start loading