def main(): for pipe in db.get_pipes('data'): metadata = yaml.load(pipe['comment']) if metadata and metadata.get('type') != 'Azure': log.info(f"{pipe['name']} is not an Azure pipe, and will be skipped.") continue blob_name = metadata['blob'] account_name = metadata['account'] pipe_name = pipe['name'] table = metadata['target'] sas_token_envar = 'AZURE_SAS_TOKEN_' + metadata.get('suffix', '') if sas_token_envar in environ: encrypted_sas_token = environ.get(sas_token_envar) elif 'encrypted_sas_token' in metadata: encrypted_sas_token = metadata['encrypted_sas_token'] else: log.info(f"{pipe['name']} has no azure auth") continue sas_token = vault.decrypt_if_encrypted(encrypted_sas_token) log.info(f"Now working on pipe {pipe_name}") endpoint_suffix = metadata.get('endpoint_suffix', 'core.windows.net') block_blob_service = BlockBlobService( account_name=account_name, sas_token=sas_token, endpoint_suffix=endpoint_suffix ) files = block_blob_service.list_blobs(blob_name) newest_time = get_timestamp(table) new_files = [] if newest_time: for file in files: if file.properties.creation_time > newest_time: new_files.append(StagedFile(file.name, None)) else: for file in files: new_files.append(StagedFile(file.name, None)) log.info(new_files) # Proxy object that abstracts the Snowpipe REST API ingest_manager = SimpleIngestManager(account=environ.get('SNOWFLAKE_ACCOUNT'), host=f'{environ.get("SNOWFLAKE_ACCOUNT")}.snowflakecomputing.com', user=environ.get('SA_USER'), pipe=f'SNOWALERT.DATA.{pipe_name}', private_key=load_pkb_rsa(PRIVATE_KEY, PRIVATE_KEY_PASSWORD)) if len(new_files) > 0: try: response = ingest_manager.ingest_files(new_files) log.info(response) except Exception as e: log.error(e) return
def ingest(self, files: Iterable[str]) -> None: if not files: raise ValueError('files must be defined') request_id = uuid.uuid4() self.logger.debug(f'request_id: {request_id}') staged_files = [StagedFile(name, None) for name in files] body = self.ingest_manager.ingest_files( staged_files=staged_files, request_id=request_id, ) print(json.dumps(body, indent=4))
def test_simple_ingest(connection_ctx, test_util): param = connection_ctx['param'] pipe_name = '{}.{}.TEST_SIMPLE_INGEST_PIPE'.format(param['database'], param['schema']) private_key = test_util.read_private_key() print(private_key) cur = connection_ctx['cnx'].cursor() test_file = os.path.join(test_util.get_data_dir(), 'test_file.csv') cur.execute( 'create or replace table TEST_SIMPLE_INGEST_TABLE(c1 number, c2 string)' ) cur.execute('create or replace stage TEST_SIMPLE_INGEST_STAGE') cur.execute('put file://{} @TEST_SIMPLE_INGEST_STAGE'.format(test_file)) cur.execute( 'create or replace pipe {0} as copy into TEST_SIMPLE_INGEST_TABLE ' 'from @TEST_SIMPLE_INGEST_STAGE'.format(pipe_name)) ingest_manager = SimpleIngestManager(account=param['account'], user=param['user'], private_key=private_key, pipe=pipe_name, scheme=param['protocol'], host=param['host'], port=param['port']) staged_files = [StagedFile('test_file.csv.gz', None)] resp = ingest_manager.ingest_files(staged_files) assert resp['responseCode'] == 'SUCCESS' start_polling_time = time.time() while time.time() - start_polling_time < 120: history_resp = ingest_manager.get_history() if len(history_resp['files']) == 1: assert history_resp['files'][0]['path'] == 'test_file.csv.gz' return else: # wait for 20 seconds time.sleep(20) assert False
def ingest(table_name, options): base_name = re.sub(r'_CONNECTION$', '', table_name) storage_account = options['storage_account'] sas_token = vault.decrypt_if_encrypted(options['sas_token']) suffix = options['suffix'] container_name = options['container_name'] snowflake_account = options['snowflake_account'] sa_user = options['sa_user'] database = options['database'] block_blob_service = BlockBlobService(account_name=storage_account, sas_token=sas_token, endpoint_suffix=suffix) db.execute(f"select SYSTEM$PIPE_FORCE_RESUME('DATA.{base_name}_PIPE');") last_loaded = db.fetch_latest(f'data.{table_name}', 'loaded_on') log.info(f"Last loaded time is {last_loaded}") blobs = block_blob_service.list_blobs(container_name) new_files = [ StagedFile(b.name, None) for b in blobs if (last_loaded is None or b.properties.creation_time > last_loaded) ] log.info(f"Found {len(new_files)} files to ingest") # Proxy object that abstracts the Snowpipe REST API ingest_manager = SimpleIngestManager( account=snowflake_account, host=f'{snowflake_account}.snowflakecomputing.com', user=sa_user, pipe=f'{database}.data.{base_name}_PIPE', private_key=load_pkb_rsa(PRIVATE_KEY, PRIVATE_KEY_PASSWORD)) if len(new_files) > 0: for file_group in groups_of(4999, new_files): response = ingest_manager.ingest_files(file_group) log.info(response) yield len(file_group)
# List of files in the stage specified in the pipe definition file_list = ['departments.csv'] ingest_manager = SimpleIngestManager( account='<>', host='<>.<>.snowflakecomputing.com', user='******', pipe='dataload_db.dataload.load_dept_pipe', private_key=private_key_text) # List of files, but wrapped into a class staged_file_list = [] for file_name in file_list: print(file_name) staged_file_list.append(StagedFile(file_name, None)) for staged_file in staged_file_list: print(staged_file) try: resp = ingest_manager.ingest_files(staged_file_list) print(resp) except HTTPError as e: # HTTP error, may need to retry logger.error(e) exit(1) # This means Snowflake has received file and will start loading assert (resp['responseCode'] == 'SUCCESS')
def load_via_snowpipe(self, s3_key, stream): """ Performs data transfer from the stage to snowflake using snowpipe. """ def _generate_pipe_name(dbname, schema_table_name): stripped_db_name = dbname.replace('"', '') stripped_table_name = schema_table_name.replace('"', '') return f"{stripped_db_name}.{stripped_table_name}_s3_pipe" def _generate_pipe_args(pipe_name, schema_table_name, columns_with_trans): pipe_args = dict( pipe_name=pipe_name, db_name=self.connection_config['dbname'], obj_name=schema_table_name, stage=self.connection_config['stage'], file_format=self.connection_config['file_format'], cols=', '.join([c['name'] for c in columns_with_trans]), ) return pipe_args def _load_private_key(): key_path = getattr(self.connection_config, "private_key_path", "/rsa_key.p8") password = getattr(self.connection_config, "private_key_password", None) with open(key_path, 'rb') as pem_in: private_key_obj = load_pem_private_key( pem_in.read(), password=password, backend=default_backend()) private_key_text = private_key_obj.private_bytes( Encoding.PEM, PrivateFormat.PKCS8, NoEncryption()).decode('utf-8') return private_key_text def _increment_value(exponentially=False): previous = 0 current = 1 while True: yield 2**(current + previous) if exponentially \ else current + previous + 30 current = current + previous previous = current - previous self.logger.info("Loading data using Snowpipe.") # Get list if columns with types and transformation columns_with_trans = [{ "name": safe_column_name(name), "trans": column_trans(schema) } for (name, schema) in self.flatten_schema.items()] schema_table_name = self.table_name(stream, False) db_name = self.connection_config['dbname'] pipe_name = _generate_pipe_name(db_name, schema_table_name) pipe_args = _generate_pipe_args(pipe_name, schema_table_name, columns_with_trans) create_pipe_sql = """create pipe {pipe_name} as copy into {db_name}.{obj_name} ({cols}) from @{db_name}.{stage} file_format = (format_name = {db_name}.{file_format} );""".format( **pipe_args) drop_pipe_sql = f"drop pipe if exists {pipe_name};" # Create snowpipe try: self.logger.debug("Creating snowpipe - %s.", pipe_name) # primary key in records found, raise warning if len(self.stream_schema_message['key_properties']) > 0: self.logger.warning( "Primary key %s found in the data stream. Snowpipe can not be used to " "consolidate records based upon keys. It can just copy data. " "Please refer the docs for further details", self.stream_schema_message['key_properties']) # primary key not present in the records, perform copy self.query(create_pipe_sql) except ProgrammingError as error: self.logger.error( "An error was encountered while creating the snowpipe, %s", error) # Private key encription required to perform snowpipe data transfer private_key_text = _load_private_key() ingest_manager = SimpleIngestManager( account=self.connection_config['account'].split('.')[0], host=self.connection_config['account'] + '.snowflakecomputing.com', user=self.connection_config['user'], pipe=pipe_name, scheme='https', port=443, private_key=private_key_text) # List of files, but wrapped into a class staged_file_list = [StagedFile(s3_key, None)] #ingest files using snowpipe retries = self.connection_config.get('max_retry', 5) wait_time = _increment_value(exponentially=True) while True: try: self.logger.debug( "Starting to ingest file via snowpipe, retries left %s", retries) resp = ingest_manager.ingest_files(staged_file_list) self.logger.info( "Snowpipe has recived the files and will now start loading: %s", resp['responseCode']) break except HTTPError as e: # HTTP error, wait and retry, exit if still fails self.logger.error(e) time.sleep(next(wait_time)) retries -= 1 if not retries: self.logger.critcal( "Max retry limit reached, Failed to load data using snowpipe" ) sys.exit(1) # Needs to wait for a while to perform transfer, delete pipe after transfer wait_time = _increment_value() while True: history_resp = ingest_manager.get_history() if len(history_resp['files']) > 0: self.logger.info( '''Ingest Report for snowpipe : %s STATUS: %s rowsInserted(rowsParsed): %s(%s)''', history_resp['pipe'], history_resp['completeResult'], history_resp['files'][0]['rowsInserted'], history_resp['files'][0]['rowsParsed']) self.query(drop_pipe_sql) break else: self.logger.debug('waiting for snowpipe to transfer data...') time.sleep(next(wait_time))