def mock_helper(command, *args, **kwargs): if '_get_schema_columns' in command: raise ProgrammingError( "Information schema query returned too much data. Please repeat query with more " "selective predicates.", 90030) else: return original_execute(command, *args, **kwargs)
def mock_helper(command, *args, **kwargs): if '_get_schema_columns' in command: # Creating exception exactly how SQLAlchemy does raise DBAPIError.instance( ''' SELECT /* sqlalchemy:_get_schema_columns */ ic.table_name, ic.column_name, ic.data_type, ic.character_maximum_length, ic.numeric_precision, ic.numeric_scale, ic.is_nullable, ic.column_default, ic.is_identity, ic.comment FROM information_schema.columns ic WHERE ic.table_schema='schema_name' ORDER BY ic.ordinal_position''', {'table_schema': 'TESTSCHEMA'}, ProgrammingError("Information schema query returned too much data. Please repeat query with more " "selective predicates.", 90030), Error, hide_parameters=False, connection_invalidated=False, dialect=SnowflakeDialect(), ismulti=None ) else: return original_execute(command, *args, **kwargs)
def test_load_json_objects_to_snowflake_no_existing_table(mock_sf_connection): # Mock the Snowflake connection, cursor, and fetchone method. mock_cursor = mock_sf_connection.cursor() mock_fetchone = mock.Mock(side_effect=ProgrammingError("does not exist")) mock_cursor.fetchone = mock_fetchone with Flow("test") as f: snowflake.load_ga_data_to_snowflake( sf_credentials={}, sf_database="test_database", sf_schema="test_schema", sf_table="test_table", sf_role="test_role", sf_warehouse="test_warehouse", sf_storage_integration="test_storage_integration", bq_dataset="test_dataset", gcs_url="gs://test-location", date="2020-01-01", ) state = f.run() assert state.is_successful() mock_cursor.execute.assert_has_calls([ mock.call( "\n SELECT 1 FROM test_database.test_schema.test_table\n WHERE session:date='2020-01-01'\n AND ga_view_id='test_dataset'\n " ), # noqa mock.call( '\n CREATE TABLE IF NOT EXISTS test_database.test_schema.test_table (\n id number autoincrement start 1 increment 1,\n load_time timestamp_ltz default current_timestamp(),\n ga_view_id string,\n session VARIANT\n );\n ' ), # noqa mock.call( "\n CREATE OR REPLACE STAGE test_database.test_schema.test_table_stage\n URL = 'gcs://test-location'\n STORAGE_INTEGRATION = test_storage_integration\n FILE_FORMAT = (TYPE = JSON);\n " ), # noqa mock.call( "\n COPY INTO test_database.test_schema.test_table (ga_view_id, session)\n FROM (\n SELECT\n 'test_dataset',\n t.$1\n FROM @test_database.test_schema.test_table_stage t\n )\n PATTERN='.*'\n FORCE=False\n " ), # noqa ])
def test_load_s3_data_to_snowflake_no_existing_table(mock_sf_connection): # Mock the Snowflake connection, cursor, and fetchone method. mock_cursor = mock_sf_connection.cursor() mock_fetchone = mock.Mock(side_effect=ProgrammingError("does not exist")) mock_cursor.fetchone = mock_fetchone task = snowflake.load_s3_data_to_snowflake task.run( date="2020-01-01", date_property='date', sf_credentials={}, sf_database="test_database", sf_schema="test_schema", sf_table="test_table", sf_role="test_role", sf_warehouse="test_warehouse", sf_storage_integration_name="test_storage_integration", s3_url="s3://edx-test/test/", file="test_file.csv", pattern=".*", ) mock_cursor.execute.assert_has_calls([ mock.call( "\n SELECT 1 FROM test_database.test_schema.test_table\n WHERE date(PROPERTIES:date)=date('2020-01-01')\n " ), # noqa mock.call( '\n CREATE TABLE IF NOT EXISTS test_database.test_schema.test_table (\n ID NUMBER AUTOINCREMENT START 1 INCREMENT 1,\n LOAD_TIME TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),\n ORIGIN_FILE_NAME VARCHAR(16777216),\n ORIGIN_FILE_LINE NUMBER(38,0),\n ORIGIN_STR VARCHAR(16777216),\n PROPERTIES VARIANT\n );\n ' ), # noqa mock.call( "\n CREATE STAGE IF NOT EXISTS test_database.test_schema.test_table_stage\n URL = 's3://edx-test/test/'\n STORAGE_INTEGRATION = test_storage_integration\n FILE_FORMAT = (TYPE='JSON', STRIP_OUTER_ARRAY=TRUE);\n " ), # noqa mock.call( "\n COPY INTO test_database.test_schema.test_table (origin_file_name, origin_file_line, origin_str, properties)\n FROM (\n SELECT\n metadata$filename,\n metadata$file_row_number,\n t.$1,\n CASE\n WHEN CHECK_JSON(t.$1) IS NULL THEN t.$1\n ELSE NULL\n END\n FROM @test_database.test_schema.test_table_stage t\n )\n FILES = ( 'test_file.csv' )\n PATTERN = '.*'\n FORCE=False\n " ) # noqa ])
def test_export_snowflake_table_to_s3_with_exception(mock_sf_connection): mock_cursor = mock_sf_connection.cursor() mock_execute = mock.Mock(side_effect=ProgrammingError( 'Files already existing at the unload destination')) mock_cursor.execute = mock_execute task = snowflake.export_snowflake_table_to_s3 with pytest.raises( signals.FAIL, match= "Files already exist. Use overwrite option to force unloading."): task.run( sf_credentials={}, sf_database="test_database", sf_schema="test_schema", sf_table="test_table", sf_role="test_role", sf_warehouse="test_warehouse", sf_storage_integration="test_storage_integration", s3_path="s3://edx-test/test/", overwrite=False, )
def test_load_json_objects_to_snowflake_error_on_table_exist_check( mock_sf_connection): # Mock the Snowflake connection, cursor, and fetchone method. mock_cursor = mock_sf_connection.cursor() mock_fetchone = mock.Mock(side_effect=ProgrammingError()) mock_cursor.fetchone = mock_fetchone with Flow("test") as f: snowflake.load_ga_data_to_snowflake( sf_credentials={}, sf_database="test_database", sf_schema="test_schema", sf_table="test_table", sf_role="test_role", sf_warehouse="test_warehouse", sf_storage_integration="test_storage_integration", bq_dataset="test_dataset", gcs_url="gs://test-location", date="2020-01-01", ) with raise_on_exception(): with pytest.raises(ProgrammingError): f.run()
def write_pandas( conn: 'SnowflakeConnection', df: 'pandas.DataFrame', table_name: str, database: Optional[str] = None, schema: Optional[str] = None, chunk_size: Optional[int] = None, compression: str = 'gzip', on_error: str = 'abort_statement', parallel: int = 4 ) -> Tuple[bool, int, int, Sequence[Tuple[str, str, int, int, int, int, Optional[str], Optional[int], Optional[int], Optional[str]]]]: """ Allows users to most efficiently write back a pandas DataFrame to Snowflake by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table. Returns the COPY INTO command's results to verify ingestion. Returns whether all files were ingested correctly, number of chunks uploaded, and number of rows ingested with all of the COPY INTO command's output for debugging purposes. :Example: import pandas from snowflake.connector.pandas_tools import write_pandas df = pandas.DataFrame([('Mark', 10), ('Luke', 20)], columns=['name', 'balance']) success, nchunks, nrows, _ = write_pandas(cnx, df, 'customers') @param conn: connection to be used to communicate with Snowflake @param df: Dataframe we'd like to write back @param table_name: Table name where we want to insert into @param database: Database schema and table is in, if not provided the default one will be used @param schema: Schema table is in, if not provided the default one will be used @param chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once @param compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a better compression, while snappy is faster. Use whichever is more appropriate. @param on_error: Action to take when COPY INTO statements fail, default follows documentation at: https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions @param parallel: Number of threads to be used when uploading chunks, default follows documentation at: https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters @return: tuple of whether all chunks were ingested correctly, # of chunks, # of ingested rows, and ingest's output """ if database is not None and schema is None: raise ProgrammingError( "Schema has to be provided to write_pandas when a database is provided" ) # This dictionary maps the compression algorithm to Snowflake put copy into command type # https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#type-parquet compression_map = {'gzip': 'auto', 'snappy': 'snappy'} if compression not in compression_map.keys(): raise ProgrammingError( "Invalid compression '{}', only acceptable values are: {}".format( compression, compression_map.keys())) location = (('"' + database + '".') if database else '' + ('"' + schema + '".') if schema else '' + ('"' + table_name + '"')) if chunk_size is None: chunk_size = len(df) cursor = conn.cursor() stage_name = None # Forward declaration while True: try: stage_name = ''.join( random.choice(string.ascii_lowercase) for _ in range(5)) cursor.execute( 'create temporary stage /* Python:snowflake.connector.pandas_tools.write_pandas() */ ' '"{stage_name}"'.format(stage_name=stage_name), _is_internal=True).fetchall() break except ProgrammingError as pe: if pe.msg.endswith('already exists.'): continue raise with TemporaryDirectory() as tmp_folder: for i, chunk in chunk_helper(df, chunk_size): chunk_path = os.path.join(tmp_folder, 'file{}.txt'.format(i)) # Dump chunk into parquet file chunk.to_parquet(chunk_path, compression=compression) # Upload parquet file cursor.execute( 'PUT /* Python:snowflake.connector.pandas_tools.write_pandas() */ ' 'file://{path} @"{stage_name}" PARALLEL={parallel}'.format( path=chunk_path, stage_name=stage_name, parallel=parallel), _is_internal=True) # Remove chunk file os.remove(chunk_path) copy_results = cursor.execute(( 'COPY INTO {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ ' 'FROM @"{stage_name}" FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression}) ' 'MATCH_BY_COLUMN_NAME=CASE_SENSITIVE PURGE=TRUE ON_ERROR={on_error}' ).format(location=location, stage_name=stage_name, compression=compression_map[compression], on_error=on_error), _is_internal=True).fetchall() cursor.close() return (all((e[1] == 'LOADED' for e in copy_results)), len(copy_results), sum((e[3] for e in copy_results)), copy_results)
def write_pandas( conn: 'SnowflakeConnection', df: 'pandas.DataFrame', table_name: str, database: Optional[str] = None, schema: Optional[str] = None, chunk_size: Optional[int] = None, compression: str = 'gzip', on_error: str = 'abort_statement', parallel: int = 4, quote_identifiers: bool = True ) -> Tuple[bool, int, int, Sequence[Tuple[str, str, int, int, int, int, Optional[str], Optional[int], Optional[int], Optional[str]]]]: """Allows users to most efficiently write back a pandas DataFrame to Snowflake. It works by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table. Returns whether all files were ingested correctly, number of chunks uploaded, and number of rows ingested with all of the COPY INTO command's output for debugging purposes. Example usage: import pandas from snowflake.connector.pandas_tools import write_pandas df = pandas.DataFrame([('Mark', 10), ('Luke', 20)], columns=['name', 'balance']) success, nchunks, nrows, _ = write_pandas(cnx, df, 'customers') Args: conn: Connection to be used to communicate with Snowflake. df: Dataframe we'd like to write back. table_name: Table name where we want to insert into. database: Database schema and table is in, if not provided the default one will be used (Default value = None). schema: Schema table is in, if not provided the default one will be used (Default value = None). chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once (Default value = None). compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a better compression, while snappy is faster. Use whichever is more appropriate (Default value = 'gzip'). on_error: Action to take when COPY INTO statements fail, default follows documentation at: https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions (Default value = 'abort_statement'). parallel: Number of threads to be used when uploading chunks, default follows documentation at: https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters (Default value = 4). quote_identifiers: By default, identifiers, specifically database, schema, table and column names (from df.columns) will be quoted. If set to False, identifiers are passed on to Snowflake without quoting. I.e. identifiers will be coerced to uppercase by Snowflake. (Default value = True) Returns: Returns the COPY INTO command's results to verify ingestion in the form of a tuple of whether all chunks were ingested correctly, # of chunks, # of ingested rows, and ingest's output. """ if database is not None and schema is None: raise ProgrammingError( "Schema has to be provided to write_pandas when a database is provided" ) # This dictionary maps the compression algorithm to Snowflake put copy into command type # https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#type-parquet compression_map = {'gzip': 'auto', 'snappy': 'snappy'} if compression not in compression_map.keys(): raise ProgrammingError( "Invalid compression '{}', only acceptable values are: {}".format( compression, compression_map.keys())) if quote_identifiers: location = ((('"' + database + '".') if database else '') + (('"' + schema + '".') if schema else '') + ('"' + table_name + '"')) else: location = ((database + '.' if database else '') + (schema + '.' if schema else '') + (table_name)) if chunk_size is None: chunk_size = len(df) cursor = conn.cursor() stage_name = None # Forward declaration while True: try: stage_name = ''.join( random.choice(string.ascii_lowercase) for _ in range(5)) create_stage_sql = ( 'create temporary stage /* Python:snowflake.connector.pandas_tools.write_pandas() */ ' '"{stage_name}"').format(stage_name=stage_name) logger.debug("creating stage with '{}'".format(create_stage_sql)) cursor.execute(create_stage_sql, _is_internal=True).fetchall() break except ProgrammingError as pe: if pe.msg.endswith('already exists.'): continue raise with TemporaryDirectory() as tmp_folder: for i, chunk in chunk_helper(df, chunk_size): chunk_path = os.path.join(tmp_folder, 'file{}.txt'.format(i)) # Dump chunk into parquet file chunk.to_parquet(chunk_path, compression=compression) # Upload parquet file upload_sql = ( 'PUT /* Python:snowflake.connector.pandas_tools.write_pandas() */ ' '\'file://{path}\' @"{stage_name}" PARALLEL={parallel}' ).format(path=chunk_path.replace('\\', '\\\\').replace('\'', '\\\''), stage_name=stage_name, parallel=parallel) logger.debug("uploading files with '{}'".format(upload_sql)) cursor.execute(upload_sql, _is_internal=True) # Remove chunk file os.remove(chunk_path) if quote_identifiers: columns = '"' + '","'.join(list(df.columns)) + '"' else: columns = ','.join(list(df.columns)) # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html) parquet_columns = '$1:' + ',$1:'.join(df.columns) copy_into_sql = ( 'COPY INTO {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ ' '({columns}) ' 'FROM (SELECT {parquet_columns} FROM @"{stage_name}") ' 'FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression}) ' 'PURGE=TRUE ON_ERROR={on_error}').format( location=location, columns=columns, parquet_columns=parquet_columns, stage_name=stage_name, compression=compression_map[compression], on_error=on_error) logger.debug("copying into with '{}'".format(copy_into_sql)) copy_results = cursor.execute(copy_into_sql, _is_internal=True).fetchall() cursor.close() return (all(e[1] == 'LOADED' for e in copy_results), len(copy_results), sum(e[3] for e in copy_results), copy_results)
def write_pandas( conn: SnowflakeConnection, df: pandas.DataFrame, table_name: str, database: str | None = None, schema: str | None = None, chunk_size: int | None = None, compression: str = "gzip", on_error: str = "abort_statement", parallel: int = 4, quote_identifiers: bool = True, auto_create_table: bool = False, create_temp_table: bool = False, ) -> tuple[ bool, int, int, Sequence[ tuple[ str, str, int, int, int, int, str | None, int | None, int | None, str | None, ] ], ]: """Allows users to most efficiently write back a pandas DataFrame to Snowflake. It works by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table. Returns whether all files were ingested correctly, number of chunks uploaded, and number of rows ingested with all of the COPY INTO command's output for debugging purposes. Example usage: import pandas from snowflake.connector.pandas_tools import write_pandas df = pandas.DataFrame([('Mark', 10), ('Luke', 20)], columns=['name', 'balance']) success, nchunks, nrows, _ = write_pandas(cnx, df, 'customers') Args: conn: Connection to be used to communicate with Snowflake. df: Dataframe we'd like to write back. table_name: Table name where we want to insert into. database: Database schema and table is in, if not provided the default one will be used (Default value = None). schema: Schema table is in, if not provided the default one will be used (Default value = None). chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once (Default value = None). compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a better compression, while snappy is faster. Use whichever is more appropriate (Default value = 'gzip'). on_error: Action to take when COPY INTO statements fail, default follows documentation at: https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions (Default value = 'abort_statement'). parallel: Number of threads to be used when uploading chunks, default follows documentation at: https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters (Default value = 4). quote_identifiers: By default, identifiers, specifically database, schema, table and column names (from df.columns) will be quoted. If set to False, identifiers are passed on to Snowflake without quoting. I.e. identifiers will be coerced to uppercase by Snowflake. (Default value = True) auto_create_table: When true, will automatically create a table with corresponding columns for each column in the passed in DataFrame. The table will not be created if it already exists create_temp_table: Will make the auto-created table as a temporary table Returns: Returns the COPY INTO command's results to verify ingestion in the form of a tuple of whether all chunks were ingested correctly, # of chunks, # of ingested rows, and ingest's output. """ if database is not None and schema is None: raise ProgrammingError( "Schema has to be provided to write_pandas when a database is provided" ) # This dictionary maps the compression algorithm to Snowflake put copy into command type # https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#type-parquet compression_map = {"gzip": "auto", "snappy": "snappy"} if compression not in compression_map.keys(): raise ProgrammingError( "Invalid compression '{}', only acceptable values are: {}".format( compression, compression_map.keys() ) ) if quote_identifiers: location = ( (('"' + database + '".') if database else "") + (('"' + schema + '".') if schema else "") + ('"' + table_name + '"') ) else: location = ( (database + "." if database else "") + (schema + "." if schema else "") + (table_name) ) if chunk_size is None: chunk_size = len(df) cursor = conn.cursor() stage_name = None # Forward declaration while True: try: stage_name = "".join( random.choice(string.ascii_lowercase) for _ in range(5) ) create_stage_sql = ( "create temporary stage /* Python:snowflake.connector.pandas_tools.write_pandas() */ " '"{stage_name}"' ).format(stage_name=stage_name) logger.debug(f"creating stage with '{create_stage_sql}'") cursor.execute(create_stage_sql, _is_internal=True).fetchall() break except ProgrammingError as pe: if pe.msg.endswith("already exists."): continue raise with TemporaryDirectory() as tmp_folder: for i, chunk in chunk_helper(df, chunk_size): chunk_path = os.path.join(tmp_folder, f"file{i}.txt") # Dump chunk into parquet file chunk.to_parquet(chunk_path, compression=compression) # Upload parquet file upload_sql = ( "PUT /* Python:snowflake.connector.pandas_tools.write_pandas() */ " "'file://{path}' @\"{stage_name}\" PARALLEL={parallel}" ).format( path=chunk_path.replace("\\", "\\\\").replace("'", "\\'"), stage_name=stage_name, parallel=parallel, ) logger.debug(f"uploading files with '{upload_sql}'") cursor.execute(upload_sql, _is_internal=True) # Remove chunk file os.remove(chunk_path) if quote_identifiers: columns = '"' + '","'.join(list(df.columns)) + '"' else: columns = ",".join(list(df.columns)) if auto_create_table: file_format_name = None while True: try: file_format_name = ( '"' + "".join(random.choice(string.ascii_lowercase) for _ in range(5)) + '"' ) file_format_sql = ( f"CREATE FILE FORMAT {file_format_name} " f"/* Python:snowflake.connector.pandas_tools.write_pandas() */ " f"TYPE=PARQUET COMPRESSION={compression_map[compression]}" ) logger.debug(f"creating file format with '{file_format_sql}'") cursor.execute(file_format_sql, _is_internal=True) break except ProgrammingError as pe: if pe.msg.endswith("already exists."): continue raise infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@\"{stage_name}\"', file_format=>'{file_format_name}'))" logger.debug(f"inferring schema with '{infer_schema_sql}'") column_type_mapping = dict( cursor.execute(infer_schema_sql, _is_internal=True).fetchall() ) # Infer schema can return the columns out of order depending on the chunking we do when uploading # so we have to iterate through the dataframe columns to make sure we create the table with its # columns in order quote = '"' if quote_identifiers else "" create_table_columns = ", ".join( [f"{quote}{c}{quote} {column_type_mapping[c]}" for c in df.columns] ) create_table_sql = ( f"CREATE {'TEMP ' if create_temp_table else ''}TABLE IF NOT EXISTS {location} " f"({create_table_columns})" f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ " ) logger.debug(f"auto creating table with '{create_table_sql}'") cursor.execute(create_table_sql, _is_internal=True) drop_file_format_sql = f"DROP FILE FORMAT IF EXISTS {file_format_name}" logger.debug(f"dropping file format with '{drop_file_format_sql}'") cursor.execute(drop_file_format_sql, _is_internal=True) # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html) if quote_identifiers: parquet_columns = "$1:" + ",$1:".join(f'"{c}"' for c in df.columns) else: parquet_columns = "$1:" + ",$1:".join(df.columns) copy_into_sql = ( "COPY INTO {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ " "({columns}) " 'FROM (SELECT {parquet_columns} FROM @"{stage_name}") ' "FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression}) " "PURGE=TRUE ON_ERROR={on_error}" ).format( location=location, columns=columns, parquet_columns=parquet_columns, stage_name=stage_name, compression=compression_map[compression], on_error=on_error, ) logger.debug(f"copying into with '{copy_into_sql}'") copy_results = cursor.execute(copy_into_sql, _is_internal=True).fetchall() cursor.close() return ( all(e[1] == "LOADED" for e in copy_results), len(copy_results), sum(int(e[3]) for e in copy_results), copy_results, )