예제 #1
0
def pd_writer(table: pandas.io.sql.SQLTable,
              conn: Union['sqlalchemy.engine.Engine',
                          'sqlalchemy.engine.Connection'], keys: Iterable,
              data_iter: Iterable) -> None:
    """
    This is a wrapper on top of write_pandas to make it compatible with to_sql method in pandas.
    :Example:

    import pandas as pd
    from snowflake.connector.pandas_tools import pd_writer

    sf_connector_version_df = pd.DataFrame([('snowflake-connector-python', '1.0')], columns=['NAME', 'NEWEST_VERSION'])
    sf_connector_version_df.to_sql('driver_versions', engine, index=False, method=pd_writer)
    @param table: Pandas package's table object
    @param conn: SQLAlchemy engine object to talk to Snowflake
    @param keys: Column names that we are trying to insert
    @param data_iter: Iterator over the rows
    @return: None
    """
    sf_connection = conn.connection.connection
    df = pandas.DataFrame(data_iter, columns=keys)
    write_pandas(
        conn=sf_connection,
        df=df,
        # Note: Our sqlalchemy connector creates tables case insensitively
        table_name=table.name.upper(),
        schema=table.schema)
def test_auto_create_table_similar_column_names(
    conn_cnx: Callable[..., Generator[SnowflakeConnection, None, None]], ):
    """Tests whether similar names do not cause issues when auto-creating a table as expected."""
    table_name = random_string(5, "numbas_")
    df_data = [(10, 11), (20, 21)]

    df = pandas.DataFrame(df_data, columns=["number", "Number"])
    select_sql = f'SELECT * FROM "{table_name}"'
    drop_sql = f'DROP TABLE IF EXISTS "{table_name}"'
    with conn_cnx() as cnx:
        try:
            success, nchunks, nrows, _ = write_pandas(cnx,
                                                      df,
                                                      table_name,
                                                      quote_identifiers=True,
                                                      auto_create_table=True)

            # Check write_pandas output
            assert success
            assert nrows == len(df_data)
            assert nchunks == 1
            # Check table's contents
            result = cnx.cursor(DictCursor).execute(select_sql).fetchall()
            for row in result:
                assert (
                    row["number"],
                    row["Number"],
                ) in df_data
        finally:
            cnx.execute_string(drop_sql)
def pd_writer(table: 'pandas.io.sql.SQLTable',
              conn: Union['sqlalchemy.engine.Engine', 'sqlalchemy.engine.Connection'],
              keys: Iterable,
              data_iter: Iterable,
              quote_identifiers: bool = True) -> None:
    """This is a wrapper on top of write_pandas to make it compatible with to_sql method in pandas.

        Example usage:
            import pandas as pd
            from snowflake.connector.pandas_tools import pd_writer

            sf_connector_version_df = pd.DataFrame([('snowflake-connector-python', '1.0')], columns=['NAME', 'NEWEST_VERSION'])
            sf_connector_version_df.to_sql('driver_versions', engine, index=False, method=pd_writer)

            # to use quote_identifiers=False
            from functools import partial
            sf_connector_version_df.to_sql(
                'driver_versions', engine, index=False, method=partial(pd_writer, quote_identifiers=False))

    Args:
        table: Pandas package's table object.
        conn: SQLAlchemy engine object to talk to Snowflake.
        keys: Column names that we are trying to insert.
        data_iter: Iterator over the rows.
        quote_identifiers: if True (default), quote identifiers passed to Snowflake. If False, identifiers are not
            quoted (and typically coerced to uppercase by Snowflake)
    """
    sf_connection = conn.connection.connection
    df = pandas.DataFrame(data_iter, columns=keys)
    write_pandas(conn=sf_connection,
                 df=df,
                 # Note: Our sqlalchemy connector creates tables case insensitively
                 table_name=table.name.upper(),
                 schema=table.schema,
                 quote_identifiers=quote_identifiers)
예제 #4
0
def test_default_value_insertion(
    conn_cnx: Callable[..., Generator["SnowflakeConnection", None, None]],
    quote_identifiers: bool,
):
    """Tests whether default values can be successfully inserted with the pandas writeback."""
    table_name = "users"
    df_data = [("Mark", 10), ("Luke", 20)]

    # Create a DataFrame containing data about customers
    df = pandas.DataFrame(df_data, columns=["name", "balance"])
    # Assume quote_identifiers is true in string and if not remove " from strings
    create_sql = """CREATE OR REPLACE TABLE "{}"
                 ("name" STRING, "balance" INT,
                 "id" varchar(36) default uuid_string(),
                 "ts" timestamp_ltz default current_timestamp)""".format(
        table_name)
    select_sql = 'SELECT * FROM "{}"'.format(table_name)
    drop_sql = 'DROP TABLE IF EXISTS "{}"'.format(table_name)
    if not quote_identifiers:
        create_sql = create_sql.replace('"', "")
        select_sql = select_sql.replace('"', "")
        drop_sql = drop_sql.replace('"', "")
    with conn_cnx() as cnx:  # type: SnowflakeConnection
        cnx.execute_string(create_sql)
        try:
            success, nchunks, nrows, _ = write_pandas(
                cnx, df, table_name, quote_identifiers=quote_identifiers)

            # Check write_pandas output
            assert success
            assert nrows == len(df_data)
            assert nchunks == 1
            # Check table's contents
            result = cnx.cursor(DictCursor).execute(select_sql).fetchall()
            for row in result:
                assert (row["id" if quote_identifiers else "ID"]
                        is not None)  # ID (UUID String)
                assert len(row["id" if quote_identifiers else "ID"]) == 36
                assert (row["ts" if quote_identifiers else "TS"]
                        is not None)  # TS (Current Timestamp)
                assert isinstance(row["ts" if quote_identifiers else "TS"],
                                  datetime)
                assert (
                    row["name" if quote_identifiers else "NAME"],
                    row["balance" if quote_identifiers else "BALANCE"],
                ) in df_data
        finally:
            cnx.execute_string(drop_sql)
예제 #5
0
def test_resultbatches_pandas_functionality(conn_cnx):
    """Fetch ArrowResultBatches as pandas dataframes and check its result."""
    rowcount = 100000
    expected_df = pandas.DataFrame(data={"A": range(rowcount)})
    with conn_cnx() as con:
        with con.cursor() as cur:
            cur.execute(
                f"select seq4() a from table(generator(rowcount => {rowcount}));"
            )
            assert cur._result_set.total_row_index() == rowcount
            result_batches = cur.get_result_batches()
            assert len(result_batches) > 1
    tables = itertools.chain.from_iterable(
        list(b.create_iter(iter_unit=TABLE_UNIT)) for b in result_batches)
    final_df = pyarrow.concat_tables(tables).to_pandas()
    assert numpy.array_equal(expected_df, final_df)
def test_special_name_quoting(
    conn_cnx: Callable[..., Generator[SnowflakeConnection, None, None]],
    auto_create_table: bool,
):
    """Tests whether special column names get quoted as expected."""
    table_name = "users"
    df_data = [("Mark", 10), ("Luke", 20)]

    df = pandas.DataFrame(df_data, columns=["00name", "bAlance"])
    create_sql = (f'CREATE OR REPLACE TABLE "{table_name}"'
                  '("00name" STRING, "bAlance" INT, "id" INT AUTOINCREMENT)')
    select_sql = f'SELECT * FROM "{table_name}"'
    drop_sql = f'DROP TABLE IF EXISTS "{table_name}"'
    with conn_cnx() as cnx:  # type: SnowflakeConnection
        if not auto_create_table:
            cnx.execute_string(create_sql)
        try:
            success, nchunks, nrows, _ = write_pandas(
                cnx,
                df,
                table_name,
                quote_identifiers=True,
                auto_create_table=auto_create_table,
            )

            # Check write_pandas output
            assert success
            assert nrows == len(df_data)
            assert nchunks == 1
            # Check table's contents
            result = cnx.cursor(DictCursor).execute(select_sql).fetchall()
            for row in result:
                # The auto create table functionality does not auto-create an incrementing ID
                if not auto_create_table:
                    assert row["id"] in (1, 2)
                assert (
                    row["00name"],
                    row["bAlance"],
                ) in df_data
        finally:
            cnx.execute_string(drop_sql)
def test_autoincrement_insertion(
    conn_cnx: Callable[..., Generator[SnowflakeConnection, None, None]],
    quote_identifiers: bool,
):
    """Tests whether default values can be successfully inserted with the pandas writeback."""
    table_name = "users"
    df_data = [("Mark", 10), ("Luke", 20)]

    # Create a DataFrame containing data about customers
    df = pandas.DataFrame(df_data, columns=["name", "balance"])
    # Assume quote_identifiers is true in string and if not remove " from strings
    create_sql = ('CREATE OR REPLACE TABLE "{}"'
                  '("name" STRING, "balance" INT, "id" INT AUTOINCREMENT)'
                  ).format(table_name)
    select_sql = f'SELECT * FROM "{table_name}"'
    drop_sql = f'DROP TABLE IF EXISTS "{table_name}"'
    if not quote_identifiers:
        create_sql = create_sql.replace('"', "")
        select_sql = select_sql.replace('"', "")
        drop_sql = drop_sql.replace('"', "")
    with conn_cnx() as cnx:  # type: SnowflakeConnection
        cnx.execute_string(create_sql)
        try:
            success, nchunks, nrows, _ = write_pandas(
                cnx, df, table_name, quote_identifiers=quote_identifiers)

            # Check write_pandas output
            assert success
            assert nrows == len(df_data)
            assert nchunks == 1
            # Check table's contents
            result = cnx.cursor(DictCursor).execute(select_sql).fetchall()
            for row in result:
                assert row["id" if quote_identifiers else "ID"] in (1, 2)
                assert (
                    row["name" if quote_identifiers else "NAME"],
                    row["balance" if quote_identifiers else "BALANCE"],
                ) in df_data
        finally:
            cnx.execute_string(drop_sql)
예제 #8
0
def fetch_pandas(conn_cnx, sql, row_count, col_count, method='one'):
    """Tests that parameters can be customized.

    Args:
        conn_cnx: Connection object.
        sql: SQL command for execution.
        row_count: Number of total rows combining all dataframes.
        col_count: Number of columns in dataframe.
        method: If method is 'batch', we fetch dataframes in batch. If method is 'one', we fetch a single dataframe
            containing all data (Default value = 'one').
    """
    assert row_count != 0, '# of rows should be larger than 0'
    assert col_count != 0, '# of columns should be larger than 0'

    with conn_cnx() as cnx_row:
        with conn_cnx() as cnx_table:
            # fetch dataframe by fetching row by row
            cursor_row = cnx_row.cursor()
            cursor_row.execute(SQL_ENABLE_ARROW)
            cursor_row.execute(sql)

            # build dataframe
            # actually its exec time would be different from `pd.read_sql()` via sqlalchemy as most people use
            # further perf test can be done separately
            start_time = time.time()
            rows = 0
            if method == 'one':
                df_old = pd.DataFrame(cursor_row.fetchall(), columns=['c{}'.format(i) for i in range(col_count)])
            else:
                print("use fetchmany")
                while True:
                    dat = cursor_row.fetchmany(10000)
                    if not dat:
                        break
                    else:
                        df_old = pd.DataFrame(dat, columns=['c{}'.format(i) for i in range(col_count)])
                        rows += df_old.shape[0]
            end_time = time.time()
            print('The original way took {}s'.format(end_time - start_time))
            cursor_row.close()

            # fetch dataframe with new arrow support
            cursor_table = cnx_table.cursor()
            cursor_table.execute(SQL_ENABLE_ARROW)
            cursor_table.execute(sql)

            # build dataframe
            total_rows, total_batches = 0, 0
            start_time = time.time()
            if method == 'one':
                df_new = cursor_table.fetch_pandas_all()
                total_rows = df_new.shape[0]
            else:
                for df_new in cursor_table.fetch_pandas_batches():
                    total_rows += df_new.shape[0]
                    total_batches += 1
            end_time = time.time()
            print('new way (fetching {}) took {}s'.format(method, end_time - start_time))
            if method == 'batch':
                print('new way has # of batches : {}'.format(total_batches))
            cursor_table.close()
            assert total_rows == row_count, 'there should be {} rows, but {} rows'.format(row_count, total_rows)

            # verify the correctness
            # only do it when fetch one dataframe
            if method == 'one':
                assert df_old.shape == df_new.shape, 'the shape of old dataframe is {}, the shape of new dataframe is {}, \
                                     shapes are not equal'.format(df_old.shape, df_new.shape)

                for i in range(row_count):
                    col_old = df_old.iloc[i]
                    col_new = df_new.iloc[i]
                    for j, (c_old, c_new) in enumerate(zip(col_old, col_new)):
                        assert c_old == c_new, '{} row, {} column: old value is {}, new value is {}, \
                                              values are not equal'.format(i, j, c_old, c_new)
            else:
                assert rows == total_rows, 'the number of rows are not equal {} vs {}'.format(rows, total_rows)
예제 #9
0
    pandas = None
    write_pandas = None

MYPY = False
if MYPY:  # from typing import TYPE_CHECKING once 3.5 is deprecated
    from snowflake.connector import SnowflakeConnection

sf_connector_version_data = [
    ("snowflake-connector-python", "1.2.23"),
    ("snowflake-sqlalchemy", "1.1.1"),
    ("snowflake-connector-go", "0.0.1"),
    ("snowflake-go", "1.0.1"),
    ("snowflake-odbc", "3.12.3"),
]

sf_connector_version_df = LazyVar(lambda: pandas.DataFrame(
    sf_connector_version_data, columns=["name", "newest_version"]))


@pytest.mark.parametrize("chunk_size", [5, 4, 3, 2, 1])
@pytest.mark.parametrize("compression", ["gzip", "snappy"])
# Note: since the file will to small to chunk, this is only testing the put command's syntax
@pytest.mark.parametrize("parallel", [4, 99])
@pytest.mark.parametrize("quote_identifiers", [True, False])
def test_write_pandas(
    conn_cnx: Callable[..., Generator["SnowflakeConnection", None, None]],
    db_parameters: Dict[str, str],
    compression: str,
    parallel: int,
    chunk_size: int,
    quote_identifiers: bool,
):
from ...lazy_var import LazyVar

MYPY = False
if MYPY:  # from typing import TYPE_CHECKING once 3.5 is deprecated
    from snowflake.connector import SnowflakeConnection

sf_connector_version_data = [
    ('snowflake-connector-python', '1.2.23'),
    ('snowflake-sqlalchemy', '1.1.1'),
    ('snowflake-connector-go', '0.0.1'),
    ('snowflake-go', '1.0.1'),
    ('snowflake-odbc', '3.12.3'),
]

sf_connector_version_df = LazyVar(lambda: pandas.DataFrame(
    sf_connector_version_data, columns=['name', 'newest_version']))


@pytest.mark.parametrize('chunk_size', [5, 4, 3, 2, 1])
@pytest.mark.parametrize('compression', ['gzip', 'snappy'])
# Note: since the file will to small to chunk, this is only testing the put command's syntax
@pytest.mark.parametrize('parallel', [4, 99])
@pytest.mark.parametrize('quote_identifiers', [True, False])
def test_write_pandas(conn_cnx: Callable[..., Generator['SnowflakeConnection',
                                                        None, None]],
                      db_parameters: Dict[str, str], compression: str,
                      parallel: int, chunk_size: int, quote_identifiers: bool):
    num_of_chunks = math.ceil(len(sf_connector_version_data) / chunk_size)

    with conn_cnx(user=db_parameters['user'],
                  account=db_parameters['account'],
def test_all_pandas_types(conn_cnx: Callable[...,
                                             Generator[SnowflakeConnection,
                                                       None, None]]):
    table_name = random_string(5, "all_types_")
    datetime_with_tz = datetime(1997,
                                6,
                                3,
                                14,
                                21,
                                32,
                                00,
                                tzinfo=timezone(timedelta(hours=+10)))
    datetime_with_ntz = datetime(1997, 6, 3, 14, 21, 32, 00)
    df_data = [
        (1, 1.1, "1string1", True, datetime_with_tz, datetime_with_ntz),
        (2, 2.2, "2string2", False, datetime_with_tz, datetime_with_ntz),
    ]
    df_data_no_timestamps = [(
        row[0],
        row[1],
        row[2],
        row[3],
    ) for row in df_data]

    df = pandas.DataFrame(
        df_data,
        columns=[
            "int", "float", "string", "bool", "timestamp_tz", "timestamp_ntz"
        ],
    )

    select_sql = f'SELECT * FROM "{table_name}"'
    drop_sql = f'DROP TABLE IF EXISTS "{table_name}"'
    with conn_cnx() as cnx:
        try:
            success, nchunks, nrows, _ = write_pandas(cnx,
                                                      df,
                                                      table_name,
                                                      quote_identifiers=True,
                                                      auto_create_table=True)

            # Check write_pandas output
            assert success
            assert nrows == len(df_data)
            assert nchunks == 1
            # Check table's contents
            result = cnx.cursor(DictCursor).execute(select_sql).fetchall()
            for row in result:
                assert (
                    row["int"],
                    row["float"],
                    row["string"],
                    row["bool"],
                ) in df_data_no_timestamps
                # TODO: Schema detection on the server-side has bugs dealing with timestamp_ntz and timestamp_tz.
                #  After the bugs are fixed, change the assertion to `data[0]["tm_tz"] == datetime_with_tz`
                #  and `data[0]["tm_ntz"] == datetime_with_ntz`,
                #  JIRA https://snowflakecomputing.atlassian.net/browse/SNOW-524865
                #  JIRA https://snowflakecomputing.atlassian.net/browse/SNOW-359205
                #  JIRA https://snowflakecomputing.atlassian.net/browse/SNOW-507644
                assert row["timestamp_tz"] is not None
                assert row["timestamp_ntz"] is not None
        finally:
            cnx.execute_string(drop_sql)