def test_merge_upsert(sql, caplog): table_name = "##test_merge_upsert" dataframe = pd.DataFrame({"ColumnA": [3, 4]}) sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # delete, but keep in SQL since upserting dataframe = dataframe[dataframe.index != 0].copy() # update dataframe.loc[dataframe.index == 1, "ColumnA"] = 5 # insert dataframe = pd.concat([ dataframe, pd.DataFrame([6], columns=["ColumnA"], index=pd.Index([2], name="_index")), ]) # merge values into table, using the SQL primary key that came from the dataframe's index dataframe = sql.merge.merge(table_name, dataframe, upsert=True) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert dataframe.equals(result.loc[[1, 2]]) assert result.loc[0].equals( pd.Series([3], dtype="UInt8", index=["ColumnA"])) assert "_time_update" not in result.columns assert "_time_insert" not in result.columns # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 1 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2]
def test_update_primary_key(sql, caplog): table_name = "##test_update_primary_key" dataframe = pd.DataFrame({ "ColumnA": [1, 2], "ColumnB": ["a", "b"], "ColumnC": [3, 4] }) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # update values in table, using the SQL primary key that came from the dataframe's index dataframe["ColumnC"] = [5, 6] updated = sql.update.update(table_name, dataframe=dataframe[["ColumnC"]]) dataframe["ColumnC"] = updated["ColumnC"] # test result schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert dataframe.equals(result[dataframe.columns]) assert "_time_update" not in result.columns assert "_time_insert" not in result.columns # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 1 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2]
def test_update_composite_pk(sql, caplog): table_name = "##test_update_composite_pk" dataframe = pd.DataFrame({ "ColumnA": [1, 2], "ColumnB": ["a", "b"], "ColumnC": [3, 4] }) dataframe = dataframe.set_index(keys=["ColumnA", "ColumnB"]) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # update values in table, using the primary key created in SQL and ColumnA dataframe["ColumnC"] = [5, 6] updated = sql.update.update(table_name, dataframe) # test result schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result.equals(updated) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 1 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2]
def test_merge_two_match_columns(sql, caplog): table_name = "##test_merge_two_match_columns" dataframe = pd.DataFrame({ "State": ["A", "B"], "ColumnA": [3, 4], "ColumnB": ["a", "b"] }) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # delete dataframe = dataframe[dataframe.index != 0] # update dataframe.loc[dataframe.index == 1, "ColumnA"] = 5 # insert dataframe = pd.concat([ dataframe, pd.DataFrame( { "State": ["C"], "ColumnA": [6], "ColumnB": ["d"] }, index=pd.Index([2], name="_index"), ), ]) # merge values into table, using the primary key that came from the dataframe's index and ColumnA dataframe = sql.merge_meta.merge(table_name, dataframe, match_columns=["_index", "State"]) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[dataframe.columns].equals(dataframe) assert all(result["_time_update"].notna() == [True, False]) assert all(result["_time_insert"].notna() == [False, True]) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 3 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." )
def test_insert_include_metadata_timestamps(sql, caplog): table_name = "##test_insert_include_metadata_timestamps" # sample data dataframe = pd.DataFrame({"_bit": pd.Series([1, 0, None], dtype="boolean")}) # create table sql.create.table(table_name, columns={"_bit": "BIT"}) # insert data dataframe = sql.insert_meta.insert(table_name, dataframe) # test result schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values( f"SELECT * FROM {table_name}", schema, sql.connection ) assert all(result["_time_insert"].notna()) assert result["_bit"].equals(dataframe["_bit"]) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 1 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[0][1] == logging.WARNING assert ( caplog.record_tuples[0][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." )
def test_sample(sql, data, caplog): # create cursor to perform operations cursor = sql.cursor() cursor.fast_executemany = True # get table schema for setting input data types and sizes schema, dataframe = conversion.get_schema(connection=sql, table_name="##test_conversion") # only schema_name.table_name can be specified with pytest.raises(ValueError): conversion.get_schema(connection=sql, table_name="ServerName.dbo.##test_conversion") # dynamic SQL object names table = dynamic.escape(cursor, "##test_conversion") columns = dynamic.escape(cursor, data.columns) # prepare values of dataframe for insert dataframe, values = conversion.prepare_values(schema, data) # prepare cursor for input data types and sizes cursor = conversion.prepare_cursor(schema, dataframe, cursor) # issue insert statement insert = ", ".join(columns) params = ", ".join(["?"] * len(columns)) statement = f""" INSERT INTO {table} ( {insert} ) VALUES ( {params} ) """ cursor.executemany(statement, values) # read data, excluding ID columns that is only to insure sorting columns = ", ".join([x for x in data.columns]) statement = f"SELECT {columns} FROM {table} ORDER BY id ASC" result = conversion.read_values(statement, schema, connection=sql) # compare result to insert, comparing to dataframe as values may have changed during insert preparation assert result.equals(dataframe.set_index(keys="id")) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 2 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.conversion" assert caplog.record_tuples[0][1] == logging.WARNING assert ( caplog.record_tuples[0][2] == "Nanosecond precision for dataframe columns ['_time'] will be rounded as SQL data type 'time' allows 7 max decimal places." ) assert caplog.record_tuples[1][0] == "mssql_dataframe.core.conversion" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == "Nanosecond precision for dataframe columns ['_datetime2'] will be rounded as SQL data type 'datetime2' allows 7 max decimal places." )
def test_insert_alter_primary_key(sql, caplog): # inital insert table_name = "##test_insert_alter_primary_key" dataframe = pd.DataFrame({ "ColumnA": [0, 1, 2, 3], "ColumnB": [0, 1, 2, 3], "ColumnC": ["a", "b", "c", "d"], }).set_index(keys=["ColumnA", "ColumnB"]) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") schema, _ = conversion.get_schema(sql.connection, table_name) _, dtypes = conversion.sql_spec(schema, dataframe) assert dtypes == { "ColumnA": "tinyint", "ColumnB": "tinyint", "ColumnC": "varchar(1)", } assert schema.at["ColumnA", "pk_seq"] == 1 assert schema.at["ColumnB", "pk_seq"] == 2 assert pd.isna(schema.at["ColumnC", "pk_seq"]) # insert that alters primary key new = pd.DataFrame({ "ColumnA": [256, 257, 258, 259], "ColumnB": [4, 5, 6, 7], "ColumnC": ["e", "f", "g", "h"], }).set_index(keys=["ColumnA", "ColumnB"]) new = sql.insert.insert(table_name, new) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result.equals(pd.concat([dataframe, new])) _, dtypes = conversion.sql_spec(schema, new) assert dtypes == { "ColumnA": "smallint", "ColumnB": "tinyint", "ColumnC": "varchar(1)", } assert schema.at["ColumnA", "pk_seq"] == 1 assert schema.at["ColumnB", "pk_seq"] == 2 assert pd.isna(schema.at["ColumnC", "pk_seq"]) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 2 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Altering column 'ColumnA' in table '{table_name}' to data type 'smallint' with 'is_nullable=False'." )
def test_read_values_errors(sql): schema, _ = conversion.get_schema(connection=sql, table_name="##test_conversion") # error for a column missingin schema definition with pytest.raises(AttributeError): conversion.read_values( statement="SELECT * FROM ##test_conversion", schema=schema[schema.index != "id"], connection=sql, ) # error for primary key missing from query statement with pytest.raises(KeyError): conversion.read_values( statement="SELECT _bit FROM ##test_conversion", schema=schema, connection=sql, )
def test_insert_singles(sql): table_name = "##test_insert_singles" # create table columns = { "ColumnA": "TINYINT", "ColumnB": "INT", "ColumnC": "DATE", } sql.create.table(table_name, columns) schema, _ = conversion.get_schema(sql.connection, table_name) # single value dataframe = pd.DataFrame({"ColumnA": [1]}) dataframe = sql.insert.insert(table_name, dataframe) result = conversion.read_values( f"SELECT ColumnA FROM {table_name}", schema, sql.connection ) assert all(result["ColumnA"] == [1]) # single column dataframe = pd.DataFrame({"ColumnB": [2, 3, 4]}) dataframe = sql.insert.insert(table_name, dataframe) result = conversion.read_values( f"SELECT ColumnB FROM {table_name}", schema, sql.connection ) assert result["ColumnB"].equals(pd.Series([pd.NA, 2, 3, 4], dtype="Int32")) # single column of dates dataframe = pd.DataFrame( {"ColumnC": ["06-22-2021", "06-22-2021"]}, dtype="datetime64[ns]" ) dataframe = sql.insert.insert(table_name, dataframe) result = conversion.read_values( f"SELECT ColumnC FROM {table_name}", schema, sql.connection ) assert result["ColumnC"].equals( pd.Series( [pd.NA, pd.NA, pd.NA, pd.NA, "06-22-2021", "06-22-2021"], dtype="datetime64[ns]", ) )
def test_insert_alter_column(sql, caplog): table_name = "##test_insert_alter_column" sql.create.table( table_name, columns={ "ColumnA": "TINYINT", "ColumnB": "VARCHAR(1)", "ColumnC": "TINYINT" }, ) dataframe = pd.DataFrame({ "ColumnA": [1], "ColumnB": ["aaa"], "ColumnC": [100000] }) dataframe = sql.insert_meta.insert(table_name, dataframe=dataframe) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[dataframe.columns].equals(dataframe) assert all(result["_time_insert"].notna()) _, dtypes = conversion.sql_spec(schema, dataframe) assert dtypes == { "ColumnA": "tinyint", "ColumnB": "varchar(3)", "ColumnC": "int", "_time_insert": "datetime2", } # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 3 assert caplog.record_tuples[0][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[0][1] == logging.WARNING assert ( caplog.record_tuples[0][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Altering column 'ColumnB' in table '{table_name}' to data type 'varchar(3)' with 'is_nullable=True'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Altering column 'ColumnC' in table '{table_name}' to data type 'int' with 'is_nullable=True'." )
def test_insert_add_and_alter_column(sql, caplog): table_name = "##test_insert_add_and_alter_column" dataframe = pd.DataFrame({ "ColumnA": [0, 1, 2, 3], "ColumnB": [0, 1, 2, 3] }) dataframe = sql.create_meta.table_from_dataframe(table_name, dataframe, primary_key="index") new = pd.DataFrame( { "ColumnA": [4, 5, 6, 7], "ColumnB": [256, 257, 258, 259], "ColumnC": [0, 1, 2, 3], }, index=[4, 5, 6, 7], ) new.index.name = "_index" new = sql.insert_meta.insert(table_name, new) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[new.columns].equals(pd.concat([dataframe, new])) assert all(result["_time_insert"].notna()) _, dtypes = conversion.sql_spec(schema, dataframe) assert dtypes == { "_index": "tinyint", "ColumnA": "tinyint", "ColumnB": "smallint", "_time_insert": "datetime2", "ColumnC": "tinyint", } # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 3 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column 'ColumnC' in table '{table_name}' with data type 'tinyint'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Altering column 'ColumnB' in table '{table_name}' to data type 'smallint' with 'is_nullable=False'." )
def test_merge_alter_column(sql, caplog): table_name = "##test_merge_alter_column" dataframe = pd.DataFrame({"ColumnA": [1, 2], "ColumnB": ["a", "b"]}) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # merge using the SQL primary key that came from the dataframe's index dataframe = dataframe[dataframe.index != 0] dataframe["ColumnA"] = dataframe["ColumnA"].astype("Int64") dataframe.loc[1, "ColumnA"] = 10000 dataframe.loc[1, "ColumnB"] = "bbbbb" dataframe = sql.merge_meta.merge(table_name, dataframe) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[dataframe.columns].equals(dataframe) assert all(result["_time_update"].notna()) assert all(result["_time_insert"].isna()) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 5 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[3][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[3][1] == logging.WARNING assert ( caplog.record_tuples[3][2] == f"Altering column 'ColumnA' in table '{table_name}' to data type 'smallint' with 'is_nullable=False'." ) assert caplog.record_tuples[4][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[4][1] == logging.WARNING assert ( caplog.record_tuples[4][2] == f"Altering column 'ColumnB' in table '{table_name}' to data type 'varchar(5)' with 'is_nullable=False'." )
def test_update_two_match_columns(sql, caplog): table_name = "##test_update_two_match_columns" dataframe = pd.DataFrame({ "ColumnA": [1, 2], "ColumnB": ["a", "b"], "ColumnC": [3, 4] }) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="sql") # update values in table, using the primary key created in SQL and ColumnA schema, _ = conversion.get_schema(sql.connection, table_name) dataframe = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) dataframe["ColumnC"] = [5, 6] updated = sql.update_meta.update(table_name, dataframe, match_columns=["_pk", "ColumnA"]) # test result schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert updated.equals(result[updated.columns]) assert result["_time_update"].notna().all() # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 2 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." )
def test_update_alter_column(sql, caplog): table_name = "##test_update_alter_column" dataframe = pd.DataFrame({ "ColumnA": [1, 2], "ColumnB": ["a", "b"], "ColumnC": [0, 0] }) sql.create.table_from_dataframe(table_name, dataframe, primary_key=None) # update using ColumnA dataframe["ColumnB"] = ["aaa", "bbb"] dataframe["ColumnC"] = [256, 256] updated = sql.update_meta.update(table_name, dataframe, match_columns=["ColumnA"]) dataframe[["ColumnB", "ColumnC"]] = updated[["ColumnB", "ColumnC"]] schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[dataframe.columns].equals(dataframe) assert result["_time_update"].notna().all() # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 4 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Altering column 'ColumnB' in table '{table_name}' to data type 'varchar(3)' with 'is_nullable=False'." ) assert caplog.record_tuples[3][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[3][1] == logging.WARNING assert ( caplog.record_tuples[3][2] == f"Altering column 'ColumnC' in table '{table_name}' to data type 'smallint' with 'is_nullable=False'." )
def test_merge_add_column(sql, caplog): table_name = "##test_merge_add_column" dataframe = pd.DataFrame({"ColumnA": [1, 2]}) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # merge using the SQL primary key that came from the dataframe's index dataframe = dataframe[dataframe.index != 0] dataframe["NewColumn"] = [3] dataframe = sql.merge_meta.merge(table_name, dataframe) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[dataframe.columns].equals(dataframe) assert all(result["_time_update"].notna()) assert all(result["_time_insert"].isna()) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 4 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[3][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[3][1] == logging.WARNING assert ( caplog.record_tuples[3][2] == f"Creating column 'NewColumn' in table '{table_name}' with data type 'tinyint'." )
def test_merge_create_table(sql, caplog): table_name = "##test_merge_create_table" dataframe = pd.DataFrame({ "_pk": [1, 2], "ColumnA": [5, 6], "ColumnB": ["06/22/2021", "2023-08-31"] }) dataframe = sql.merge_meta.merge(table_name, dataframe, match_columns=["_pk"]) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[dataframe.columns].equals(dataframe) assert all(result["_time_update"].isna()) assert all(result["_time_insert"].notna()) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 4 assert caplog.record_tuples[0][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[0][1] == logging.WARNING assert caplog.record_tuples[0][2] == f"Creating table '{table_name}'." assert caplog.record_tuples[1][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[1][1] == logging.WARNING assert "Created table" in caplog.record_tuples[1][2] assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[3][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[3][1] == logging.WARNING assert ( caplog.record_tuples[3][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." )
def test_insert_composite_pk(sql): table_name = "##test_insert_composite_pk" columns = columns = { "ColumnA": "TINYINT", "ColumnB": "VARCHAR(5)", "ColumnC": "BIGINT", } sql.create.table(table_name, columns, primary_key_column=["ColumnA", "ColumnB"]) dataframe = pd.DataFrame({"ColumnA": [1], "ColumnB": ["12345"], "ColumnC": [1]}) dataframe = sql.insert.insert(table_name, dataframe) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values( f"SELECT * FROM {table_name}", schema, sql.connection ) assert all(result.index == pd.MultiIndex.from_tuples([(1, "12345")])) assert all(result["ColumnC"] == 1)
def test_insert_create_table(sql, caplog): table_name = "##test_insert_create_table" dataframe = pd.DataFrame({ "ColumnA": [1, 2, 3], "ColumnB": ["06/22/2021", "06-22-2021", "2021-06-22"] }) dataframe = sql.insert_meta.insert(table_name, dataframe=dataframe) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) expected = pd.DataFrame({ "ColumnA": pd.Series([1, 2, 3], dtype="UInt8"), "ColumnB": pd.Series( [pd.Timestamp(year=2021, month=6, day=22)] * 3, dtype="datetime64[ns]", ), }).set_index(keys="ColumnA") assert result[expected.columns].equals(expected) assert all(result["_time_insert"].notna()) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 3 assert caplog.record_tuples[0][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[0][1] == logging.WARNING assert caplog.record_tuples[0][2] == f"Creating table '{table_name}'." assert caplog.record_tuples[1][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[1][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[1][2] assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." )
def test_merge_override_timestamps(sql, caplog): table_name = "##test_merge_override_timestamps" dataframe = pd.DataFrame({"ColumnA": [3, 4]}) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # update dataframe.loc[dataframe.index == 1, "ColumnA"] = 5 # merge values into table, using the SQL primary key that came from the dataframe's index dataframe = sql.merge.merge(table_name, dataframe, include_metadata_timestamps=True) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[dataframe.columns].equals(dataframe) assert all(result["_time_update"].notna() == [True, True]) assert all(result["_time_insert"].notna() == [False, False]) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 3 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." )
def test_insert_add_column(sql, caplog): table_name = "##test_insert_add_column" sql.create.table(table_name, columns={"ColumnA": "TINYINT"}) dataframe = pd.DataFrame({ "ColumnA": [1], "ColumnB": [2], "ColumnC": ["zzz"] }) dataframe = sql.insert_meta.insert(table_name, dataframe=dataframe) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[dataframe.columns].equals(dataframe) assert all(result["_time_insert"].notna()) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 3 assert caplog.record_tuples[0][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[0][1] == logging.WARNING assert ( caplog.record_tuples[0][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column 'ColumnB' in table '{table_name}' with data type 'tinyint'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Creating column 'ColumnC' in table '{table_name}' with data type 'varchar(3)'." )
def test_merge_composite_pk(sql, caplog): table_name = "##test_merge_composite_pk" dataframe = pd.DataFrame({ "State": ["A", "B"], "ColumnA": [3, 4], "ColumnB": ["a", "b"] }).set_index(keys=["State", "ColumnA"]) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # delete dataframe = dataframe[dataframe.index != ("A", 3)].copy() # update dataframe.loc[dataframe.index == ("B", 4), "ColumnB"] = "c" # insert dataframe = pd.concat([ dataframe, pd.DataFrame({ "State": ["C"], "ColumnA": [6], "ColumnB": ["d"] }).set_index(keys=["State", "ColumnA"]), ]) dataframe = sql.merge.merge(table_name, dataframe) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert result[dataframe.columns].equals(dataframe) assert "_time_update" not in result assert "_time_insert" not in result # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 1 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2]
def test_update_override_timestamps(sql, caplog): table_name = "##test_update_override_timestamps" dataframe = pd.DataFrame({ "ColumnA": [1, 2], "ColumnB": ["a", "b"], "ColumnC": [3, 4] }) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # update values in table, using the SQL primary key that came from the dataframe's index dataframe["ColumnC"] = [5, 6] updated = sql.update.update(table_name, dataframe=dataframe[["ColumnC"]], include_metadata_timestamps=True) dataframe["ColumnC"] = updated["ColumnC"] # test result schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert dataframe.equals(result[dataframe.columns]) assert result["_time_update"].notna().all() # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 2 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." )
def test_insert_dataframe(sql, caplog): table_name = "##test_insert_dataframe" # sample data dataframe = pd.DataFrame( { "_bit": pd.Series([1, 0, None], dtype="boolean"), "_tinyint": pd.Series([0, 255, None], dtype="UInt8"), "_smallint": pd.Series([-(2**15), 2**15 - 1, None], dtype="Int16"), "_int": pd.Series([-(2**31), 2**31 - 1, None], dtype="Int32"), "_bigint": pd.Series([-(2**63), 2**63 - 1, None], dtype="Int64"), "_float": pd.Series([-(1.79**308), 1.79**308, None], dtype="float"), "_time": pd.Series( ["00:00:00.0000000", "23:59:59.9999999", None], dtype="timedelta64[ns]" ), "_date": pd.Series( [ (pd.Timestamp.min + pd.Timedelta(days=1)).date(), pd.Timestamp.max.date(), None, ], dtype="datetime64[ns]", ), "_datetime2": pd.Series( [pd.Timestamp.min, pd.Timestamp.max, None], dtype="datetime64[ns]" ), "_varchar": pd.Series(["a", "bbb", None], dtype="string"), "_nvarchar": pd.Series( ["100\N{DEGREE SIGN}F", "company name\N{REGISTERED SIGN}", None], dtype="string", ), } ) # create table columns = { "_time_insert": "DATETIME2", "_bit": "BIT", "_tinyint": "TINYINT", "_smallint": "SMALLINT", "_int": "INT", "_bigint": "BIGINT", "_float": "FLOAT", "_time": "TIME", "_date": "DATE", "_datetime2": "DATETIME2", "_varchar": "VARCHAR", "_nvarchar": "NVARCHAR", } columns["_varchar"] = ( columns["_varchar"] + "(" + str(dataframe["_varchar"].str.len().max()) + ")" ) columns["_nvarchar"] = ( columns["_nvarchar"] + "(" + str(dataframe["_nvarchar"].str.len().max()) + ")" ) sql.create.table(table_name, columns) # insert data dataframe = sql.insert_meta.insert(table_name, dataframe) # test result schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values( f"SELECT * FROM {table_name}", schema, sql.connection ) assert all(result["_time_insert"].notna()) assert dataframe.equals(result[result.columns.drop("_time_insert")]) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 1 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.conversion" assert caplog.record_tuples[0][1] == logging.WARNING assert ( caplog.record_tuples[0][2] == "Nanosecond precision for dataframe columns ['_datetime2'] will be rounded as SQL data type 'datetime2' allows 7 max decimal places." )
def test_merge_non_pk_column(sql, caplog): table_name = "##test_merge_non_pk_column" dataframe = pd.DataFrame({ "State": ["A", "B"], "ColumnA": [3, 4], "ColumnB": ["a", "b"] }) dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key=None) # delete dataframe = dataframe[dataframe.index != 0] dataframe = dataframe.reset_index(drop=True) # update dataframe.loc[dataframe.index == 1, "ColumnA"] = 5 # insert dataframe = pd.concat([ dataframe, pd.DataFrame( { "State": ["C"], "ColumnA": [6], "ColumnB": ["d"] }, index=pd.Index([1], name="_index"), ), ]) # merge values into table, using a single column that is not the primary key: dataframe = sql.merge_meta.merge(table_name, dataframe, match_columns=["State"]) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values( f"SELECT * FROM {table_name} ORDER BY _time_update DESC", schema, sql.connection, ) assert result[dataframe.columns].equals(dataframe) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 3 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." )
def test_merge_two_delete_requires(sql, caplog): table_name = "##test_merge_two_delete_requires" dataframe = pd.DataFrame( { "State1": ["A", "B", "B"], "State2": ["X", "Y", "Z"], "ColumnA": [3, 4, 4], "ColumnB": ["a", "b", "b"], }, index=[0, 1, 2], ) dataframe.index.name = "_pk" dataframe = sql.create.table_from_dataframe(table_name, dataframe, primary_key="index") # delete 2 records dataframe = dataframe[dataframe.index == 1].copy() # update dataframe.loc[dataframe.index == 1, ["ColumnA", "ColumnB"]] = [5, "c"] # insert dataframe.index.name = "_pk" dataframe = pd.concat([ dataframe, pd.DataFrame( { "State1": ["C"], "State2": ["Z"], "ColumnA": [6], "ColumnB": ["d"] }, index=pd.Index([3], name="_pk"), ), ]) # merge values into table, using the primary key that came from the dataframe's index # also require a match on State1 and State2 to prevent a record from being deleted dataframe = sql.merge_meta.merge( table_name, dataframe, match_columns=["_pk"], delete_requires=["State1", "State2"], ) schema, _ = conversion.get_schema(sql.connection, table_name) result = conversion.read_values(f"SELECT * FROM {table_name}", schema, sql.connection) assert all( result.loc[[1, 3], ["State1", "State2", "ColumnA", "ColumnB"]] == dataframe) assert all(result.loc[0, ["State1", "State2", "ColumnA", "ColumnB"]] == pd.Series(["A", "X", 3, "a"], index=["State1", "State2", "ColumnA", "ColumnB"])) assert all(result["_time_update"].notna() == [False, True, False]) assert all(result["_time_insert"].notna() == [False, False, True]) # assert warnings raised by logging after all other tasks assert len(caplog.record_tuples) == 3 assert caplog.record_tuples[0][0] == "mssql_dataframe.core.create" assert caplog.record_tuples[0][1] == logging.WARNING assert f"Created table: {table_name}" in caplog.record_tuples[0][2] assert caplog.record_tuples[1][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[1][1] == logging.WARNING assert ( caplog.record_tuples[1][2] == f"Creating column '_time_update' in table '{table_name}' with data type 'datetime2'." ) assert caplog.record_tuples[2][ 0] == "mssql_dataframe.core.write._exceptions" assert caplog.record_tuples[2][1] == logging.WARNING assert ( caplog.record_tuples[2][2] == f"Creating column '_time_insert' in table '{table_name}' with data type 'datetime2'." )
def table( self, table_name: str, column_names: list = None, where: str = None, limit: int = None, order_column: str = None, order_direction: Literal[None, "ASC", "DESC"] = None, ) -> pd.DataFrame: """Select data from SQL into a dataframe. Parameters ---------- table_name (str) : name of table to select data frame column_names (list|str, default=None) : list of columns to select, or None to select all where (str, default=None) : where clause filter to apply limit (int, default=None) : select limited number of records only order_column (str, default=None) : order results by column order_direction (str, default=None) : order direction Returns ------- dataframe (pandas.DataFrame): tabular data from select statement Examples -------- A sample table to read, created from a dataframe. >>> df = pd.DataFrame( ... { ... "ColumnA": [5, 6, 7], ... "ColumnB": [5, 6, None], ... "ColumnC": [pd.NA, 6, 7], ... "ColumnD": ["06-22-2021", "06-22-2021", pd.NaT], ... "ColumnE": ["a", "b", None], ... }, index = ["xxx", "yyy", "zzz"] ... ) >>> df = create.table_from_dataframe('##ExampleRead', df, primary_key='index') Select the entire table. The primary key is set as the dataframe's index. >>> query = read.table('##ExampleRead') Select specific columns. >>> query = read.table('##ExampleRead', column_names=['ColumnA','ColumnB']) Select using conditions grouped by parentheses while applying a limit and order. >>> query = read.table('##ExampleRead', where="(ColumnB>4 AND ColumnC IS NOT NULL) OR ColumnE IS NULL", limit=5, order_column='ColumnB', order_direction='DESC') """ # get table schema for conversion to pandas schema, _ = conversion.get_schema(self._connection, table_name) # always read in primary key columns for dataframe index primary_key_columns = list( schema.loc[schema["pk_seq"].notna(), "pk_seq"].sort_values(ascending=True).index) # dynamic table and column names, and column_name development table_name = dynamic.escape(self._connection.cursor(), table_name) if column_names is None: column_names = "*" else: if isinstance(column_names, str): column_names = [column_names] elif isinstance(column_names, pd.Index): column_names = list(column_names) column_names = primary_key_columns + column_names column_names = list(set(column_names)) missing = [x for x in column_names if x not in schema.index] if len(missing) > 0: raise custom_errors.SQLColumnDoesNotExist( f"Column does not exist in table {table_name}:", missing) column_names = dynamic.escape(self._connection.cursor(), column_names) column_names = "\n,".join(column_names) # format optional where_statement if where is None: where_statement, where_args = ("", None) else: where_statement, where_args = dynamic.where( self._connection.cursor(), where) # format optional limit if limit is None: limit = "" elif not isinstance(limit, int): raise ValueError("limit must be an integer") else: limit = "TOP(" + str(limit) + ")" # format optional order options = [None, "ASC", "DESC"] if (order_column is None and order_direction is not None) or ( order_column is not None and order_direction is None): raise ValueError( "order_column and order_direction must both be specified") elif order_direction not in options: raise ValueError("order direction must be one of: " + str(options)) elif order_column is not None: order = ("ORDER BY " + dynamic.escape(self._connection.cursor(), order_column) + " " + order_direction) else: order = "" # select values statement = f""" SELECT {limit} {column_names} FROM {table_name} {where_statement} {order} """ # read sql query dataframe = conversion.read_values(statement, schema, self._connection, where_args) return dataframe