def test_values_conversion(_): engine = create_engine('sqlite:///') row = { 'id': 0, 'pd_interval': pd.Interval(left=0, right=5), 'nan': np.nan, 'nat': pd.NaT, 'none': None, 'pd_na': getattr(pd, 'NA', None), 'ts': pd.Timestamp('2021-01-01') } df = pd.DataFrame([row]).set_index('id') with engine.connect() as connection: pse = PandasSpecialEngine(connection=connection, df=df, table_name=TableNames.NO_TABLE) values = pse._get_values_to_insert() converted_row = values[0] assert len(row) == len(converted_row) # iterate over the keys of the row and get the index (after conversion we just have a list) for ix, k in enumerate(row): v = row[k] v_converted = converted_row[ix] if k == 'id': assert v_converted == v elif k == 'pd_interval': assert isinstance(v_converted, str) elif k == 'ts': assert isinstance(v_converted, datetime.datetime) # we should receive all null likes here else: assert pd.isna(v) assert isinstance(v_converted, SqlaNull)
def _setup_objects(self, pse: PandasSpecialEngine): """ Handles optional pre-upsert operations: 1. creating the PostgreSQL schema 2. creating the table 3. adding missing columns 4. altering columns data types (if needed) when these columns are empty in the db but not in the df """ # create schema if not exists # IMPORTANT! `pse.schema` and not `schema` # -> With postgres None will be set to `public` if self.create_schema and pse.schema is not None: pse.create_schema_if_not_exists() # create table if not exists if self.create_table: pse.create_table_if_not_exists() # change dtype of empty columns in db if self.adapt_dtype_of_empty_db_columns and pse.table_exists(): pse.adapt_dtype_of_empty_db_columns() # add new columns from frame if self.add_new_columns and pse.table_exists(): pse.add_new_columns()
def execute_yield(self, connectable: Connectable, if_row_exists: str, chunksize: int): """ Same as `execute` but for each chunk upserted yields a `sqlalchemy.engine.cursor.LegacyCursorResult` object with which we can for instance retrieve the number of updated rows """ with TransactionHandler(connectable=connectable) as trans: # setup pse = PandasSpecialEngine(connection=trans.connection, df=self.df, table_name=self.table_name, schema=self.schema, dtype=self.dtype) self._setup_objects(pse=pse) # upsert # make sure to return an empty generator in case of an empty DataFrame # for consistent data types (thanks to https://stackoverflow.com/a/13243870 !) if len(self.df) == 0: return yield for result in pse.upsert_yield(if_row_exists=if_row_exists, chunksize=chunksize): yield result
def test_bad_chunksize(engine, schema, bad_chunksize_value): df = pd.DataFrame({'test': [0]}) df.index.name = 'id' with sync_async_connect_switch(engine) as connection: pse = PandasSpecialEngine(connection=connection, schema=schema, table_name=TableNames.NO_TABLE, df=df) with pytest.raises(ValueError) as excinfo: pse._create_chunks(values=[0], chunksize=bad_chunksize_value) assert "integer strictly above 0" in str(excinfo.value)
async def test_schema_creation_async(engine, schema): # overwrite schema schema = schema_for_testing_creation # schema may already exist before testing if 'postgres' in engine.dialect.dialect_description: drop_schema(engine=engine, schema=schema) # then try to create a schema from a PandasSpecialEngine instance dummy_df = pd.DataFrame(index=pd.Index(data=[0], name='id')) async with engine.connect() as connection: try: pse = PandasSpecialEngine(connection=connection, schema=schema, table_name=TableNames.NO_TABLE, df=dummy_df) # this should raise HasNoSchemaSystemException # if we are not on a postgres engine assert not await pse.aschema_exists() await pse.acreate_schema_if_not_exists() await connection.commit() assert await pse.aschema_exists() except Exception as e: if pse._db_type == 'postgres' or not isinstance( e, HasNoSchemaSystemException): raise e finally: if pse._db_type == 'postgres': drop_schema(engine=engine, schema=schema)
def test_error_index_level_named(engine, schema): df = pd.DataFrame({'test': [0]}) with pytest.raises(UnnamedIndexLevelsException) as excinfo: with sync_async_connect_switch(engine) as connection: PandasSpecialEngine(connection=connection, schema=schema, table_name=TableNames.NO_TABLE, df=df) assert "All index levels must be named" in str(excinfo.value)
def test_non_unique_index(engine, schema): df = pd.DataFrame(index=pd.Index(data=[0, 0], name='ix')) with pytest.raises(DuplicateValuesInIndexException) as excinfo: with sync_async_connect_switch(engine) as connection: PandasSpecialEngine(connection=connection, schema=schema, table_name=TableNames.NO_TABLE, df=df) assert "The index must be unique" in str(excinfo.value)
def execute(self, connectable: Connectable, if_row_exists: str, chunksize: int) -> None: """ Handles the actual upsert operation. """ with TransactionHandler(connectable=connectable) as trans: # setup pse = PandasSpecialEngine(connection=trans.connection, df=self.df, table_name=self.table_name, schema=self.schema, dtype=self.dtype) self._setup_objects(pse=pse) # upsert if len(self.df) == 0: return pse.upsert(if_row_exists=if_row_exists, chunksize=chunksize)
async def test_change_column_type_if_column_empty_async( engine, schema, caplog, new_empty_column_value=None): print(new_empty_column_value) # store arguments we will use for multiple PandasSpecialEngine instances table_name = TableNames.CHANGE_EMPTY_COL_TYPE common_kwargs = dict(schema=schema, table_name=table_name) common_kwargs['dtype'] = { 'profileid': VARCHAR(5) } if 'mysql' in engine.dialect.dialect_description else None # json like will not work for sqlalchemy < 1.4 # also skip sqlite as it does not support such alteration json_like = isinstance(new_empty_column_value, (dict, list)) if json_like and not _sqla_gt14(): pytest.skip('JSON like values will not work for sqlalchemy < 1.4') elif 'sqlite' in engine.dialect.dialect_description: pytest.skip('such column alteration is not possible with SQlite') # create our example table df = pd.DataFrame({ 'profileid': ['foo'], 'empty_col': [None] }).set_index('profileid') async with engine.connect() as connection: pse = PandasSpecialEngine(connection=connection, df=df, **common_kwargs) await pse.acreate_table_if_not_exists() await connection.commit() assert await pse.atable_exists() # recreate an instance of PandasSpecialEngine with a new df (so the model gets refreshed) # the line below is a "hack" to set any type of element as a column value # without pandas trying to broadcast it. This is useful when passing a list or such df['empty_col'] = df.index.map(lambda x: new_empty_column_value) async with engine.connect() as connection: pse = PandasSpecialEngine(connection=connection, df=df, **common_kwargs) with caplog.at_level(logging.INFO, logger='pangres'): await pse.aadapt_dtype_of_empty_db_columns() assert len(caplog.records) == 1 assert 'Changed type of column empty_col' in caplog.text caplog.clear()
async def aexecute_yield(self, async_connectable, if_row_exists: str, chunksize: int): async with TransactionHandler(connectable=async_connectable) as trans: # setup pse = PandasSpecialEngine(connection=trans.connection, df=self.df, table_name=self.table_name, schema=self.schema, dtype=self.dtype) await self._asetup_objects(pse=pse) # upsert if len(self.df) == 0: return yield # IMPORTANT! NO `await` async for result in pse.aupsert_yield(if_row_exists=if_row_exists, chunksize=chunksize): yield result
def test_detect_db_type(_, connection_string, expected): # there are some engines that we will not be able to create because they are asynchronous # and this requires sqlalchemy >= 1.4 try: engine = create_sync_or_async_engine(connection_string) except NotImplementedError as e: pytest.skip(str(e)) except ModuleNotFoundError as e: pytest.skip( f'Could not find module for connection string: {connection_string}. Error was: {e}' ) assert PandasSpecialEngine._detect_db_type(connectable=engine) == expected
def test_table_attr(engine, schema): # generate a somewhat complex table model via the _TestsExampleTable class df = _TestsExampleTable.create_example_df(nb_rows=10) table_name = TableNames.NO_TABLE with sync_async_connect_switch(engine) as connection: pse = PandasSpecialEngine(connection=connection, schema=schema, table_name=table_name, df=df) # make sure columns and table name match expected_cols = list(df.index.names) + df.columns.tolist() assert all((col in pse.table.columns for col in expected_cols)) assert pse.table.name == table_name
def test_repr(engine, schema): dummy_df = pd.DataFrame(index=pd.Index(data=[0], name='id')) with sync_async_connect_switch(engine) as connection: pse = PandasSpecialEngine(connection=connection, schema=schema, table_name=TableNames.NO_TABLE, df=dummy_df) # make sure it is printable without errors txt = str(pse) print(txt) # test some strings we expect to find in the repr for s in ('PandasSpecialEngine', 'id ', 'hexid', 'connection', 'schema', 'table', 'SQLalchemy table model'): assert s in txt
async def aexecute(self, async_connectable, if_row_exists: str, chunksize: int) -> None: async with TransactionHandler(connectable=async_connectable) as trans: # setup pse = PandasSpecialEngine(connection=trans.connection, df=self.df, table_name=self.table_name, schema=self.schema, dtype=self.dtype) await self._asetup_objects(pse=pse) # upsert if len(self.df) == 0: return await pse.aupsert(if_row_exists=if_row_exists, chunksize=chunksize)
async def test_table_creation_async(engine, schema): dtype = { 'profileid': VARCHAR(5) } if 'mysql' in engine.dialect.dialect_description else None df = _TestsExampleTable.create_example_df(nb_rows=10) async with engine.connect() as connection: pse = PandasSpecialEngine(connection=connection, schema=schema, dtype=dtype, table_name=TableNames.TABLE_CREATION, df=df) assert not await pse.atable_exists() await pse.acreate_table_if_not_exists() await connection.commit() assert await pse.atable_exists()
def test_duplicated_names(engine, schema, option): df = pd.DataFrame({'test': [0]}) if option == 'index and column collision': df.index.name = 'test' elif option == 'columns duplicated': df.index.name = 'ix' df = df[['test', 'test']] elif option == 'index duplicated': df = df.set_index(['test', 'test']) else: raise AssertionError(f'Unexpected value for param `option`: {option}') with pytest.raises(DuplicateLabelsException) as excinfo: with sync_async_connect_switch(engine) as connection: PandasSpecialEngine(connection=connection, schema=schema, table_name=TableNames.NO_TABLE, df=df) assert "Found duplicates across index and columns" in str(excinfo.value)
async def test_add_new_columns_async(engine, schema, axis=None): # store arguments we will use for multiple PandasSpecialEngine instances table_name = TableNames.ADD_NEW_COLUMN common_kwargs = dict(schema=schema, table_name=table_name) common_kwargs['dtype'] = { 'profileid': VARCHAR(5) } if 'mysql' in engine.dialect.dialect_description else None # create our example table df = _TestsExampleTable.create_example_df(nb_rows=10) async with engine.connect() as connection: pse = PandasSpecialEngine(connection=connection, df=df, **common_kwargs) await pse.acreate_table_if_not_exists() await connection.commit() assert await pse.atable_exists() # we need to recreate an instance of PandasSpecialEngine # so that a new table model with the new columns is created then add columns async with engine.connect() as connection: # error message if we get unexpected values for "axis" # or we make a typo in our if/elif statements err_msg = f'Expected axis to be one of index, columns. Got {axis}' # add a new index level or new columns (no JSON ones, # it's not supported by sqlalchemy compilers :( ) if axis == 'index': df['new_index_col'] = 'foo' df.set_index('new_index_col', append=True, inplace=True) elif axis == 'column': df = df.assign( new_text_col='test', new_int_col=0, new_float_col=1.1, new_bool_col=False, new_dt_col=pd.Timestamp('2020-01-01'), # create this col for later empty_col=None) else: raise AssertionError(err_msg) # recreate PandasSpecialEngine pse = PandasSpecialEngine(connection=connection, df=df, **common_kwargs) # check if we get an error when trying to add an index level if axis == 'index': with pytest.raises(MissingIndexLevelInSqlException) as exc_info: await pse.aadd_new_columns() assert 'Cannot add' in str(exc_info.value) return elif axis == 'column': await pse.aadd_new_columns() await connection.commit() else: raise AssertionError(err_msg) # check if the columns were correctly added # since we issued a return for 'index' earlier # the axis must now be 'columns' assert axis == 'column' # check the columns where added sync_engine = async_engine_to_sync_engine(engine) with sync_engine.connect() as connection: ns = get_table_namespace(schema=schema, table_name=table_name) df_db = pd.read_sql(text(f'SELECT * FROM {ns} LIMIT 0;'), con=connection, index_col='profileid') assert set(df.columns) == set(df_db.columns)