def test_values_conversion(_):
    engine = create_engine('sqlite:///')
    row = {
        'id': 0,
        'pd_interval': pd.Interval(left=0, right=5),
        'nan': np.nan,
        'nat': pd.NaT,
        'none': None,
        'pd_na': getattr(pd, 'NA', None),
        'ts': pd.Timestamp('2021-01-01')
    }
    df = pd.DataFrame([row]).set_index('id')
    with engine.connect() as connection:
        pse = PandasSpecialEngine(connection=connection,
                                  df=df,
                                  table_name=TableNames.NO_TABLE)
        values = pse._get_values_to_insert()
    converted_row = values[0]
    assert len(row) == len(converted_row)
    # iterate over the keys of the row and get the index (after conversion we just have a list)
    for ix, k in enumerate(row):
        v = row[k]
        v_converted = converted_row[ix]
        if k == 'id':
            assert v_converted == v
        elif k == 'pd_interval':
            assert isinstance(v_converted, str)
        elif k == 'ts':
            assert isinstance(v_converted, datetime.datetime)
        # we should receive all null likes here
        else:
            assert pd.isna(v)
            assert isinstance(v_converted, SqlaNull)
Exemplo n.º 2
0
    def _setup_objects(self, pse: PandasSpecialEngine):
        """
        Handles optional pre-upsert operations:
        1. creating the PostgreSQL schema
        2. creating the table
        3. adding missing columns
        4. altering columns data types (if needed) when these columns are
           empty in the db but not in the df
        """
        # create schema if not exists
        # IMPORTANT! `pse.schema` and not `schema`
        # -> With postgres None will be set to `public`
        if self.create_schema and pse.schema is not None:
            pse.create_schema_if_not_exists()

        # create table if not exists
        if self.create_table:
            pse.create_table_if_not_exists()

        # change dtype of empty columns in db
        if self.adapt_dtype_of_empty_db_columns and pse.table_exists():
            pse.adapt_dtype_of_empty_db_columns()

        # add new columns from frame
        if self.add_new_columns and pse.table_exists():
            pse.add_new_columns()
Exemplo n.º 3
0
    def execute_yield(self, connectable: Connectable, if_row_exists: str,
                      chunksize: int):
        """
        Same as `execute` but for each chunk upserted yields a
        `sqlalchemy.engine.cursor.LegacyCursorResult` object with which
        we can for instance retrieve the number of updated rows
        """
        with TransactionHandler(connectable=connectable) as trans:

            # setup
            pse = PandasSpecialEngine(connection=trans.connection,
                                      df=self.df,
                                      table_name=self.table_name,
                                      schema=self.schema,
                                      dtype=self.dtype)
            self._setup_objects(pse=pse)

            # upsert
            # make sure to return an empty generator in case of an empty DataFrame
            # for consistent data types (thanks to https://stackoverflow.com/a/13243870 !)
            if len(self.df) == 0:
                return
                yield
            for result in pse.upsert_yield(if_row_exists=if_row_exists,
                                           chunksize=chunksize):
                yield result
def test_bad_chunksize(engine, schema, bad_chunksize_value):
    df = pd.DataFrame({'test': [0]})
    df.index.name = 'id'
    with sync_async_connect_switch(engine) as connection:
        pse = PandasSpecialEngine(connection=connection,
                                  schema=schema,
                                  table_name=TableNames.NO_TABLE,
                                  df=df)
        with pytest.raises(ValueError) as excinfo:
            pse._create_chunks(values=[0], chunksize=bad_chunksize_value)
        assert "integer strictly above 0" in str(excinfo.value)
async def test_schema_creation_async(engine, schema):
    # overwrite schema
    schema = schema_for_testing_creation

    # schema may already exist before testing
    if 'postgres' in engine.dialect.dialect_description:
        drop_schema(engine=engine, schema=schema)

    # then try to create a schema from a PandasSpecialEngine instance
    dummy_df = pd.DataFrame(index=pd.Index(data=[0], name='id'))
    async with engine.connect() as connection:
        try:
            pse = PandasSpecialEngine(connection=connection,
                                      schema=schema,
                                      table_name=TableNames.NO_TABLE,
                                      df=dummy_df)
            # this should raise HasNoSchemaSystemException
            # if we are not on a postgres engine
            assert not await pse.aschema_exists()
            await pse.acreate_schema_if_not_exists()
            await connection.commit()
            assert await pse.aschema_exists()
        except Exception as e:
            if pse._db_type == 'postgres' or not isinstance(
                    e, HasNoSchemaSystemException):
                raise e
        finally:
            if pse._db_type == 'postgres':
                drop_schema(engine=engine, schema=schema)
def test_error_index_level_named(engine, schema):
    df = pd.DataFrame({'test': [0]})
    with pytest.raises(UnnamedIndexLevelsException) as excinfo:
        with sync_async_connect_switch(engine) as connection:
            PandasSpecialEngine(connection=connection,
                                schema=schema,
                                table_name=TableNames.NO_TABLE,
                                df=df)
    assert "All index levels must be named" in str(excinfo.value)
def test_non_unique_index(engine, schema):
    df = pd.DataFrame(index=pd.Index(data=[0, 0], name='ix'))
    with pytest.raises(DuplicateValuesInIndexException) as excinfo:
        with sync_async_connect_switch(engine) as connection:
            PandasSpecialEngine(connection=connection,
                                schema=schema,
                                table_name=TableNames.NO_TABLE,
                                df=df)
    assert "The index must be unique" in str(excinfo.value)
Exemplo n.º 8
0
    def execute(self, connectable: Connectable, if_row_exists: str,
                chunksize: int) -> None:
        """
        Handles the actual upsert operation.
        """
        with TransactionHandler(connectable=connectable) as trans:
            # setup
            pse = PandasSpecialEngine(connection=trans.connection,
                                      df=self.df,
                                      table_name=self.table_name,
                                      schema=self.schema,
                                      dtype=self.dtype)
            self._setup_objects(pse=pse)

            # upsert
            if len(self.df) == 0:
                return
            pse.upsert(if_row_exists=if_row_exists, chunksize=chunksize)
async def test_change_column_type_if_column_empty_async(
        engine, schema, caplog, new_empty_column_value=None):
    print(new_empty_column_value)
    # store arguments we will use for multiple PandasSpecialEngine instances
    table_name = TableNames.CHANGE_EMPTY_COL_TYPE
    common_kwargs = dict(schema=schema, table_name=table_name)
    common_kwargs['dtype'] = {
        'profileid': VARCHAR(5)
    } if 'mysql' in engine.dialect.dialect_description else None

    # json like will not work for sqlalchemy < 1.4
    # also skip sqlite as it does not support such alteration
    json_like = isinstance(new_empty_column_value, (dict, list))
    if json_like and not _sqla_gt14():
        pytest.skip('JSON like values will not work for sqlalchemy < 1.4')
    elif 'sqlite' in engine.dialect.dialect_description:
        pytest.skip('such column alteration is not possible with SQlite')

    # create our example table
    df = pd.DataFrame({
        'profileid': ['foo'],
        'empty_col': [None]
    }).set_index('profileid')
    async with engine.connect() as connection:
        pse = PandasSpecialEngine(connection=connection,
                                  df=df,
                                  **common_kwargs)
        await pse.acreate_table_if_not_exists()
        await connection.commit()
        assert await pse.atable_exists()

    # recreate an instance of PandasSpecialEngine with a new df (so the model gets refreshed)
    # the line below is a "hack" to set any type of element as a column value
    # without pandas trying to broadcast it. This is useful when passing a list or such
    df['empty_col'] = df.index.map(lambda x: new_empty_column_value)
    async with engine.connect() as connection:
        pse = PandasSpecialEngine(connection=connection,
                                  df=df,
                                  **common_kwargs)
        with caplog.at_level(logging.INFO, logger='pangres'):
            await pse.aadapt_dtype_of_empty_db_columns()
        assert len(caplog.records) == 1
        assert 'Changed type of column empty_col' in caplog.text
    caplog.clear()
Exemplo n.º 10
0
    async def aexecute_yield(self, async_connectable, if_row_exists: str,
                             chunksize: int):
        async with TransactionHandler(connectable=async_connectable) as trans:
            # setup
            pse = PandasSpecialEngine(connection=trans.connection,
                                      df=self.df,
                                      table_name=self.table_name,
                                      schema=self.schema,
                                      dtype=self.dtype)
            await self._asetup_objects(pse=pse)

            # upsert
            if len(self.df) == 0:
                return
                yield
            # IMPORTANT! NO `await`
            async for result in pse.aupsert_yield(if_row_exists=if_row_exists,
                                                  chunksize=chunksize):
                yield result
Exemplo n.º 11
0
def test_detect_db_type(_, connection_string, expected):
    # there are some engines that we will not be able to create because they are asynchronous
    # and this requires sqlalchemy >= 1.4
    try:
        engine = create_sync_or_async_engine(connection_string)
    except NotImplementedError as e:
        pytest.skip(str(e))
    except ModuleNotFoundError as e:
        pytest.skip(
            f'Could not find module for connection string: {connection_string}. Error was: {e}'
        )
    assert PandasSpecialEngine._detect_db_type(connectable=engine) == expected
Exemplo n.º 12
0
def test_table_attr(engine, schema):
    # generate a somewhat complex table model via the _TestsExampleTable class
    df = _TestsExampleTable.create_example_df(nb_rows=10)
    table_name = TableNames.NO_TABLE
    with sync_async_connect_switch(engine) as connection:
        pse = PandasSpecialEngine(connection=connection,
                                  schema=schema,
                                  table_name=table_name,
                                  df=df)
        # make sure columns and table name match
        expected_cols = list(df.index.names) + df.columns.tolist()
        assert all((col in pse.table.columns for col in expected_cols))
        assert pse.table.name == table_name
Exemplo n.º 13
0
def test_repr(engine, schema):
    dummy_df = pd.DataFrame(index=pd.Index(data=[0], name='id'))
    with sync_async_connect_switch(engine) as connection:
        pse = PandasSpecialEngine(connection=connection,
                                  schema=schema,
                                  table_name=TableNames.NO_TABLE,
                                  df=dummy_df)
        # make sure it is printable without errors
        txt = str(pse)
        print(txt)
        # test some strings we expect to find in the repr
        for s in ('PandasSpecialEngine', 'id ', 'hexid', 'connection',
                  'schema', 'table', 'SQLalchemy table model'):
            assert s in txt
Exemplo n.º 14
0
    async def aexecute(self, async_connectable, if_row_exists: str,
                       chunksize: int) -> None:
        async with TransactionHandler(connectable=async_connectable) as trans:
            # setup
            pse = PandasSpecialEngine(connection=trans.connection,
                                      df=self.df,
                                      table_name=self.table_name,
                                      schema=self.schema,
                                      dtype=self.dtype)
            await self._asetup_objects(pse=pse)

            # upsert
            if len(self.df) == 0:
                return
            await pse.aupsert(if_row_exists=if_row_exists, chunksize=chunksize)
Exemplo n.º 15
0
async def test_table_creation_async(engine, schema):
    dtype = {
        'profileid': VARCHAR(5)
    } if 'mysql' in engine.dialect.dialect_description else None
    df = _TestsExampleTable.create_example_df(nb_rows=10)
    async with engine.connect() as connection:
        pse = PandasSpecialEngine(connection=connection,
                                  schema=schema,
                                  dtype=dtype,
                                  table_name=TableNames.TABLE_CREATION,
                                  df=df)
        assert not await pse.atable_exists()
        await pse.acreate_table_if_not_exists()
        await connection.commit()
        assert await pse.atable_exists()
Exemplo n.º 16
0
def test_duplicated_names(engine, schema, option):
    df = pd.DataFrame({'test': [0]})
    if option == 'index and column collision':
        df.index.name = 'test'
    elif option == 'columns duplicated':
        df.index.name = 'ix'
        df = df[['test', 'test']]
    elif option == 'index duplicated':
        df = df.set_index(['test', 'test'])
    else:
        raise AssertionError(f'Unexpected value for param `option`: {option}')

    with pytest.raises(DuplicateLabelsException) as excinfo:
        with sync_async_connect_switch(engine) as connection:
            PandasSpecialEngine(connection=connection,
                                schema=schema,
                                table_name=TableNames.NO_TABLE,
                                df=df)
    assert "Found duplicates across index and columns" in str(excinfo.value)
Exemplo n.º 17
0
async def test_add_new_columns_async(engine, schema, axis=None):
    # store arguments we will use for multiple PandasSpecialEngine instances
    table_name = TableNames.ADD_NEW_COLUMN
    common_kwargs = dict(schema=schema, table_name=table_name)
    common_kwargs['dtype'] = {
        'profileid': VARCHAR(5)
    } if 'mysql' in engine.dialect.dialect_description else None

    # create our example table
    df = _TestsExampleTable.create_example_df(nb_rows=10)
    async with engine.connect() as connection:
        pse = PandasSpecialEngine(connection=connection,
                                  df=df,
                                  **common_kwargs)
        await pse.acreate_table_if_not_exists()
        await connection.commit()
        assert await pse.atable_exists()

    # we need to recreate an instance of PandasSpecialEngine
    # so that a new table model with the new columns is created then add columns
    async with engine.connect() as connection:
        # error message if we get unexpected values for "axis"
        # or we make a typo in our if/elif statements
        err_msg = f'Expected axis to be one of index, columns. Got {axis}'
        # add a new index level or new columns (no JSON ones,
        # it's not supported by sqlalchemy compilers :( )
        if axis == 'index':
            df['new_index_col'] = 'foo'
            df.set_index('new_index_col', append=True, inplace=True)
        elif axis == 'column':
            df = df.assign(
                new_text_col='test',
                new_int_col=0,
                new_float_col=1.1,
                new_bool_col=False,
                new_dt_col=pd.Timestamp('2020-01-01'),
                # create this col for later
                empty_col=None)
        else:
            raise AssertionError(err_msg)

        # recreate PandasSpecialEngine
        pse = PandasSpecialEngine(connection=connection,
                                  df=df,
                                  **common_kwargs)

        # check if we get an error when trying to add an index level
        if axis == 'index':
            with pytest.raises(MissingIndexLevelInSqlException) as exc_info:
                await pse.aadd_new_columns()
            assert 'Cannot add' in str(exc_info.value)
            return
        elif axis == 'column':
            await pse.aadd_new_columns()
            await connection.commit()
        else:
            raise AssertionError(err_msg)

    # check if the columns were correctly added
    # since we issued a return for 'index' earlier
    # the axis must now be 'columns'
    assert axis == 'column'
    # check the columns where added
    sync_engine = async_engine_to_sync_engine(engine)
    with sync_engine.connect() as connection:
        ns = get_table_namespace(schema=schema, table_name=table_name)
        df_db = pd.read_sql(text(f'SELECT * FROM {ns} LIMIT 0;'),
                            con=connection,
                            index_col='profileid')
        assert set(df.columns) == set(df_db.columns)