Пример #1
0
def test_log_output(_, caplog, test_level):
    txt = 'TESTING LOG OUTPUT AND LEVEL'
    with caplog.at_level(logging.INFO):
        log(txt, level=test_level)
    assert len(caplog.records) == 1
    assert caplog.records[0].levelname == log_level_values_to_names[test_level]
    assert txt in caplog.text
Пример #2
0
    def add_new_columns(self):
        """
        Adds columns present in df but not in the SQL table
        for given instance of PandasSpecialEngine.

        Notes
        -----
        Sadly, it seems that we cannot create JSON columns.
        """
        # create deepcopies of the column because we are going to unbound
        # them for the table model (otherwise alembic would think we add
        # a column that already exists in the database)
        cols_to_add = [deepcopy(col) for col in self.table.columns
                       if col.name not in self.get_db_columns_names()]
        # check columns are not index levels
        if any((c.name in self.df.index.names for c in cols_to_add)):
            raise ValueError(('Cannot add any column that is part of the df index!\n'
                              "You'll have to update your table primary key or change your "
                              "df index"))
        
        with self.engine.connect() as con:
            ctx = MigrationContext.configure(con)
            op = Operations(ctx)
            for col in cols_to_add:
                col.table = None # Important! unbound column from table
                op.add_column(self.table.name, col, schema=self.schema)
                log(f"Added column {col} (type: {col.type}) in table {self.table.name} "
                    f'(schema="{self.schema}")')
Пример #3
0
    def _get_values_to_insert(self):
        """
        Gets the values to be inserted from the pandas DataFrame 
        defined in given instance of PandasSpecialEngine
        to the coresponding SQL table.

        Returns
        -------
        values : list
            Values from the df attribute that may have been converted
            for SQL compability e.g. pd.Timestamp will be converted
            to datetime.datetime objects.
        """
        # this seems to be the most reliable way to unpack
        # the DataFrame. For instance using df.to_dict(orient='records')
        # can introduce types such as numpy integer which we'd have to deal with
        values = self.df.reset_index().values.tolist()
        for i in range(len(values)):
            row = values[i]
            for j in range(len(row)):
                val = row[j]
                # replace pd.Timestamp with datetime.datetime
                if isinstance(val, pd.Timestamp):
                    values[i][j] = val.to_pydatetime()
                # check if na unless it is list like
                elif not pd.api.types.is_list_like(val) and pd.isna(val):
                    values[i][j] = null()
                # cast pd.Interval to str
                elif isinstance(val, pd.Interval):
                    log('found pd.Interval objects, they will be casted to str',
                        level=logging.WARNING)
                    values[i][j] = str(val)
        return values
Пример #4
0
    def adapt_dtype_of_empty_db_columns(self):
        """
        Changes the data types of empty columns in the SQL table defined
        in given instance of a PandasSpecialEngine.

        This should only happen in case of data type mismatches.
        This means with columns for which the sqlalchemy table
        model for df and the model for the SQL table have different data types.
        """
        empty_db_columns = self.get_empty_columns()
        db_table = self.get_db_table_schema()
        # if column does not have value in db and there are values
        # in the frame then change the column type if needed
        for col in empty_db_columns:
            # check if the column also exists in df
            if col.name not in self.df.columns:
                continue
            # check same type
            orig_type = db_table.columns[col.name].type.compile(self.engine.dialect)
            dest_type = self.table.columns[col.name].type.compile(self.engine.dialect)
            # remove character count e.g. "VARCHAR(50)" -> "VARCHAR" 
            orig_type = RE_CHARCOUNT_COL_TYPE.sub('', orig_type)
            dest_type = RE_CHARCOUNT_COL_TYPE.sub('', dest_type)
            # if same type or we want to insert TEXT instead of JSON continue
            # (JSON is not supported on some DBs so it's normal to have TEXT instead)
            if ((orig_type == dest_type) or
                ((orig_type == 'JSON') and (dest_type == 'TEXT'))):
                continue
            # grab the col/index from the df
            # so we can check if there are any values
            if col.name in self.df.index.names:
                df_col = self.df.index.get_level_values(col.name)
            else:
                df_col = self.df[col.name]
            if df_col.notna().any():
                # raise error if we have to modify the dtype but we have a SQlite engine
                # (SQLite does not support data type alteration)
                if self._db_type == 'sqlite':
                    raise ValueError('SQlite does not support column data type alteration!')
                with self.engine.connect() as con:
                    ctx = MigrationContext.configure(con)
                    op = Operations(ctx)
                    new_col = self.table.columns[col.name]
                    # check if postgres (in which case we have to use "using" syntax
                    # to alter columns data types)
                    if self._db_type == 'postgres':
                        escaped_col = str(new_col.compile(dialect=self.engine.dialect))
                        compiled_type = new_col.type.compile(dialect=self.engine.dialect)
                        alter_kwargs = {'postgresql_using':f'{escaped_col}::{compiled_type}'}
                    else:
                        alter_kwargs = {}
                    op.alter_column(table_name=self.table.name,
                                    column_name=new_col.name,
                                    type_=new_col.type,
                                    schema=self.schema,
                                    **alter_kwargs)
                    log(f"Changed type of column {new_col.name} "
                        f"from {col.type} to {new_col.type} "
                        f'in table {self.table.name} (schema="{self.schema}")')
Пример #5
0
def test_custom_level(_, caplog):
    base_text = 'TESTING CUSTOM LOGGER LEVEL'
    with custom_logger(logger_level=logging.WARNING):
        # test lower log level -> should be hidden
        with caplog.at_level(logging.INFO):
            log(base_text + ' | THIS LOG SHOULD BE HIDDEN', level=logging.INFO)
        assert caplog.text == ''

        # test greater than or equal level
        expected = base_text + ' | THIS LOG SHOULD BE DISPLAYED'
        with caplog.at_level(logging.INFO):
            log(expected, level=logging.WARNING)
        assert len(caplog.records) == 1
        assert expected in caplog.text
Пример #6
0
def custom_logger(logger_level):
    # kick out any already initalized logger before setting the environment variable
    # not only from the `loggers` dict but also from the logging module as the handlers
    # will still be present and cause duplicates
    loggers.pop('pangres', None)
    logger = logging.getLogger('pangres')
    while logger.hasHandlers():
        try:
            logger.removeHandler(logger.handlers[0])
        except IndexError:
            break

    os.environ['PANGRES_LOG_LEVEL'] = str(logger_level)
    # make sure a logger is available (initialized if not present by the log function)
    log('DUMMY LOG', level=logging.DEBUG)
    try:
        yield
    finally:
        os.environ.pop('PANGRES_LOG_LEVEL', None)
Пример #7
0
def fix_psycopg2_bad_cols(
        df: pd.DataFrame,
        replacements={
            '%': '',
            '(': '',
            ')': ''
        }) -> pd.DataFrame:
    """
    Replaces '%', '(' and ')' (characters that won't play nicely or even
    at all with psycopg2) in column and index names in a deep copy of df.
    This is a workaround for the unresolved issue
    described here: https://github.com/psycopg/psycopg2/issues/167

    **IMPORTANT**:
    You will need to apply the same changes in the database as
    well if the SQL table already exists for a given DataFrame.
    Otherwise you will for instance end up with a column
    "total_%" and "total_" in your SQL table.

    Parameters
    ----------
    df : pd.DataFrame
    replacements : dict {'%':str, '(':str, ')':str}, default {'%':'', '(':'', ')':''}
        The keys '%', '(' and ')' are mandatory.
        There cannot be any extra keys.

    Returns
    -------
    new_df : pd.DataFrame

    Examples
    --------
    * fix bad col/index names with default replacements (empty string for '(', ')' and '%')
    >>> import pandas as pd
    >>> df = pd.DataFrame({'test()':[0],
    ...                    'foo()%':[0]}).set_index('test()')
    >>> print(df.to_markdown())
    |   test() |   foo()% |
    |---------:|---------:|
    |        0 |        0 |

    >>> df_fixed = fix_psycopg2_bad_cols(df)
    >>> print(df_fixed.to_markdown())
    |   test |   foo |
    |-------:|------:|
    |      0 |     0 |

    * fix bad col/index names with custom replacements - you MUST provide replacements for '(', ')' and '%'!
    >>> import pandas as pd
    >>> df = pd.DataFrame({'test()':[0],
    ...                    'foo()%':[0]}).set_index('test()')
    >>> print(df.to_markdown())
    |   test() |   foo()% |
    |---------:|---------:|
    |        0 |        0 |

    >>> df_fixed = fix_psycopg2_bad_cols(df, replacements={'%':'percent', '(':'', ')':''})
    >>> print(df_fixed.to_markdown())
    |   test |   foopercent |
    |-------:|-------------:|
    |      0 |            0 |
    """
    # verify all index levels are named
    index_names = list(df.index.names)
    if any([ix_name is None for ix_name in index_names]):
        raise IndexError("All index levels must be named!")
    # verify duplicated columns
    fields = list(df.index.names) + df.columns.tolist()
    if len(set(fields)) != len(fields):
        raise ValueError(("There cannot be duplicated names amongst "
                          "index levels and/or columns!"))
    # verify replacements arg
    expected_keys = ('%', '(', ')')
    if ((not isinstance(replacements, dict))
            or (set(replacements.keys()) - set(expected_keys) != set())
            or (len(replacements) != len(expected_keys))):
        raise TypeError(
            f'replacements must be a dict containing the following keys (and none other): {expected_keys}'
        )
    if not all((isinstance(v, str) for v in replacements.values())):
        raise TypeError(f'The values of replacements must all be strings')
    # replace bad col names
    translator = {ord(k): v for k, v in replacements.items()}
    new_df = df.copy(deep=True)
    renamer = lambda col: col.translate(translator) if isinstance(col, str
                                                                  ) else col
    new_df = new_df.rename(columns=renamer).rename_axis(index=renamer)
    # check columns are unique after renaming
    fields = list(new_df.index.names) + new_df.columns.tolist()
    if len(set(fields)) != len(fields):
        raise ValueError("Columns/index are not unique after renaming!")
    # compare columns (and index)
    before = df.reset_index().columns.tolist()
    after = new_df.reset_index().columns.tolist()
    changed = []
    for i, j in zip(before, after):
        if (isinstance(i, str) and isinstance(j, str) and i != j):
            log(f'Renamed column/index "{i}" to "{j}s"')
    return new_df
Пример #8
0
def test_bad_log_level(_):
    with pytest.raises(ValueError) as exc_info:
        log('test', level=5)
    assert 'not a valid log level' in str(exc_info.value)
Пример #9
0
def adjust_chunksize(con: Connectable, df: pd.DataFrame, chunksize: int):
    """
    Checks if given `chunksize` is appropriate for upserting rows in given database using
    given DataFrame.
    The `chunksize` represents the number of rows you would like to upsert in a chunk when
    using the `pangres.upsert` function.

    Some databases have limitations on the number of SQL parameters and we need one parameter
    per value for upserting data.
    E.g. a DataFrame of 5 **columns+index levels** and 4 rows will require 5*4=20 SQL parameters.

    This function will check the database type (e.g. SQlite) and the number of **columns+index levels**
    to determine if the `chunksize` does not exceed limits and propose a lower one if it does.
    Otherwise the same `chunksize` that you gave as input is returned.

    This function currently takes into account max parameters limitations for the following cases:
    * sqlite (32766 max for version >= 3.22.0 otherwise 999)
    * asyncpg (32767 max)

    If you know about more parameter limitations relevant for this library (PostgreSQL, MySQL, SQlite
    or other databases I have not tested with this library that you managed to have working),
    please contact me.

    Parameters
    ----------
    con
        sqlalchemy Engine or Connection
    df
        DataFrame you would wish to upsert
    chunksize
        Size of chunks you would wish to use for upserting (represents the number of rows
        in each chunk)

    Raises
    ------
    TooManyColumnsForUpsertException
        When a DataFrame has more columns+index levels than the maximum number of allowed SQL variables
        for a SQL query for given database.
        In such a case even inserting row by row would not be possible because we would already
        have too many variables.
        For more information you can for instance google "SQLITE_MAX_VARIABLE_NUMBER".

    Examples
    --------
    >>> from sqlalchemy import create_engine
    >>>
    >>> # config (this assumes you have SQlite version >= 3.22.0)
    >>> engine = create_engine("sqlite://")
    >>>
    >>> # some df we want to upsert
    >>> df = pd.DataFrame({'name':['Albert']}).rename_axis(index='profileid')
    >>> print(df.to_markdown())
    |   profileid | name   |
    |------------:|:-------|
    |           0 | Albert |

    >>> # adjust chunksize: 100,000 is too big of a chunksize in general for given database
    >>> # SQlite only allows 32766 parameters (values) at once maximum in a query
    >>> # since we have two columns (technically 1 column + 1 index level)
    >>> # we can only upsert in chunks of FLOOR(32766/2) rows maximum which is 16383
    >>> adjust_chunksize(con=engine, df=df, chunksize=100_000)
    16383
    """
    validate_chunksize_param(chunksize=chunksize)

    # get maximum number of parameters depending on the database
    dialect = con.dialect.dialect_description
    if 'sqlite' in dialect:
        maximum = 32766 if _sqlite_gt3_32_0() else 999
    elif 'asyncpg' in dialect:
        maximum = 32767
    else:
        maximum = None

    # simple case we can solve early
    if maximum is None:
        return chunksize

    # adjust chunksize
    new_chunksize = floor(maximum / (len(df.columns) + df.index.nlevels))
    if new_chunksize < 1:
        raise TooManyColumnsForUpsertException(
            'The df has more columns+index levels than the maxmimum number of allowed parameters '
            'for given database for a query (we could not even upsert row by row).'
        )
    if chunksize > new_chunksize:
        log(
            f'Reduced chunksize from {chunksize} to {new_chunksize} due '
            f'to max variable restriction of given dialect (max {maximum} for dialect {dialect}).',
            level=logging.INFO)
        chunksize = new_chunksize
    return chunksize
Пример #10
0
    def upsert(self, if_row_exists, chunksize=10000):
        """
        Generates and executes an upsert (insert update or 
        insert ignore depending on :if_row_exists:) statement
        for given instance of PandasSpecialEngine.

        The values of df will be upserted with different sqlalchemy
        methods depending on the dialect (e.g. using
        sqlalchemy.dialects.postgresql.insert for postgres).
        See more information under pangres.upsert.

        Parameters
        ----------
        if_rows_exists : {'ignore', 'update'}
            If 'ignore' where the primary key matches nothing is
            updated.
            If 'update' where the primary key matches the values
            are updated using what's available in df.
            In both cases rows are inserted for non primary keys.
        chunksize : int > 0, default 900
            Number of values to be inserted at once,
            an integer strictly above zero.
        """
        # VERIFY ARGUMENTS
        if if_row_exists not in ('ignore', 'update'):
            raise ValueError('if_row_exists must be "ignore" or "update"')
        # convert values if needed
        values = self._get_values_to_insert()
        # recalculate chunksize for sqlite
        if self._db_type == 'sqlite':
            # to circumvent the max of 999 parameters for sqlite we have to
            # make sure chunksize is not too high also I don't know how to
            # deal with tables that have more than 999 columns because even
            # with single row inserts it's already too many variables.
            # (you can google "SQLITE_MAX_VARIABLE_NUMBER" for more info)
            new_chunksize = floor(999 / len(self.table.columns))
            if new_chunksize < 1:
                # case > 999 columns
                err = ('Updating SQlite tables with more than 999 columns is '
                       'not supported due to max variables restriction (999 max). '
                       'If you know a way around that please let me know'
                       '(e.g. post a GitHub issue)!')
                raise NotImpletementedError(err)
            if chunksize > new_chunksize:
                log(f'Reduced chunksize from {chunksize} to {new_chunksize} due '
                    'to SQlite max variable restriction (max 999).',
                    level=logging.WARNING)
            chunksize=new_chunksize
        # create chunks
        chunks = self._create_chunks(values=values, chunksize=chunksize)
        upsert_funcs = {"postgres":postgres_upsert,
                        "mysql":mysql_upsert,
                        "sqlite":sqlite_upsert,
                        "other":sqlite_upsert}
        upsert_func = upsert_funcs[self._db_type]
        with self.engine.connect() as con:
            for chunk in chunks:
                upsert = upsert_func(engine=self.engine,
                                     connection=con,
                                     table=self.table,
                                     values=chunk,
                                     if_row_exists=if_row_exists)