def test_log_output(_, caplog, test_level): txt = 'TESTING LOG OUTPUT AND LEVEL' with caplog.at_level(logging.INFO): log(txt, level=test_level) assert len(caplog.records) == 1 assert caplog.records[0].levelname == log_level_values_to_names[test_level] assert txt in caplog.text
def add_new_columns(self): """ Adds columns present in df but not in the SQL table for given instance of PandasSpecialEngine. Notes ----- Sadly, it seems that we cannot create JSON columns. """ # create deepcopies of the column because we are going to unbound # them for the table model (otherwise alembic would think we add # a column that already exists in the database) cols_to_add = [deepcopy(col) for col in self.table.columns if col.name not in self.get_db_columns_names()] # check columns are not index levels if any((c.name in self.df.index.names for c in cols_to_add)): raise ValueError(('Cannot add any column that is part of the df index!\n' "You'll have to update your table primary key or change your " "df index")) with self.engine.connect() as con: ctx = MigrationContext.configure(con) op = Operations(ctx) for col in cols_to_add: col.table = None # Important! unbound column from table op.add_column(self.table.name, col, schema=self.schema) log(f"Added column {col} (type: {col.type}) in table {self.table.name} " f'(schema="{self.schema}")')
def _get_values_to_insert(self): """ Gets the values to be inserted from the pandas DataFrame defined in given instance of PandasSpecialEngine to the coresponding SQL table. Returns ------- values : list Values from the df attribute that may have been converted for SQL compability e.g. pd.Timestamp will be converted to datetime.datetime objects. """ # this seems to be the most reliable way to unpack # the DataFrame. For instance using df.to_dict(orient='records') # can introduce types such as numpy integer which we'd have to deal with values = self.df.reset_index().values.tolist() for i in range(len(values)): row = values[i] for j in range(len(row)): val = row[j] # replace pd.Timestamp with datetime.datetime if isinstance(val, pd.Timestamp): values[i][j] = val.to_pydatetime() # check if na unless it is list like elif not pd.api.types.is_list_like(val) and pd.isna(val): values[i][j] = null() # cast pd.Interval to str elif isinstance(val, pd.Interval): log('found pd.Interval objects, they will be casted to str', level=logging.WARNING) values[i][j] = str(val) return values
def adapt_dtype_of_empty_db_columns(self): """ Changes the data types of empty columns in the SQL table defined in given instance of a PandasSpecialEngine. This should only happen in case of data type mismatches. This means with columns for which the sqlalchemy table model for df and the model for the SQL table have different data types. """ empty_db_columns = self.get_empty_columns() db_table = self.get_db_table_schema() # if column does not have value in db and there are values # in the frame then change the column type if needed for col in empty_db_columns: # check if the column also exists in df if col.name not in self.df.columns: continue # check same type orig_type = db_table.columns[col.name].type.compile(self.engine.dialect) dest_type = self.table.columns[col.name].type.compile(self.engine.dialect) # remove character count e.g. "VARCHAR(50)" -> "VARCHAR" orig_type = RE_CHARCOUNT_COL_TYPE.sub('', orig_type) dest_type = RE_CHARCOUNT_COL_TYPE.sub('', dest_type) # if same type or we want to insert TEXT instead of JSON continue # (JSON is not supported on some DBs so it's normal to have TEXT instead) if ((orig_type == dest_type) or ((orig_type == 'JSON') and (dest_type == 'TEXT'))): continue # grab the col/index from the df # so we can check if there are any values if col.name in self.df.index.names: df_col = self.df.index.get_level_values(col.name) else: df_col = self.df[col.name] if df_col.notna().any(): # raise error if we have to modify the dtype but we have a SQlite engine # (SQLite does not support data type alteration) if self._db_type == 'sqlite': raise ValueError('SQlite does not support column data type alteration!') with self.engine.connect() as con: ctx = MigrationContext.configure(con) op = Operations(ctx) new_col = self.table.columns[col.name] # check if postgres (in which case we have to use "using" syntax # to alter columns data types) if self._db_type == 'postgres': escaped_col = str(new_col.compile(dialect=self.engine.dialect)) compiled_type = new_col.type.compile(dialect=self.engine.dialect) alter_kwargs = {'postgresql_using':f'{escaped_col}::{compiled_type}'} else: alter_kwargs = {} op.alter_column(table_name=self.table.name, column_name=new_col.name, type_=new_col.type, schema=self.schema, **alter_kwargs) log(f"Changed type of column {new_col.name} " f"from {col.type} to {new_col.type} " f'in table {self.table.name} (schema="{self.schema}")')
def test_custom_level(_, caplog): base_text = 'TESTING CUSTOM LOGGER LEVEL' with custom_logger(logger_level=logging.WARNING): # test lower log level -> should be hidden with caplog.at_level(logging.INFO): log(base_text + ' | THIS LOG SHOULD BE HIDDEN', level=logging.INFO) assert caplog.text == '' # test greater than or equal level expected = base_text + ' | THIS LOG SHOULD BE DISPLAYED' with caplog.at_level(logging.INFO): log(expected, level=logging.WARNING) assert len(caplog.records) == 1 assert expected in caplog.text
def custom_logger(logger_level): # kick out any already initalized logger before setting the environment variable # not only from the `loggers` dict but also from the logging module as the handlers # will still be present and cause duplicates loggers.pop('pangres', None) logger = logging.getLogger('pangres') while logger.hasHandlers(): try: logger.removeHandler(logger.handlers[0]) except IndexError: break os.environ['PANGRES_LOG_LEVEL'] = str(logger_level) # make sure a logger is available (initialized if not present by the log function) log('DUMMY LOG', level=logging.DEBUG) try: yield finally: os.environ.pop('PANGRES_LOG_LEVEL', None)
def fix_psycopg2_bad_cols( df: pd.DataFrame, replacements={ '%': '', '(': '', ')': '' }) -> pd.DataFrame: """ Replaces '%', '(' and ')' (characters that won't play nicely or even at all with psycopg2) in column and index names in a deep copy of df. This is a workaround for the unresolved issue described here: https://github.com/psycopg/psycopg2/issues/167 **IMPORTANT**: You will need to apply the same changes in the database as well if the SQL table already exists for a given DataFrame. Otherwise you will for instance end up with a column "total_%" and "total_" in your SQL table. Parameters ---------- df : pd.DataFrame replacements : dict {'%':str, '(':str, ')':str}, default {'%':'', '(':'', ')':''} The keys '%', '(' and ')' are mandatory. There cannot be any extra keys. Returns ------- new_df : pd.DataFrame Examples -------- * fix bad col/index names with default replacements (empty string for '(', ')' and '%') >>> import pandas as pd >>> df = pd.DataFrame({'test()':[0], ... 'foo()%':[0]}).set_index('test()') >>> print(df.to_markdown()) | test() | foo()% | |---------:|---------:| | 0 | 0 | >>> df_fixed = fix_psycopg2_bad_cols(df) >>> print(df_fixed.to_markdown()) | test | foo | |-------:|------:| | 0 | 0 | * fix bad col/index names with custom replacements - you MUST provide replacements for '(', ')' and '%'! >>> import pandas as pd >>> df = pd.DataFrame({'test()':[0], ... 'foo()%':[0]}).set_index('test()') >>> print(df.to_markdown()) | test() | foo()% | |---------:|---------:| | 0 | 0 | >>> df_fixed = fix_psycopg2_bad_cols(df, replacements={'%':'percent', '(':'', ')':''}) >>> print(df_fixed.to_markdown()) | test | foopercent | |-------:|-------------:| | 0 | 0 | """ # verify all index levels are named index_names = list(df.index.names) if any([ix_name is None for ix_name in index_names]): raise IndexError("All index levels must be named!") # verify duplicated columns fields = list(df.index.names) + df.columns.tolist() if len(set(fields)) != len(fields): raise ValueError(("There cannot be duplicated names amongst " "index levels and/or columns!")) # verify replacements arg expected_keys = ('%', '(', ')') if ((not isinstance(replacements, dict)) or (set(replacements.keys()) - set(expected_keys) != set()) or (len(replacements) != len(expected_keys))): raise TypeError( f'replacements must be a dict containing the following keys (and none other): {expected_keys}' ) if not all((isinstance(v, str) for v in replacements.values())): raise TypeError(f'The values of replacements must all be strings') # replace bad col names translator = {ord(k): v for k, v in replacements.items()} new_df = df.copy(deep=True) renamer = lambda col: col.translate(translator) if isinstance(col, str ) else col new_df = new_df.rename(columns=renamer).rename_axis(index=renamer) # check columns are unique after renaming fields = list(new_df.index.names) + new_df.columns.tolist() if len(set(fields)) != len(fields): raise ValueError("Columns/index are not unique after renaming!") # compare columns (and index) before = df.reset_index().columns.tolist() after = new_df.reset_index().columns.tolist() changed = [] for i, j in zip(before, after): if (isinstance(i, str) and isinstance(j, str) and i != j): log(f'Renamed column/index "{i}" to "{j}s"') return new_df
def test_bad_log_level(_): with pytest.raises(ValueError) as exc_info: log('test', level=5) assert 'not a valid log level' in str(exc_info.value)
def adjust_chunksize(con: Connectable, df: pd.DataFrame, chunksize: int): """ Checks if given `chunksize` is appropriate for upserting rows in given database using given DataFrame. The `chunksize` represents the number of rows you would like to upsert in a chunk when using the `pangres.upsert` function. Some databases have limitations on the number of SQL parameters and we need one parameter per value for upserting data. E.g. a DataFrame of 5 **columns+index levels** and 4 rows will require 5*4=20 SQL parameters. This function will check the database type (e.g. SQlite) and the number of **columns+index levels** to determine if the `chunksize` does not exceed limits and propose a lower one if it does. Otherwise the same `chunksize` that you gave as input is returned. This function currently takes into account max parameters limitations for the following cases: * sqlite (32766 max for version >= 3.22.0 otherwise 999) * asyncpg (32767 max) If you know about more parameter limitations relevant for this library (PostgreSQL, MySQL, SQlite or other databases I have not tested with this library that you managed to have working), please contact me. Parameters ---------- con sqlalchemy Engine or Connection df DataFrame you would wish to upsert chunksize Size of chunks you would wish to use for upserting (represents the number of rows in each chunk) Raises ------ TooManyColumnsForUpsertException When a DataFrame has more columns+index levels than the maximum number of allowed SQL variables for a SQL query for given database. In such a case even inserting row by row would not be possible because we would already have too many variables. For more information you can for instance google "SQLITE_MAX_VARIABLE_NUMBER". Examples -------- >>> from sqlalchemy import create_engine >>> >>> # config (this assumes you have SQlite version >= 3.22.0) >>> engine = create_engine("sqlite://") >>> >>> # some df we want to upsert >>> df = pd.DataFrame({'name':['Albert']}).rename_axis(index='profileid') >>> print(df.to_markdown()) | profileid | name | |------------:|:-------| | 0 | Albert | >>> # adjust chunksize: 100,000 is too big of a chunksize in general for given database >>> # SQlite only allows 32766 parameters (values) at once maximum in a query >>> # since we have two columns (technically 1 column + 1 index level) >>> # we can only upsert in chunks of FLOOR(32766/2) rows maximum which is 16383 >>> adjust_chunksize(con=engine, df=df, chunksize=100_000) 16383 """ validate_chunksize_param(chunksize=chunksize) # get maximum number of parameters depending on the database dialect = con.dialect.dialect_description if 'sqlite' in dialect: maximum = 32766 if _sqlite_gt3_32_0() else 999 elif 'asyncpg' in dialect: maximum = 32767 else: maximum = None # simple case we can solve early if maximum is None: return chunksize # adjust chunksize new_chunksize = floor(maximum / (len(df.columns) + df.index.nlevels)) if new_chunksize < 1: raise TooManyColumnsForUpsertException( 'The df has more columns+index levels than the maxmimum number of allowed parameters ' 'for given database for a query (we could not even upsert row by row).' ) if chunksize > new_chunksize: log( f'Reduced chunksize from {chunksize} to {new_chunksize} due ' f'to max variable restriction of given dialect (max {maximum} for dialect {dialect}).', level=logging.INFO) chunksize = new_chunksize return chunksize
def upsert(self, if_row_exists, chunksize=10000): """ Generates and executes an upsert (insert update or insert ignore depending on :if_row_exists:) statement for given instance of PandasSpecialEngine. The values of df will be upserted with different sqlalchemy methods depending on the dialect (e.g. using sqlalchemy.dialects.postgresql.insert for postgres). See more information under pangres.upsert. Parameters ---------- if_rows_exists : {'ignore', 'update'} If 'ignore' where the primary key matches nothing is updated. If 'update' where the primary key matches the values are updated using what's available in df. In both cases rows are inserted for non primary keys. chunksize : int > 0, default 900 Number of values to be inserted at once, an integer strictly above zero. """ # VERIFY ARGUMENTS if if_row_exists not in ('ignore', 'update'): raise ValueError('if_row_exists must be "ignore" or "update"') # convert values if needed values = self._get_values_to_insert() # recalculate chunksize for sqlite if self._db_type == 'sqlite': # to circumvent the max of 999 parameters for sqlite we have to # make sure chunksize is not too high also I don't know how to # deal with tables that have more than 999 columns because even # with single row inserts it's already too many variables. # (you can google "SQLITE_MAX_VARIABLE_NUMBER" for more info) new_chunksize = floor(999 / len(self.table.columns)) if new_chunksize < 1: # case > 999 columns err = ('Updating SQlite tables with more than 999 columns is ' 'not supported due to max variables restriction (999 max). ' 'If you know a way around that please let me know' '(e.g. post a GitHub issue)!') raise NotImpletementedError(err) if chunksize > new_chunksize: log(f'Reduced chunksize from {chunksize} to {new_chunksize} due ' 'to SQlite max variable restriction (max 999).', level=logging.WARNING) chunksize=new_chunksize # create chunks chunks = self._create_chunks(values=values, chunksize=chunksize) upsert_funcs = {"postgres":postgres_upsert, "mysql":mysql_upsert, "sqlite":sqlite_upsert, "other":sqlite_upsert} upsert_func = upsert_funcs[self._db_type] with self.engine.connect() as con: for chunk in chunks: upsert = upsert_func(engine=self.engine, connection=con, table=self.table, values=chunk, if_row_exists=if_row_exists)