def diff_df(df1: pd.DataFrame, df2: pd.DataFrame, keys: List[Col], ignores: List[Col]) -> Tuple[DiffDict, Status]: """Find diffs as a DiffDict of changes from df1 to df2.""" dd: DiffDict = {'adds': None, 'mods': [], 'retires': None} dim_status = compare_dims(df1, df2, True, False) if dim_status != OK(): return dd, dim_status df1_, df2_, retire_df, new_df = symm_diff_df(df1, df2, keys) mods, dim_status = find_mods(df1_, df2_, keys, ignores) if dim_status != OK(): return dd, dim_status return {'adds': new_df, 'mods': mods, 'retires': retire_df}, OK()
def gen_pk_uniq(pk: List[Col], uniq: List[Col]) -> Tuple[str, Status]: """Generate Primary Key or Unique substring.""" s_pk = '' if not pk else 'PRIMARY KEY({})'.format(', '.join(pk)) s_uniq = '' if not uniq else 'UNIQUE({})'.format(', '.join(uniq)) s = ('' if not s_pk and not s_uniq else f'{s_pk}\n' if s_pk and not s_uniq else f'{s_uniq}\n' if s_uniq and not s_pk else f'{s_pk},\n{s_uniq}\n') return s, OK()
def query_df(cur: Cursor, q: str) -> Tuple[pd.DataFrame, Status]: """Execute SQL query string and return result as DataFrame.""" rows, status = query(cur, q, True) if status != OK() or not rows: logger.error(f'Query {q} returned nvalid status or no data') return pd.DataFrame(), status df = pd.DataFrame(rows[1:], columns=rows[0]) return df, status
def gen_cols(cols: List[SchemaCol]) -> Tuple[str, Status]: """Generate columns substring.""" status: Status if not cols: return '', OK() s, status = '', OK() for col in cols: name, dtype, pk, uniq, not_null = col if pk and uniq: s, status = '', Error(f'Col {name} specified as both PK and Uniq') break dtype_ = dtype_to_str(dtype) s += f'{name} {dtype_}' s += ' PRIMARY KEY' if pk else ' UNIQUE' if uniq else '' s += ' NOT NULL,\n' if not_null else ',\n' return s, status
def gen_create_stmt(td: TableDef) -> Tuple[str, Status]: """Translate TableDef into Create Table statement.""" status: Status create, s1 = gen_create(td['if_not_exists'], td['name']) cols, s2 = gen_cols(td['cols']) fks, s3 = gen_fks(td['fks']) pk_uniq, s4 = gen_pk_uniq(td['pk'], td['uniq']) if all(s == OK() for s in (s1, s2, s3, s4)): spec = strip_comma(f'{cols}{fks}{pk_uniq}') stmt = f'{create}({spec});' status = OK() else: stmt = '' msg = ' | '.join(s.msg for s in (s1, s2, s3, s4) if s != OK()) status = Error(msg) return stmt, status
def create(cur: Cursor, stmt: str) -> Status: """Create table.""" status: Status try: cur.execute(stmt) status = OK() logger.info(f'Create statement executed: {stmt}') except Exception as e: logger.error(f'Create statement exception: {stmt}; {e}') status = Error(str(e)) return status
def validate_insert(cur: Cursor, table: str, rows: Rows, schema_cast: bool) -> Tuple[SqliteSchema, RowsPair]: """Validate insertion cols and optionally try casting to schema dtypes.""" q = f'SELECT sql FROM sqlite_master WHERE type="table" and name="{table}"' ret, status = query(cur, q) if not ret or status != OK(): msg = f'Schema validation query {q} failed' logger.error(msg) return [], ([], Error(msg)) schema = parse_schema(ret[0][0]) if len(schema) != len(rows[0]): msg = f'Insertion validation error: {table} has ' msg += f'{len(schema)} cols vs input {len(rows[0])} cols' logger.error(msg) return [], ([], Error(msg)) return (schema, apply_schema(schema, rows) if schema_cast else (rows, OK()))
def __connect__(self) -> None: """Establish DB connection.""" if self.db_type is DB_Type.SQLITE: self.conn = sqlite3.connect(self.db_name) self.cur = self.conn.cursor() self.status = OK() self.conn.execute('PRAGMA foreign_keys = 1') logger.info('Connected to Sqlite DB: {}'.format(self.db_name)) else: self.status = self.INVALID_STATUS logger.error('DB conn failed: {}'.format(self.INVALID_STATUS.msg))
def close(conn) -> Status: """Close DB connection object.""" status: Status try: conn.close() status = OK() logger.info('Closed DB conn.') except Exception as e: status = Error(str(e)) logger.error('Failed to close DB conn: {}'.format(str(e))) return status
def compare_dims(df1: pd.DataFrame, df2: pd.DataFrame, cols: bool = True, rows: bool = False) -> Status: """Compare DF dimensions for compatibility.""" status: Status cols1, cols2 = len(df1.columns), len(df2.columns) rows1, rows2 = len(df1), len(df2) if not cols and not rows: status = OK() elif cols and not rows: status = (OK() if cols1 == cols2 else Error(f'Cols mismatch: {cols1} vs {cols2}')) elif rows and not cols: status = (OK() if rows1 == rows2 else Error(f'Rows mismatch: {rows1} vs {rows2}')) else: status = (OK() if rows1 == rows2 and cols1 == cols2 else Error('Matrix mismatch: ' + f'{rows1} x {cols1} vs {rows2} x {cols2}')) return status
def find_mods(df1: pd.DataFrame, df2: pd.DataFrame, keys: List[Col], ignores: List[Col] = []) -> Tuple[List[Mod], Status]: """Find all changes as Mods from df1 to df2.""" diff_cols = [ col for col in df1.columns if col not in keys and col not in ignores ] dim_status = compare_dims(df1, df2, True, True) if dim_status != OK(): return [], dim_status f = lambda x: x.sort_values(by=keys).reset_index(drop=True) df1_, df2_ = f(df1), f(df2) mask = (df1_ != df2_).any(axis=1) df = df1_[mask].merge(df2_[mask], on=keys, suffixes=['_old', '_new']) pairs = [ gen_mod(t._asdict(), keys, diff_cols) for t in df.itertuples(index=False) ] return [pair for pair in pairs if pair[1]], OK()
def insert(conn: Conn, cur: Cursor, table: str, rows: Rows, schema_cast: bool = True) -> Status: """Attempt to execute SQL insertion into specified table.""" status: Status schema, (rows_, v) = validate_insert(cur, table, rows, schema_cast) if v != OK(): return v try: cols = ','.join(name for name, _ in schema) vals = ','.join('?' * len(schema)) i = f'INSERT INTO {table}({cols}) VALUES ({vals})' cur.executemany(i, rows_) conn.commit() status = OK() logger.info('Insertion to {} executed: {}'.format(table, i)) except Exception as e: status = Error(str(e)) logger.error('Insertion exception for {}: {}'.format(i, str(e))) return status
def apply_schema(schema: SqliteSchema, rows: Rows) -> RowsPair: """Attempt to cast rows to primitive types in schema.""" status: Status = OK() try: rows_ = [] for row in rows: row_ = [cast(elem) for elem, (_, cast) in zip(row, schema)] rows_.append(row_) except Exception as e: msg = f'Insertion validation error: exception while casting {row}: {e}' status = Error(msg) logger.error(msg) return rows_, status
def query(cur: Cursor, q: str, hdr: bool = False) -> RowsPair: """Execute SQL query string.""" status: Status try: result = cur.execute(q) if hdr: cols = [d[0] for d in result.description] rows = [cols] + [list(row) for row in result.fetchall()] else: rows = [list(row) for row in result.fetchall()] status = OK() logger.info(f'Query executed: {q}') except Exception as e: logger.error(f'Query exception: {q}; {e}') rows, status = [], Error(str(e)) return rows, status
def gen_fks(fks: List[SchemaForeignKey]) -> Tuple[str, Status]: """Generate Foreign Keys substring.""" status: Status s, status = '', OK() for fk in fks: fk_cols, fk_ref_cols = fk['cols'], fk['ref_cols'] if len(fk_cols) != len(fk_ref_cols): s, status = '', Error( f'Length mismatch {fk_cols} vs {fk_ref_cols}') break cols, ref_cols = ', '.join(fk_cols), ', '.join(fk_ref_cols) ref_table = fk['ref_table'] s += f'FOREIGN KEY({cols}) REFERENCES {ref_table}({ref_cols}),\n' return f'{s}', status
def gen_create(if_not_exists: bool, name: str) -> Tuple[str, Status]: """Generate Create substring.""" s = (f'CREATE TABLE {name}' if if_not_exists is False else f'CREATE TABLE IF NOT EXISTS {name}') return s, OK()