def query_and_decrypt_data( db: sqlalchemy.engine.base.Engine, env_aead: tink.aead.KmsEnvelopeAead, table_name: str, ) -> None: with db.connect() as conn: # Execute the query and fetch all results recent_votes = conn.execute( f"SELECT team, time_cast, voter_email FROM {table_name} " "ORDER BY time_cast DESC LIMIT 5").fetchall() print("Team\tEmail\tTime Cast") for row in recent_votes: team = row[0] # Postgres pads CHAR fields with spaces. These will need to be removed before # decrypting. aad = team.rstrip() # Use the envelope AEAD primitive to decrypt the email, using the team name as # associated data. Encryption with associated data ensures authenticity # (who the sender is) and integrity (the data has not been tampered with) of that # data, but not its secrecy. (see RFC 5116 for more info) email = env_aead.decrypt(row[2], aad.encode()).decode() time_cast = row[1] # Print recent votes print(f"{team}\t{email}\t{time_cast}")
def usage_file_is_processed(file_name: str, db_engine: sqlalchemy.engine.base.Engine) -> bool: """ Verifies if the current file has been previously processed Parameters ---------- file_name: str Name of the file db_engine: sqlalchemy.engine.base.Engine Returns ------- bool True if the file has been processed, False if it has not. """ _create_usage_table_if_it_does_not_exist(db_engine) query = f"select exists(select 1 from {USAGE_TABLE_NAME} where FILE_NAME='{file_name}') as 'exists'" db_item = {"exists": False} with db_engine.connect() as con: result: Union[ResultProxy, None] = con.execute(query) db_item = _map_single_result_to_dict(result) return db_item["exists"] == 1
def encrypt_and_insert_data( db: sqlalchemy.engine.base.Engine, env_aead: tink.aead.KmsEnvelopeAead, table_name: str, team: str, email: str, ) -> None: time_cast = datetime.datetime.now(tz=datetime.timezone.utc) # Use the envelope AEAD primitive to encrypt the email, using the team name as # associated data. Encryption with associated data ensures authenticity # (who the sender is) and integrity (the data has not been tampered with) of that # data, but not its secrecy. (see RFC 5116 for more info) encrypted_email = env_aead.encrypt(email.encode(), team.encode()) # Verify that the team is one of the allowed options if team != "TABS" and team != "SPACES": logger.error(f"Invalid team specified: {team}") return # Preparing a statement before hand can help protect against injections. stmt = sqlalchemy.text( f"INSERT INTO {table_name} (time_cast, team, voter_email)" " VALUES (:time_cast, :team, :voter_email)" ) # Using a with statement ensures that the connection is always released # back into the pool at the end of statement (even if an error occurs) with db.connect() as conn: conn.execute( stmt, time_cast=time_cast, team=team, voter_email=encrypted_email) print(f"Vote successfully cast for '{team}' at time {time_cast}!")
def _table_to_csv(engine: sqlalchemy.engine.base.Engine, table_name: str, file_path: str): with open(file_path, 'w') as fh: outcsv = csv.writer(fh) with engine.connect() as con: records = con.execute(f"SELECT * FROM {table_name}") outcsv.writerow(records.keys()) outcsv.writerows(records)
def getUSerById(id: int, engine: sa.engine.base.Engine) -> User: user = list(engine.connect().execute( f"select * from {User.__tablename__} where id = '{id}'")) if (len(user) == 0): return None else: user = user[0] user = User.from_row_to_obj(user) return user
def getUserByEmail(email: str, engine: sa.engine.base.Engine) -> User: ## should check username validity user = list(engine.connect().execute( f"select * from {User.__tablename__} where email = '{email}'")) if (len(user) == 0): return None else: user = user[0] user = User.from_row_to_obj(user) return user
def create_tables(config: list, engine: sqlalchemy.engine.base.Engine): con = engine.connect() for table in config: name = table.get('name') schema = table.get('schema') ddl = f"""DROP TABLE IF EXISTS {name}""" con.execute(ddl) ddl = f"""CREATE TABLE {name} ({schema})""" con.execute(ddl)
def last_sync_date( sync_db: sqlalchemy.engine.base.Engine) -> Optional[datetime]: with sync_db.connect() as con: try: usage_df = read_sql("SELECT asOfDate FROM Usage", con) if usage_df["asOfDate"].count() == 0: return None return date_parse(usage_df["asOfDate"].max()) # type: ignore except OperationalError: logger.debug("No Usage table yet") return None
def build_from_engine(self, engine: sqlalchemy.engine.base.Engine) -> dict: self.setup(engine) # Create a connection to the database conn = engine.connect() # Record the time spent duration = { 'querying': {}, } # Sample each relation self.relations = {} sampling_method = { True: 'SYSTEM', False: 'BERNOULLI' }[self.block_sampling] for rel_name in self.rel_names: # Sample the relation if the number of rows is high enough query = 'SELECT * FROM {}'.format(rel_name) # Add a sampling statement if the sampling ratio is lower than 1 sampling_ratio = max(self.sampling_ratio, self.min_rows / self.rel_cards[rel_name]) if sampling_ratio < 1: # Make sure there won't be less samples then the minimum number of allowed rows query += ' TABLESAMPLE {} ({}) REPEATABLE ({})'.format( sampling_method, sampling_ratio * 100, self.seed) date_atts = [ att for att, typ in self.att_types[rel_name].items() if typ == 'date' ] tic = time.time() rel = pd.read_sql_query(sql=query, con=conn, parse_dates=date_atts) duration['querying'][rel_name] = time.time() - tic # Convert the datetimes to ISO formatted strings for att in date_atts: rel[att] = rel[att].map(lambda x: x.isoformat()) # Strip the whitespace from the string columns for att in rel.columns: if rel[att].dtype == 'object': rel[att] = rel[att].str.rstrip() # Store the relation self.relations[rel_name] = rel # Close the connection to the database conn.close() return duration
def delete_table(engine: sa.engine.base.Engine) -> bool: conn = engine.connect() try: conn.execute(f"drop table {User.__tablename__}") return True except sa.exc.OperationalError as err: if f"no such table: {User.__tablename__}" in err._message(): if (engine.echo): print(f"[!] Table '{User.__tablename__}' does not exist!") return False else: raise err
def temp_ids_con(engine: sa.engine.base.Engine, ids: set): """Create database connection that makes temp.ids available as single column temp table """ with engine.connect() as con: rows = ", ".join([f"({sql_clause_format(id)})" for id in ids]) queries = [ "DROP TABLE IF EXISTS temp.ids", "CREATE TEMP TABLE temp.ids(id INTEGER)", f"INSERT INTO temp.ids(id) VALUES {rows}", ] for query in queries: con.execute(query) yield con
def cleanup_after_sync(resource_name: str, sync_db: sqlalchemy.engine.base.Engine): """ Delete sync temporary tables if they exist Parameters ---------- resource_name: str the name of the API resource, e.g. "Courses", to be used in SQL sync_db: sqlalchemy.engine.base.Engine an Engine instance for creating database connections """ with sync_db.connect() as con: con.execute(f"DROP TABLE IF EXISTS Sync_{resource_name}") con.execute(f"DROP TABLE IF EXISTS Unmatched_{resource_name}")
def _create_sync_table_from_resource_df( resource_df: DataFrame, identity_columns: List[str], resource_name: str, sync_db: sqlalchemy.engine.base.Engine, ): """ Take fetched data and push to a new temporary sync table. Includes hash and tentative extractor CreateDate/LastModifiedDates. Parameters ---------- resource_df: DataFrame a DataFrame with current fetched data. identity_columns: List[str] a List of the identity columns for the resource dataframe. resource_name: str the name of the API resource, e.g. "Courses", to be used in SQL sync_db: sqlalchemy.engine.base.Engine an Engine instance for creating database connections """ with sync_db.connect() as con: # ensure sync table exists, need column ordering to be identical to regular table con.execute(f"DROP TABLE IF EXISTS Sync_{resource_name}") con.execute( f""" CREATE TABLE IF NOT EXISTS Sync_{resource_name} ( {SYNC_COLUMNS_SQL} ) """ ) sync_df: DataFrame = resource_df.copy() sync_df = add_hash_and_json_to(sync_df) # add (possibly composite) primary key, sorting for consistent ordering add_sourceid_to(sync_df, identity_columns) now: datetime = datetime.now() sync_df["CreateDate"] = now sync_df["LastModifiedDate"] = now sync_df["SyncNeeded"] = 1 sync_df = sync_df[SYNC_COLUMNS] sync_df.set_index("SourceId", inplace=True) # push to temporary sync table sync_df.to_sql( f"Sync_{resource_name}", sync_db, if_exists="append", index=True, chunksize=1000 )
def setup(self, engine: sqlalchemy.engine.base.Engine): # Retrieve the metadata to know what tables and joins are available metadata = tools.get_metadata(engine) self.rel_names = tuple(metadata.tables.keys()) # Create a connection to the database conn = engine.connect() # Retrieve relation cardinalities self.rel_cards = {} query = ''' SELECT relname, reltuples FROM pg_class WHERE relname IN :rel_names ''' rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names) for (rel_name, card) in rows: self.rel_cards[rel_name] = card # Retrieve attribute cardinalities and number of nulls self.att_cards = defaultdict(dict) self.null_fracs = defaultdict(dict) query = ''' SELECT tablename, attname, n_distinct, null_frac FROM pg_stats WHERE tablename IN :rel_names ''' rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names) for (rel_name, att_name, card, null_frac) in rows: self.att_cards[rel_name][att_name] = -card * self.rel_cards[ rel_name] if card < 0 else card self.null_fracs[rel_name][att_name] = null_frac # Retrieve the type of each attribute self.att_types = defaultdict(dict) query = ''' SELECT table_name, column_name, data_type FROM information_schema.columns WHERE table_name IN :rel_names ''' rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names) for (rel_name, att_name, att_type) in rows: self.att_types[rel_name][att_name] = att_type # Close the connection to the database conn.close()
def sync_to_db_without_cleanup( resource_df: DataFrame, identity_columns: List[str], resource_name: str, sync_db: sqlalchemy.engine.base.Engine, ): """ Take fetched data and sync with database. Creates tables when necessary, but ok if temporary tables are there to start. Does not delete temporary tables when finished. Parameters ---------- resource_df: DataFrame a DataFrame with current fetched data identity_columns: List[str] a List of the identity columns for the resource dataframe. resource_name: str the name of the API resource, e.g. "Courses", to be used in SQL sync_db: sqlalchemy.engine.base.Engine an Engine instance for creating database connections Returns ------- DataFrame a DataFrame with current fetched data and reconciled CreateDate/LastModifiedDate """ assert (Series(identity_columns).isin( resource_df.columns).all()), "Identity columns missing from dataframe" # In certain cases we can end up with duplicate records, for example # in Canvas when a course belongs to a sub-account. De-duplicate the # DataFrame based on the identity_columns resource_df.drop_duplicates(subset=identity_columns, inplace=True) _create_sync_table_from_resource_df(resource_df, identity_columns, resource_name, sync_db) with sync_db.connect() as con: _ensure_main_table_exists(resource_name, con) _create_unmatched_records_temp_table(resource_name, con) _get_true_create_dates_for_unmatched_records(resource_name, con) _update_resource_table_with_changes(resource_name, con) result_df: DataFrame = _update_dataframe_with_true_dates( resource_df, identity_columns, resource_name, con) return result_df
def _table_exist( table_name: str, db_engine: sqlalchemy.engine.base.Engine, ) -> bool: with db_engine.connect() as con: result: Union[ResultProxy, None] = con.execute( """ SELECT name FROM sqlite_master WHERE type='table' AND name=?; """, (table_name, ), ) if result is None: return False if result.first() is None: return False return True
def load_features(engine: sqlalchemy.engine.base.Engine, processed_data, processed_labels): processed_data.to_sql('features', con=engine, index_label='user_id', if_exists='replace', method=psql_insert_copy) processed_labels = pd.DataFrame({'is_fraudster': processed_labels}, index=processed_labels.index) processed_labels.to_sql('labels', con=engine, index_label='user_id', if_exists='replace', method=psql_insert_copy) with engine.connect() as con: con.execute('ALTER TABLE features ADD PRIMARY KEY (user_id);') con.execute('ALTER TABLE labels ADD PRIMARY KEY (user_id);')
def _drop_everything(engine: sqlalchemy.engine.base.Engine): """(On a live db) drops all foreign key constraints before dropping all tables. Workaround for SQLAlchemy not doing DROP ## CASCADE for drop_all() (https://github.com/pallets/flask-sqlalchemy/issues/722) """ con = engine.connect() trans = con.begin() inspector = Inspector.from_engine(engine) # We need to re-create a minimal metadata with only the required things to # successfully emit drop constraints and tables commands for # postgres (based on the actual schema of the running instance) meta = MetaData() tables = [] all_fkeys = [] for view_name in inspector.get_view_names(): con.execute( "DROP MATERIALIZED VIEW IF EXISTS {} CASCADE".format(view_name)) for table_name in inspector.get_table_names(): fkeys = [] for fkey in inspector.get_foreign_keys(table_name): if not fkey["name"]: continue fkeys.append(ForeignKeyConstraint((), (), name=fkey["name"])) tables.append(Table(table_name, meta, *fkeys)) all_fkeys.extend(fkeys) for fkey in all_fkeys: con.execute(DropConstraint(fkey)) for table in tables: con.execute(DropTable(table)) trans.commit() Base.metadata.drop_all(engine)
def request_all_usage_as_df( resource: Optional[Resource], sync_db: sqlalchemy.engine.base.Engine, env_start_date: str, env_end_date: str, ) -> DataFrame: usage_df: DataFrame = request_latest_usage_as_df( resource, start_date(sync_db, env_start_date), end_date(env_end_date)) if usage_df.empty: return usage_df usage_df.to_sql("Usage", sync_db, if_exists="append", index=False, chunksize=500) # remove duplicates - leave only the most recent with sync_db.connect() as con: con.execute("DELETE from Usage " "WHERE rowid not in (select max(rowid) " "FROM Usage " "GROUP BY email, asOfDate)") return usage_df
def create_user(info: dict, engine: sa.engine.base.Engine) -> bool: user_table = User.get_table_obj(engine) ins = user_table.insert() conn = engine.connect() ## incomplete fields check integrity hashing, email/uname/pass format validation etc try: #user = list( conn.execute( ins.values( name=info['name'], username=info['username'], email=info['email'], password=bcrypt.hashpw(info['password'].encode(), bcrypt.gensalt()).decode(), role=info['role'], )) #.returning(sa.literal_column('*')))) #if(len(user) == 0): return False #else: user = User.from_row_to_obj(engine) #print(user) return True except sa.exc.IntegrityError as err: if (engine.echo): print(f"[!] {err._message()}") return False
def build_from_engine(self, engine: sqlalchemy.engine.base.Engine) -> dict: self.setup(engine) # Create a connection to the database conn = engine.connect() # Record the time spent duration = {'querying': {}, 'parameters': {}} # Create histograms per attribute self.histograms = {} self.n_in_bin = {} sampling_method = { True: 'SYSTEM', False: 'BERNOULLI' }[self.block_sampling] for rel_name in self.rel_names: self.histograms[rel_name] = {} self.n_in_bin[rel_name] = {} rel_card = self.rel_cards[rel_name] # Sample the relation if the number of rows is high enough query = 'SELECT * FROM {}'.format(rel_name) # Add a sampling statement if the sampling ratio is lower than 1 sampling_ratio = max(self.sampling_ratio, self.min_rows / rel_card) if sampling_ratio < 1: # Make sure there won't be less samples then the minimum number of allowed rows query += ' TABLESAMPLE {} ({}) REPEATABLE ({})'.format( sampling_method, sampling_ratio * 100, self.seed) date_atts = [ att for att, typ in self.att_types[rel_name].items() if typ == 'date' ] tic = time.time() rel = pd.read_sql_query(sql=query, con=conn, parse_dates=date_atts) duration['querying'][rel_name] = time.time() - tic # Convert the datetimes to ISO formatted strings for att in date_atts: rel[att] = rel[att].map(lambda x: x.isoformat()) # Strip the whitespace from the string columns for att in rel.columns: if rel[att].dtype == 'object': rel[att] = rel[att].str.rstrip() # Blacklist the ID columns blacklist = [ att for att in rel.columns if '_id' in att or 'id_' in att or att == 'id' or '_sk' in att or self.att_types[rel_name][att] == 'character varying' or round(rel_card * self.null_fracs[rel_name][att] + self.att_cards[rel_name][att]) == rel_card ] # Create one histogram per attribute tic = time.time() for att in set(rel.columns) - set(blacklist): rel[att], self.n_in_bin[rel_name][ att] = tools.discretize_series(rel[att], n_mcv=self.n_mcv, n_bins=self.n_bins) self.histograms[rel_name][att] = distribution.Distribution( on=att, by=None) self.histograms[rel_name][att].build_from_df( rel, types=self.att_types[rel_name]) duration['parameters'][rel_name] = time.time() - tic # Close the connection to the database conn.close() return duration
def update_on_table(df: pd.DataFrame, keys: update_key_type, values: update_key_type, table_name: str, engine: sa.engine.base.Engine, schema: str) -> int: """ :param df: a dataframe with data tha needs to be updated. Must have columns to be used as key and some for values :param keys: the set of columns to use as key, i.e. update when matched :param values: the set of columns to update, i.e. set when matched :param table_name: a table name as in util_function :param engine: the sqlalchemy engine for the database :param schema: a schema of interest - None if default schema of database is ok :return: the number of records updated """ # get table tbl = util_function(table_name, engine, schema) # change nan to None, make sure columns are modified so that we can easily bindparam df_ = df.copy() df_.columns = [f"{el.lower()}_updt" for el in df_.columns] groups = toolz.partition_all( CHUNK_SIZE, df_.where(pd.notnull(df_), None).to_dict(orient='records')) if not isinstance(keys, tuple) and not isinstance(keys, dict): raise BadArgumentType( "keys and values must either be both tuples or both dicts", None) # create where clause, and update statement update_statement: dml.Update if isinstance(keys, tuple): if not isinstance(values, tuple): raise BadArgumentType( "keys and values must either be both tuples or both dicts", None) where = [ tbl.c[el] == sa.bindparam(f"{el.lower()}_updt") for el in keys ] update_statement = tbl.update().where(sa.and_(*where)).values( dict((a, sa.bindparam(f"{a.lower()}_updt")) for a in values)) if isinstance(keys, dict): if not isinstance(values, dict): raise BadArgumentType( "keys and values must either be both tuples or both dicts", None) where = [ tbl.c[k] == sa.bindparam(f"{v.lower()}_updt") for k, v in keys.items() ] update_statement = tbl.update().where(sa.and_(*where)).values( dict((k, sa.bindparam(f"{v.lower()}_updt")) for k, v in values.items())) # update count, last_successful_update = 0, None with engine.connect() as connection: for group in groups: try: result = connection.execute(update_statement, group) last_successful_update = group[-1] count += result.rowcount except exc.OperationalError as _: # try again time.sleep(2) try: result = connection.execute(update_statement, group) last_successful_update = group[-1] count += result.rowcount except exc.OperationalError as e: raise OperationalError( f"Failed to update records. Last successful update: {last_successful_update}", e) return count