def _maybe_create_entity_date_table(self): if self.replace or not table_exists(self.entity_date_table_name, self.db_engine): logger.spam( f"Creating entity_date table {self.entity_date_table_name}") self.db_engine.execute( f"drop table if exists {self.entity_date_table_name}") self.db_engine.execute( f"""create table {self.entity_date_table_name} ( entity_id integer, as_of_date timestamp, {DEFAULT_ACTIVE_STATE} boolean ) """) logger.spam( f"Creating indices on entity_id and as_of_date for entity_date table {self.entity_date_table_name}" ) self.db_engine.execute( f"create index on {self.entity_date_table_name} (entity_id, as_of_date)" ) else: logger.notice( f"Not dropping and recreating entity_date {self.entity_date_table_name} table because " f"replace flag was set to False and table was found to exist")
def test_materialized_from_obj_maybe_materialize(db_engine_with_events_table): from_obj = FromObj(from_obj="events", name="myquery", knowledge_date_column='event_date') from_obj.should_materialize = lambda: True from_obj.maybe_materialize(db_engine_with_events_table) assert table_exists(from_obj.table, db_engine_with_events_table)
def table_should_exist(table_name, db_engine): """Ensures that the table exists in the given database Args: table_name (string) A table name (with schema) db_engine (sqlalchemy.engine) Raises: ValueError if the table does not exist """ if not table_exists(table_name, db_engine): raise ValueError("{} table does not exist".format(table_name))
def _maybe_create_cohort_table(self): if self.replace or not table_exists(self.cohort_table_name, self.db_engine): self.db_engine.execute(f"drop table if exists {self.cohort_table_name}") self.db_engine.execute( f"""create table {self.cohort_table_name} ( entity_id integer, as_of_date timestamp, {DEFAULT_ACTIVE_STATE} boolean ) """ ) logging.info("Created cohort table") else: logging.info("Not dropping and recreating cohort table because " "replace flag was set to False and table was found to exist")
def _create_labels_table(self, labels_table_name): if self.replace or not table_exists(labels_table_name, self.db_engine): self.db_engine.execute(f"drop table if exists {labels_table_name}") self.db_engine.execute(f""" create table {labels_table_name} ( entity_id int, as_of_date date, label_timespan interval, label_name varchar, label_type varchar, label smallint )""") else: logger.notice( f"Not dropping and recreating {labels_table_name} table because " f"replace flag was set to False and table was found to exist")
def _create_and_populate_entity_date_table_from_labels(self): """Create an entity_date table by storing all distinct entity-id/as-of-date pairs from the labels table """ self._maybe_create_entity_date_table() logger.spam( f"Populating entity_date table {self.entity_date_table_name} from labels table {self.labels_table_name}" ) if not table_exists(self.labels_table_name, self.db_engine): logger.warning( "Labels table does not exist, cannot populate entity-dates") return self.db_engine.execute(f"""insert into {self.entity_date_table_name} select distinct entity_id, as_of_date from {self.labels_table_name} """)
def _create_labels_table(self, labels_table_name): if self.replace or not table_exists(labels_table_name, self.db_engine): self.db_engine.execute( "drop table if exists {}".format(labels_table_name)) self.db_engine.execute(""" create table {} ( entity_id int, as_of_date date, label_timespan interval, label_name varchar(30), label_type varchar(30), label int )""".format(labels_table_name)) else: logging.info( "Not dropping and recreating table because " "replace flag was set to False and table was found to exist")
def upgrade_if_clean(dburl): """Upgrade the database only if the results schema hasn't been created yet. Raises: ValueError if the database results schema version does not equal the code's version """ alembic_cfg = alembic_config(dburl) engine = create_engine(dburl) script_ = script.ScriptDirectory.from_config(alembic_cfg) if not table_exists('results_schema_versions', engine): logger.info( "No results_schema_versions table exists, which means that this installation " "is fresh. Upgrading db.") upgrade_db(dburl=dburl) return with engine.begin() as conn: current_revision = conn.execute( 'select version_num from results_schema_versions limit 1').scalar( ) logger.debug("Database's triage_metadata schema version is %s", current_revision) triage_head = script_.get_current_head() logger.debug("Code's triage_metadata schema version is %s", triage_head) database_is_ahead = not any(migration.revision == current_revision for migration in script_.walk_revisions()) if database_is_ahead: raise ValueError( f"Your database's results schema version, {current_revision}, is not a known " "revision to this version of Triage. Usually, this happens if you use a branch " "with a new results schema version and upgrade the database to that version. " "To use this version of Triage, you will likely need to check out that branch " f"and downgrade to {triage_head}", ) elif current_revision != triage_head: raise ValueError( f"Your database's results schema revision, {current_revision}, is out of date " "for this version of Triage. However, your database can be upgraded to this " "revision. If you would like to upgrade your database from the console, and " "you've installed Triage, you may execute `triage db upgrade`. " "If the `triage` command is unavailable, (because you are running Triage directly " " from a repository checkout), then `manage alembic upgrade head`. " "The database changes may take a long time on a heavily populated database. " "Otherwise, you can also downgrade your Triage version to match your database." )
def _generate_imp_table_tasks_for(self, aggregation, drop_preagg=True): """Generate SQL statements for preparing, populating, and finalizing imputations, for each feature group table in the given aggregation. Requires the existance of the underlying feature and aggregation tables defined in `_generate_agg_table_tasks_for()`. Args: aggregation (collate.SpacetimeAggregation) drop_preagg: boolean to specify dropping pre-imputation tables Returns: (dict) of structure { 'prepare': list of commands to prepare table for population 'inserts': list of commands to populate table 'finalize': list of commands to finalize table after population } """ table_tasks = OrderedDict() imp_tbl_name = self._clean_table_name(aggregation.get_table_name(imputed=True)) if not self.replace and self._table_exists(imp_tbl_name): logging.info("Skipping imputation table creation for %s", imp_tbl_name) table_tasks[imp_tbl_name] = {} return table_tasks if not aggregation.state_table: logging.warning( "No state table defined in aggregation, cannot create imputation table for %s", imp_tbl_name, ) table_tasks[imp_tbl_name] = {} return table_tasks if not table_exists(aggregation.state_table, self.db_engine): logging.warning( "State table %s does not exist, cannot create imputation table for %s", aggregation.state_table, imp_tbl_name, ) table_tasks[imp_tbl_name] = {} return table_tasks # excute query to find columns with null values and create lists of columns # that do and do not need imputation when creating the imputation table with self.db_engine.begin() as conn: results = conn.execute(aggregation.find_nulls()) null_counts = results.first().items() impute_cols = [col for (col, val) in null_counts if val > 0] nonimpute_cols = [col for (col, val) in null_counts if val == 0] # table tasks for imputed aggregation table, most of the work is done here # by collate's get_impute_create() table_tasks[imp_tbl_name] = { "prepare": [ aggregation.get_drop(imputed=True), aggregation.get_impute_create( impute_cols=impute_cols, nonimpute_cols=nonimpute_cols ), ], "inserts": [], "finalize": [self._aggregation_index_query(aggregation, imputed=True)], } logging.info("Created table tasks for imputation: %s", imp_tbl_name) # do some cleanup: # drop the group-level and aggregation tables, just leaving the # imputation table if drop_preagg=True if drop_preagg: drops = aggregation.get_drops() table_tasks[imp_tbl_name]["finalize"] += list(drops.values()) + [ aggregation.get_drop() ] logging.info("Added drop table cleanup tasks: %s", imp_tbl_name) return table_tasks
def generate_all_dates(self, as_of_dates, cohort_table_name, cohort_hash): logger.spam("Creating protected groups table") table_is_new = False if not table_exists(self.protected_groups_table_name, self.db_engine): self.db_engine.execute(f""" create table if not exists {self.protected_groups_table_name} ( entity_id int, as_of_date date, {', '.join([str(col) + " varchar" for col in self.attribute_columns])}, cohort_hash text )""") logger.debug( f"Protected groups table {self.protected_groups_table_name} created" ) table_is_new = True else: logger.debug( f"Protected groups table {self.protected_groups_table_name} exist" ) if self.replace: self.db_engine.execute( f"delete from {self.protected_groups_table_name} where cohort_hash = '{cohort_hash}'" ) logger.debug( f"Removed from {self.protected_groups_table_name} all rows from cohort {cohort_hash}" ) logger.spam( f"Creating protected_groups for {len(as_of_dates)} as of dates", ) for as_of_date in as_of_dates: if not self.replace: logger.spam( "Looking for existing protected_groups for as of date {as_of_date}" ) any_existing_rows = list( self.db_engine.execute( f"""select 1 from {self.protected_groups_table_name} where as_of_date = '{as_of_date}' and cohort_hash = '{cohort_hash}' limit 1 """)) if len(any_existing_rows) == 1: logger.debug( "Since nonzero existing protected_groups found, skipping" ) continue logger.debug( "Generating protected groups for as of date {as_of_date} ") self.generate(start_date=as_of_date, cohort_table_name=cohort_table_name, cohort_hash=cohort_hash) if table_is_new: self.db_engine.execute( f"create index on {self.protected_groups_table_name} (cohort_hash, as_of_date)" ) nrows = self.db_engine.execute("select count(*) from {}".format( self.protected_groups_table_name)).scalar() if nrows == 0: logger.warning( "Done creating protected_groups, but no rows in protected_groups table!" ) else: logger.success(f"Protected groups stored in the table " f"{self.protected_groups_table_name} successfully") logger.spam(f"Protected groups table has {nrows} rows")
def test_table_exists(self): self.engine.execute("create table incidents (col1 varchar)") assert dbreflect.table_exists("incidents", self.engine) assert not dbreflect.table_exists("compliments", self.engine)
def generate_all_dates(self, as_of_dates, cohort_table_name, cohort_hash): table_is_new = False if not table_exists(self.protected_groups_table_name, self.db_engine): self.db_engine.execute( """ create table if not exists {} ( entity_id int, as_of_date date, {}, cohort_hash text )""".format( self.protected_groups_table_name, ", ".join([str(col) + " varchar(60)" for col in self.attribute_columns]) ) ) table_is_new = True else: logging.info("Not dropping and recreating protected groups table because " "replace flag was set to False and table was found to exist") if self.replace: self.db_engine.execute( f'delete from {self.protected_groups_table_name} where cohort_hash = %s', cohort_hash ) logging.info( "Creating protected_groups for %s as of dates", len(as_of_dates) ) for as_of_date in as_of_dates: if not self.replace: logging.info( "Looking for existing protected_groups for as of date %s", as_of_date ) any_existing_rows = list(self.db_engine.execute( f"""select 1 from {self.protected_groups_table_name} where as_of_date = '{as_of_date}' and cohort_hash = '{cohort_hash}' limit 1 """ )) if len(any_existing_rows) == 1: logging.info("Since nonzero existing protected_groups found, skipping") continue logging.info( "Generating protected groups for as of date %s ", as_of_date ) self.generate( start_date=as_of_date, cohort_table_name=cohort_table_name, cohort_hash=cohort_hash ) if table_is_new: self.db_engine.execute(f"create index on {self.protected_groups_table_name} (cohort_hash, as_of_date)") nrows = self.db_engine.execute( "select count(*) from {}".format(self.protected_groups_table_name) ).scalar() if nrows == 0: logging.warning("Done creating protected_groups, but no rows in protected_groups table!") else: logging.info("Done creating protected_groups table %s: rows: %s", self.protected_groups_table_name, nrows)