Пример #1
0
    def _maybe_create_entity_date_table(self):
        if self.replace or not table_exists(self.entity_date_table_name,
                                            self.db_engine):
            logger.spam(
                f"Creating entity_date table {self.entity_date_table_name}")
            self.db_engine.execute(
                f"drop table if exists {self.entity_date_table_name}")
            self.db_engine.execute(
                f"""create table {self.entity_date_table_name} (
                    entity_id integer,
                    as_of_date timestamp,
                    {DEFAULT_ACTIVE_STATE} boolean
                )
                """)

            logger.spam(
                f"Creating indices on entity_id and as_of_date for entity_date table {self.entity_date_table_name}"
            )
            self.db_engine.execute(
                f"create index on {self.entity_date_table_name} (entity_id, as_of_date)"
            )
        else:
            logger.notice(
                f"Not dropping and recreating entity_date {self.entity_date_table_name} table because "
                f"replace flag was set to False and table was found to exist")
Пример #2
0
def test_materialized_from_obj_maybe_materialize(db_engine_with_events_table):
    from_obj = FromObj(from_obj="events",
                       name="myquery",
                       knowledge_date_column='event_date')
    from_obj.should_materialize = lambda: True
    from_obj.maybe_materialize(db_engine_with_events_table)
    assert table_exists(from_obj.table, db_engine_with_events_table)
Пример #3
0
def table_should_exist(table_name, db_engine):
    """Ensures that the table exists in the given database

    Args:
        table_name (string) A table name (with schema)
        db_engine (sqlalchemy.engine)

    Raises: ValueError if the table does not exist
    """
    if not table_exists(table_name, db_engine):
        raise ValueError("{} table does not exist".format(table_name))
Пример #4
0
 def _maybe_create_cohort_table(self):
     if self.replace or not table_exists(self.cohort_table_name, self.db_engine):
         self.db_engine.execute(f"drop table if exists {self.cohort_table_name}")
         self.db_engine.execute(
             f"""create table {self.cohort_table_name} (
                 entity_id integer,
                 as_of_date timestamp,
                 {DEFAULT_ACTIVE_STATE} boolean
             )
             """
         )
         logging.info("Created cohort table")
     else:
         logging.info("Not dropping and recreating cohort table because "
                      "replace flag was set to False and table was found to exist")
Пример #5
0
 def _create_labels_table(self, labels_table_name):
     if self.replace or not table_exists(labels_table_name, self.db_engine):
         self.db_engine.execute(f"drop table if exists {labels_table_name}")
         self.db_engine.execute(f"""
             create table {labels_table_name} (
             entity_id int,
             as_of_date date,
             label_timespan interval,
             label_name varchar,
             label_type varchar,
             label smallint
             )""")
     else:
         logger.notice(
             f"Not dropping and recreating {labels_table_name} table because "
             f"replace flag was set to False and table was found to exist")
Пример #6
0
    def _create_and_populate_entity_date_table_from_labels(self):
        """Create an entity_date table by storing all distinct entity-id/as-of-date pairs
        from the labels table
        """
        self._maybe_create_entity_date_table()
        logger.spam(
            f"Populating entity_date table {self.entity_date_table_name} from labels table {self.labels_table_name}"
        )
        if not table_exists(self.labels_table_name, self.db_engine):
            logger.warning(
                "Labels table does not exist, cannot populate entity-dates")
            return

        self.db_engine.execute(f"""insert into {self.entity_date_table_name}
            select distinct entity_id, as_of_date
            from {self.labels_table_name}
            """)
Пример #7
0
 def _create_labels_table(self, labels_table_name):
     if self.replace or not table_exists(labels_table_name, self.db_engine):
         self.db_engine.execute(
             "drop table if exists {}".format(labels_table_name))
         self.db_engine.execute("""
             create table {} (
             entity_id int,
             as_of_date date,
             label_timespan interval,
             label_name varchar(30),
             label_type varchar(30),
             label int
         )""".format(labels_table_name))
     else:
         logging.info(
             "Not dropping and recreating table because "
             "replace flag was set to False and table was found to exist")
Пример #8
0
def upgrade_if_clean(dburl):
    """Upgrade the database only if the results schema hasn't been created yet.

    Raises: ValueError if the database results schema version does not equal the code's version
    """
    alembic_cfg = alembic_config(dburl)
    engine = create_engine(dburl)
    script_ = script.ScriptDirectory.from_config(alembic_cfg)
    if not table_exists('results_schema_versions', engine):
        logger.info(
            "No results_schema_versions table exists, which means that this installation "
            "is fresh. Upgrading db.")
        upgrade_db(dburl=dburl)
        return
    with engine.begin() as conn:
        current_revision = conn.execute(
            'select version_num from results_schema_versions limit 1').scalar(
            )
        logger.debug("Database's triage_metadata schema version is %s",
                     current_revision)
        triage_head = script_.get_current_head()
        logger.debug("Code's triage_metadata schema version is %s",
                     triage_head)
        database_is_ahead = not any(migration.revision == current_revision
                                    for migration in script_.walk_revisions())
        if database_is_ahead:
            raise ValueError(
                f"Your database's results schema version, {current_revision}, is not a known "
                "revision to this version of Triage. Usually, this happens if you use a branch "
                "with a new results schema version and upgrade the database to that version. "
                "To use this version of Triage, you will likely need to check out that branch "
                f"and downgrade to {triage_head}", )
        elif current_revision != triage_head:
            raise ValueError(
                f"Your database's results schema revision, {current_revision}, is out of date "
                "for this version of Triage. However, your database can be upgraded to this "
                "revision. If you would like to upgrade your database from the console, and "
                "you've installed Triage, you may execute `triage db upgrade`. "
                "If the `triage` command is unavailable, (because you are running Triage directly "
                " from a repository checkout), then `manage alembic upgrade head`. "
                "The database changes may take a long time on a heavily populated database. "
                "Otherwise, you can also downgrade your Triage version to match your database."
            )
Пример #9
0
    def _generate_imp_table_tasks_for(self, aggregation, drop_preagg=True):
        """Generate SQL statements for preparing, populating, and
        finalizing imputations, for each feature group table in the
        given aggregation.

        Requires the existance of the underlying feature and aggregation
        tables defined in `_generate_agg_table_tasks_for()`.

        Args:
            aggregation (collate.SpacetimeAggregation)
            drop_preagg: boolean to specify dropping pre-imputation
                tables

        Returns: (dict) of structure {
                'prepare': list of commands to prepare table for population
                'inserts': list of commands to populate table
                'finalize': list of commands to finalize table after population
            }

        """
        table_tasks = OrderedDict()
        imp_tbl_name = self._clean_table_name(aggregation.get_table_name(imputed=True))

        if not self.replace and self._table_exists(imp_tbl_name):
            logging.info("Skipping imputation table creation for %s", imp_tbl_name)
            table_tasks[imp_tbl_name] = {}
            return table_tasks

        if not aggregation.state_table:
            logging.warning(
                "No state table defined in aggregation, cannot create imputation table for %s",
                imp_tbl_name,
            )
            table_tasks[imp_tbl_name] = {}
            return table_tasks

        if not table_exists(aggregation.state_table, self.db_engine):
            logging.warning(
                "State table %s does not exist, cannot create imputation table for %s",
                aggregation.state_table,
                imp_tbl_name,
            )
            table_tasks[imp_tbl_name] = {}
            return table_tasks

        # excute query to find columns with null values and create lists of columns
        # that do and do not need imputation when creating the imputation table
        with self.db_engine.begin() as conn:
            results = conn.execute(aggregation.find_nulls())
            null_counts = results.first().items()
        impute_cols = [col for (col, val) in null_counts if val > 0]
        nonimpute_cols = [col for (col, val) in null_counts if val == 0]

        # table tasks for imputed aggregation table, most of the work is done here
        # by collate's get_impute_create()
        table_tasks[imp_tbl_name] = {
            "prepare": [
                aggregation.get_drop(imputed=True),
                aggregation.get_impute_create(
                    impute_cols=impute_cols, nonimpute_cols=nonimpute_cols
                ),
            ],
            "inserts": [],
            "finalize": [self._aggregation_index_query(aggregation, imputed=True)],
        }
        logging.info("Created table tasks for imputation: %s", imp_tbl_name)

        # do some cleanup:
        # drop the group-level and aggregation tables, just leaving the
        # imputation table if drop_preagg=True
        if drop_preagg:
            drops = aggregation.get_drops()
            table_tasks[imp_tbl_name]["finalize"] += list(drops.values()) + [
                aggregation.get_drop()
            ]
            logging.info("Added drop table cleanup tasks: %s", imp_tbl_name)

        return table_tasks
Пример #10
0
    def generate_all_dates(self, as_of_dates, cohort_table_name, cohort_hash):
        logger.spam("Creating protected groups table")
        table_is_new = False
        if not table_exists(self.protected_groups_table_name, self.db_engine):
            self.db_engine.execute(f"""
                create table if not exists {self.protected_groups_table_name} (
                entity_id int,
                as_of_date date,
                {', '.join([str(col) + " varchar" for col in self.attribute_columns])},
                cohort_hash text
                )""")
            logger.debug(
                f"Protected groups table {self.protected_groups_table_name} created"
            )
            table_is_new = True
        else:
            logger.debug(
                f"Protected groups table {self.protected_groups_table_name} exist"
            )

        if self.replace:
            self.db_engine.execute(
                f"delete from {self.protected_groups_table_name} where cohort_hash = '{cohort_hash}'"
            )
            logger.debug(
                f"Removed from {self.protected_groups_table_name} all rows from cohort {cohort_hash}"
            )

        logger.spam(
            f"Creating protected_groups for {len(as_of_dates)} as of dates", )

        for as_of_date in as_of_dates:
            if not self.replace:
                logger.spam(
                    "Looking for existing protected_groups for as of date {as_of_date}"
                )
                any_existing_rows = list(
                    self.db_engine.execute(
                        f"""select 1 from {self.protected_groups_table_name}
                    where as_of_date = '{as_of_date}'
                    and cohort_hash = '{cohort_hash}'
                    limit 1
                    """))
                if len(any_existing_rows) == 1:
                    logger.debug(
                        "Since nonzero existing protected_groups found, skipping"
                    )
                    continue

            logger.debug(
                "Generating protected groups for as of date {as_of_date} ")
            self.generate(start_date=as_of_date,
                          cohort_table_name=cohort_table_name,
                          cohort_hash=cohort_hash)
        if table_is_new:
            self.db_engine.execute(
                f"create index on {self.protected_groups_table_name} (cohort_hash, as_of_date)"
            )
        nrows = self.db_engine.execute("select count(*) from {}".format(
            self.protected_groups_table_name)).scalar()
        if nrows == 0:
            logger.warning(
                "Done creating protected_groups, but no rows in protected_groups table!"
            )
        else:
            logger.success(f"Protected groups stored in the table "
                           f"{self.protected_groups_table_name} successfully")
            logger.spam(f"Protected groups table has {nrows} rows")
Пример #11
0
 def test_table_exists(self):
     self.engine.execute("create table incidents (col1 varchar)")
     assert dbreflect.table_exists("incidents", self.engine)
     assert not dbreflect.table_exists("compliments", self.engine)
Пример #12
0
    def generate_all_dates(self, as_of_dates, cohort_table_name, cohort_hash):
        table_is_new = False
        if not table_exists(self.protected_groups_table_name, self.db_engine):
            self.db_engine.execute(
                """
                create table if not exists {} (
                entity_id int,
                as_of_date date,
                {},
                cohort_hash text
            )""".format(
                    self.protected_groups_table_name,
                    ", ".join([str(col) + " varchar(60)" for col in self.attribute_columns])
                )
            )
            table_is_new = True
        else:
            logging.info("Not dropping and recreating protected groups table because "
                         "replace flag was set to False and table was found to exist")
        if self.replace:
            self.db_engine.execute(
                f'delete from {self.protected_groups_table_name} where cohort_hash = %s',
                cohort_hash
            )

        logging.info(
            "Creating protected_groups for %s as of dates",
            len(as_of_dates)
        )
        for as_of_date in as_of_dates:
            if not self.replace:
                logging.info(
                    "Looking for existing protected_groups for as of date %s",
                    as_of_date
                )
                any_existing_rows = list(self.db_engine.execute(
                    f"""select 1 from {self.protected_groups_table_name}
                    where as_of_date = '{as_of_date}'
                    and cohort_hash = '{cohort_hash}'
                    limit 1
                    """
                ))
                if len(any_existing_rows) == 1:
                    logging.info("Since nonzero existing protected_groups found, skipping")
                    continue

            logging.info(
                "Generating protected groups for as of date %s ",
                as_of_date
            )
            self.generate(
                start_date=as_of_date,
                cohort_table_name=cohort_table_name,
                cohort_hash=cohort_hash
            )
        if table_is_new:
            self.db_engine.execute(f"create index on {self.protected_groups_table_name} (cohort_hash, as_of_date)")
        nrows = self.db_engine.execute(
            "select count(*) from {}".format(self.protected_groups_table_name)
        ).scalar()
        if nrows == 0:
            logging.warning("Done creating protected_groups, but no rows in protected_groups table!")
        else:
            logging.info("Done creating protected_groups table %s: rows: %s", self.protected_groups_table_name, nrows)