Пример #1
0
    def generate_sparse_table(self, as_of_dates):
        """Convert the object's input table
        into a sparse states table for the given as_of_dates

        Args:
            as_of_dates (list of datetime.dates) Dates to include in the sparse
                state table
        """
        logging.debug("Generating sparse table using as_of_dates: %s",
                      as_of_dates)
        self._create_and_populate_sparse_table(as_of_dates)
        self.db_engine.execute(
            "create index on {} (entity_id, as_of_date)".format(
                self.sparse_table_name))
        logging.info(
            "Indices created on entity_id and as_of_date for sparse state table"
        )
        if not table_has_data(self.sparse_table_name, self.db_engine):
            raise ValueError(self._empty_table_message(as_of_dates))

        logging.info("Sparse states table generated at %s",
                     self.sparse_table_name)
        logging.info("Generating stats on %s", self.sparse_table_name)
        logging.info(
            "Row count of %s: %s",
            self.sparse_table_name,
            table_row_count(self.sparse_table_name, self.db_engine),
        )
Пример #2
0
    def generate_all_labels(self, labels_table, as_of_dates, label_timespans):
        self._create_labels_table(labels_table)
        logger.spam(
            f"Creating labels for {len(as_of_dates)} as of dates and {len(label_timespans)} label timespans"
        )
        for as_of_date in as_of_dates:
            for label_timespan in label_timespans:
                if not self.replace:
                    logger.spam(
                        f"Looking for existing labels for as of date {as_of_date} and label timespan {label_timespan}"
                    )
                    any_existing_labels = list(
                        self.db_engine.execute(f"""select 1 from {labels_table}
                        where as_of_date = '{as_of_date}'
                        and label_timespan = '{label_timespan}'::interval
                        and label_name = '{self.label_name}'
                        limit 1
                        """))
                    if len(any_existing_labels) == 1:
                        logger.spam(
                            "Since nonzero existing labels found, skipping")
                        continue

                logger.debug(
                    f"Generating labels for as of date {as_of_date} and label timespan {label_timespan}",
                )
                self.generate(
                    start_date=as_of_date,
                    label_timespan=label_timespan,
                    labels_table=labels_table,
                )

        self.db_engine.execute(
            f"create index on {labels_table} (entity_id, as_of_date)")
        logger.spam("Added index to labels table")

        nrows = table_row_count(labels_table, self.db_engine)

        if nrows == 0:
            logger.warning(
                f"Done creating labels, but no rows in {labels_table} table!")
            raise ValueError(f"{labels_table} is empty!")

        if table_has_duplicates(labels_table, [
                'entity_id', 'as_of_date', 'label_timespan', 'label_name',
                'label_type'
        ], self.db_engine):
            raise ValueError(f"Duplicates found in {labels_table}!")

        logger.debug(f"Labels table generated at {labels_table}")
        logger.spam(f"Row count of {labels_table}: {nrows}")