예제 #1
0
def run_coalescer(cfg: dict,
                  tables: List[str],
                  period: str,
                  run_once: bool,
                  logger: Logger,
                  no_sqpoller: bool = False) -> None:
    """Run the coalescer.

    Runs it once and returns or periodically depending on the
    value of run_once. It also writes out the coalescer records
    as a parquet file.

    :param cfg: dict, the Suzieq config file read in
    :param tables: List[str], list of table names to coalesce
    :param period: str, the string of how periodically the poller runs,
                   Examples are '1h', '1d' etc.
    :param run_once: bool, True if you want the poller to run just once
    :param logger: logging.Logger, the logger to write logs to
    :param no_sqpoller: bool, write records even when there's no sqpoller rec
    :returns: Nothing
    :rtype: none

    """

    try:
        schemas = Schema(cfg['schema-directory'])
    except Exception as ex:
        logger.error(f'Aborting. Unable to load schema: {str(ex)}')
        print(f'ERROR: Aborting. Unable to load schema: {str(ex)}')
        sys.exit(1)

    coalescer_schema = SchemaForTable('sqCoalescer', schemas)
    pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger)
    if not run_once:
        now = datetime.now()
        nextrun = parse(period, settings={'PREFER_DATES_FROM': 'future'})
        sleep_time = (nextrun - now).seconds
        logger.info(f'Got sleep time of {sleep_time} secs')

    while True:
        try:
            stats = do_coalesce(cfg, tables, period, logger, no_sqpoller)
        except Exception:
            logger.exception('Coalescer aborted. Continuing')
        # Write the selftats
        df = pd.DataFrame([asdict(x) for x in stats])
        if not df.empty:
            df['sqvers'] = coalescer_schema.version
            df['version'] = SUZIEQ_VERSION
            df['active'] = True
            df['namespace'] = ''
            pqdb.write('sqCoalescer', 'pandas', df, True,
                       coalescer_schema.get_arrow_schema(), None)

        if run_once:
            break
        sleep(sleep_time)
예제 #2
0
    def migrate(self, table_name: str, schema: SchemaForTable) -> None:
        """Migrates the data for the table specified to latest version

        :param table_name: str, The name of the table to migrate
        :param schema: SchemaForTable, the current schema
        :returns: None
        :rtype:
        """

        current_vers = schema.version
        defvals = self._get_default_vals()
        arrow_schema = schema.get_arrow_schema()
        schema_def = dict(zip(arrow_schema.names, arrow_schema.types))

        for sqvers in self._get_avail_sqvers(table_name, True):
            if sqvers != current_vers:
                migrate_rtn = get_migrate_fn(table_name, sqvers, current_vers)
                if migrate_rtn:
                    dataset = self._get_cp_dataset(table_name, True, sqvers,
                                                   'all', '', '')
                    for item in dataset.files:
                        try:
                            namespace = item.split('namespace=')[1] \
                                            .split('/')[0]
                        except IndexError:
                            # Don't convert data not in our template
                            continue

                        df = pd.read_parquet(item)
                        df['sqvers'] = sqvers
                        df['namespace'] = namespace
                        newdf = migrate_rtn(df)

                        cols = newdf.columns
                        # Ensure all fields are present
                        for field in schema_def:
                            if field not in cols:
                                newdf[field] = defvals.get(
                                    schema_def[field], '')

                        newdf.drop(columns=['namespace', 'sqvers'])

                        newitem = item.replace(f'sqvers={sqvers}',
                                               f'sqvers={current_vers}')
                        newdir = os.path.dirname(newitem)
                        if not os.path.exists(newdir):
                            os.makedirs(newdir, exist_ok=True)

                        table = pa.Table.from_pandas(
                            newdf,
                            schema=schema.get_arrow_schema(),
                            preserve_index=False)
                        pq.write_to_dataset(table,
                                            newitem,
                                            version="2.0",
                                            compression="ZSTD",
                                            row_group_size=100000)
                        self.logger.debug(
                            f'Migrated {item} version {sqvers}->{current_vers}'
                        )
                        os.remove(item)

                    rmtree(
                        f'{self._get_table_directory(table_name, True)}/sqvers={sqvers}',
                        ignore_errors=True)
        return