Exemplo n.º 1
0
def coalescer_main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--service-only",
        type=str,
        help="Only run this space separated list of services",
    )
    parser.add_argument(
        "-x",
        "--exclude-services",
        type=str,
        help="Exclude running this space separated list of services",
    )

    parser.add_argument("-c",
                        "--config",
                        default=f'{os.getenv("HOME")}/.suzieq/suzieq-cfg.yml',
                        type=str,
                        help="alternate config file")
    parser.add_argument(
        "--run-once",
        default=False,
        help='Run the coalescer once and exit',
        action='store_true',
    )
    parser.add_argument(
        "-p",
        "--period",
        type=str,
        help=('Override the period specified in config file with this. '
              'Format is <period><h|d|w|y>. 1h is 1 hour, 2w is 2 weeks etc.'))
    parser.add_argument("--no-sqpoller",
                        action='store_true',
                        help=argparse.SUPPRESS)

    userargs = parser.parse_args()

    cfg = load_sq_config(config_file=userargs.config)
    if not cfg:
        print(f'Invalid Suzieq config file {userargs.config}')
        sys.exit(1)

    logfile, loglevel = get_log_file_level('coalescer', cfg,
                                           '/tmp/sq-coalescer.log')
    logger = init_logger('suzieq.coalescer', logfile, loglevel, False)

    # Ensure we're the only compacter
    coalesce_dir = cfg.get('coalescer', {})\
                      .get('coalesce-directory',
                           f'{cfg.get("data-directory")}/coalesced')

    fd = ensure_single_instance(f'{coalesce_dir}/.sq-coalescer.pid', False)
    if not fd:
        print(f'ERROR: Another coalescer process present')
        logger.error(f'Another coalescer process present')
        sys.exit(errno.EBUSY)

    if userargs.run_once:
        timestr = ''
    elif not userargs.period:
        timestr = cfg.get('coalescer', {'period': '1h'}).get('period', '1h')
    else:
        timestr = userargs.period

    schemas = Schema(cfg.get('schema-directory'))
    if userargs.service_only or userargs.exclude_services:
        tables = [
            x for x in schemas.tables()
            if (schemas.type_for_table(x) != "derivedRecord")
        ]
        if userargs.service_only:
            tables = [x for x in tables if x in userargs.service_only.split()]
        if userargs.exclude_services:
            tables = [
                x for x in tables
                if x not in userargs.exclude_services.split()
            ]
    else:
        tables = []

    run_coalescer(cfg, tables, timestr, userargs.run_once, logger,
                  userargs.no_sqpoller or False)
    os.truncate(fd, 0)
    try:
        fcntl.flock(fd, fcntl.LOCK_UN)
        os.close(fd)
    except OSError:
        pass

    sys.exit(0)
Exemplo n.º 2
0
    def coalesce(self,
                 tables: List[str] = [],
                 period: str = '',
                 ign_sqpoller: bool = False) -> None:
        """Coalesce all the resource parquet files in specified folder.

        This routine does not run periodically. It runs once and returns.

        :param tables: List[str], List of specific tables to coalesce, empty for all
        :param period: str, coalescing period, needed for various internal stuff
        :param ign_sqpoller: True if its OK to ignore the absence of sqpoller to
                             coalesce
        :returns: coalesce statistics list, one per table
        :rtype: SqCoalesceStats
        """

        infolder = self.cfg['data-directory']
        outfolder = self._get_table_directory('', True)  # root folder
        archive_folder = self.cfg.get('coalescer', {}) \
                                 .get('archive-directory',
                                      f'{infolder}/_archived')

        if not period:
            period = self.cfg.get('coalesceer', {
                'period': '1h'
            }).get('period', '1h')
        schemas = Schema(self.cfg.get('schema-directory'))
        state = SqCoalesceState(self.logger, period)

        state.logger = self.logger
        # Trying to be complete here. the ignore prefixes assumes you have coalesceers
        # across multiple time periods running, and so we need to ignore the files
        # created by the longer time period coalesceions. In other words, weekly
        # coalesceer should ignore monthly and yearly coalesced files, monthly
        # coalesceer should ignore yearly coalesceer and so on.
        try:
            timeint = int(period[:-1])
            time_unit = period[-1]
            if time_unit == 'h':
                run_int = timedelta(hours=timeint)
                state.prefix = 'sqc-h-'
                state.ign_pfx = ['.', '_', 'sqc-']
            elif time_unit == 'd':
                run_int = timedelta(days=timeint)
                if timeint > 364:
                    state.prefix = 'sqc-y-'
                    state.ign_pfx = ['.', '_', 'sqc-y-']
                elif timeint > 29:
                    state.prefix = 'sqc-m-'
                    state.ign_pfx = ['.', '_', 'sqc-m-', 'sqc-y-']
                else:
                    state.prefix = 'sqc-d-'
                    state.ign_pfx = [
                        '.', '_', 'sqc-d-', 'sqc-w-', 'sqc-m-', 'sqc-y-'
                    ]
            elif time_unit == 'w':
                run_int = timedelta(weeks=timeint)
                state.prefix = 'sqc-w-'
                state.ign_pfx = ['.', '_', 'sqc-w-', 'sqc-m-', 'sqc-y-']
            else:
                logging.error(f'Invalid unit for period, {time_unit}, '
                              'must be one of h/d/w')
        except ValueError:
            logging.error(f'Invalid time, {period}')
            return

        state.period = run_int
        # Create list of tables to coalesce.
        # TODO: Verify that we're only coalescing parquet tables here
        if tables:
            tables = [
                x for x in tables if schemas.tables() and (
                    schemas.type_for_table(x) != "derivedRecord")
            ]
        else:
            tables = [
                x for x in schemas.tables()
                if schemas.type_for_table(x) != "derivedRecord"
            ]
        if 'sqPoller' not in tables and not ign_sqpoller:
            # This is an error. sqPoller keeps track of discontinuities
            # among other things.
            self.logger.error(
                'No sqPoller data, cannot compute discontinuities')
            return
        else:
            # We want sqPoller to be first to compute discontinuities
            with suppress(ValueError):
                tables.remove('sqPoller')
            if not ign_sqpoller:
                tables.insert(0, 'sqPoller')

        # We've forced the sqPoller to be always the first table to coalesce
        stats = []
        for entry in tables:
            table_outfolder = f'{outfolder}/{entry}'
            table_infolder = f'{infolder}//{entry}'
            if archive_folder:
                table_archive_folder = f'{archive_folder}/{entry}'
            else:
                table_archive_folder = None
            state.current_df = pd.DataFrame()
            state.dbeng = self
            state.schema = SchemaForTable(entry, schemas, None)
            if not os.path.isdir(table_infolder):
                self.logger.info(f'No input records to coalesce for {entry}')
                continue
            try:
                if not os.path.isdir(table_outfolder):
                    os.makedirs(table_outfolder)
                if (table_archive_folder
                        and not os.path.isdir(table_archive_folder)):
                    os.makedirs(table_archive_folder, exist_ok=True)
                # Migrate the data if needed
                self.logger.debug(f'Migrating data for {entry}')
                self.migrate(entry, state.schema)
                self.logger.debug(f'Migrating data for {entry}')
                start = time()
                coalesce_resource_table(table_infolder, table_outfolder,
                                        table_archive_folder, entry, state)
                end = time()
                self.logger.info(
                    f'coalesced {state.wrfile_count} files/{state.wrrec_count} '
                    f'records of {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), state.wrfile_count,
                        state.wrrec_count,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))
            except Exception:
                self.logger.exception(f'Unable to coalesce table {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), 0, 0,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))

        return stats
Exemplo n.º 3
0
                                False)
    if not fd:
        print(f'ERROR: Another coalescer process present')
        logger.error(f'Another coalescer process present')
        sys.exit(errno.EBUSY)

    if userargs.run_once:
        timestr = ''
    elif not userargs.period:
        timestr = cfg.get('coalescer', {'period': '1h'}).get('period', '1h')
    else:
        timestr = userargs.period

    schemas = Schema(cfg.get('schema-directory'))
    if userargs.service_only or userargs.exclude_services:
        tables = [x for x in schemas.tables()
                  if (schemas.type_for_table(x) != "derivedRecord")]
        if userargs.service_only:
            tables = [x for x in tables if x in userargs.service_only.split()]
        if userargs.exclude_services:
            tables = [x for x in tables
                      if x not in userargs.exclude_services.split()]
    else:
        tables = []

    run_coalescer(cfg, tables, timestr, userargs.run_once,
                  logger, userargs.no_sqpoller or False)
    os.truncate(fd, 0)
    try:
        fcntl.flock(fd, fcntl.LOCK_UN)
        os.close(fd)