예제 #1
0
    def __init__(self, engine_name: str = '',
                 hostname: typing.List[str] = None,
                 start_time: str = '', end_time: str = '',
                 view: str = '', namespace: typing.List[str] = None,
                 columns: typing.List[str] = None,
                 context=None, table: str = '', config_file=None) -> None:

        if not context:
            self.ctxt = SqContext(cfg=load_sq_config(validate=True,
                                                     config_file=config_file),
                                  engine=engine_name)
            self.ctxt.schemas = Schema(self.ctxt.cfg["schema-directory"])
        else:
            self.ctxt = context
            if not self.ctxt.cfg:
                self.ctxt.cfg = load_sq_config(validate=True,
                                               config_file=config_file)
                self.ctxt.schemas = Schema(self.ctxt.cfg["schema-directory"])
            if not self.ctxt.engine:
                self.ctxt.engine = engine_name

        self._cfg = self.ctxt.cfg
        self._schema = SchemaForTable(table, self.ctxt.schemas)
        self._table = table
        self._sort_fields = self._schema.key_fields()
        self._convert_args = {}

        self.namespace = namespace or self.ctxt.namespace or []
        self.hostname = hostname or self.ctxt.hostname or []
        self.start_time = start_time or self.ctxt.start_time
        self.end_time = end_time or self.ctxt.end_time

        view = view or self.ctxt.view

        if self.start_time and self.end_time and not view:
            self.view = 'all'
        else:
            self.view = view or 'latest'

        self.columns = columns or ['default']
        self._unique_def_column = ['hostname']

        if engine_name and engine_name != '':
            self.engine = get_sqengine(engine_name, self._table)(self)
        elif self.ctxt.engine:
            self.engine = get_sqengine(self.ctxt.engine, self._table)(self)

        if not self.engine:
            raise ValueError('Unknown analysis engine')

        self.summarize_df = pd.DataFrame()

        self._addnl_filter = self._addnl_fields = []
        self._valid_get_args = self._valid_assert_args = []
        self._valid_arg_vals = self._valid_find_args = []
        self._valid_summarize_args = []
예제 #2
0
    def on_connected(self, *args, **kwargs):
        if self._args.V:
            print_version()
            sys.exit(0)
        if self._args.config:
            self.ctxt.cfg = load_sq_config(validate=True,
                                           config_file=self._args.config)
        else:
            self.ctxt.cfg = load_sq_config(validate=True)

        if not self.ctxt.cfg:
            print('ERROR: No suzieq configuration found')
            print('Create a suzieq-cfg.yml under the homedir or current dir')
            print('OR pass a path to the config file via -c argument')
            sys.exit(1)
        self.ctxt.schemas = Schema(self.ctxt.cfg["schema-directory"])
        cfg = self.ctxt.cfg
        self.ctxt.engine = cfg.get('ux', {}).get('engine', 'pandas')
        if self.ctxt.engine == 'rest':
            # See if we can extract the REST info from the REST part
            restcfg = cfg.get('rest', {})
            self.ctxt.rest_server_ip = restcfg.get('address', '127.0.0.1')
            self.ctxt.reset_server_port = restcfg.get('address', '80')
            if restcfg.get('no-https', 'False') == 'False':
                self.ctxt.transport = 'https'
            else:
                self.ctxt.transport = 'http'
            self.ctxt.rest_api_key = restcfg.get('API_KEY', '')
예제 #3
0
def run_coalescer(cfg: dict,
                  tables: List[str],
                  periodstr: str,
                  run_once: bool,
                  logger: Logger,
                  no_sqpoller: bool = False) -> None:
    """Run the coalescer.

    Runs it once and returns or periodically depending on the
    value of run_once. It also writes out the coalescer records
    as a parquet file.

    :param cfg: dict, the Suzieq config file read in
    :param tables: List[str], list of table names to coalesce
    :param periodstr: str, the string of how periodically the poller runs,
                      Examples are '1h', '1d' etc.
    :param run_once: bool, True if you want the poller to run just once
    :param logger: logging.Logger, the logger to write logs to
    :param no_sqpoller: bool, write records even when there's no sqpoller rec
    :returns: Nothing
    :rtype: none

    """

    try:
        schemas = Schema(cfg['schema-directory'])
    except Exception as ex:
        logger.error(f'Aborting. Unable to load schema: {str(ex)}')
        print(f'ERROR: Aborting. Unable to load schema: {str(ex)}')
        sys.exit(1)

    coalescer_schema = SchemaForTable('sqCoalescer', schemas)
    pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger)

    status, errmsg = validate_periodstr(periodstr)
    if not status:
        logger.error(errmsg)
        print(f'ERROR: {errmsg}')
        sys.exit(1)

    while True:
        try:
            stats = do_coalesce(cfg, tables, periodstr, logger, no_sqpoller)
        except Exception:
            logger.exception('Coalescer aborted. Continuing')
        # Write the selftats
        if stats:
            df = pd.DataFrame([asdict(x) for x in stats])
            if not df.empty:
                df['sqvers'] = coalescer_schema.version
                df['version'] = SUZIEQ_VERSION
                df['active'] = True
                df['namespace'] = ''
                pqdb.write('sqCoalescer', 'pandas', df, True,
                           coalescer_schema.get_arrow_schema(), None)

        if run_once:
            break
        sleep_time = get_sleep_time(periodstr)
        sleep(sleep_time)
예제 #4
0
def test_transform(input_file):
    '''Test transformation is captured by coalescer'''
    to_transform = Yaml2Class(input_file)

    try:
        data_directory = to_transform.transform.data_directory
    except AttributeError:
        print('Invalid transformation file, no data directory')
        pytest.fail('AttributeError', pytrace=True)

    #  Make a copy of the data directory
    temp_dir, tmpfile = _coalescer_init(data_directory)

    cfg = load_sq_config(config_file=tmpfile.name)
    schemas = Schema(cfg['schema-directory'])

    # pylint: disable=too-many-nested-blocks, no-member
    for ele in to_transform.transform.transform:
        query_str_list = []
        # Each transformation has a record => write's happen per record
        for record in ele.record:
            changed_fields = set()
            new_df = pd.DataFrame()
            tables = [x for x in dir(record) if not x.startswith('_')]
            for table in tables:
                # Lets read the data in now that we know the table
                tblobj = get_sqobject(table)
                pq_db = get_sqdb_engine(cfg, table, None, None)
                columns = schemas.fields_for_table(table)
                mod_df = tblobj(config_file=tmpfile.name).get(columns=columns)

                for key in getattr(record, table):
                    query_str = key.match
                    chg_df = pd.DataFrame()
                    if query_str != "all":
                        try:
                            chg_df = mod_df.query(query_str) \
                                           .reset_index(drop=True)
                        except Exception as ex:  # pylint: disable=broad-except
                            assert (not ex)
                        query_str_list.append(query_str)
                    else:
                        chg_df = mod_df

                    _process_transform_set(key.set, chg_df, changed_fields)
                    if new_df.empty:
                        new_df = chg_df
                    elif not chg_df.empty:
                        new_df = pd.concat([new_df, chg_df])

                if new_df.empty:
                    continue

                # Write the records now
                _write_verify_transform(new_df, table, pq_db,
                                        SchemaForTable(table,
                                                       schemas), tmpfile.name,
                                        query_str_list, changed_fields)

    # Now we coalesce and verify it works
    pre_table_df = get_sqobject('tables')(config_file=tmpfile.name).get()
    do_coalesce(cfg, None)
    _verify_coalescing(temp_dir)

    post_table_df = get_sqobject('tables')(config_file=tmpfile.name).get()
    assert_df_equal(pre_table_df, post_table_df, None)

    # Run additional tests on the coalesced data
    for ele in to_transform.transform.verify:
        table = [x for x in dir(ele) if not x.startswith('_')][0]
        tblobj = get_sqobject(table)

        for tst in getattr(ele, table):
            start_time = tst.test.get('start-time', '')
            end_time = tst.test.get('end-time', '')

            columns = tst.test.get('columns', ['default'])
            df = tblobj(config_file=tmpfile.name,
                        start_time=start_time,
                        end_time=end_time).get(columns=columns)
            if not df.empty and 'query' in tst.test:
                query_str = tst.test['query']
                df = df.query(query_str).reset_index(drop=True)

            if 'assertempty' in tst.test:
                assert (df.empty)
            elif 'shape' in tst.test:
                shape = tst.test['shape'].split()
                if shape[0] != '*':
                    assert (int(shape[0]) == df.shape[0])
                if shape[1] != '*':
                    assert (int(shape[1]) == df.shape[1])
            else:
                assert (not df.empty)

    _coalescer_cleanup(temp_dir, tmpfile)
예제 #5
0
    async def init_services(self) -> List[Service]:
        """Instantiate Service objects and prepare
        them for running. This function should be called before
        scheduling the service for running.

        Returns:
            List[Service]: the list of the initialized service instances
        """
        services = []
        schemas = defaultdict(dict)
        svc_classes = Service.get_plugins()

        schemas = Schema(self.schema_dir)
        if schemas:
            poller_schema = schemas.get_arrow_schema('sqPoller')
            poller_schema_version = SchemaForTable('sqPoller', schemas).version

        # Read the available services and iterate over them, discarding
        # the ones we do not need to instantiate
        svc_desc_files = Path(self.service_directory).glob('*.yml')

        for filename in svc_desc_files:
            with open(filename, 'r') as f:
                svc_def = yaml.safe_load(f.read())

            if not svc_def:
                logger.warning(f'Skip empty service file: {filename}')
                continue

            if svc_def.get('service') not in self.svcs_list:
                logger.warning(
                    f"Ignoring unspecified service {svc_def.get('service')}"
                )
                continue

            if 'service' not in svc_def or 'apply' not in svc_def:
                logger.error(
                    'Ignoring invalid service file definition.'
                    f"'service' and 'apply' keywords: {filename}"
                )
                continue

            period = svc_def.get('period', self.default_interval)
            for nos, cmds_desc in svc_def['apply'].items():

                # Check if the the current nos copies from another
                if isinstance(cmds_desc, dict) and 'copy' in cmds_desc:
                    newval = svc_def['apply'].get(cmds_desc['copy'], None)
                    if not newval:
                        logger.error(
                            f"No device type {cmds_desc['copy']} to copy from,"
                            f"for {nos} for service {svc_def['service']}"
                        )
                        return
                    cmds_desc = newval

                # Update the command description adding the
                # specification for the output parsing
                if isinstance(cmds_desc, list):
                    for subele in cmds_desc:
                        self._parse_nos_version(filename, svc_def, nos, subele)
                else:
                    self._parse_nos_version(filename, svc_def, nos, cmds_desc)

            try:
                schema = SchemaForTable(svc_def['service'], schema=schemas)
            except Exception:  # pylint: disable=broad-except
                logger.error(f"No matching schema for {svc_def['service']}")
                continue

            if schema.type == 'derivedRecord':
                # These are not real services and so ignore them
                continue

            # Valid service definition, add it to list
            # if the service has not a dedicated class, we will use the
            # default implementation
            class_to_use = svc_classes.get(svc_def['service'], Service)
            service = class_to_use(
                svc_def['service'],
                svc_def['apply'],
                period,
                svc_def.get('type', 'state'),
                svc_def.get('keys', []),
                svc_def.get('ignore-fields', []),
                schema,
                self.output_queue,
                self.run_mode
            )
            service.poller_schema = poller_schema
            service.poller_schema_version = poller_schema_version
            logger.info(f'Service {service.name} added')
            services.append(service)

        # Once done set the service list and return its content
        self._services = services
        return self._services
예제 #6
0
def coalescer_main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--service-only",
        type=str,
        help="Only run this space separated list of services",
    )
    parser.add_argument(
        "-x",
        "--exclude-services",
        type=str,
        help="Exclude running this space separated list of services",
    )

    parser.add_argument("-c",
                        "--config",
                        type=str,
                        help="alternate config file")
    parser.add_argument(
        "--run-once",
        default=False,
        help='Run the coalescer once and exit',
        action='store_true',
    )
    parser.add_argument(
        "-p",
        "--period",
        type=str,
        help=('Override the period specified in config file with this. '
              'Format is <period><m|h|d|w>. 1h is 1 hour, 2w is 2 weeks etc.'))
    parser.add_argument("--no-sqpoller",
                        action='store_true',
                        help=argparse.SUPPRESS)

    userargs = parser.parse_args()

    cfg = load_sq_config(config_file=userargs.config)
    if not cfg:
        print(f'Invalid Suzieq config file {userargs.config}')
        sys.exit(1)

    logfile, loglevel, logsize, log_stdout = get_log_params(
        'coalescer', cfg, '/tmp/sq-coalescer.log')
    logger = init_logger('suzieq.coalescer', logfile, loglevel, logsize,
                         log_stdout)

    # Ensure we're the only compacter
    coalesce_dir = cfg.get('coalescer', {})\
        .get('coalesce-directory',
             f'{cfg.get("data-directory")}/coalesced')

    fd = ensure_single_instance(f'{coalesce_dir}/.sq-coalescer.pid', False)
    if not fd:
        print('ERROR: Another coalescer process present')
        logger.error('Another coalescer process present')
        sys.exit(errno.EBUSY)

    timestr = userargs.period or (cfg.get('coalescer', {
        'period': '1h'
    }).get('period', '1h'))

    schemas = Schema(cfg.get('schema-directory'))
    if userargs.service_only or userargs.exclude_services:
        tables = [
            x for x in schemas.tables()
            if (schemas.type_for_table(x) != "derivedRecord")
        ]
        if userargs.service_only:
            tables = [x for x in tables if x in userargs.service_only.split()]
        if userargs.exclude_services:
            tables = [
                x for x in tables
                if x not in userargs.exclude_services.split()
            ]
    else:
        tables = []

    run_coalescer(cfg, tables, timestr, userargs.run_once, logger,
                  userargs.no_sqpoller or False)
    os.truncate(fd, 0)
    try:
        fcntl.flock(fd, fcntl.LOCK_UN)
        os.close(fd)
    except OSError:
        pass

    sys.exit(0)
예제 #7
0
        pa.list_(pa.int64()): [],
    }

    with concurrent.futures.ProcessPoolExecutor(max_workers=None) as thread:
        threads = {thread.submit(convert_file, item,
                                 output_dir, sqschema, defaults, arrow_schema)
                   for item in files}
        for future in concurrent.futures.as_completed(threads):
            try:
                _ = future.result()
            except Exception:
                logger.exception(f'Exception occcurred with {future}')


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print('Usage: convert_parquet <input dir> <output_dir> <schema_dir>')
        sys.exit(1)

    input_dir = Path(sys.argv[1])
    output_dir = sys.argv[2]
    schemas = Schema(sys.argv[3])
    service = input_dir.parts[-1]
    svc_schema = SchemaForTable(service, schema=schemas)
    arrow_schema = svc_schema.get_arrow_schema()
    sqschema = svc_schema.get_raw_schema()

    logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
    logger = logging.getLogger('sq-converter')
    convert_dir(input_dir, output_dir, sqschema, arrow_schema)
예제 #8
0
파일: parquetdb.py 프로젝트: skg-net/suzieq
    def coalesce(self,
                 tables: List[str] = None,
                 period: str = '',
                 ign_sqpoller: bool = False) -> Optional[List]:
        """Coalesce all the resource parquet files in specified folder.

        This routine does not run periodically. It runs once and returns.

        :param tables: List[str], List of specific tables to coalesce,
                       empty for all
        :param period: str, coalescing period, needed for various internal
                       stuff
        :param ign_sqpoller: True if its OK to ignore the absence of sqpoller
                             to coalesce
        :returns: coalesce statistics list, one per table
        :rtype: SqCoalesceStats
        """

        infolder = self.cfg['data-directory']
        outfolder = self._get_table_directory('', True)  # root folder
        archive_folder = self.cfg.get('coalescer', {}) \
            .get('archive-directory',
                 f'{infolder}/_archived')

        if not period:
            period = self.cfg.get('coalesceer', {
                'period': '1h'
            }).get('period', '1h')
        schemas = Schema(self.cfg.get('schema-directory'))
        state = SqCoalesceState(self.logger, period)

        state.logger = self.logger
        # Trying to be complete here. the ignore prefixes assumes you have
        # coalesceers across multiple time periods running, and so we need
        # to ignore the files created by the longer time period coalesceions.
        # In other words, weekly coalesceer should ignore monthly and yearly
        # coalesced files, monthly coalesceer should ignore yearly coalesceer
        # and so on.
        try:
            timeint = int(period[:-1])
            time_unit = period[-1]
            if time_unit == 'm':
                run_int = timedelta(minutes=timeint)
                state.prefix = 'sqc-m-'
                state.ign_pfx = ['.', '_', 'sqc-']
            elif time_unit == 'h':
                run_int = timedelta(hours=timeint)
                state.prefix = 'sqc-h-'
                state.ign_pfx = [
                    '.', '_', 'sqc-y-', 'sqc-d-', 'sqc-w-', 'sqc-M-'
                ]
            elif time_unit == 'd':
                run_int = timedelta(days=timeint)
                if timeint > 364:
                    state.prefix = 'sqc-y-'
                    state.ign_pfx = ['.', '_', 'sqc-y-']
                elif timeint > 29:
                    state.prefix = 'sqc-M-'
                    state.ign_pfx = ['.', '_', 'sqc-M-', 'sqc-y-']
                else:
                    state.prefix = 'sqc-d-'
                    state.ign_pfx = [
                        '.', '_', 'sqc-m-', 'sqc-d-', 'sqc-w-', 'sqc-M-',
                        'sqc-y-'
                    ]
            elif time_unit == 'w':
                run_int = timedelta(weeks=timeint)
                state.prefix = 'sqc-w-'
                state.ign_pfx = ['.', '_', 'sqc-w-', 'sqc-m-', 'sqc-y-']
            else:
                logging.error(f'Invalid unit for period, {time_unit}, '
                              'must be one of m/h/d/w')
        except ValueError:
            logging.error(f'Invalid time, {period}')
            return None

        state.period = run_int
        # Create list of tables to coalesce.
        # TODO: Verify that we're only coalescing parquet tables here
        if tables:
            tables = [
                x for x in tables if schemas.tables() and (
                    schemas.type_for_table(x) != "derivedRecord")
            ]
        else:
            tables = [
                x for x in schemas.tables()
                if schemas.type_for_table(x) != "derivedRecord"
            ]
        if 'sqPoller' not in tables and not ign_sqpoller:
            # This is an error. sqPoller keeps track of discontinuities
            # among other things.
            self.logger.error(
                'No sqPoller data, cannot compute discontinuities')
            return None
        else:
            # We want sqPoller to be first to compute discontinuities
            with suppress(ValueError):
                tables.remove('sqPoller')
            if not ign_sqpoller:
                tables.insert(0, 'sqPoller')

        # We've forced the sqPoller to be always the first table to coalesce
        stats = []
        for entry in tables:
            table_outfolder = f'{outfolder}/{entry}'
            table_infolder = f'{infolder}//{entry}'
            if archive_folder:
                table_archive_folder = f'{archive_folder}/{entry}'
            else:
                table_archive_folder = None
            state.current_df = pd.DataFrame()
            state.dbeng = self
            state.schema = SchemaForTable(entry, schemas, None)
            if not os.path.isdir(table_infolder):
                self.logger.info(f'No input records to coalesce for {entry}')
                continue
            try:
                if not os.path.isdir(table_outfolder):
                    os.makedirs(table_outfolder)
                if (table_archive_folder
                        and not os.path.isdir(table_archive_folder)):
                    os.makedirs(table_archive_folder, exist_ok=True)
                # Migrate the data if needed
                self.logger.debug(f'Migrating data for {entry}')
                self.migrate(entry, state.schema)
                self.logger.debug(f'Migrating data for {entry}')
                start = time()
                coalesce_resource_table(table_infolder, table_outfolder,
                                        table_archive_folder, entry, state)
                end = time()
                self.logger.info(f'coalesced {state.wrfile_count} '
                                 f'files/{state.wrrec_count} '
                                 f'records of {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), state.wrfile_count,
                        state.wrrec_count,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))
            except Exception:  # pylint: disable=broad-except
                self.logger.exception(f'Unable to coalesce table {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), 0, 0,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))

        return stats