def on_connected(self, *args, **kwargs): if self._args.config: self.cfg = load_sq_config(validate=True, config_file=self._args.config) else: self.cfg = load_sq_config(validate=True) if not self.cfg: sys.exit(1) self.schemas = Schema(self.cfg["schema-directory"])
def run_coalescer(cfg: dict, tables: List[str], period: str, run_once: bool, logger: Logger, no_sqpoller: bool = False) -> None: """Run the coalescer. Runs it once and returns or periodically depending on the value of run_once. It also writes out the coalescer records as a parquet file. :param cfg: dict, the Suzieq config file read in :param tables: List[str], list of table names to coalesce :param period: str, the string of how periodically the poller runs, Examples are '1h', '1d' etc. :param run_once: bool, True if you want the poller to run just once :param logger: logging.Logger, the logger to write logs to :param no_sqpoller: bool, write records even when there's no sqpoller rec :returns: Nothing :rtype: none """ try: schemas = Schema(cfg['schema-directory']) except Exception as ex: logger.error(f'Aborting. Unable to load schema: {str(ex)}') print(f'ERROR: Aborting. Unable to load schema: {str(ex)}') sys.exit(1) coalescer_schema = SchemaForTable('sqCoalescer', schemas) pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger) if not run_once: now = datetime.now() nextrun = parse(period, settings={'PREFER_DATES_FROM': 'future'}) sleep_time = (nextrun - now).seconds logger.info(f'Got sleep time of {sleep_time} secs') while True: try: stats = do_coalesce(cfg, tables, period, logger, no_sqpoller) except Exception: logger.exception('Coalescer aborted. Continuing') # Write the selftats df = pd.DataFrame([asdict(x) for x in stats]) if not df.empty: df['sqvers'] = coalescer_schema.version df['version'] = SUZIEQ_VERSION df['active'] = True df['namespace'] = '' pqdb.write('sqCoalescer', 'pandas', df, True, coalescer_schema.get_arrow_schema(), None) if run_once: break sleep(sleep_time)
def __init__(self, engine, config_file=None): self.cfg = load_sq_config(config_file=config_file) self.schemas = Schema(self.cfg['schema-directory']) self.namespace = '' self.hostname = '' self.start_time = '' self.end_time = '' self.exec_time = '' self.engine = engine self.sort_fields = []
def __init__(self, engine="pandas"): self.cfg = load_sq_config(validate=False) self.schemas = Schema(self.cfg["schema-directory"]) self.namespace = "" self.hostname = "" self.start_time = "" self.end_time = "" self.exec_time = "" self.engine_name = engine self.sort_fields = [] self.engine = get_sqengine(engine) super().__init__()
def __init__(self, engine): self.cfg = load_sq_config(validate=False) self.schemas = Schema(self.cfg['schema-directory']) self.namespace = '' self.hostname = '' self.start_time = '' self.end_time = '' self.exec_time = '' self.engine = 'pandas' self.sort_fields = [] self.engine = get_sqengine(self.engine) if not self.engine: # We really should define our own error raise ValueError
def on_connected(self, *args, **kwargs): if self._args.config: self.cfg = load_sq_config(validate=False, config_file=self._args.config) self.schemas = Schema(self.cfg["schema-directory"])
async def init_services(svc_dir: str, schema_dir: str, queue, svclist: list, def_interval: int, run_once: str): """Process service definitions by reading each file in svc dir""" svcs_list = [] schemas = defaultdict(dict) # Load up all the service definitions we can find svc_classes = {} for i in walk_packages(path=[dirname(getfile(Service))]): for mbr in getmembers( importlib.import_module('suzieq.poller.services.' + i.name), isclass): if mbr[0] == "Service" or not mbr[0].endswith("Service"): continue svc_classes[i.name] = mbr[1] svc_classes[mbr[0]] = mbr[1] if not isdir(svc_dir): logger.error("services directory not a directory: {}".format(svc_dir)) return svcs_list if not isdir(schema_dir): logger.error("schema directory not a directory: {}".format(svc_dir)) return svcs_list else: schemas = Schema(schema_dir) if schemas: poller_schema = schemas.get_arrow_schema("sqPoller") poller_schema_version = SchemaForTable('sqPoller', schemas).version for root, _, filenames in walk(svc_dir): for filename in filenames: if filename.endswith(".yml"): with open(root + "/" + filename, "r") as f: svc_def = yaml.safe_load(f.read()) if svc_def.get('service') not in svclist: logger.warning( f'Ignoring unspecified service {svc_def.get("service")}' ) continue if "service" not in svc_def or "apply" not in svc_def: logger.error('Ignoring invalid service file definition. \ Need both "service" and "apply" keywords: {}'.format( filename)) continue period = svc_def.get("period", def_interval) for elem, val in svc_def["apply"].items(): if "copy" in val: newval = svc_def["apply"].get(val["copy"], None) if not newval: logger.error("No device type {} to copy from for " "{} for service {}".format( val["copy"], elem, svc_def["service"])) continue val = newval if (("command" not in val) or ((isinstance(val['command'], list) and not all('textfsm' in x or 'normalize' in x for x in val['command'])) or (not isinstance(val['command'], list) and ("normalize" not in val and "textfsm" not in val)))): logger.error( "Ignoring invalid service file " 'definition. Need both "command" and ' '"normalize/textfsm" keywords: {}, {}'.format( filename, val)) continue if "textfsm" in val: # We may have already visited this element and parsed # the textfsm file. Check for this if val["textfsm"] and isinstance( val["textfsm"], textfsm.TextFSM): continue tfsm_file = svc_dir + "/" + val["textfsm"] if not isfile(tfsm_file): logger.error("Textfsm file {} not found. Ignoring" " service".format(tfsm_file)) continue with open(tfsm_file, "r") as f: tfsm_template = textfsm.TextFSM(f) val["textfsm"] = tfsm_template elif (isinstance(val['command'], list)): for subelem in val['command']: if 'textfsm' in subelem: if subelem["textfsm"] and isinstance( subelem["textfsm"], textfsm.TextFSM): continue tfsm_file = svc_dir + "/" + subelem["textfsm"] if not isfile(tfsm_file): logger.error( "Textfsm file {} not found. Ignoring" " service".format(tfsm_file)) continue with open(tfsm_file, "r") as f: tfsm_template = textfsm.TextFSM(f) subelem["textfsm"] = tfsm_template else: tfsm_template = None try: schema = SchemaForTable(svc_def['service'], schema=schemas) except Exception: logger.error( f"No matching schema for {svc_def['service']}") continue if schema.type == "derivedRecord": # These are not real services and so ignore them continue # Valid service definition, add it to list if svc_def["service"] in svc_classes: service = svc_classes[svc_def["service"]]( svc_def["service"], svc_def["apply"], period, svc_def.get("type", "state"), svc_def.get("keys", []), svc_def.get("ignore-fields", []), schema, queue, run_once, ) else: service = Service(svc_def["service"], svc_def["apply"], period, svc_def.get("type", "state"), svc_def.get("keys", []), svc_def.get("ignore-fields", []), schema, queue, run_once) service.poller_schema = poller_schema service.poller_schema_version = poller_schema_version logger.info("Service {} added".format(service.name)) svcs_list.append(service) return svcs_list
def create_context(): config = load_sq_config(config_file=create_dummy_config_file()) context = NubiaSuzieqContext() context.cfg = config context.schemas = Schema(config["schema-directory"]) return context
def test_transform(input_file): to_transform = Yaml2Class(input_file) try: data_directory = to_transform.transform.data_directory except AttributeError: print('Invalid transformation file, no data directory') pytest.fail('AttributeError', pytrace=True) # Make a copy of the data directory temp_dir, tmpfile = _coalescer_init(data_directory) cfg = load_sq_config(config_file=tmpfile.name) schemas = Schema(cfg['schema-directory']) for ele in to_transform.transform.transform: query_str_list = [] # Each transformation has a record => write's happen per record for record in ele.record: changed_fields = set() new_df = pd.DataFrame() tables = [x for x in dir(record) if not x.startswith('_')] for table in tables: # Lets read the data in now that we know the table tblobj = get_sqobject(table) pq_db = get_sqdb_engine(cfg, table, None, None) columns = schemas.fields_for_table(table) mod_df = tblobj(config_file=tmpfile.name).get(columns=columns) for key in getattr(record, table): query_str = key.match chg_df = pd.DataFrame() if query_str != "all": try: chg_df = mod_df.query(query_str) \ .reset_index(drop=True) except Exception as ex: assert (not ex) query_str_list.append(query_str) else: chg_df = mod_df _process_transform_set(key.set, chg_df, changed_fields) if new_df.empty: new_df = chg_df elif not chg_df.empty: new_df = pd.concat([new_df, chg_df]) if new_df.empty: continue # Write the records now _write_verify_transform(new_df, table, pq_db, SchemaForTable(table, schemas), tmpfile.name, query_str_list, changed_fields) # Now we coalesce and verify it works from suzieq.sqobjects.tables import TablesObj pre_table_df = TablesObj(config_file=tmpfile.name).get() do_coalesce(cfg, None) _verify_coalescing(temp_dir) post_table_df = TablesObj(config_file=tmpfile.name).get() assert_df_equal(pre_table_df, post_table_df, None) # Run additional tests on the coalesced data for ele in to_transform.transform.verify: table = [x for x in dir(ele) if not x.startswith('_')][0] tblobj = get_sqobject(table) for tst in getattr(ele, table): start_time = tst.test.get('start-time', '') end_time = tst.test.get('end-time', '') columns = tst.test.get('columns', ['default']) df = tblobj(config_file=tmpfile.name, start_time=start_time, end_time=end_time).get(columns=columns) if not df.empty and 'query' in tst.test: query_str = tst.test['query'] df = df.query(query_str).reset_index(drop=True) if 'assertempty' in tst.test: assert (df.empty) elif 'shape' in tst.test: shape = tst.test['shape'].split() if shape[0] != '*': assert (int(shape[0]) == df.shape[0]) if shape[1] != '*': assert (int(shape[1]) == df.shape[1]) else: assert (not df.empty) _coalescer_cleanup(temp_dir, tmpfile)
def coalesce(self, tables: List[str] = [], period: str = '', ign_sqpoller: bool = False) -> None: """Coalesce all the resource parquet files in specified folder. This routine does not run periodically. It runs once and returns. :param tables: List[str], List of specific tables to coalesce, empty for all :param period: str, coalescing period, needed for various internal stuff :param ign_sqpoller: True if its OK to ignore the absence of sqpoller to coalesce :returns: coalesce statistics list, one per table :rtype: SqCoalesceStats """ infolder = self.cfg['data-directory'] outfolder = self._get_table_directory('', True) # root folder archive_folder = self.cfg.get('coalescer', {}) \ .get('archive-directory', f'{infolder}/_archived') if not period: period = self.cfg.get('coalesceer', { 'period': '1h' }).get('period', '1h') schemas = Schema(self.cfg.get('schema-directory')) state = SqCoalesceState(self.logger, period) state.logger = self.logger # Trying to be complete here. the ignore prefixes assumes you have coalesceers # across multiple time periods running, and so we need to ignore the files # created by the longer time period coalesceions. In other words, weekly # coalesceer should ignore monthly and yearly coalesced files, monthly # coalesceer should ignore yearly coalesceer and so on. try: timeint = int(period[:-1]) time_unit = period[-1] if time_unit == 'h': run_int = timedelta(hours=timeint) state.prefix = 'sqc-h-' state.ign_pfx = ['.', '_', 'sqc-'] elif time_unit == 'd': run_int = timedelta(days=timeint) if timeint > 364: state.prefix = 'sqc-y-' state.ign_pfx = ['.', '_', 'sqc-y-'] elif timeint > 29: state.prefix = 'sqc-m-' state.ign_pfx = ['.', '_', 'sqc-m-', 'sqc-y-'] else: state.prefix = 'sqc-d-' state.ign_pfx = [ '.', '_', 'sqc-d-', 'sqc-w-', 'sqc-m-', 'sqc-y-' ] elif time_unit == 'w': run_int = timedelta(weeks=timeint) state.prefix = 'sqc-w-' state.ign_pfx = ['.', '_', 'sqc-w-', 'sqc-m-', 'sqc-y-'] else: logging.error(f'Invalid unit for period, {time_unit}, ' 'must be one of h/d/w') except ValueError: logging.error(f'Invalid time, {period}') return state.period = run_int # Create list of tables to coalesce. # TODO: Verify that we're only coalescing parquet tables here if tables: tables = [ x for x in tables if schemas.tables() and ( schemas.type_for_table(x) != "derivedRecord") ] else: tables = [ x for x in schemas.tables() if schemas.type_for_table(x) != "derivedRecord" ] if 'sqPoller' not in tables and not ign_sqpoller: # This is an error. sqPoller keeps track of discontinuities # among other things. self.logger.error( 'No sqPoller data, cannot compute discontinuities') return else: # We want sqPoller to be first to compute discontinuities with suppress(ValueError): tables.remove('sqPoller') if not ign_sqpoller: tables.insert(0, 'sqPoller') # We've forced the sqPoller to be always the first table to coalesce stats = [] for entry in tables: table_outfolder = f'{outfolder}/{entry}' table_infolder = f'{infolder}//{entry}' if archive_folder: table_archive_folder = f'{archive_folder}/{entry}' else: table_archive_folder = None state.current_df = pd.DataFrame() state.dbeng = self state.schema = SchemaForTable(entry, schemas, None) if not os.path.isdir(table_infolder): self.logger.info(f'No input records to coalesce for {entry}') continue try: if not os.path.isdir(table_outfolder): os.makedirs(table_outfolder) if (table_archive_folder and not os.path.isdir(table_archive_folder)): os.makedirs(table_archive_folder, exist_ok=True) # Migrate the data if needed self.logger.debug(f'Migrating data for {entry}') self.migrate(entry, state.schema) self.logger.debug(f'Migrating data for {entry}') start = time() coalesce_resource_table(table_infolder, table_outfolder, table_archive_folder, entry, state) end = time() self.logger.info( f'coalesced {state.wrfile_count} files/{state.wrrec_count} ' f'records of {entry}') stats.append( SqCoalesceStats( entry, period, int(end - start), state.wrfile_count, state.wrrec_count, int(datetime.now(tz=timezone.utc).timestamp() * 1000))) except Exception: self.logger.exception(f'Unable to coalesce table {entry}') stats.append( SqCoalesceStats( entry, period, int(end - start), 0, 0, int(datetime.now(tz=timezone.utc).timestamp() * 1000))) return stats
def coalescer_main(): parser = argparse.ArgumentParser() parser.add_argument( "-s", "--service-only", type=str, help="Only run this space separated list of services", ) parser.add_argument( "-x", "--exclude-services", type=str, help="Exclude running this space separated list of services", ) parser.add_argument("-c", "--config", default=f'{os.getenv("HOME")}/.suzieq/suzieq-cfg.yml', type=str, help="alternate config file") parser.add_argument( "--run-once", default=False, help='Run the coalescer once and exit', action='store_true', ) parser.add_argument( "-p", "--period", type=str, help=('Override the period specified in config file with this. ' 'Format is <period><h|d|w|y>. 1h is 1 hour, 2w is 2 weeks etc.')) parser.add_argument("--no-sqpoller", action='store_true', help=argparse.SUPPRESS) userargs = parser.parse_args() cfg = load_sq_config(config_file=userargs.config) if not cfg: print(f'Invalid Suzieq config file {userargs.config}') sys.exit(1) logfile, loglevel = get_log_file_level('coalescer', cfg, '/tmp/sq-coalescer.log') logger = init_logger('suzieq.coalescer', logfile, loglevel, False) # Ensure we're the only compacter coalesce_dir = cfg.get('coalescer', {})\ .get('coalesce-directory', f'{cfg.get("data-directory")}/coalesced') fd = ensure_single_instance(f'{coalesce_dir}/.sq-coalescer.pid', False) if not fd: print(f'ERROR: Another coalescer process present') logger.error(f'Another coalescer process present') sys.exit(errno.EBUSY) if userargs.run_once: timestr = '' elif not userargs.period: timestr = cfg.get('coalescer', {'period': '1h'}).get('period', '1h') else: timestr = userargs.period schemas = Schema(cfg.get('schema-directory')) if userargs.service_only or userargs.exclude_services: tables = [ x for x in schemas.tables() if (schemas.type_for_table(x) != "derivedRecord") ] if userargs.service_only: tables = [x for x in tables if x in userargs.service_only.split()] if userargs.exclude_services: tables = [ x for x in tables if x not in userargs.exclude_services.split() ] else: tables = [] run_coalescer(cfg, tables, timestr, userargs.run_once, logger, userargs.no_sqpoller or False) os.truncate(fd, 0) try: fcntl.flock(fd, fcntl.LOCK_UN) os.close(fd) except OSError: pass sys.exit(0)
fd = ensure_single_instance(f'{coalesce_dir}/.sq-coalescer.pid', False) if not fd: print(f'ERROR: Another coalescer process present') logger.error(f'Another coalescer process present') sys.exit(errno.EBUSY) if userargs.run_once: timestr = '' elif not userargs.period: timestr = cfg.get('coalescer', {'period': '1h'}).get('period', '1h') else: timestr = userargs.period schemas = Schema(cfg.get('schema-directory')) if userargs.service_only or userargs.exclude_services: tables = [x for x in schemas.tables() if (schemas.type_for_table(x) != "derivedRecord")] if userargs.service_only: tables = [x for x in tables if x in userargs.service_only.split()] if userargs.exclude_services: tables = [x for x in tables if x not in userargs.exclude_services.split()] else: tables = [] run_coalescer(cfg, tables, timestr, userargs.run_once, logger, userargs.no_sqpoller or False) os.truncate(fd, 0) try:
if 'norifcnReason' in df.columns: df.rename({'notifcnReason': 'notificnReason'}, inplace=True) pq.write_to_dataset( table, root_path=output_dir, partition_cols=partition_cols, version="2.0", compression='ZSTD', row_group_size=100000, ) logger.info(f'Wrote converted {input_dir}') if __name__ == "__main__": if len(sys.argv) < 4: print('Usage: convert_parquet <input dir> <output_dir> <schema_dir>') sys.exit(1) input_dir = Path(sys.argv[1]) output_dir = sys.argv[2] schemas = Schema(sys.argv[3]) service = input_dir.parts[-1] svc_schema = SchemaForTable(service, schema=schemas) logging.basicConfig(stream=sys.stdout, level=logging.WARNING) logger = logging.getLogger('sq-converter') convert_dir(input_dir, output_dir, svc_schema)