def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame: """Get the list of top/bottom entries of "what" field""" if "columns" in kwargs: columns = kwargs["columns"] del kwargs["columns"] else: columns = ["default"] # if self._valid_get_args: # self._valid_get_args += ['what', 'n', 'reverse'] # This raises exceptions if it fails try: self.validate_get_input(**kwargs) except Exception as error: df = pd.DataFrame({'error': [f'{error}']}) return df table_schema = SchemaForTable(self._table, self.all_schemas) columns = table_schema.get_display_fields(columns) if what not in columns: self._addnl_fields.append(what) return self.engine.top(what=what, n=n, reverse=reverse, **kwargs)
def __init__(self, engine_name: str = 'pandas', hostname: typing.List[str] = [], start_time: str = '', end_time: str = '', view: str = 'latest', namespace: typing.List[str] = [], columns: typing.List[str] = ['default'], context=None, table: str = '', config_file=None) -> None: if context is None: self.ctxt = SqContext(engine_name, config_file) else: self.ctxt = context if not self.ctxt: self.ctxt = SqContext(engine_name) self._cfg = self.ctxt.cfg self._schema = SchemaForTable(table, self.ctxt.schemas) self._table = table self._sort_fields = self._schema.key_fields() if not namespace and self.ctxt.namespace: self.namespace = self.ctxt.namespace else: self.namespace = namespace if not hostname and self.ctxt.hostname: self.hostname = self.ctxt.hostname else: self.hostname = hostname if not start_time and self.ctxt.start_time: self.start_time = self.ctxt.start_time else: self.start_time = start_time if not end_time and self.ctxt.end_time: self.end_time = self.ctxt.end_time else: self.end_time = end_time if not view and self.ctxt.view: self.view = self.ctxt.view else: self.view = view self.columns = columns if engine_name and engine_name != '': self.engine = get_sqengine(engine_name, self._table)(self._table, self) elif self.ctxt.engine: self.engine = get_sqengine(self.ctxt.engine, self._table)(self._table, self) if not self.engine: raise ValueError('Unknown analysis engine') self._addnl_filter = None self._addnl_fields = [] self._valid_get_args = None self._valid_assert_args = None self._valid_arg_vals = None
def run_coalescer(cfg: dict, tables: List[str], period: str, run_once: bool, logger: Logger, no_sqpoller: bool = False) -> None: """Run the coalescer. Runs it once and returns or periodically depending on the value of run_once. It also writes out the coalescer records as a parquet file. :param cfg: dict, the Suzieq config file read in :param tables: List[str], list of table names to coalesce :param period: str, the string of how periodically the poller runs, Examples are '1h', '1d' etc. :param run_once: bool, True if you want the poller to run just once :param logger: logging.Logger, the logger to write logs to :param no_sqpoller: bool, write records even when there's no sqpoller rec :returns: Nothing :rtype: none """ try: schemas = Schema(cfg['schema-directory']) except Exception as ex: logger.error(f'Aborting. Unable to load schema: {str(ex)}') print(f'ERROR: Aborting. Unable to load schema: {str(ex)}') sys.exit(1) coalescer_schema = SchemaForTable('sqCoalescer', schemas) pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger) if not run_once: now = datetime.now() nextrun = parse(period, settings={'PREFER_DATES_FROM': 'future'}) sleep_time = (nextrun - now).seconds logger.info(f'Got sleep time of {sleep_time} secs') while True: try: stats = do_coalesce(cfg, tables, period, logger, no_sqpoller) except Exception: logger.exception('Coalescer aborted. Continuing') # Write the selftats df = pd.DataFrame([asdict(x) for x in stats]) if not df.empty: df['sqvers'] = coalescer_schema.version df['version'] = SUZIEQ_VERSION df['active'] = True df['namespace'] = '' pqdb.write('sqCoalescer', 'pandas', df, True, coalescer_schema.get_arrow_schema(), None) if run_once: break sleep(sleep_time)
def convert_dir(input_dir: str, output_dir: str, svcschema: SchemaForTable): """Convert the data into a single file and write it out""" defaults = { pa.string(): "", pa.int32(): 0, pa.int64(): 0, pa.float32(): 0.0, pa.float64(): 0.0, pa.date64(): 0.0, pa.bool_(): False, pa.list_(pa.string()): ['-'], pa.list_(pa.int64()): [], } df = pd.read_parquet(input_dir, use_legacy_dataset=True) sqschema = svcschema.get_raw_schema() arrow_schema = svc_schema.get_arrow_schema() for column in filter(lambda x: x['name'] not in df.columns, sqschema): df[column['name']] = column.get('default', defaults[column['type']]) # convert all dtypes to whatever is desired for column in df.columns: if column in arrow_schema: df[column] = df[column].astype( arrow_schema.field(column).type.to_pandas_dtype()) # If there's the original ifname saved up, then eliminate this unnecessary # field as this model is no longer necessary if 'origIfname' in df.columns: if 'ifname' in df.columns: df = df.drop(columns=['ifname']) \ .rename(columns={'origIfname': 'ifname'}) elif 'oif' in df.columns: df = df.drop(columns=['oif']) \ .rename(columns={'origIfname': 'oif'}) table = pa.Table.from_pandas(df, schema=arrow_schema, preserve_index=False) partition_cols = svcschema.get_partition_columns() if 'norifcnReason' in df.columns: df.rename({'notifcnReason': 'notificnReason'}, inplace=True) pq.write_to_dataset( table, root_path=output_dir, partition_cols=partition_cols, version="2.0", compression='ZSTD', row_group_size=100000, ) logger.info(f'Wrote converted {input_dir}')
def get_table_info(self, table, **kwargs): sch = SchemaForTable(table, schema=self.schemas) key_fields = sch.key_fields() # You can't use view from user because we need to see all the data # to compute data required. kwargs.pop('view', None) all_time_df = self._get_table_info(table, view='all', **kwargs) times = all_time_df['timestamp'].unique() ret = {'first_time': all_time_df.timestamp.min(), 'latest_time': all_time_df.timestamp.max(), 'intervals': len(times), 'all rows': len(all_time_df), 'namespaces': self._unique_or_zero(all_time_df, 'namespace'), 'devices': self._unique_or_zero(all_time_df, 'hostname')} return ret
def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame: """Get the list of top/bottom entries of "what" field""" if "columns" in kwargs: columns = kwargs["columns"] del kwargs["columns"] else: columns = ["default"] table_schema = SchemaForTable(self._table, self.all_schemas) columns = table_schema.get_display_fields(columns) if what == "numChanges" and what not in columns: self._addnl_nbr_fields.append(what) return self.engine_obj.top(what=what, n=n, reverse=reverse, **kwargs)
def get(self, **kwargs): """Replacing the original interface name in returned result""" addnl_fields = kwargs.pop('addnl_fields', []) columns = kwargs.get('columns', ['default']) vrf = kwargs.pop('vrf', None) peer = kwargs.pop('peer', None) hostname = kwargs.pop('hostname', None) drop_cols = ['origPeer', 'peerHost'] addnl_fields.extend(['origPeer']) sch = SchemaForTable(self.iobj.table, self.schemas) fields = sch.get_display_fields(columns) for col in ['peerIP', 'updateSource', 'state', 'namespace', 'vrf', 'peer', 'hostname']: if col not in fields: addnl_fields.append(col) drop_cols.append(col) df = super().get(addnl_fields=addnl_fields, **kwargs) if df.empty: return df query_str = build_query_str([], sch, vrf=vrf, peer=peer, hostname=hostname) if 'peer' in df.columns: df['peer'] = np.where(df['origPeer'] != "", df['origPeer'], df['peer']) if 'peerHostname' in df.columns: mdf = self._get_peer_matched_df(df) drop_cols = [x for x in drop_cols if x in mdf.columns] drop_cols.extend(list(mdf.filter(regex='_y'))) else: mdf = df if query_str: return mdf.query(query_str).drop(columns=drop_cols, errors='ignore') else: return mdf.drop(columns=drop_cols, errors='ignore')
def describe(self, **kwargs): """Describes the fields for a given table""" table = kwargs.get('table', '') try: sch = SchemaForTable(table, self.schemas) except ValueError: sch = None if not sch: df = pd.DataFrame( {'error': [f'ERROR: incorrect table name {table}']}) return df entries = [{ 'name': x['name'], 'type': x['type'], 'key': x.get('key', ''), 'display': x.get('display', '') } for x in sch.get_raw_schema()] df = pd.DataFrame.from_dict(entries).sort_values('name') return df
def coalesce(self, tables: List[str] = [], period: str = '', ign_sqpoller: bool = False) -> None: """Coalesce all the resource parquet files in specified folder. This routine does not run periodically. It runs once and returns. :param tables: List[str], List of specific tables to coalesce, empty for all :param period: str, coalescing period, needed for various internal stuff :param ign_sqpoller: True if its OK to ignore the absence of sqpoller to coalesce :returns: coalesce statistics list, one per table :rtype: SqCoalesceStats """ infolder = self.cfg['data-directory'] outfolder = self._get_table_directory('', True) # root folder archive_folder = self.cfg.get('coalescer', {}) \ .get('archive-directory', f'{infolder}/_archived') if not period: period = self.cfg.get('coalesceer', { 'period': '1h' }).get('period', '1h') schemas = Schema(self.cfg.get('schema-directory')) state = SqCoalesceState(self.logger, period) state.logger = self.logger # Trying to be complete here. the ignore prefixes assumes you have coalesceers # across multiple time periods running, and so we need to ignore the files # created by the longer time period coalesceions. In other words, weekly # coalesceer should ignore monthly and yearly coalesced files, monthly # coalesceer should ignore yearly coalesceer and so on. try: timeint = int(period[:-1]) time_unit = period[-1] if time_unit == 'h': run_int = timedelta(hours=timeint) state.prefix = 'sqc-h-' state.ign_pfx = ['.', '_', 'sqc-'] elif time_unit == 'd': run_int = timedelta(days=timeint) if timeint > 364: state.prefix = 'sqc-y-' state.ign_pfx = ['.', '_', 'sqc-y-'] elif timeint > 29: state.prefix = 'sqc-m-' state.ign_pfx = ['.', '_', 'sqc-m-', 'sqc-y-'] else: state.prefix = 'sqc-d-' state.ign_pfx = [ '.', '_', 'sqc-d-', 'sqc-w-', 'sqc-m-', 'sqc-y-' ] elif time_unit == 'w': run_int = timedelta(weeks=timeint) state.prefix = 'sqc-w-' state.ign_pfx = ['.', '_', 'sqc-w-', 'sqc-m-', 'sqc-y-'] else: logging.error(f'Invalid unit for period, {time_unit}, ' 'must be one of h/d/w') except ValueError: logging.error(f'Invalid time, {period}') return state.period = run_int # Create list of tables to coalesce. # TODO: Verify that we're only coalescing parquet tables here if tables: tables = [ x for x in tables if schemas.tables() and ( schemas.type_for_table(x) != "derivedRecord") ] else: tables = [ x for x in schemas.tables() if schemas.type_for_table(x) != "derivedRecord" ] if 'sqPoller' not in tables and not ign_sqpoller: # This is an error. sqPoller keeps track of discontinuities # among other things. self.logger.error( 'No sqPoller data, cannot compute discontinuities') return else: # We want sqPoller to be first to compute discontinuities with suppress(ValueError): tables.remove('sqPoller') if not ign_sqpoller: tables.insert(0, 'sqPoller') # We've forced the sqPoller to be always the first table to coalesce stats = [] for entry in tables: table_outfolder = f'{outfolder}/{entry}' table_infolder = f'{infolder}//{entry}' if archive_folder: table_archive_folder = f'{archive_folder}/{entry}' else: table_archive_folder = None state.current_df = pd.DataFrame() state.dbeng = self state.schema = SchemaForTable(entry, schemas, None) if not os.path.isdir(table_infolder): self.logger.info(f'No input records to coalesce for {entry}') continue try: if not os.path.isdir(table_outfolder): os.makedirs(table_outfolder) if (table_archive_folder and not os.path.isdir(table_archive_folder)): os.makedirs(table_archive_folder, exist_ok=True) # Migrate the data if needed self.logger.debug(f'Migrating data for {entry}') self.migrate(entry, state.schema) self.logger.debug(f'Migrating data for {entry}') start = time() coalesce_resource_table(table_infolder, table_outfolder, table_archive_folder, entry, state) end = time() self.logger.info( f'coalesced {state.wrfile_count} files/{state.wrrec_count} ' f'records of {entry}') stats.append( SqCoalesceStats( entry, period, int(end - start), state.wrfile_count, state.wrrec_count, int(datetime.now(tz=timezone.utc).timestamp() * 1000))) except Exception: self.logger.exception(f'Unable to coalesce table {entry}') stats.append( SqCoalesceStats( entry, period, int(end - start), 0, 0, int(datetime.now(tz=timezone.utc).timestamp() * 1000))) return stats
def get_valid_df(self, table, **kwargs) -> pd.DataFrame: if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) sch = SchemaForTable(table, schema=self.schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields + addnl_fields: addnl_fields.append('active') drop_cols.append('active') for fld in key_fields: if fld not in fields + addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) for dt in [self.iobj.start_time, self.iobj.end_time]: if dt: try: parse(dt) except (ValueError, ParserError) as e: print(f"invalid time {dt}: {e}") return pd.DataFrame() table_df = self.ctxt.engine.get_table_df( self.cfg, table=phy_table, start_time=self.iobj.start_time, end_time=self.iobj.end_time, columns=fields, view=view, key_fields=key_fields, **kwargs) if not table_df.empty: if view == 'latest' and active_only: table_df = table_df.query('active') \ .drop(columns=drop_cols) else: table_df.drop(columns=drop_cols, inplace=True) if 'timestamp' in table_df.columns: table_df['timestamp'] = pd.to_datetime( table_df.timestamp.astype(str), unit="ms") return table_df
def _get_combined_df(self, **kwargs): """OSPF has info divided across multiple tables. Get a single one""" columns = kwargs.pop('columns', ['default']) state = kwargs.pop('state', '') addnl_fields = kwargs.pop('addnl_fields', self.iobj._addnl_fields) addnl_nbr_fields = self.iobj._addnl_nbr_fields cols = SchemaForTable('ospf', schema=self.schemas) \ .get_display_fields(columns) if columns == ['default']: cols.append('timestamp') ifschema = SchemaForTable('ospfIf', schema=self.schemas) nbrschema = SchemaForTable('ospfNbr', schema=self.schemas) if (columns != ['default']) and (columns != ['*']): ifkeys = ifschema.key_fields() nbrkeys = nbrschema.key_fields() if_flds = ifschema.fields nbr_flds = nbrschema.fields ifcols = ifkeys nbrcols = nbrkeys for fld in columns: if fld in if_flds and fld not in ifcols: ifcols.append(fld) elif fld in nbr_flds and fld not in nbrcols: nbrcols.append(fld) else: ifcols = ifschema.get_display_fields(columns) nbrcols = nbrschema.get_display_fields(columns) if state == "full": query_str = 'adjState == "full" or adjState == "passive"' elif state == "other": query_str = 'adjState != "full" and adjState != "passive"' elif state == "passive": query_str = 'adjState == "passive"' else: query_str = '' df = self.get_valid_df('ospfIf', addnl_fields=addnl_fields, columns=ifcols, **kwargs) nbr_df = self.get_valid_df('ospfNbr', addnl_fields=addnl_nbr_fields, columns=nbrcols, **kwargs) if nbr_df.empty: return nbr_df merge_cols = [ x for x in ['namespace', 'hostname', 'ifname'] if x in nbr_df.columns ] # Merge the two tables df = df.merge(nbr_df, on=merge_cols, how='left') if columns == ['*']: df = df.drop(columns=['area_y', 'instance_y', 'vrf_y', 'areaStub_y', 'timestamp_y']) \ .rename(columns={ 'instance_x': 'instance', 'areaStub_x': 'areaStub', 'area_x': 'area', 'vrf_x': 'vrf', 'state_x': 'ifState', 'state_y': 'adjState', 'sqvers_x': 'sqvers', 'active_x': 'active', 'timestamp_x': 'timestamp'}) else: df = df.rename( columns={ 'vrf_x': 'vrf', 'area_x': 'area', 'state_x': 'ifState', 'state_y': 'adjState', 'timestamp_x': 'timestamp' }) df = df.drop(list(df.filter(regex='_y$')), axis=1) \ .fillna({'peerIP': '-', 'numChanges': 0, 'lastChangeTime': 0}) # Fill the adjState column with passive if passive if 'passive' in df.columns: df.loc[df['adjState'].isnull(), 'adjState'] = df['passive'] df.loc[df['adjState'].eq(True), 'adjState'] = 'passive' df.loc[df['adjState'].eq(False), 'adjState'] = 'fail' df.drop(columns=['passive'], inplace=True) df.bfill(axis=0, inplace=True) # Move the timestamp column to the end if query_str: return df.query(query_str)[cols] return df[cols]
def _get_combined_df(self, **kwargs): """OSPF has info divided across multiple tables. Get a single one""" columns = kwargs.pop('columns', ['default']) state = kwargs.pop('state', '') addnl_fields = kwargs.pop('addnl_fields', self.iobj._addnl_fields) addnl_nbr_fields = self.iobj._addnl_nbr_fields user_query = kwargs.pop('query_str', '') cols = SchemaForTable('ospf', schema=self.schemas) \ .get_display_fields(columns) if columns == ['default']: cols.append('timestamp') ifschema = SchemaForTable('ospfIf', schema=self.schemas) nbrschema = SchemaForTable('ospfNbr', schema=self.schemas) if (columns != ['default']) and (columns != ['*']): ifkeys = ifschema.key_fields() nbrkeys = nbrschema.key_fields() if_flds = ifschema.fields nbr_flds = nbrschema.fields ifcols = ifkeys nbrcols = nbrkeys for fld in columns: if fld in if_flds and fld not in ifcols: ifcols.append(fld) elif fld in nbr_flds and fld not in nbrcols: nbrcols.append(fld) else: ifcols = ifschema.get_display_fields(columns) nbrcols = nbrschema.get_display_fields(columns) if state == "full": query_str = 'adjState == "full" or adjState == "passive"' elif state == "other": query_str = 'adjState != "full" and adjState != "passive"' elif state == "passive": query_str = 'adjState == "passive"' else: query_str = '' df = self.get_valid_df('ospfIf', addnl_fields=addnl_fields, columns=ifcols, **kwargs) nbr_df = self.get_valid_df('ospfNbr', addnl_fields=addnl_nbr_fields, columns=nbrcols, **kwargs) if nbr_df.empty: return df merge_cols = [ x for x in ['namespace', 'hostname', 'ifname'] if x in nbr_df.columns ] # Merge the two tables df = df.merge(nbr_df, on=merge_cols, how='left') # This is because some NOS have the ipAddress in nbr table and some in # interface table. Nbr table wins over interface table if present if 'ipAddress_y' in df: df['ipAddress'] = np.where(df['ipAddress_y'] == "", df['ipAddress_x'], df['ipAddress_y']) df['ipAddress'] = np.where(df['ipAddress'], df['ipAddress'], df['ipAddress_x']) if columns == ['*']: df = df.drop(columns=['area_y', 'instance_y', 'vrf_y', 'ipAddress_x', 'ipAddress_y', 'areaStub_y', 'timestamp_y'], errors='ignore') \ .rename(columns={ 'instance_x': 'instance', 'areaStub_x': 'areaStub', 'area_x': 'area', 'vrf_x': 'vrf', 'state_x': 'ifState', 'state_y': 'adjState', 'sqvers_x': 'sqvers', 'active_x': 'active', 'timestamp_x': 'timestamp'}) else: df = df.rename( columns={ 'vrf_x': 'vrf', 'area_x': 'area', 'state_x': 'ifState', 'state_y': 'adjState', 'timestamp_x': 'timestamp' }) df = df.drop(list(df.filter(regex='_y$')), axis=1) \ .drop('ipAddress_x', axis=1, errors='ignore') \ .fillna({'peerIP': '-', 'numChanges': 0, 'lastChangeTime': 0}) # Fill the adjState column with passive if passive if 'passive' in df.columns: df.loc[df['adjState'].isnull(), 'adjState'] = df['passive'] df.loc[df['adjState'].eq(True), 'adjState'] = 'passive' df.loc[df['adjState'].eq(False), 'adjState'] = 'fail' df.drop(columns=['passive'], inplace=True) df.bfill(axis=0, inplace=True) if 'peerHostname' in columns or (columns in [['*'], ['default']]): nfdf = df.query('adjState != "full"').reset_index() nfdf['peerHostname'] = '' newdf = df.query('adjState == "full"').reset_index() \ .drop('peerHostname', axis=1, errors='ignore') if not newdf.empty: newdf['matchIP'] = newdf.ipAddress.str.split('/').str[0] newdf = newdf.merge(newdf[['namespace', 'hostname', 'vrf', 'matchIP']], left_on=['namespace', 'vrf', 'peerIP'], right_on=['namespace', 'vrf', 'matchIP'], suffixes=["", "_y"]) \ .rename(columns={'hostname_y': 'peerHostname'}) \ .drop_duplicates(subset=['namespace', 'hostname', 'vrf', 'ifname']) \ .drop(columns=['matchIP', 'matchIP_y'], errors='ignore') if newdf.empty: newdf = df.query('adjState == "full"').reset_index() newdf['peerHostname'] = '' final_df = pd.concat([nfdf, newdf]) else: final_df = df else: final_df = df if query_str: final_df = final_df.query(query_str).reset_index(drop=True) if user_query and not final_df.empty: final_df = self._handle_user_query_str(final_df, user_query) # Move the timestamp column to the end return final_df[cols]
def test_transform(input_file): to_transform = Yaml2Class(input_file) try: data_directory = to_transform.transform.data_directory except AttributeError: print('Invalid transformation file, no data directory') pytest.fail('AttributeError', pytrace=True) # Make a copy of the data directory temp_dir, tmpfile = _coalescer_init(data_directory) cfg = load_sq_config(config_file=tmpfile.name) schemas = Schema(cfg['schema-directory']) for ele in to_transform.transform.transform: query_str_list = [] # Each transformation has a record => write's happen per record for record in ele.record: changed_fields = set() new_df = pd.DataFrame() tables = [x for x in dir(record) if not x.startswith('_')] for table in tables: # Lets read the data in now that we know the table tblobj = get_sqobject(table) pq_db = get_sqdb_engine(cfg, table, None, None) columns = schemas.fields_for_table(table) mod_df = tblobj(config_file=tmpfile.name).get(columns=columns) for key in getattr(record, table): query_str = key.match chg_df = pd.DataFrame() if query_str != "all": try: chg_df = mod_df.query(query_str) \ .reset_index(drop=True) except Exception as ex: assert (not ex) query_str_list.append(query_str) else: chg_df = mod_df _process_transform_set(key.set, chg_df, changed_fields) if new_df.empty: new_df = chg_df elif not chg_df.empty: new_df = pd.concat([new_df, chg_df]) if new_df.empty: continue # Write the records now _write_verify_transform(new_df, table, pq_db, SchemaForTable(table, schemas), tmpfile.name, query_str_list, changed_fields) # Now we coalesce and verify it works from suzieq.sqobjects.tables import TablesObj pre_table_df = TablesObj(config_file=tmpfile.name).get() do_coalesce(cfg, None) _verify_coalescing(temp_dir) post_table_df = TablesObj(config_file=tmpfile.name).get() assert_df_equal(pre_table_df, post_table_df, None) # Run additional tests on the coalesced data for ele in to_transform.transform.verify: table = [x for x in dir(ele) if not x.startswith('_')][0] tblobj = get_sqobject(table) for tst in getattr(ele, table): start_time = tst.test.get('start-time', '') end_time = tst.test.get('end-time', '') columns = tst.test.get('columns', ['default']) df = tblobj(config_file=tmpfile.name, start_time=start_time, end_time=end_time).get(columns=columns) if not df.empty and 'query' in tst.test: query_str = tst.test['query'] df = df.query(query_str).reset_index(drop=True) if 'assertempty' in tst.test: assert (df.empty) elif 'shape' in tst.test: shape = tst.test['shape'].split() if shape[0] != '*': assert (int(shape[0]) == df.shape[0]) if shape[1] != '*': assert (int(shape[1]) == df.shape[1]) else: assert (not df.empty) _coalescer_cleanup(temp_dir, tmpfile)
def __init__(self, engine_name: str = '', hostname: typing.List[str] = [], start_time: str = '', end_time: str = '', view: str = 'latest', namespace: typing.List[str] = [], columns: typing.List[str] = ['default'], context=None, table: str = '', config_file=None) -> None: if context is None: self.ctxt = SqContext(engine_name, config_file) else: self.ctxt = context if not self.ctxt: self.ctxt = SqContext(engine_name) self._cfg = self.ctxt.cfg self._schema = SchemaForTable(table, self.ctxt.schemas) self._table = table self._sort_fields = self._schema.key_fields() if not namespace and self.ctxt.namespace: self.namespace = self.ctxt.namespace else: self.namespace = namespace if not hostname and self.ctxt.hostname: self.hostname = self.ctxt.hostname else: self.hostname = hostname if not start_time and self.ctxt.start_time: self.start_time = self.ctxt.start_time else: self.start_time = start_time if not end_time and self.ctxt.end_time: self.end_time = self.ctxt.end_time else: self.end_time = end_time if not view and self.ctxt.view: self.view = self.ctxt.view else: self.view = view self.columns = columns if engine_name and engine_name != '': self.engine = get_sqengine(engine_name) else: self.engine = self.ctxt.engine if self._table: self.engine_obj = self.engine.get_object(self._table, self) else: self.engine_obj = None self._addnl_filter = None self._addnl_fields = []
def get(self, **kwargs): """Replacing the original interface name in returned result""" addnl_fields = kwargs.pop('addnl_fields', []) columns = kwargs.get('columns', ['default']) vrf = kwargs.pop('vrf', None) peer = kwargs.pop('peer', None) hostname = kwargs.pop('hostname', None) user_query = kwargs.pop('query_str', None) drop_cols = ['origPeer', 'peerHost'] addnl_fields.extend(['origPeer']) sch = SchemaForTable(self.iobj.table, self.schemas) fields = sch.get_display_fields(columns) for col in [ 'peerIP', 'updateSource', 'state', 'namespace', 'vrf', 'peer', 'hostname' ]: if col not in fields: addnl_fields.append(col) drop_cols.append(col) try: df = super().get(addnl_fields=addnl_fields, **kwargs) except KeyError as ex: if ('afi' in str(ex)) or ('safi' in str(ex)): df = pd.DataFrame({ 'error': [f'ERROR: Migrate BGP data first using sq-coalescer'] }) return df if df.empty: return df if 'afiSafi' in columns or (columns == ['*']): df['afiSafi'] = df['afi'] + ' ' + df['safi'] query_str = build_query_str([], sch, vrf=vrf, peer=peer, hostname=hostname) if 'peer' in df.columns: df['peer'] = np.where(df['origPeer'] != "", df['origPeer'], df['peer']) # Convert old data into new 2.0 data format if 'peerHostname' in df.columns: mdf = self._get_peer_matched_df(df) drop_cols = [x for x in drop_cols if x in mdf.columns] drop_cols.extend(list(mdf.filter(regex='_y'))) else: mdf = df mdf = self._handle_user_query_str(mdf, user_query) if query_str: return mdf.query(query_str).drop(columns=drop_cols, errors='ignore') else: return mdf.drop(columns=drop_cols, errors='ignore')
if 'norifcnReason' in df.columns: df.rename({'notifcnReason': 'notificnReason'}, inplace=True) pq.write_to_dataset( table, root_path=output_dir, partition_cols=partition_cols, version="2.0", compression='ZSTD', row_group_size=100000, ) logger.info(f'Wrote converted {input_dir}') if __name__ == "__main__": if len(sys.argv) < 4: print('Usage: convert_parquet <input dir> <output_dir> <schema_dir>') sys.exit(1) input_dir = Path(sys.argv[1]) output_dir = sys.argv[2] schemas = Schema(sys.argv[3]) service = input_dir.parts[-1] svc_schema = SchemaForTable(service, schema=schemas) logging.basicConfig(stream=sys.stdout, level=logging.WARNING) logger = logging.getLogger('sq-converter') convert_dir(input_dir, output_dir, svc_schema)
def aver(self, **kwargs): """Assert that the OSPF state is OK""" kwargs.pop('columns', []) columns = [ "namespace", "hostname", "vrf", "ifname", "routerId", "helloTime", "deadTime", "passive", "ipAddress", "isUnnumbered", "areaStub", "networkType", "timestamp", "area", "nbrCount", ] # we have to not filter hostname at this point because we need to # understand neighbor relationships orig_hostname = kwargs.pop('hostname', '') ospf_df = self.get_valid_df("ospfIf", columns=columns, **kwargs) if ospf_df.empty: return pd.DataFrame(columns=columns) ospf_df["assertReason"] = [[] for _ in range(len(ospf_df))] df = (ospf_df[ospf_df["routerId"] != ""].groupby( ["routerId", "namespace"], as_index=False)[[ "hostname", "namespace" ]].agg(lambda x: x.unique().tolist())).dropna(how='any') # df is a dataframe with each row containing the routerId and the # corresponding list of hostnames with that routerId. In a good # configuration, the list must have exactly one entry ospf_df['assertReason'] = (ospf_df.merge( df, on=["routerId"], how="outer").apply( lambda x: ["duplicate routerId {}".format(x["hostname_y"])] if len(x['hostname_y']) != 1 else [], axis=1)) # Now peering match lldpobj = LldpObj(context=self.ctxt) lldp_df = lldpobj.get(namespace=kwargs.get("namespace", ""), hostname=kwargs.get("hostname", ""), ifname=kwargs.get("ifname", ""), columns=[ "namespace", "hostname", "ifname", "peerHostname", "peerIfname", "peerMacaddr" ]) if lldp_df.empty: ospf_df = ospf_df[~(ospf_df.ifname.str.contains('loopback') | ospf_df.ifname.str.contains('Vlan'))] ospf_df['assertReason'] = 'No LLDP peering info' ospf_df['assert'] = 'fail' return ospf_df[[ 'namespace', 'hostname', 'vrf', 'ifname', 'assertReason', 'assert' ]] # Create a single massive DF with fields populated appropriately use_cols = [ "namespace", "routerId", "hostname", "vrf", "ifname", "helloTime", "deadTime", "passive", "ipAddress", "areaStub", "isUnnumbered", "networkType", "area", "timestamp", ] int_df = ospf_df[use_cols].merge(lldp_df, on=["namespace", "hostname", "ifname"]) \ .dropna(how="any") # filter by hostname now if orig_hostname: ospfschema = SchemaForTable('ospf', schema=self.schemas) hq = build_query_str([], ospfschema, hostname=orig_hostname) ospf_df = ospf_df.query(hq) if int_df.empty: # Weed out the loopback and SVI interfaces as they have no LLDP peers ospf_df = ospf_df[~(ospf_df.ifname.str.contains('loopback') | ospf_df.ifname.str.contains('Vlan'))] ospf_df['assertReason'] = 'No LLDP peering info' ospf_df['assert'] = 'fail' return ospf_df[[ 'namespace', 'hostname', 'vrf', 'ifname', 'assertReason', 'assert' ]] ospf_df = ospf_df.merge(int_df, left_on=["namespace", "hostname", "ifname"], right_on=["namespace", "peerHostname", "peerIfname"]) \ .dropna(how="any") # Now start comparing the various parameters ospf_df["assertReason"] += ospf_df.apply( lambda x: ["subnet mismatch"] if ((x["isUnnumbered_x"] != x["isUnnumbered_y"]) and (IPv4Network(x["ipAddress_x"], strict=False) != IPv4Network( x["ipAddress_y"], strict=False))) else [], axis=1, ) ospf_df["assertReason"] += ospf_df.apply( lambda x: ["area mismatch"] if (x["area_x"] != x["area_y"] and x[ "areaStub_x"] != x["areaStub_y"]) else [], axis=1, ) ospf_df["assertReason"] += ospf_df.apply( lambda x: ["Hello timers mismatch"] if x["helloTime_x"] != x["helloTime_y"] else [], axis=1, ) ospf_df["assertReason"] += ospf_df.apply( lambda x: ["Dead timer mismatch"] if x["deadTime_x"] != x["deadTime_y"] else [], axis=1, ) ospf_df["assertReason"] += ospf_df.apply( lambda x: ["network type mismatch"] if x["networkType_x"] != x["networkType_y"] else [], axis=1, ) ospf_df["assertReason"] += ospf_df.apply( lambda x: ["passive config mismatch"] if x["passive_x"] != x["passive_y"] else [], axis=1, ) ospf_df["assertReason"] += ospf_df.apply( lambda x: ["vrf mismatch"] if x["vrf_x"] != x["vrf_y"] else [], axis=1, ) # Fill up a single assert column now indicating pass/fail ospf_df['assert'] = ospf_df.apply( lambda x: 'pass' if not len(x['assertReason']) else 'fail', axis=1) return (ospf_df.rename( index=str, columns={ "hostname_x": "hostname", "ifname_x": "ifname", "vrf_x": "vrf", }, )[[ "namespace", "hostname", "ifname", "vrf", "assert", "assertReason", "timestamp" ]].explode(column='assertReason').fillna({'assertReason': '-'}))
async def init_services(svc_dir: str, schema_dir: str, queue, svclist: list, def_interval: int, run_once: str): """Process service definitions by reading each file in svc dir""" svcs_list = [] schemas = defaultdict(dict) # Load up all the service definitions we can find svc_classes = {} for i in walk_packages(path=[dirname(getfile(Service))]): for mbr in getmembers( importlib.import_module('suzieq.poller.services.' + i.name), isclass): if mbr[0] == "Service" or not mbr[0].endswith("Service"): continue svc_classes[i.name] = mbr[1] svc_classes[mbr[0]] = mbr[1] if not isdir(svc_dir): logger.error("services directory not a directory: {}".format(svc_dir)) return svcs_list if not isdir(schema_dir): logger.error("schema directory not a directory: {}".format(svc_dir)) return svcs_list else: schemas = Schema(schema_dir) if schemas: poller_schema = schemas.get_arrow_schema("sqPoller") poller_schema_version = SchemaForTable('sqPoller', schemas).version for root, _, filenames in walk(svc_dir): for filename in filenames: if filename.endswith(".yml"): with open(root + "/" + filename, "r") as f: svc_def = yaml.safe_load(f.read()) if svc_def.get('service') not in svclist: logger.warning( f'Ignoring unspecified service {svc_def.get("service")}' ) continue if "service" not in svc_def or "apply" not in svc_def: logger.error('Ignoring invalid service file definition. \ Need both "service" and "apply" keywords: {}'.format( filename)) continue period = svc_def.get("period", def_interval) for elem, val in svc_def["apply"].items(): if "copy" in val: newval = svc_def["apply"].get(val["copy"], None) if not newval: logger.error("No device type {} to copy from for " "{} for service {}".format( val["copy"], elem, svc_def["service"])) continue val = newval if (("command" not in val) or ((isinstance(val['command'], list) and not all('textfsm' in x or 'normalize' in x for x in val['command'])) or (not isinstance(val['command'], list) and ("normalize" not in val and "textfsm" not in val)))): logger.error( "Ignoring invalid service file " 'definition. Need both "command" and ' '"normalize/textfsm" keywords: {}, {}'.format( filename, val)) continue if "textfsm" in val: # We may have already visited this element and parsed # the textfsm file. Check for this if val["textfsm"] and isinstance( val["textfsm"], textfsm.TextFSM): continue tfsm_file = svc_dir + "/" + val["textfsm"] if not isfile(tfsm_file): logger.error("Textfsm file {} not found. Ignoring" " service".format(tfsm_file)) continue with open(tfsm_file, "r") as f: tfsm_template = textfsm.TextFSM(f) val["textfsm"] = tfsm_template elif (isinstance(val['command'], list)): for subelem in val['command']: if 'textfsm' in subelem: if subelem["textfsm"] and isinstance( subelem["textfsm"], textfsm.TextFSM): continue tfsm_file = svc_dir + "/" + subelem["textfsm"] if not isfile(tfsm_file): logger.error( "Textfsm file {} not found. Ignoring" " service".format(tfsm_file)) continue with open(tfsm_file, "r") as f: tfsm_template = textfsm.TextFSM(f) subelem["textfsm"] = tfsm_template else: tfsm_template = None try: schema = SchemaForTable(svc_def['service'], schema=schemas) except Exception: logger.error( f"No matching schema for {svc_def['service']}") continue if schema.type == "derivedRecord": # These are not real services and so ignore them continue # Valid service definition, add it to list if svc_def["service"] in svc_classes: service = svc_classes[svc_def["service"]]( svc_def["service"], svc_def["apply"], period, svc_def.get("type", "state"), svc_def.get("keys", []), svc_def.get("ignore-fields", []), schema, queue, run_once, ) else: service = Service(svc_def["service"], svc_def["apply"], period, svc_def.get("type", "state"), svc_def.get("keys", []), svc_def.get("ignore-fields", []), schema, queue, run_once) service.poller_schema = poller_schema service.poller_schema_version = poller_schema_version logger.info("Service {} added".format(service.name)) svcs_list.append(service) return svcs_list
def get_valid_df(self, table, **kwargs) -> pd.DataFrame: if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) sch = SchemaForTable(table, schema=self.schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) query_str = kwargs.pop('query_str', '') # The REST API provides the query_str enclosed in ". Strip that if query_str: if query_str.startswith('"') and query_str.endswith('"'): query_str = query_str[1:-1] fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if columns == ['*']: drop_cols.append('sqvers') if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields+addnl_fields: addnl_fields.append('active') drop_cols.append('active') for fld in key_fields: if fld not in fields+addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) if self.iobj.start_time: try: start_time = dateparser.parse( self.iobj.start_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.start_time}: {e}") return pd.DataFrame() else: start_time = '' if self.iobj.start_time and not start_time: # Something went wrong with our parsing print(f"ERROR: unable to parse {self.iobj.start_time}") return pd.DataFrame() if self.iobj.end_time: try: end_time = dateparser.parse( self.iobj.end_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.end_time}: {e}") return pd.DataFrame() else: end_time = '' if self.iobj.end_time and not end_time: # Something went wrong with our parsing print(f"ERROR: Unable to parse {self.iobj.end_time}") return pd.DataFrame() table_df = self._dbeng.read( phy_table, 'pandas', start_time=start_time, end_time=end_time, columns=fields, view=view, key_fields=key_fields, **kwargs ) if not table_df.empty: if view == 'latest' and active_only: table_df = table_df.query('active') \ .drop(columns=drop_cols) else: table_df.drop(columns=drop_cols, inplace=True) if 'timestamp' in table_df.columns: table_df['timestamp'] = humanize_timestamp( table_df.timestamp, self.cfg.get('analyzer', {}) .get('timezone', None)) if query_str: return table_df.query(query_str) else: return table_df
def get_table_df(self, cfg, schemas, **kwargs) -> pd.DataFrame: """Use Pandas instead of Spark to retrieve the data""" MAX_FILECNT_TO_READ_FOLDER = 10000 self.cfg = cfg table = kwargs.pop("table") start = kwargs.pop("start_time") end = kwargs.pop("end_time") view = kwargs.pop("view") sort_fields = kwargs.pop("sort_fields") ign_key_fields = kwargs.pop("ign_key", []) addnl_fields = kwargs.pop("addnl_fields", []) for f in ['active', 'timestamp']: if f not in addnl_fields: addnl_fields.append(f) sch = SchemaForTable(table, schema=schemas) phy_table = sch.get_phy_table_for_table() folder = self._get_table_directory(phy_table) # Restrict to a single DC if thats whats asked if "namespace" in kwargs: v = kwargs["namespace"] if v: if not isinstance(v, list): folder += "/namespace={}/".format(v) fcnt = self.get_filecnt(folder) if fcnt == 0: return pd.DataFrame() # We are going to hard code use_get_files until we have some autoamted testing use_get_files = False # use_get_files = ( # (fcnt > MAX_FILECNT_TO_READ_FOLDER and view == "latest") or # start or end # ) if use_get_files: # Switch to more efficient method when there are lotsa files # Reduce I/O since that is the worst drag key_fields = [] if len(kwargs.get("namespace", [])) > 1: del kwargs["namespace"] files = get_latest_files(folder, start, end, view) else: # ign_key_fields contains key fields that are not partition cols key_fields = [i for i in sch.key_fields() if i not in ign_key_fields] filters = self.build_pa_filters(start, end, key_fields, **kwargs) if "columns" in kwargs: columns = kwargs["columns"] del kwargs["columns"] else: columns = ["default"] fields = sch.get_display_fields(columns) for f in addnl_fields: if f not in fields: fields.append(f) # Create the filter to select only specified columns query_str = "" prefix = "" addnl_filter = kwargs.pop('add_filter', None) for f, v in kwargs.items(): if not v or f in key_fields or f in ["groupby"]: continue if isinstance(v, str): if v.startswith('!'): v = v[1:] op = '!=' else: op = '==' query_str += "{} {}{}'{}' ".format(prefix, f, op, v) prefix = "and" else: query_str += "{} {}=={} ".format(prefix, f, v) prefix = "and" # Add the ignored fields back to key fields to ensure we # do the drop_duplicates correctly below incl reading reqd cols key_fields.extend(ign_key_fields) # Handle the case where key fields are missing from display fields fldset = set(fields) kfldset = set(key_fields) add_flds = kfldset.difference(fldset) if add_flds: fields.extend(list(add_flds)) if addnl_filter: # This is for special cases that are specific to an object if not query_str: query_str = addnl_filter else: query_str += ' and {}'.format(addnl_filter) # Restore the folder to what it needs to be folder = self._get_table_directory(phy_table) if use_get_files: if not query_str: query_str = "active == True" pdf_list = [] with Executor(max_workers=8) as exe: jobs = [ exe.submit(self.read_pq_file, f, fields, query_str) for f in files ] pdf_list = [job.result() for job in jobs] if pdf_list: final_df = pd.concat(pdf_list) else: final_df = pd.DataFrame(columns=fields) elif view == "latest": if not query_str: # Make up a dummy query string to avoid if/then/else query_str = "timestamp != 0" try: final_df = ( pa.ParquetDataset( folder, filters=filters or None, validate_schema=False ) .read(columns=fields) .to_pandas(split_blocks=True, self_destruct=True) .query(query_str) .drop_duplicates(subset=key_fields, keep="last") .query("active == True") ) except pa.lib.ArrowInvalid: return pd.DataFrame(columns=fields) else: if not query_str: # Make up a dummy query string to avoid if/then/else query_str = 'timestamp != "0"' try: final_df = ( pa.ParquetDataset( folder, filters=filters or None, validate_schema=False ) .read(columns=fields) .to_pandas() .query(query_str) ) except pa.lib.ArrowInvalid: return pd.DataFrame(columns=fields) if 'active' not in columns: final_df.drop(columns=['active'], axis=1, inplace=True) fields.remove('active') final_df = df_timestamp_to_datetime(final_df) if sort_fields and all(x in sort_fields for x in fields): return final_df[fields].sort_values(by=sort_fields) else: return final_df[fields]
def get_valid_df(self, table, **kwargs) -> pd.DataFrame: if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) sch = SchemaForTable(table, schema=self.schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if columns == ['*']: drop_cols.append('sqvers') aug_fields = sch.get_augmented_fields() if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields + addnl_fields: addnl_fields.append('active') drop_cols.append('active') # Order matters. Don't put this before the missing key fields insert for f in aug_fields: dep_fields = sch.get_parent_fields(f) addnl_fields += dep_fields for fld in key_fields: if fld not in fields + addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) if self.iobj.start_time: try: start_time = dateparser.parse( self.iobj.start_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.start_time}: {e}") return pd.DataFrame() else: start_time = '' if self.iobj.start_time and not start_time: # Something went wrong with our parsing print(f"ERROR: unable to parse {self.iobj.start_time}") return pd.DataFrame() if self.iobj.end_time: try: end_time = dateparser.parse( self.iobj.end_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.end_time}: {e}") return pd.DataFrame() else: end_time = '' if self.iobj.end_time and not end_time: # Something went wrong with our parsing print(f"ERROR: Unable to parse {self.iobj.end_time}") return pd.DataFrame() table_df = self._dbeng.read(phy_table, 'pandas', start_time=start_time, end_time=end_time, columns=fields, view=view, key_fields=key_fields, **kwargs) if not table_df.empty: if view == "all" or not active_only: table_df.drop(columns=drop_cols, inplace=True) else: table_df = table_df.query('active') \ .drop(columns=drop_cols) if 'timestamp' in table_df.columns and not table_df.empty: table_df['timestamp'] = humanize_timestamp( table_df.timestamp, self.cfg.get('analyzer', {}).get('timezone', None)) return table_df
def migrate(self, table_name: str, schema: SchemaForTable) -> None: """Migrates the data for the table specified to latest version :param table_name: str, The name of the table to migrate :param schema: SchemaForTable, the current schema :returns: None :rtype: """ current_vers = schema.version defvals = self._get_default_vals() arrow_schema = schema.get_arrow_schema() schema_def = dict(zip(arrow_schema.names, arrow_schema.types)) for sqvers in self._get_avail_sqvers(table_name, True): if sqvers != current_vers: migrate_rtn = get_migrate_fn(table_name, sqvers, current_vers) if migrate_rtn: dataset = self._get_cp_dataset(table_name, True, sqvers, 'all', '', '') for item in dataset.files: try: namespace = item.split('namespace=')[1] \ .split('/')[0] except IndexError: # Don't convert data not in our template continue df = pd.read_parquet(item) df['sqvers'] = sqvers df['namespace'] = namespace newdf = migrate_rtn(df) cols = newdf.columns # Ensure all fields are present for field in schema_def: if field not in cols: newdf[field] = defvals.get( schema_def[field], '') newdf.drop(columns=['namespace', 'sqvers']) newitem = item.replace(f'sqvers={sqvers}', f'sqvers={current_vers}') newdir = os.path.dirname(newitem) if not os.path.exists(newdir): os.makedirs(newdir, exist_ok=True) table = pa.Table.from_pandas( newdf, schema=schema.get_arrow_schema(), preserve_index=False) pq.write_to_dataset(table, newitem, version="2.0", compression="ZSTD", row_group_size=100000) self.logger.debug( f'Migrated {item} version {sqvers}->{current_vers}' ) os.remove(item) rmtree( f'{self._get_table_directory(table_name, True)}/sqvers={sqvers}', ignore_errors=True) return
class SqObject(object): def __init__(self, engine_name: str = 'pandas', hostname: typing.List[str] = [], start_time: str = '', end_time: str = '', view: str = 'latest', namespace: typing.List[str] = [], columns: typing.List[str] = ['default'], context=None, table: str = '', config_file=None) -> None: if context is None: self.ctxt = SqContext(engine_name, config_file) else: self.ctxt = context if not self.ctxt: self.ctxt = SqContext(engine_name) self._cfg = self.ctxt.cfg self._schema = SchemaForTable(table, self.ctxt.schemas) self._table = table self._sort_fields = self._schema.key_fields() if not namespace and self.ctxt.namespace: self.namespace = self.ctxt.namespace else: self.namespace = namespace if not hostname and self.ctxt.hostname: self.hostname = self.ctxt.hostname else: self.hostname = hostname if not start_time and self.ctxt.start_time: self.start_time = self.ctxt.start_time else: self.start_time = start_time if not end_time and self.ctxt.end_time: self.end_time = self.ctxt.end_time else: self.end_time = end_time if not view and self.ctxt.view: self.view = self.ctxt.view else: self.view = view self.columns = columns if engine_name and engine_name != '': self.engine = get_sqengine(engine_name, self._table)(self._table, self) elif self.ctxt.engine: self.engine = get_sqengine(self.ctxt.engine, self._table)(self._table, self) if not self.engine: raise ValueError('Unknown analysis engine') self._addnl_filter = None self._addnl_fields = [] self._valid_get_args = None self._valid_assert_args = None self._valid_arg_vals = None @property def all_schemas(self): return self.ctxt.schemas @property def schema(self): return self._schema @property def cfg(self): return self._cfg @property def table(self): return self._table def _check_input_for_valid_args( self, good_arg_list, **kwargs, ): if not good_arg_list: return # add standard args that are always good_arg_list = good_arg_list + (['namespace', 'addnl_fields']) for arg in kwargs.keys(): if arg not in good_arg_list: raise AttributeError( f"argument {arg} not supported for this command") def _check_input_for_valid_vals(self, good_arg_val_list, **kwargs): '''Check if the input is valid for the arg, if possible''' if not good_arg_val_list: return for arg in kwargs.keys(): if arg in good_arg_val_list: if kwargs[arg] not in good_arg_val_list[arg]: raise AttributeError( f"invalid value {kwargs[arg]} for argument {arg}") def validate_get_input(self, **kwargs): self._check_input_for_valid_args(self._valid_get_args + ['columns'], **kwargs) self._check_input_for_valid_vals(self._valid_arg_vals, **kwargs) def validate_assert_input(self, **kwargs): self._check_input_for_valid_args(self._valid_assert_args, **kwargs) def get(self, **kwargs) -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') if self._addnl_filter: kwargs['add_filter'] = self._addnl_filter # This raises exceptions if it fails try: self.validate_get_input(**kwargs) except Exception as error: df = pd.DataFrame({'error': [f'{error}']}) return df return self.engine.get(**kwargs) def summarize(self, namespace=[], hostname=[], query_str='') -> pd.DataFrame: if self.columns != ["default"]: self.summarize_df = pd.DataFrame({ 'error': ['ERROR: You cannot specify columns with summarize'] }) return self.summarize_df if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') return self.engine.summarize(namespace=namespace, hostname=hostname, query_str=query_str) def unique(self, **kwargs) -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') columns = kwargs.pop('columns', self.columns) return self.engine.unique(**kwargs, columns=columns) def analyze(self, **kwargs): raise NotImplementedError def aver(self, **kwargs): raise NotImplementedError def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame: """Get the list of top/bottom entries of "what" field""" if "columns" in kwargs: columns = kwargs["columns"] del kwargs["columns"] else: columns = ["default"] # if self._valid_get_args: # self._valid_get_args += ['what', 'n', 'reverse'] # This raises exceptions if it fails try: self.validate_get_input(**kwargs) except Exception as error: df = pd.DataFrame({'error': [f'{error}']}) return df table_schema = SchemaForTable(self._table, self.all_schemas) columns = table_schema.get_display_fields(columns) if what not in columns: self._addnl_fields.append(what) return self.engine.top(what=what, n=n, reverse=reverse, **kwargs) def humanize_fields(self, df: pd.DataFrame, subset=None) -> pd.DataFrame: '''Humanize the fields for human consumption. Individual classes will implement the right transofmations. This routine is just a placeholder for all those with nothing to modify. ''' return df
class SqObject(object): def __init__(self, engine_name: str = '', hostname: typing.List[str] = [], start_time: str = '', end_time: str = '', view: str = 'latest', namespace: typing.List[str] = [], columns: typing.List[str] = ['default'], context=None, table: str = '', config_file=None) -> None: if context is None: self.ctxt = SqContext(engine_name, config_file) else: self.ctxt = context if not self.ctxt: self.ctxt = SqContext(engine_name) self._cfg = self.ctxt.cfg self._schema = SchemaForTable(table, self.ctxt.schemas) self._table = table self._sort_fields = self._schema.key_fields() if not namespace and self.ctxt.namespace: self.namespace = self.ctxt.namespace else: self.namespace = namespace if not hostname and self.ctxt.hostname: self.hostname = self.ctxt.hostname else: self.hostname = hostname if not start_time and self.ctxt.start_time: self.start_time = self.ctxt.start_time else: self.start_time = start_time if not end_time and self.ctxt.end_time: self.end_time = self.ctxt.end_time else: self.end_time = end_time if not view and self.ctxt.view: self.view = self.ctxt.view else: self.view = view self.columns = columns if engine_name and engine_name != '': self.engine = get_sqengine(engine_name) else: self.engine = self.ctxt.engine if self._table: self.engine_obj = self.engine.get_object(self._table, self) else: self.engine_obj = None self._addnl_filter = None self._addnl_fields = [] @property def all_schemas(self): return self.ctxt.schemas @property def schema(self): return self._schema @property def cfg(self): return self._cfg @property def table(self): return self._table def validate_input(self, **kwargs): """Dummy validate input""" return def get(self, **kwargs) -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') if self._addnl_filter: kwargs['add_filter'] = self._addnl_filter # This raises exceptions if it fails try: self.validate_input(**kwargs) except Exception as error: df = pd.DataFrame({'error': [f'{error}']}) return df return self.engine_obj.get(**kwargs) def summarize(self, namespace='') -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') return self.engine_obj.summarize(namespace=namespace) def unique(self, **kwargs) -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') return self.engine_obj.unique(**kwargs) def analyze(self, **kwargs): raise NotImplementedError def aver(self, **kwargs): raise NotImplementedError def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame: """Get the list of top/bottom entries of "what" field""" if "columns" in kwargs: columns = kwargs["columns"] del kwargs["columns"] else: columns = ["default"] table_schema = SchemaForTable(self._table, self.all_schemas) columns = table_schema.get_display_fields(columns) if what not in columns: self._addnl_fields.append(what) return self.engine_obj.top(what=what, n=n, reverse=reverse, **kwargs)