示例#1
0
def run_coalescer(cfg: dict,
                  tables: List[str],
                  periodstr: str,
                  run_once: bool,
                  logger: Logger,
                  no_sqpoller: bool = False) -> None:
    """Run the coalescer.

    Runs it once and returns or periodically depending on the
    value of run_once. It also writes out the coalescer records
    as a parquet file.

    :param cfg: dict, the Suzieq config file read in
    :param tables: List[str], list of table names to coalesce
    :param periodstr: str, the string of how periodically the poller runs,
                      Examples are '1h', '1d' etc.
    :param run_once: bool, True if you want the poller to run just once
    :param logger: logging.Logger, the logger to write logs to
    :param no_sqpoller: bool, write records even when there's no sqpoller rec
    :returns: Nothing
    :rtype: none

    """

    try:
        schemas = Schema(cfg['schema-directory'])
    except Exception as ex:
        logger.error(f'Aborting. Unable to load schema: {str(ex)}')
        print(f'ERROR: Aborting. Unable to load schema: {str(ex)}')
        sys.exit(1)

    coalescer_schema = SchemaForTable('sqCoalescer', schemas)
    pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger)

    status, errmsg = validate_periodstr(periodstr)
    if not status:
        logger.error(errmsg)
        print(f'ERROR: {errmsg}')
        sys.exit(1)

    while True:
        try:
            stats = do_coalesce(cfg, tables, periodstr, logger, no_sqpoller)
        except Exception:
            logger.exception('Coalescer aborted. Continuing')
        # Write the selftats
        if stats:
            df = pd.DataFrame([asdict(x) for x in stats])
            if not df.empty:
                df['sqvers'] = coalescer_schema.version
                df['version'] = SUZIEQ_VERSION
                df['active'] = True
                df['namespace'] = ''
                pqdb.write('sqCoalescer', 'pandas', df, True,
                           coalescer_schema.get_arrow_schema(), None)

        if run_once:
            break
        sleep_time = get_sleep_time(periodstr)
        sleep(sleep_time)
示例#2
0
    def describe(self, **kwargs):
        """Describes the fields for a given table"""

        table = kwargs.get('table', self.table)

        cols = kwargs.get('columns', ['default'])
        if cols not in [['default'], ['*']]:
            df = pd.DataFrame(
                {'error': ['ERROR: cannot specify column names for describe']})
            return df

        try:
            sch = SchemaForTable(table, self.all_schemas)
        except ValueError:
            sch = None
        if not sch:
            df = pd.DataFrame(
                {'error': [f'ERROR: incorrect table name {table}']})
            return df

        entries = [{'name': x['name'], 'type': x['type'],
                    'key': x.get('key', ''),
                    'display': x.get('display', ''),
                    'description': x.get('description', '')}
                   for x in sch.get_raw_schema()]
        df = pd.DataFrame.from_dict(entries).sort_values('name')

        query_str = kwargs.get('query_str', '')
        if query_str:
            return df.query(query_str).reset_index(drop=True)

        return df
示例#3
0
    def __init__(self, engine_name: str = '',
                 hostname: typing.List[str] = None,
                 start_time: str = '', end_time: str = '',
                 view: str = '', namespace: typing.List[str] = None,
                 columns: typing.List[str] = None,
                 context=None, table: str = '', config_file=None) -> None:

        if not context:
            self.ctxt = SqContext(cfg=load_sq_config(validate=True,
                                                     config_file=config_file),
                                  engine=engine_name)
            self.ctxt.schemas = Schema(self.ctxt.cfg["schema-directory"])
        else:
            self.ctxt = context
            if not self.ctxt.cfg:
                self.ctxt.cfg = load_sq_config(validate=True,
                                               config_file=config_file)
                self.ctxt.schemas = Schema(self.ctxt.cfg["schema-directory"])
            if not self.ctxt.engine:
                self.ctxt.engine = engine_name

        self._cfg = self.ctxt.cfg
        self._schema = SchemaForTable(table, self.ctxt.schemas)
        self._table = table
        self._sort_fields = self._schema.key_fields()
        self._convert_args = {}

        self.namespace = namespace or self.ctxt.namespace or []
        self.hostname = hostname or self.ctxt.hostname or []
        self.start_time = start_time or self.ctxt.start_time
        self.end_time = end_time or self.ctxt.end_time

        view = view or self.ctxt.view

        if self.start_time and self.end_time and not view:
            self.view = 'all'
        else:
            self.view = view or 'latest'

        self.columns = columns or ['default']
        self._unique_def_column = ['hostname']

        if engine_name and engine_name != '':
            self.engine = get_sqengine(engine_name, self._table)(self)
        elif self.ctxt.engine:
            self.engine = get_sqengine(self.ctxt.engine, self._table)(self)

        if not self.engine:
            raise ValueError('Unknown analysis engine')

        self.summarize_df = pd.DataFrame()

        self._addnl_filter = self._addnl_fields = []
        self._valid_get_args = self._valid_assert_args = []
        self._valid_arg_vals = self._valid_find_args = []
        self._valid_summarize_args = []
示例#4
0
def convert_dir(input_dir: str, output_dir: str, svcschema: SchemaForTable):
    """Convert the data into a single file and write it out"""

    defaults = {
        pa.string(): "",
        pa.int32(): 0,
        pa.int64(): 0,
        pa.float32(): 0.0,
        pa.float64(): 0.0,
        pa.date64(): 0.0,
        pa.bool_(): False,
        pa.list_(pa.string()): ['-'],
        pa.list_(pa.int64()): [],
    }

    df = pd.read_parquet(input_dir, use_legacy_dataset=True)
    sqschema = svcschema.get_raw_schema()
    arrow_schema = svc_schema.get_arrow_schema()

    for column in filter(lambda x: x['name'] not in df.columns, sqschema):
        df[column['name']] = column.get('default', defaults[column['type']])

    # convert all dtypes to whatever is desired
    for column in df.columns:
        if column in arrow_schema:
            df[column] = df[column].astype(
                arrow_schema.field(column).type.to_pandas_dtype())

    # If there's the original ifname saved up, then eliminate this unnecessary
    # field as this model is no longer necessary

    if 'origIfname' in df.columns:
        if 'ifname' in df.columns:
            df = df.drop(columns=['ifname']) \
                   .rename(columns={'origIfname': 'ifname'})
        elif 'oif' in df.columns:
            df = df.drop(columns=['oif']) \
                   .rename(columns={'origIfname': 'oif'})

    table = pa.Table.from_pandas(df, schema=arrow_schema, preserve_index=False)
    partition_cols = svcschema.get_partition_columns()

    if 'norifcnReason' in df.columns:
        df.rename({'notifcnReason': 'notificnReason'}, inplace=True)

    pq.write_to_dataset(
        table,
        root_path=output_dir,
        partition_cols=partition_cols,
        version="2.0",
        compression='ZSTD',
        row_group_size=100000,
    )

    logger.info(f'Wrote converted {input_dir}')
示例#5
0
    def _field_exists(self, table_schema: SchemaForTable, field: str) -> bool:
        """Check if a field exists in the schema

        Args:
            table_schema (SchemaForTable): The schema for the table
            field (str): the field name we're checking for

        Returns:
            bool: True if the field exists, False otherwise
        """
        return table_schema.field(field)
示例#6
0
    def top(self, what: str = '', count: int = 5, reverse: bool = False,
            **kwargs) -> pd.DataFrame:
        """Get the list of top/bottom entries of "what" field"""

        columns = kwargs.get('columns', ['default'])
        # This raises ValueError if it fails
        self.validate_columns(columns)

        if not what:
            raise ValueError('Must specify what field to get top for')
        # if self._valid_get_args:
        #     self._valid_get_args += ['what', 'n', 'reverse']
        # This raises exceptions if it fails
        try:
            self.validate_get_input(**kwargs)
        except (ValueError, AttributeError) as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        # This raises ValueError if it fails
        table_schema = SchemaForTable(self._table, self.all_schemas)
        if not self._field_exists(table_schema, what):
            raise ValueError(
                f"Field {what} does not exist in table {self.table}")

        columns = table_schema.get_display_fields(columns)

        ftype = table_schema.field(what).get('type', 'str')
        if ftype not in ['long', 'double', 'float', 'int', 'timestamp',
                         'timedelta64[s]']:
            return pd.DataFrame({'error':
                                 [f'{what} not numeric; top can be used with'
                                  f' numeric fields only']})

        if what not in columns:
            self._addnl_fields.append(what)

        return self.engine.top(what=what, count=count, reverse=reverse,
                               **kwargs)
示例#7
0
    def validate_columns(self, columns: typing.List[str]) -> bool:
        """Validate that the provided columns are valid for the table

        Args:
            columns (List[str]): list of columns

        Returns:
            bool: True if columns are valid
        Raises:
            ValueError: if columns are invalid
        """

        if columns in [['default'], ['*']]:
            return True

        table_schema = SchemaForTable(self._table, self.all_schemas)
        invalid_columns = [x for x in columns if x not in table_schema.fields]
        if invalid_columns:
            raise ValueError(f"Invalid columns specified: {invalid_columns}")
        return True
示例#8
0
def test_transform(input_file):
    '''Test transformation is captured by coalescer'''
    to_transform = Yaml2Class(input_file)

    try:
        data_directory = to_transform.transform.data_directory
    except AttributeError:
        print('Invalid transformation file, no data directory')
        pytest.fail('AttributeError', pytrace=True)

    #  Make a copy of the data directory
    temp_dir, tmpfile = _coalescer_init(data_directory)

    cfg = load_sq_config(config_file=tmpfile.name)
    schemas = Schema(cfg['schema-directory'])

    # pylint: disable=too-many-nested-blocks, no-member
    for ele in to_transform.transform.transform:
        query_str_list = []
        # Each transformation has a record => write's happen per record
        for record in ele.record:
            changed_fields = set()
            new_df = pd.DataFrame()
            tables = [x for x in dir(record) if not x.startswith('_')]
            for table in tables:
                # Lets read the data in now that we know the table
                tblobj = get_sqobject(table)
                pq_db = get_sqdb_engine(cfg, table, None, None)
                columns = schemas.fields_for_table(table)
                mod_df = tblobj(config_file=tmpfile.name).get(columns=columns)

                for key in getattr(record, table):
                    query_str = key.match
                    chg_df = pd.DataFrame()
                    if query_str != "all":
                        try:
                            chg_df = mod_df.query(query_str) \
                                           .reset_index(drop=True)
                        except Exception as ex:  # pylint: disable=broad-except
                            assert (not ex)
                        query_str_list.append(query_str)
                    else:
                        chg_df = mod_df

                    _process_transform_set(key.set, chg_df, changed_fields)
                    if new_df.empty:
                        new_df = chg_df
                    elif not chg_df.empty:
                        new_df = pd.concat([new_df, chg_df])

                if new_df.empty:
                    continue

                # Write the records now
                _write_verify_transform(new_df, table, pq_db,
                                        SchemaForTable(table,
                                                       schemas), tmpfile.name,
                                        query_str_list, changed_fields)

    # Now we coalesce and verify it works
    pre_table_df = get_sqobject('tables')(config_file=tmpfile.name).get()
    do_coalesce(cfg, None)
    _verify_coalescing(temp_dir)

    post_table_df = get_sqobject('tables')(config_file=tmpfile.name).get()
    assert_df_equal(pre_table_df, post_table_df, None)

    # Run additional tests on the coalesced data
    for ele in to_transform.transform.verify:
        table = [x for x in dir(ele) if not x.startswith('_')][0]
        tblobj = get_sqobject(table)

        for tst in getattr(ele, table):
            start_time = tst.test.get('start-time', '')
            end_time = tst.test.get('end-time', '')

            columns = tst.test.get('columns', ['default'])
            df = tblobj(config_file=tmpfile.name,
                        start_time=start_time,
                        end_time=end_time).get(columns=columns)
            if not df.empty and 'query' in tst.test:
                query_str = tst.test['query']
                df = df.query(query_str).reset_index(drop=True)

            if 'assertempty' in tst.test:
                assert (df.empty)
            elif 'shape' in tst.test:
                shape = tst.test['shape'].split()
                if shape[0] != '*':
                    assert (int(shape[0]) == df.shape[0])
                if shape[1] != '*':
                    assert (int(shape[1]) == df.shape[1])
            else:
                assert (not df.empty)

    _coalescer_cleanup(temp_dir, tmpfile)
示例#9
0
    async def init_services(self) -> List[Service]:
        """Instantiate Service objects and prepare
        them for running. This function should be called before
        scheduling the service for running.

        Returns:
            List[Service]: the list of the initialized service instances
        """
        services = []
        schemas = defaultdict(dict)
        svc_classes = Service.get_plugins()

        schemas = Schema(self.schema_dir)
        if schemas:
            poller_schema = schemas.get_arrow_schema('sqPoller')
            poller_schema_version = SchemaForTable('sqPoller', schemas).version

        # Read the available services and iterate over them, discarding
        # the ones we do not need to instantiate
        svc_desc_files = Path(self.service_directory).glob('*.yml')

        for filename in svc_desc_files:
            with open(filename, 'r') as f:
                svc_def = yaml.safe_load(f.read())

            if not svc_def:
                logger.warning(f'Skip empty service file: {filename}')
                continue

            if svc_def.get('service') not in self.svcs_list:
                logger.warning(
                    f"Ignoring unspecified service {svc_def.get('service')}"
                )
                continue

            if 'service' not in svc_def or 'apply' not in svc_def:
                logger.error(
                    'Ignoring invalid service file definition.'
                    f"'service' and 'apply' keywords: {filename}"
                )
                continue

            period = svc_def.get('period', self.default_interval)
            for nos, cmds_desc in svc_def['apply'].items():

                # Check if the the current nos copies from another
                if isinstance(cmds_desc, dict) and 'copy' in cmds_desc:
                    newval = svc_def['apply'].get(cmds_desc['copy'], None)
                    if not newval:
                        logger.error(
                            f"No device type {cmds_desc['copy']} to copy from,"
                            f"for {nos} for service {svc_def['service']}"
                        )
                        return
                    cmds_desc = newval

                # Update the command description adding the
                # specification for the output parsing
                if isinstance(cmds_desc, list):
                    for subele in cmds_desc:
                        self._parse_nos_version(filename, svc_def, nos, subele)
                else:
                    self._parse_nos_version(filename, svc_def, nos, cmds_desc)

            try:
                schema = SchemaForTable(svc_def['service'], schema=schemas)
            except Exception:  # pylint: disable=broad-except
                logger.error(f"No matching schema for {svc_def['service']}")
                continue

            if schema.type == 'derivedRecord':
                # These are not real services and so ignore them
                continue

            # Valid service definition, add it to list
            # if the service has not a dedicated class, we will use the
            # default implementation
            class_to_use = svc_classes.get(svc_def['service'], Service)
            service = class_to_use(
                svc_def['service'],
                svc_def['apply'],
                period,
                svc_def.get('type', 'state'),
                svc_def.get('keys', []),
                svc_def.get('ignore-fields', []),
                schema,
                self.output_queue,
                self.run_mode
            )
            service.poller_schema = poller_schema
            service.poller_schema_version = poller_schema_version
            logger.info(f'Service {service.name} added')
            services.append(service)

        # Once done set the service list and return its content
        self._services = services
        return self._services
示例#10
0
        pa.list_(pa.int64()): [],
    }

    with concurrent.futures.ProcessPoolExecutor(max_workers=None) as thread:
        threads = {thread.submit(convert_file, item,
                                 output_dir, sqschema, defaults, arrow_schema)
                   for item in files}
        for future in concurrent.futures.as_completed(threads):
            try:
                _ = future.result()
            except Exception:
                logger.exception(f'Exception occcurred with {future}')


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print('Usage: convert_parquet <input dir> <output_dir> <schema_dir>')
        sys.exit(1)

    input_dir = Path(sys.argv[1])
    output_dir = sys.argv[2]
    schemas = Schema(sys.argv[3])
    service = input_dir.parts[-1]
    svc_schema = SchemaForTable(service, schema=schemas)
    arrow_schema = svc_schema.get_arrow_schema()
    sqschema = svc_schema.get_raw_schema()

    logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
    logger = logging.getLogger('sq-converter')
    convert_dir(input_dir, output_dir, sqschema, arrow_schema)
示例#11
0
    def _get_combined_df(self, **kwargs):
        """OSPF has info divided across multiple tables. Get a single one"""

        columns = kwargs.pop('columns', ['default'])
        state = kwargs.pop('state', '')
        addnl_fields = kwargs.pop('addnl_fields', self.iobj.addnl_fields)
        addnl_nbr_fields = getattr(
            self.iobj, '._addnl_nbr_fields', ['state'])
        user_query = kwargs.pop('query_str', '')
        hostname = kwargs.pop('hostname', [])

        cols = self.schema.get_display_fields(columns)

        if columns == ['*']:
            cols.remove('sqvers')

        ifschema = SchemaForTable('ospfIf', schema=self.all_schemas)
        nbrschema = SchemaForTable('ospfNbr', schema=self.all_schemas)

        if columns not in [['default'], ['*']]:
            ifkeys = ifschema.key_fields()
            nbrkeys = nbrschema.key_fields()
            if_flds = ifschema.fields
            nbr_flds = nbrschema.fields

            ifcols = ifkeys
            nbrcols = nbrkeys
            for fld in columns:
                if fld in if_flds and fld not in ifcols:
                    ifcols.append(fld)
                elif fld in nbr_flds and fld not in nbrcols:
                    nbrcols.append(fld)
        else:
            ifcols = ifschema.get_display_fields(columns)
            nbrcols = nbrschema.get_display_fields(columns)

        state_query_dict = {
            'full': '(adjState == "full" or adjState == "passive")',
            'passive': '(adjState == "passive")',
            'other': '(adjState != "full" and adjState != "passive")',
            '!full': '(adjState != "full")',
            '!passive': '(adjState != "passive")',
            '!other': '(adjState == "full" or adjState == "passive")',
        }

        if state:
            query_str = state_query_dict.get(state, '')
            cond_prefix = ' and '
        else:
            query_str = ''
            cond_prefix = ''

        host_query_str = build_query_str([], ifschema, ignore_regex=False,
                                         hostname=hostname)
        if host_query_str:
            query_str += f'{cond_prefix}{host_query_str}'

        df = self.get_valid_df('ospfIf', addnl_fields=addnl_fields,
                               columns=ifcols, **kwargs)
        nbr_df = self.get_valid_df('ospfNbr', addnl_fields=addnl_nbr_fields,
                                   columns=nbrcols, **kwargs)
        if nbr_df.empty:
            return df

        merge_cols = [x for x in ['namespace', 'hostname', 'ifname']
                      if x in nbr_df.columns]
        # Merge the two tables
        df = df.merge(nbr_df, on=merge_cols, how='left')

        # This is because some NOS have the ipAddress in nbr table and some in
        # interface table. Nbr table wins over interface table if present
        if 'ipAddress_y' in df:
            df['ipAddress'] = np.where(
                df['ipAddress_y'] == "",
                df['ipAddress_x'], df['ipAddress_y'])
            df['ipAddress'] = np.where(df['ipAddress'], df['ipAddress'],
                                       df['ipAddress_x'])

        if columns == ['*']:
            df = df.drop(columns=['area_y', 'instance_y', 'vrf_y',
                                  'ipAddress_x', 'ipAddress_y', 'areaStub_y',
                                  'sqvers_x', 'timestamp_y'],
                         errors='ignore') \
                .rename(columns={
                    'instance_x': 'instance', 'areaStub_x': 'areaStub',
                    'area_x': 'area', 'vrf_x': 'vrf',
                    'state_x': 'ifState', 'state_y': 'adjState',
                    'active_x': 'active', 'timestamp_x': 'timestamp'})
        else:
            df = df.rename(columns={'vrf_x': 'vrf', 'area_x': 'area',
                                    'state_x': 'ifState',
                                    'state_y': 'adjState',
                                    'timestamp_x': 'timestamp'})
            df = df.drop(list(df.filter(regex='_y$')), axis=1) \
                   .drop(['ipAddress_x'], axis=1, errors='ignore') \
                   .fillna({'peerIP': '-', 'numChanges': 0,
                            'lastChangeTime': 0})

        if df.empty:
            return df

        if 'lastChangeTime' in df.columns:
            df['lastChangeTime'] = np.where(df.lastChangeTime == '-',
                                            0, df.lastChangeTime)
        # Fill the adjState column with passive if passive
        if 'passive' in df.columns:
            df.loc[df['adjState'].isnull(), 'adjState'] = df['passive']
            df.loc[df['adjState'].eq(True), 'adjState'] = 'passive'
            df.loc[df['adjState'].eq(False), 'adjState'] = 'fail'
            df.loc[df['adjState'] == 'passive', 'peerIP'] = ''
            df.loc[df['adjState'] == 'passive', 'peerRouterId'] = ''

            df.drop(columns=['passive'], inplace=True)

        df.bfill(axis=0, inplace=True)

        if 'peerHostname' in columns or (columns in [['*'], ['default']]):
            nfdf = df.query('adjState != "full"').reset_index()
            nfdf['peerHostname'] = ''
            newdf = df.query('adjState == "full"').reset_index() \
                .drop('peerHostname', axis=1, errors='ignore')
            if not newdf.empty:
                newdf['matchIP'] = newdf.ipAddress.str.split('/').str[0]
                newdf = newdf.merge(newdf[['namespace', 'hostname', 'vrf',
                                           'matchIP']],
                                    left_on=['namespace', 'vrf', 'peerIP'],
                                    right_on=['namespace', 'vrf', 'matchIP'],
                                    suffixes=["", "_y"]) \
                    .rename(columns={'hostname_y': 'peerHostname'}) \
                    .drop_duplicates(subset=['namespace', 'hostname',
                                             'vrf', 'ifname']) \
                    .drop(columns=['matchIP', 'matchIP_y', 'timestamp_y'],
                          errors='ignore')

                if newdf.empty:
                    newdf = df.query('adjState == "full"').reset_index()
                    newdf['peerHostname'] = ''
                final_df = pd.concat([nfdf, newdf])
            else:
                final_df = df
        else:
            final_df = df.drop(list(df.filter(regex='_y$')), axis=1) \
                         .rename({'timestamp_x': 'timestamp'})

        if query_str:
            final_df = final_df.query(query_str).reset_index(drop=True)

        if user_query and not final_df.empty:
            final_df = self._handle_user_query_str(final_df, user_query)
        # Move the timestamp column to the end
        return final_df[cols]
示例#12
0
class SqObject(SqPlugin):
    '''The base class for accessing the backend independent of the engine'''

    def __init__(self, engine_name: str = '',
                 hostname: typing.List[str] = None,
                 start_time: str = '', end_time: str = '',
                 view: str = '', namespace: typing.List[str] = None,
                 columns: typing.List[str] = None,
                 context=None, table: str = '', config_file=None) -> None:

        if not context:
            self.ctxt = SqContext(cfg=load_sq_config(validate=True,
                                                     config_file=config_file),
                                  engine=engine_name)
            self.ctxt.schemas = Schema(self.ctxt.cfg["schema-directory"])
        else:
            self.ctxt = context
            if not self.ctxt.cfg:
                self.ctxt.cfg = load_sq_config(validate=True,
                                               config_file=config_file)
                self.ctxt.schemas = Schema(self.ctxt.cfg["schema-directory"])
            if not self.ctxt.engine:
                self.ctxt.engine = engine_name

        self._cfg = self.ctxt.cfg
        self._schema = SchemaForTable(table, self.ctxt.schemas)
        self._table = table
        self._sort_fields = self._schema.key_fields()
        self._convert_args = {}

        self.namespace = namespace or self.ctxt.namespace or []
        self.hostname = hostname or self.ctxt.hostname or []
        self.start_time = start_time or self.ctxt.start_time
        self.end_time = end_time or self.ctxt.end_time

        view = view or self.ctxt.view

        if self.start_time and self.end_time and not view:
            self.view = 'all'
        else:
            self.view = view or 'latest'

        self.columns = columns or ['default']
        self._unique_def_column = ['hostname']

        if engine_name and engine_name != '':
            self.engine = get_sqengine(engine_name, self._table)(self)
        elif self.ctxt.engine:
            self.engine = get_sqengine(self.ctxt.engine, self._table)(self)

        if not self.engine:
            raise ValueError('Unknown analysis engine')

        self.summarize_df = pd.DataFrame()

        self._addnl_filter = self._addnl_fields = []
        self._valid_get_args = self._valid_assert_args = []
        self._valid_arg_vals = self._valid_find_args = []
        self._valid_summarize_args = []

    @property
    def all_schemas(self):
        '''Return the set of all schemas of tables supported'''
        return self.ctxt.schemas

    @property
    def schema(self):
        '''Return table-specific schema'''
        return self._schema

    @property
    def cfg(self):
        '''Return general suzieq config'''
        return self._cfg

    @property
    def table(self):
        '''Return the table served by this object'''
        return self._table

    @property
    def addnl_fields(self):
        '''Return the additional fields field'''
        return self._addnl_fields

    @property
    def sort_fields(self):
        '''Return default list of fields to sort by'''
        return self._sort_fields

    def _check_input_for_valid_args(self, good_arg_list, **kwargs,):
        '''Check that the provided set of kwargs is valid for the table'''
        if not good_arg_list:
            return

        # add standard args that are always
        good_arg_list = good_arg_list + (['namespace', 'addnl_fields'])

        for arg in kwargs:
            if arg not in good_arg_list:
                raise AttributeError(
                    f"argument {arg} not supported for this command")

    def _check_input_for_valid_vals(self, good_arg_val_list, **kwargs):
        '''Check if the input is valid for the arg, if possible'''

        if not good_arg_val_list:
            return

        for arg, val in kwargs.items():
            if arg in good_arg_val_list:
                if val not in good_arg_val_list[arg]:
                    raise ValueError(
                        f"invalid value {val} for argument {arg}")

    def validate_get_input(self, **kwargs):
        '''Validate the values of the get function'''
        self._check_input_for_valid_args(
            self._valid_get_args + ['columns'], **kwargs)
        self._check_input_for_valid_vals(self._valid_arg_vals, **kwargs)

    def validate_assert_input(self, **kwargs):
        '''Validate the values of the assert function'''
        self._check_input_for_valid_args(self._valid_assert_args, **kwargs)

    def validate_summarize_input(self, **kwargs):
        '''Validate the values of the summarize function'''
        self._check_input_for_valid_args(self._valid_get_args, **kwargs)

    def validate_columns(self, columns: typing.List[str]) -> bool:
        """Validate that the provided columns are valid for the table

        Args:
            columns (List[str]): list of columns

        Returns:
            bool: True if columns are valid
        Raises:
            ValueError: if columns are invalid
        """

        if columns in [['default'], ['*']]:
            return True

        table_schema = SchemaForTable(self._table, self.all_schemas)
        invalid_columns = [x for x in columns if x not in table_schema.fields]
        if invalid_columns:
            raise ValueError(f"Invalid columns specified: {invalid_columns}")
        return True

    def get(self, **kwargs) -> pd.DataFrame:
        '''Return the data for this table given a set of attributes'''
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        if self._addnl_filter:
            kwargs['add_filter'] = self._addnl_filter

        # This raises exceptions if it fails
        try:
            self.validate_get_input(**kwargs)
        except (AttributeError, ValueError) as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        if 'columns' not in kwargs:
            kwargs['columns'] = self.columns or ['default']

        # This raises ValueError if it fails
        self.validate_columns(kwargs.get('columns', []))

        for k, v in self._convert_args.items():
            if v and k in kwargs:
                val = kwargs[k]
                newval = []
                if isinstance(val, list):
                    for ele in val:
                        ele = v(ele)
                        newval.append(ele)
                    kwargs[k] = newval
                elif isinstance(val, str):
                    kwargs[k] = v(val)

        return self.engine.get(**kwargs)

    def summarize(self, **kwargs) -> pd.DataFrame:
        '''Summarize the data from specific table'''
        if self.columns != ["default"]:
            self.summarize_df = pd.DataFrame(
                {'error':
                 ['ERROR: You cannot specify columns with summarize']})
            return self.summarize_df

        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        self.validate_summarize_input(**kwargs)

        return self.engine.summarize(**kwargs)

    def unique(self, **kwargs) -> pd.DataFrame:
        '''Identify unique values and value counts for a column in table'''
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        columns = kwargs.pop('columns', self.columns)

        if columns is None or columns == ['default']:
            columns = self._unique_def_column

        if len(columns) > 1 or columns == ['*']:
            raise ValueError('Specify a single column with unique')

        # This raises ValueError if it fails
        self.validate_columns(columns)
        self._check_input_for_valid_vals(self._valid_arg_vals, **kwargs)
        return self.engine.unique(**kwargs, columns=columns)

    def aver(self, **kwargs):
        '''Assert one or more checks on table'''
        if self._valid_assert_args:
            return self._assert_if_supported(**kwargs)

        raise NotImplementedError

    def top(self, what: str = '', count: int = 5, reverse: bool = False,
            **kwargs) -> pd.DataFrame:
        """Get the list of top/bottom entries of "what" field"""

        columns = kwargs.get('columns', ['default'])
        # This raises ValueError if it fails
        self.validate_columns(columns)

        if not what:
            raise ValueError('Must specify what field to get top for')
        # if self._valid_get_args:
        #     self._valid_get_args += ['what', 'n', 'reverse']
        # This raises exceptions if it fails
        try:
            self.validate_get_input(**kwargs)
        except (ValueError, AttributeError) as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        # This raises ValueError if it fails
        table_schema = SchemaForTable(self._table, self.all_schemas)
        if not self._field_exists(table_schema, what):
            raise ValueError(
                f"Field {what} does not exist in table {self.table}")

        columns = table_schema.get_display_fields(columns)

        ftype = table_schema.field(what).get('type', 'str')
        if ftype not in ['long', 'double', 'float', 'int', 'timestamp',
                         'timedelta64[s]']:
            return pd.DataFrame({'error':
                                 [f'{what} not numeric; top can be used with'
                                  f' numeric fields only']})

        if what not in columns:
            self._addnl_fields.append(what)

        return self.engine.top(what=what, count=count, reverse=reverse,
                               **kwargs)

    def describe(self, **kwargs):
        """Describes the fields for a given table"""

        table = kwargs.get('table', self.table)

        cols = kwargs.get('columns', ['default'])
        if cols not in [['default'], ['*']]:
            df = pd.DataFrame(
                {'error': ['ERROR: cannot specify column names for describe']})
            return df

        try:
            sch = SchemaForTable(table, self.all_schemas)
        except ValueError:
            sch = None
        if not sch:
            df = pd.DataFrame(
                {'error': [f'ERROR: incorrect table name {table}']})
            return df

        entries = [{'name': x['name'], 'type': x['type'],
                    'key': x.get('key', ''),
                    'display': x.get('display', ''),
                    'description': x.get('description', '')}
                   for x in sch.get_raw_schema()]
        df = pd.DataFrame.from_dict(entries).sort_values('name')

        query_str = kwargs.get('query_str', '')
        if query_str:
            return df.query(query_str).reset_index(drop=True)

        return df

    def get_table_info(self, table: str, **kwargs) -> pd.DataFrame:
        """Get some basic stats about the table from the database

        Args:
            table (str): The table to get stats for

        Returns:
            pd.DataFrame: A dataframe with the stats
        """
        # This raises ValueError if it fails
        self.validate_columns(kwargs.get('columns', ['default']))

        return self.engine.get_table_info(table, **kwargs)

    def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame:
        '''Humanize the fields for human consumption.

        Individual classes will implement the right transofmations. This
        routine is just a placeholder for all those with nothing to modify.
        '''
        if 'timestamp' in df.columns and not df.empty:
            df['timestamp'] = humanize_timestamp(df.timestamp,
                                                 self.cfg.get('analyzer', {})
                                                 .get('timezone', None))

        return df

    def _field_exists(self, table_schema: SchemaForTable, field: str) -> bool:
        """Check if a field exists in the schema

        Args:
            table_schema (SchemaForTable): The schema for the table
            field (str): the field name we're checking for

        Returns:
            bool: True if the field exists, False otherwise
        """
        return table_schema.field(field)

    def _assert_if_supported(self, **kwargs):
        '''Common sqobj routine for a table that supports asserts

           Do not call this routine directly
        '''

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')
        try:
            self.validate_assert_input(**kwargs)
        except AttributeError as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        if self.columns in [['*'], ['default']]:
            req_cols = None
        else:
            req_cols = self.schema.get_display_fields(self.columns)
            if not req_cols:
                # Till we add a schema object for assert columns,
                # this will have to do
                req_cols = self.columns

        df = self.engine.aver(**kwargs)
        if not df.empty and req_cols:

            req_col_set = set(req_cols)
            got_col_set = set(df.columns)
            diff_cols = req_col_set - got_col_set
            if diff_cols:
                return pd.DataFrame(
                    {'error': [f'columns {list(diff_cols)} not in dataframe']})

            if 'assert' not in req_cols:
                req_cols.append('assert')

            df = df[req_cols]

        return df
示例#13
0
    if 'norifcnReason' in df.columns:
        df.rename({'notifcnReason': 'notificnReason'}, inplace=True)

    pq.write_to_dataset(
        table,
        root_path=output_dir,
        partition_cols=partition_cols,
        version="2.0",
        compression='ZSTD',
        row_group_size=100000,
    )

    logger.info(f'Wrote converted {input_dir}')


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print('Usage: convert_parquet <input dir> <output_dir> <schema_dir>')
        sys.exit(1)

    input_dir = Path(sys.argv[1])
    output_dir = sys.argv[2]
    schemas = Schema(sys.argv[3])
    service = input_dir.parts[-1]
    svc_schema = SchemaForTable(service, schema=schemas)

    logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
    logger = logging.getLogger('sq-converter')
    convert_dir(input_dir, output_dir, svc_schema)
示例#14
0
    def get_valid_df(self, table: str, **kwargs) -> pd.DataFrame:
        """The heart of the engine: retrieving the data from the backing store

        Args:
            table (str): Name of the table to retrieve the data for

        Returns:
            pd.DataFrame: The data as a pandas dataframe
        """
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        # Thanks to things like OSPF, we cannot use self.schema here
        sch = SchemaForTable(table, self.all_schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)
        hostname = kwargs.pop('hostname', [])

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if columns == ['*']:
            drop_cols.append('sqvers')

        aug_fields = sch.get_augmented_fields()

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields + addnl_fields:
            addnl_fields.append('active')
            if view != 'all':
                drop_cols.append('active')

        # Order matters. Don't put this before the missing key fields insert
        for f in aug_fields:
            dep_fields = sch.get_parent_fields(f)
            addnl_fields += dep_fields

        for fld in key_fields:
            if fld not in fields + addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        if self.iobj.start_time:
            try:
                start_time = int(
                    dateparser.parse(
                        self.iobj.start_time.replace(
                            'last night', 'yesterday')).timestamp() * 1000)
            except Exception:
                # pylint disable=raise-missing-from
                raise ValueError(
                    f"unable to parse start-time: {self.iobj.start_time}")
        else:
            start_time = ''

        if self.iobj.start_time and not start_time:
            # Something went wrong with our parsing
            # pylint disable=raise-missing-from
            raise ValueError(
                f"unable to parse start-time: {self.iobj.start_time}")

        if self.iobj.end_time:
            try:
                end_time = int(
                    dateparser.parse(
                        self.iobj.end_time.replace(
                            'last night', 'yesterday')).timestamp() * 1000)
            except Exception:
                # pylint disable=raise-missing-from
                raise ValueError(
                    f"unable to parse end-time: {self.iobj.end_time}")
        else:
            end_time = ''

        if self.iobj.end_time and not end_time:
            # Something went wrong with our parsing
            # pylint disable=raise-missing-from
            raise ValueError(f"unable to parse end-time: {self.iobj.end_time}")

        table_df = self._dbeng.read(phy_table,
                                    'pandas',
                                    start_time=start_time,
                                    end_time=end_time,
                                    columns=fields,
                                    view=view,
                                    key_fields=key_fields,
                                    **kwargs)

        if not table_df.empty:
            # hostname may not have been filtered if using regex
            if hostname:
                hdf_list = []
                for hn in hostname:
                    if hn.startswith('~'):
                        hn = hn[1:]
                    df1 = table_df.query(f"hostname.str.match('{hn}')")
                    if not df1.empty:
                        hdf_list.append(df1)

                if hdf_list:
                    table_df = pd.concat(hdf_list)
                else:
                    return pd.DataFrame(columns=table_df.columns.tolist())

            if view == "all" or not active_only:
                table_df.drop(columns=drop_cols, inplace=True)
            else:
                table_df = table_df.query('active') \
                    .drop(columns=drop_cols)
            if 'timestamp' in table_df.columns and not table_df.empty:
                table_df['timestamp'] = humanize_timestamp(
                    table_df.timestamp,
                    self.cfg.get('analyzer', {}).get('timezone', None))

        return table_df
示例#15
0
    def migrate(self, table_name: str, schema: SchemaForTable) -> None:
        """Migrates the data for the table specified to latest version

        :param table_name: str, The name of the table to migrate
        :param schema: SchemaForTable, the current schema
        :returns: None
        :rtype:
        """

        current_vers = schema.version
        defvals = self._get_default_vals()
        arrow_schema = schema.get_arrow_schema()
        schema_def = dict(zip(arrow_schema.names, arrow_schema.types))

        # pylint: disable=too-many-nested-blocks
        for sqvers in self._get_avail_sqvers(table_name, True):
            if sqvers != current_vers:
                migrate_rtn = get_migrate_fn(table_name, sqvers, current_vers)
                if migrate_rtn:
                    dataset = self._get_cp_dataset(table_name, True, sqvers,
                                                   'all', '', '')
                    for item in dataset.files:
                        try:
                            namespace = item.split('namespace=')[1] \
                                .split('/')[0]
                        except IndexError:
                            # Don't convert data not in our template
                            continue

                        df = pd.read_parquet(item)
                        df['sqvers'] = sqvers
                        df['namespace'] = namespace
                        newdf = migrate_rtn(df)

                        cols = newdf.columns
                        # Ensure all fields are present
                        for field in schema_def:
                            if field not in cols:
                                newdf[field] = defvals.get(
                                    schema_def[field], '')

                        newdf.drop(columns=['namespace', 'sqvers'])

                        newitem = item.replace(f'sqvers={sqvers}',
                                               f'sqvers={current_vers}')
                        newdir = os.path.dirname(newitem)
                        if not os.path.exists(newdir):
                            os.makedirs(newdir, exist_ok=True)

                        table = pa.Table.from_pandas(
                            newdf,
                            schema=schema.get_arrow_schema(),
                            preserve_index=False)
                        pq.write_to_dataset(table,
                                            newitem,
                                            version="2.0",
                                            compression="ZSTD",
                                            row_group_size=100000)
                        self.logger.debug(f'Migrated {item} version {sqvers}->'
                                          f'{current_vers}')
                        os.remove(item)

                    rmtree(
                        f'{self._get_table_directory(table_name, True)}/'
                        f'sqvers={sqvers}',
                        ignore_errors=True)
示例#16
0
    def coalesce(self,
                 tables: List[str] = None,
                 period: str = '',
                 ign_sqpoller: bool = False) -> Optional[List]:
        """Coalesce all the resource parquet files in specified folder.

        This routine does not run periodically. It runs once and returns.

        :param tables: List[str], List of specific tables to coalesce,
                       empty for all
        :param period: str, coalescing period, needed for various internal
                       stuff
        :param ign_sqpoller: True if its OK to ignore the absence of sqpoller
                             to coalesce
        :returns: coalesce statistics list, one per table
        :rtype: SqCoalesceStats
        """

        infolder = self.cfg['data-directory']
        outfolder = self._get_table_directory('', True)  # root folder
        archive_folder = self.cfg.get('coalescer', {}) \
            .get('archive-directory',
                 f'{infolder}/_archived')

        if not period:
            period = self.cfg.get('coalesceer', {
                'period': '1h'
            }).get('period', '1h')
        schemas = Schema(self.cfg.get('schema-directory'))
        state = SqCoalesceState(self.logger, period)

        state.logger = self.logger
        # Trying to be complete here. the ignore prefixes assumes you have
        # coalesceers across multiple time periods running, and so we need
        # to ignore the files created by the longer time period coalesceions.
        # In other words, weekly coalesceer should ignore monthly and yearly
        # coalesced files, monthly coalesceer should ignore yearly coalesceer
        # and so on.
        try:
            timeint = int(period[:-1])
            time_unit = period[-1]
            if time_unit == 'm':
                run_int = timedelta(minutes=timeint)
                state.prefix = 'sqc-m-'
                state.ign_pfx = ['.', '_', 'sqc-']
            elif time_unit == 'h':
                run_int = timedelta(hours=timeint)
                state.prefix = 'sqc-h-'
                state.ign_pfx = [
                    '.', '_', 'sqc-y-', 'sqc-d-', 'sqc-w-', 'sqc-M-'
                ]
            elif time_unit == 'd':
                run_int = timedelta(days=timeint)
                if timeint > 364:
                    state.prefix = 'sqc-y-'
                    state.ign_pfx = ['.', '_', 'sqc-y-']
                elif timeint > 29:
                    state.prefix = 'sqc-M-'
                    state.ign_pfx = ['.', '_', 'sqc-M-', 'sqc-y-']
                else:
                    state.prefix = 'sqc-d-'
                    state.ign_pfx = [
                        '.', '_', 'sqc-m-', 'sqc-d-', 'sqc-w-', 'sqc-M-',
                        'sqc-y-'
                    ]
            elif time_unit == 'w':
                run_int = timedelta(weeks=timeint)
                state.prefix = 'sqc-w-'
                state.ign_pfx = ['.', '_', 'sqc-w-', 'sqc-m-', 'sqc-y-']
            else:
                logging.error(f'Invalid unit for period, {time_unit}, '
                              'must be one of m/h/d/w')
        except ValueError:
            logging.error(f'Invalid time, {period}')
            return None

        state.period = run_int
        # Create list of tables to coalesce.
        # TODO: Verify that we're only coalescing parquet tables here
        if tables:
            tables = [
                x for x in tables if schemas.tables() and (
                    schemas.type_for_table(x) != "derivedRecord")
            ]
        else:
            tables = [
                x for x in schemas.tables()
                if schemas.type_for_table(x) != "derivedRecord"
            ]
        if 'sqPoller' not in tables and not ign_sqpoller:
            # This is an error. sqPoller keeps track of discontinuities
            # among other things.
            self.logger.error(
                'No sqPoller data, cannot compute discontinuities')
            return None
        else:
            # We want sqPoller to be first to compute discontinuities
            with suppress(ValueError):
                tables.remove('sqPoller')
            if not ign_sqpoller:
                tables.insert(0, 'sqPoller')

        # We've forced the sqPoller to be always the first table to coalesce
        stats = []
        for entry in tables:
            table_outfolder = f'{outfolder}/{entry}'
            table_infolder = f'{infolder}//{entry}'
            if archive_folder:
                table_archive_folder = f'{archive_folder}/{entry}'
            else:
                table_archive_folder = None
            state.current_df = pd.DataFrame()
            state.dbeng = self
            state.schema = SchemaForTable(entry, schemas, None)
            if not os.path.isdir(table_infolder):
                self.logger.info(f'No input records to coalesce for {entry}')
                continue
            try:
                if not os.path.isdir(table_outfolder):
                    os.makedirs(table_outfolder)
                if (table_archive_folder
                        and not os.path.isdir(table_archive_folder)):
                    os.makedirs(table_archive_folder, exist_ok=True)
                # Migrate the data if needed
                self.logger.debug(f'Migrating data for {entry}')
                self.migrate(entry, state.schema)
                self.logger.debug(f'Migrating data for {entry}')
                start = time()
                coalesce_resource_table(table_infolder, table_outfolder,
                                        table_archive_folder, entry, state)
                end = time()
                self.logger.info(f'coalesced {state.wrfile_count} '
                                 f'files/{state.wrrec_count} '
                                 f'records of {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), state.wrfile_count,
                        state.wrrec_count,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))
            except Exception:  # pylint: disable=broad-except
                self.logger.exception(f'Unable to coalesce table {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), 0, 0,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))

        return stats