def clean_up_before_shutdown(): global baskerville_engine, logger if not logger: logger = get_logger('clean_up_before_shutdown') logger.info('Just a sec, finishing up...') if baskerville_engine: logger.info('Finishing up Baskerville...') baskerville_engine.finish_up() for each in PROCESS_LIST: each.terminate() each.join() logger.info(f'Stopped {each.name}...')
def __init__(self, engine_conf): self.all_features = engine_conf.all_features self.extra_features = engine_conf.extra_features self.active_features = None self.active_feature_names = None self.updateable_active_features = None self.active_columns = None self.update_feature_cols = None self.column_renamings = None self.pre_group_by_calculations = None self.post_group_by_calculations = None self.logger = get_logger(self.__class__.__name__, logging_level=engine_conf.log_level, output_file=engine_conf.logpath)
def __init__(self, db_conf, engine_conf, clean_up): self.runtime = None # todo: does not belong here anymore - see feature manager self.active_features = None self.step_to_action = None self.remaining_steps = None self.logs_df = None self.db_conf = db_conf self.engine_conf = engine_conf self.all_features = self.engine_conf.all_features self.clean_up = clean_up self.db_url = get_jdbc_url(self.db_conf) self.logger = get_logger(self.__class__.__name__, logging_level=self.engine_conf.log_level, output_file=self.engine_conf.logpath)
def __init__(self, run_type, conf, register_metrics=True): super(BaskervilleAnalyticsEngine, self).__init__(conf) self.run_type = run_type self.pipeline = None self.performance_stats = None # set config's logger BaskervilleConfig.set_logger(conf['engine']['log_level'], conf['engine']['logpath']) self.config = BaskervilleConfig(self.config).validate() self.register_metrics = (self.config.engine.metrics and register_metrics) self.logger = get_logger(self.__class__.__name__, logging_level=conf['engine']['log_level'], output_file=conf['engine']['logpath'])
def __init__(self, cache_config, table_name, columns_to_keep, expire_if_longer_than=3600, logger=None, session_getter=get_spark_session, group_by_fields=('target', 'ip'), format_='parquet', path='request_set_cache'): self.__cache = None self.__persistent_cache = None self.schema = None self.cache_config = cache_config self.table_name = table_name self.columns_to_keep = columns_to_keep self.expire_if_longer_than = expire_if_longer_than self.logger = logger if logger else get_logger(self.__class__.__name__) self.session_getter = session_getter self.group_by_fields = group_by_fields self.format_ = format_ self.storage_level = StorageLevel.CUSTOM self.column_renamings = { 'first_ever_request': 'start', 'old_subset_count': 'subset_count', 'old_features': 'features', 'old_num_requests': 'num_requests', } self._count = 0 self._last_updated = datetime.datetime.utcnow() self._changed = False self.file_manager = FileManager(path, self.session_getter()) self.file_name = os.path.join( path, f'{self.__class__.__name__}.{self.format_}') self.temp_file_name = os.path.join( path, f'{self.__class__.__name__}temp.{self.format_}') if self.file_manager.path_exists(self.file_name): self.file_manager.delete_path(self.file_name) if self.file_manager.path_exists(self.temp_file_name): self.file_manager.delete_path(self.temp_file_name)
def main(): """ Baskerville commandline arguments :return: """ global baskerville_engine, logger parser = argparse.ArgumentParser() parser.add_argument( "pipeline", help="Pipeline to use: es, rawlog, or kafka", ) parser.add_argument( "-s", "--simulate", dest="simulate", action="store_true", help="Simulate real-time run using kafka", ) parser.add_argument( "-e", "--startexporter", dest="start_exporter", action="store_true", help="Start the Baskerville Prometheus exporter at the specified " "in the configuration port", ) parser.add_argument( "-t", "--testmodel", dest="test_model", help="Add a test model in the models table", default=False, action="store_true" ) parser.add_argument( "-c", "--conf", action="store", dest="conf_file", default=os.path.join(src_dir, '..', 'conf', 'baskerville.yaml'), help="Path to config file" ) parser.add_argument( "-t", "--testmodel", dest="test_model", help="Add a test model in the models table", default=False, action="store_true" ) args = parser.parse_args() conf = parse_config(path=args.conf_file) baskerville_engine = BaskervilleAnalyticsEngine( args.pipeline, conf, register_metrics=args.start_exporter ) logger = get_logger( __name__, logging_level=baskerville_engine.config.engine.log_level, output_file=baskerville_engine.config.engine.logpath ) # start simulation if specified if args.simulate: spark = None if baskerville_engine.config.engine.use_spark: from baskerville.spark import get_spark_session spark = get_spark_session() # baskerville.pipeline.spark logger.info('Starting simulation...') run_simulation(baskerville_engine.config, spark) # start baskerville prometheus exporter if specified if args.start_exporter: if not baskerville_engine.config.engine.metrics: raise RuntimeError(f'Cannot start exporter without metrics config') port = baskerville_engine.config.engine.metrics.port start_http_server(port) logger.info(f'Starting Baskerville Exporter at ' f'http://localhost:{port}') # populate with test data if specified if args.test_model: add_model_to_database(conf['database']) for p in PROCESS_LIST[::-1]: print(f"{p.name} starting...") p.start() logger.info('Starting Baskerville Engine...') baskerville_engine.run()
def set_logger(cls, log_level='INFO', log_path='baskerville.log'): global logger logger = get_logger(cls.__name__, logging_level=log_level, output_file=log_path)
# LICENSE file in the root directory of this source tree. import json import os import warnings from datetime import datetime from functools import wraps import dateutil from baskerville.util.enums import ModelEnum from baskerville.util.helpers import get_logger, get_default_data_path, \ SerializableMixin from dateutil.tz import tzutc from baskerville.features import FEATURES logger = get_logger(__name__) class ConfigError(Exception, SerializableMixin): """ Custom Error to be used in the configuration error report """ def __init__(self, message, fields, exception_type=ValueError): if isinstance(fields, str): fields = [fields] self.args = message, fields, exception_type.__name__ def __str__(self): m, f, e = self.args return f'({e}, `field(s)`: {",".join(f)}){m} '
def __init__(self): self.__registry = {} self.logger = get_logger(self.__class__.__name__)
def maintain_db(): """ Runs the partitioning and archive scripts :return: """ # todo: this can fail silently baskerville_root = os.environ.get( 'BASKERVILLE_ROOT', '../../../../baskerville' ) # we need the current config for the database details config = parse_config(path=f'{baskerville_root}/conf/baskerville.yaml') logger = get_logger( __name__, logging_level=config['engine']['log_level'], output_file=config['engine']['logpath'] ) db_config = DatabaseConfig(config['database']).validate() if db_config.maintenance.partition_by != 'week': raise NotImplementedError( f'Partition by {db_config.maintenance.partition_by} ' f'is not yet implemented' ) # maintainance will run every Sunday, so now should be Sunday night # move to the start of Monday now = datetime.utcnow() y, w, _ = now.isocalendar() partition_start_week = isoweek.Week(y, w + 1) start = datetime.combine( partition_start_week.monday(), datetime.min.time() ) end = datetime.combine( partition_start_week.sunday(), datetime.max.time() ) logger.info(f'Data Partition Start : {start}') diy = get_days_in_year(end.year) latest_archive_date = end - timedelta(days=diy) latest_archive_year, latest_archive_week, _ = latest_archive_date.isocalendar() print(latest_archive_week, latest_archive_year) if latest_archive_week > 1: latest_archive_week = latest_archive_week - 1 else: latest_archive_week = isoweek.Week.last_week_of_year( latest_archive_year-1 ).week latest_archive_year = latest_archive_year - 1 week = isoweek.Week(latest_archive_year, latest_archive_week) print(week) db_config.maintenance.data_partition.since = start db_config.maintenance.data_partition.until = ( start + timedelta(days=6) ).replace( hour=23, minute=59, second=59 ) db_config.maintenance.data_archive.since = datetime.combine( week.monday(), datetime.min.time() ) db_config.maintenance.data_archive.until = datetime.combine( week.sunday(), datetime.max.time() ) print(db_config.maintenance.data_partition) print(db_config.maintenance.data_archive) # get sql scripts partition_sql = get_temporal_partitions(db_config.maintenance) archive_sql = get_archive_script( latest_archive_date - timedelta(weeks=1), latest_archive_date ) logger.debug(partition_sql) logger.debug(archive_sql) session, engine = set_up_db(db_config.__dict__, create=False) try: # create partitions session.execute(partition_sql) session.commit() print('Partitioning done') # detach partitions over a year and attach them to the archive table session.execute(archive_sql) session.commit() print('Archive done') except SQLAlchemyError as e: traceback.print_exc() session.rollback() logger.error(f'Error executing maintenance: {e}') finally: session.close()