Пример #1
0
    def connect_to_db(self):
        """
        This connection to the db will live for the live time of the
        bothound_tools instance and will be used to save data back to the db
        """

        self.session, self.engine = set_up_db(self.conf.__dict__)
Пример #2
0
def delete_by(past_to_current_ids, db_conf):
    """
    Delete the old request_sets (that refer to the same target ip pairs as the
    current request_sets)
    :param tuple[int, int] past_to_current_ids:
    :param str db_conf: the serialized db_conf
    :return: True if everything went well, False otherwise
    :rtype: bool
    """
    import json
    from baskerville.db import set_up_db
    from baskerville.db.models import RequestSet

    db_conf = json.loads(db_conf)
    session, engine = set_up_db(db_conf, False)

    try:
        session.query(RequestSet).filter(
            RequestSet.id.in_([x[0] for x in past_to_current_ids
                               ])).delete(synchronize_session=False)
        session.commit()
    except Exception:
        import traceback
        traceback.print_exc()
        session.rollback()  # do we want to rollback here?
        session.close()
        engine.dispose()
        return False

    session.close()
    engine.dispose()

    return True
Пример #3
0
def add_model_to_database(database_config):
    """
    Load the test model and save it in the database
    :param dict[str, T] database_config:
    :return:
    """
    global logger
    path = os.path.join(get_default_data_path(), 'samples', 'models', 'AnomalyModel')
    logger.info(f'Loading test model from: {path}')
    model = AnomalyModelSklearn()
    model.load(path=path)

    db_cfg = DatabaseConfig(database_config).validate()
    session, _ = set_up_db(db_cfg.__dict__, partition=False)

    db_model = Model()
    db_model.algorithm = 'baskerville.models.anomaly_model_sklearn.AnomalyModelSklearn'
    db_model.created_at = datetime.now(tz=tzutc())
    db_model.parameters = json.dumps(model.get_params())
    db_model.classifier = bytearray(path.encode('utf8'))

    # save to db
    session.add(db_model)
    session.commit()
    session.close()
Пример #4
0
def update_request_set_by_id(id, stop, features, prediction, subset_count,
                             num_requests, db_conf):
    """
    Updates a RequestSet by id
    :param int id: the request_set id
    :param datetime.datetime stop: current subset's stop datetime
    :param dict features: latest features
    :param float prediction: the current prediction
    :param int subset_count: the subset count
    :param int num_requests: the current subset's num_requests
    :param str db_conf: the database configuration serialized
    :return:
    """
    import json
    import traceback
    from baskerville.db import set_up_db
    from baskerville.db.models import RequestSet

    db_conf = json.loads(db_conf)
    session, engine = set_up_db(db_conf, False)

    stop = stop.replace(tzinfo=tzutc())
    values = {
        "stop": stop,
        "features": features,
        "prediction": prediction,
        "subset_count": subset_count,
        "num_requests": num_requests,
    }
    try:
        # with engine.connect() as conn:
        #     stmt = RequestSet.__table__.update(). \
        #         values(values).where(RequestSet.id == id)
        #     conn.execute(stmt)
        #
        # session.bulk_update_mappings(RequestSet, [values])
        session.query(RequestSet).filter(RequestSet.id == id).update(values)
        session.flush()
        session.commit()
    except Exception:
        traceback.print_exc()
        session.rollback()
        session.close()
        engine.dispose()
        return False

    session.close()
    engine.dispose()

    return True
Пример #5
0
def bulk_update_request_sets(id, stop, features, prediction, subset_count,
                             num_requests, total_seconds, db_conf):
    import json
    import traceback
    from itertools import zip_longest
    from baskerville.db import set_up_db
    from baskerville.db.models import RequestSet

    db_conf = json.loads(db_conf)

    # https://stackoverflow.com/questions/13125236/sqlalchemy-psycopg2-and-postgresql-copy
    # https://www.endpoint.com/blog/2014/04/11/speeding-up-saving-millions-of-orm

    session, engine = set_up_db(db_conf, False)
    req_per_row = list(
        zip_longest(id, stop, features, prediction, subset_count, num_requests,
                    total_seconds))
    try:
        for request_set in req_per_row:
            values = {
                "stop": request_set[1],
                "features": request_set[2],
                "prediction": request_set[3],
                "subset_count": request_set[4],
                "num_requests": request_set[5],
                "total_seconds": request_set[6]
            }
            session.query(RequestSet).filter(
                RequestSet.id == request_set[0]).update(values)
        session.flush()
        session.commit()
    except Exception:
        traceback.print_exc()
        session.rollback()
        session.close()
        engine.dispose()
        return False

    session.close()
    engine.dispose()

    return True
Пример #6
0
def cross_reference_misp(ip, db_conf):
    import json
    from baskerville.db import set_up_db
    from baskerville.db.models import Attribute

    db_conf = json.loads(db_conf)
    session, engine = set_up_db(db_conf, False)

    attribute = session.query(Attribute).filter(Attribute.value == ip).first()

    label = None
    id_attribute = None
    if attribute:
        label = LabelEnum.malicious.value
        id_attribute = attribute.id

    session.close()
    engine.dispose()

    return label, id_attribute
def maintain_db():
    """
    Runs the partitioning and archive scripts
    :return:
    """
    # todo: this can fail silently
    baskerville_root = os.environ.get(
        'BASKERVILLE_ROOT', '../../../../baskerville'
    )
    # we need the current config for the database details
    config = parse_config(path=f'{baskerville_root}/conf/baskerville.yaml')
    logger = get_logger(
        __name__,
        logging_level=config['engine']['log_level'],
        output_file=config['engine']['logpath']
    )
    db_config = DatabaseConfig(config['database']).validate()

    if db_config.maintenance.partition_by != 'week':
        raise NotImplementedError(
            f'Partition by {db_config.maintenance.partition_by} '
            f'is not yet implemented'
        )

    # maintainance will run every Sunday, so now should be Sunday night
    # move to the start of Monday
    now = datetime.utcnow()
    y, w, _ = now.isocalendar()
    partition_start_week = isoweek.Week(y, w + 1)
    start = datetime.combine(
        partition_start_week.monday(), datetime.min.time()
    )
    end = datetime.combine(
        partition_start_week.sunday(), datetime.max.time()
    )

    logger.info(f'Data Partition Start : {start}')

    diy = get_days_in_year(end.year)
    latest_archive_date = end - timedelta(days=diy)
    latest_archive_year, latest_archive_week, _ = latest_archive_date.isocalendar()
    print(latest_archive_week, latest_archive_year)

    if latest_archive_week > 1:
        latest_archive_week = latest_archive_week - 1
    else:
        latest_archive_week = isoweek.Week.last_week_of_year(
            latest_archive_year-1
        ).week
        latest_archive_year = latest_archive_year - 1
    week = isoweek.Week(latest_archive_year, latest_archive_week)

    print(week)

    db_config.maintenance.data_partition.since = start
    db_config.maintenance.data_partition.until = (
        start + timedelta(days=6)
    ).replace(
        hour=23, minute=59, second=59
    )

    db_config.maintenance.data_archive.since = datetime.combine(
        week.monday(), datetime.min.time()
    )
    db_config.maintenance.data_archive.until = datetime.combine(
        week.sunday(), datetime.max.time()
    )

    print(db_config.maintenance.data_partition)
    print(db_config.maintenance.data_archive)

    # get sql scripts
    partition_sql = get_temporal_partitions(db_config.maintenance)

    archive_sql = get_archive_script(
        latest_archive_date - timedelta(weeks=1),
        latest_archive_date
    )

    logger.debug(partition_sql)
    logger.debug(archive_sql)
    session, engine = set_up_db(db_config.__dict__, create=False)

    try:
        # create partitions
        session.execute(partition_sql)
        session.commit()
        print('Partitioning done')
        # detach partitions over a year and attach them to the archive table
        session.execute(archive_sql)
        session.commit()
        print('Archive done')

    except SQLAlchemyError as e:
        traceback.print_exc()
        session.rollback()
        logger.error(f'Error executing maintenance: {e}')
    finally:
        session.close()
Пример #8
0
def model_transfer(input_db_cfg,
                   output_db_cfg,
                   model_id=None,
                   truncate_out_table=False):
    """
    Transferring models between dbs. The databases can be located on the same
    server or on different servers. Use ssh tunneling for better performance
    in the second case.
    :param dict[str, T] input_db_cfg: the database configuration for the db to
    get data from
    input_db_cfg = {
        'name': 'baskerville_training',
        'user': '******',
        'password': '******',
        'host': '127.0.0.1',
        'port': 5432,
        'type': 'postgres',
    }
    :param dict[str, T] output_db_cfg: the database configuration for the db to
    store data to, e.g.
    output_db_cfg = {
        'name': 'baskerville_production',
        'user': '******',
        'password': '******',
        'host': '127.0.0.1',
        'port': 5432,
        'type': 'postgres',
    }
    :param int model_id: specify an id to get only one model
    :param bool truncate_out_table: Drop the models' data on the target
    database (True) or not (False) and restarts identity
    :return: None
    """
    from baskerville.models.config import DatabaseConfig
    from baskerville.db import set_up_db
    from baskerville.db.models import Model

    in_db_cfg = DatabaseConfig(input_db_cfg)
    out_db_cfg = DatabaseConfig(output_db_cfg)

    in_session, _ = set_up_db(in_db_cfg.__dict__)
    out_session, _ = set_up_db(out_db_cfg.__dict__)

    if truncate_out_table:
        if input('Are you sure you want to truncate table data?(y/n)') in [
                'Y', 'y'
        ]:
            print('Truncating data...')
            out_session.execute(
                f'TRUNCATE TABLE {Model.__tablename__} RESTART IDENTITY')
            out_session.commit()
        else:
            print('Skipping data truncation...')

    models_in = in_session.query(Model)
    if model_id:
        models_in = models_in.filter_by(id=model_id)
    models_in = models_in.order_by(asc(Model.id)).all()

    print(f'{len(models_in)} models in in_db: {in_db_cfg.name}')

    try:
        for model in models_in:
            print(f'Getting model with id: {model.id}')
            model_out = Model()
            model_out.scaler = model.scaler
            model_out.classifier = model.classifier
            model_out.features = model.features
            model_out.algorithm = model.algorithm
            model_out.analysis_notebook = model.analysis_notebook
            model_out.created_at = model.created_at
            model_out.f1_score = model.f1_score
            model_out.n_training = model.n_training
            model_out.n_testing = model.n_testing
            model_out.notes = model.notes
            model_out.parameters = model.parameters
            model_out.precision = model.precision
            model_out.recall = model.recall
            model_out.request_sets = model.request_sets
            out_session.add(model_out)
            out_session.commit()
    except Exception:
        traceback.print_exc()
        out_session.rollback()