예제 #1
0
파일: db.py 프로젝트: janschmid/PeekabooAV
    def analysis2db(self, sample):
        """
        Save an analysis task to the analysis journal in the database.

        :param sample: The sample object for this analysis task.
        """
        with self.__lock:
            session = self.__Session()
            analysis = AnalysisJournal()
            analysis.job_hash = sample.get_job_hash()
            analysis.cuckoo_job_id = sample.job_id
            analysis.filename = sample.get_filename()
            analysis.analyses_time = datetime.strptime(sample.analyses_time,
                                                       "%Y%m%dT%H%M%S")
            analysis_result = PeekabooDatabase.__get_or_create(
                session,
                AnalysisResult,
                name=sample.get_result().name
            )
            # NOTE: We cannot determine if a known sample is inProgress again.
            s = PeekabooDatabase.__get(
                session,
                SampleInfo,
                sha256sum=sample.sha256sum,
                file_extension=sample.file_extension,
            )
            if s is None:
                s = PeekabooDatabase.__create(
                    SampleInfo,
                    sha256sum=sample.sha256sum,
                    file_extension=sample.file_extension,
                    result=analysis_result
                )
            analysis.sample = s
            session.add(analysis)
            try:
                session.commit()
            except SQLAlchemyError as e:
                session.rollback()
                raise PeekabooDatabaseError(
                    'Failed to add analysis task to the database: %s' % e
                )
            finally:
                session.close()
예제 #2
0
    async def analysis_add(self, sample):
        """
        Add an analysis task to the analysis journal in the database.

        @param sample: The sample object for this analysis task.
        @returns: ID of the newly created analysis task (also updated
                  in the sample)
        """
        sample_info = SampleInfo(state=sample.state,
                                 identity=await sample.identity,
                                 analysis_time=datetime.now(),
                                 result=sample.result,
                                 reason=sample.reason)

        job_id = None
        attempt = 1
        delay = 0
        while attempt <= self.retries:
            async with self.__session_factory() as session:
                session.add(sample_info)
                try:
                    # flush to retrieve the automatically assigned primary
                    # key value
                    await session.flush()
                    job_id = sample_info.id
                    await session.commit()
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt, 'adding analysis')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to add analysis task to the database: %s' %
                            error)

            await asyncio.sleep(delay)

        sample.update_id(job_id)
        return job_id
예제 #3
0
파일: db.py 프로젝트: 5l1v3r1/PeekabooAV
    def clear_stale_in_flight_samples(self):
        """
        Clear all in-flight markers that are too old and therefore stale. This
        detects instances which are locked up, crashed or shut down.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return True

        session = self.__session()

        # delete only the locks of a specific instance
        query = session.query(InFlightSample).filter(
            InFlightSample.start_time <= datetime.utcnow() -
            timedelta(seconds=self.stale_in_flight_threshold))
        logger.debug(
            'Clearing database of all stale in-flight samples '
            '(%d seconds)', self.stale_in_flight_threshold)

        try:
            # the loop triggers the query, so only do it if debugging is
            # enabled
            if logger.isEnabledFor(logging.DEBUG):
                # obviously there's a race between logging and actual delete
                # here, use with caution, compare with actual number of markers
                # cleared below before relying on it for debugging
                for stale in query:
                    logger.debug('Stale in-flight marker to clear: %s', stale)

            # delete() is not queued and goes to the DB before commit()
            cleared = query.delete()
            session.commit()
            if cleared > 0:
                logger.warn('%d stale in-flight samples cleared.', cleared)
        except SQLAlchemyError as error:
            session.rollback()
            raise PeekabooDatabaseError('Unable to clear the database of '
                                        'stale in-flight samples: %s' % error)
        finally:
            session.close()

        return cleared > 0
예제 #4
0
    def analysis_journal_fetch_journal(self, sample):
        """
        Fetch information stored in the database about a given sample object.

        @param sample: The sample object of which the information shall be
                       fetched from the database.
        @return: A sorted list of (analysis_time, result, reason) of the
                 requested sample.
        """
        statement = sqlalchemy.sql.expression.select(
            SampleInfo.analysis_time, SampleInfo.result,
            SampleInfo.reason).where(SampleInfo.id != sample.id).where(
                SampleInfo.result != Result.failed).filter_by(
                    state=JobState.FINISHED,
                    sha256sum=sample.sha256sum,
                    file_extension=sample.file_extension).order_by(
                        SampleInfo.analysis_time)

        sample_journal = None
        attempt = 1
        delay = 0
        while attempt <= self.retries:
            with self.__session() as session:
                try:
                    sample_journal = session.execute(statement).all()
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt, 'fetching analysis journal')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to fetch analysis journal from the database: %s'
                            % error)

            time.sleep(delay)

        return sample_journal
예제 #5
0
파일: db.py 프로젝트: 5l1v3r1/PeekabooAV
 def _init_db(self):
     """
     Initializes the Peekaboo database by creating tables and
     writing meta information to the '_meta' table.
     """
     Base.metadata.create_all(self.__engine)
     meta = PeekabooMetadata()
     meta.peekaboo_version = __version__
     meta.db_schema_version = DB_SCHEMA_VERSION
     # TODO: Get Cuckoo version.
     meta.cuckoo_version = '2.0'
     session = self.__session()
     session.add(meta)
     try:
         session.commit()
     except SQLAlchemyError as error:
         session.rollback()
         raise PeekabooDatabaseError('Cannot initialize the database: %s' %
                                     error)
     finally:
         session.close()
예제 #6
0
 def clear_in_progress(self):
     """ Remove all samples with the result 'inProgress'. """
     session = self.__Session()
     in_progress = PeekabooDatabase.__get(session,
                                          AnalysisResult,
                                          name='inProgress')
     in_progress_samples = session.query(SampleInfo).filter_by(
         result=in_progress).all()
     for in_progress_sample in in_progress_samples:
         session.query(AnalysisJournal).filter_by(
             sample=in_progress_sample).delete()
     try:
         session.commit()
         logger.debug('Cleared the database from "inProgress" entries.')
     except SQLAlchemyError as e:
         session.rollback()
         raise PeekabooDatabaseError(
             'Unable to clear the database from "inProgress" entries: %s' %
             e)
     finally:
         session.close()
예제 #7
0
    async def analysis_retrieve(self, job_id):
        """
        Fetch information stored in the database about a given sample object.

        @param job_id: ID of the analysis to retrieve
        @type job_id: int
        @return: reason and result for the given analysis task
        """
        statement = sqlalchemy.sql.expression.select(
            SampleInfo.reason,
            SampleInfo.result).filter_by(id=job_id, state=JobState.FINISHED)

        result = None
        attempt = 1
        delay = 0
        while attempt <= self.retries:
            async with self.__session_factory() as session:
                try:
                    proxy = await session.execute(statement)
                    result = proxy.first()
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt, 'retrieving analysis result')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to retrieve analysis from the database: %s'
                            % error)

            await asyncio.sleep(delay)

        return result
예제 #8
0
    async def clear_stale_in_flight_samples(self):
        """
        Clear all in-flight markers that are too old and therefore stale. This
        detects instances which are locked up, crashed or shut down.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return True

        logger.debug(
            'Clearing database of all stale in-flight samples '
            '(%d seconds)', self.stale_in_flight_threshold)

        def clear_statement(statement_class):
            # delete only the locks of a specific instance
            return statement_class(InFlightSample).where(
                InFlightSample.start_time <= datetime.utcnow() -
                timedelta(seconds=self.stale_in_flight_threshold))

        delete_statement = clear_statement(sqlalchemy.sql.expression.delete)
        select_statement = clear_statement(sqlalchemy.sql.expression.select)

        attempt = 1
        cleared = 0
        while attempt <= self.retries:
            async with self.__session_factory() as session:
                try:
                    # only do the query if debugging is enabled
                    if logger.isEnabledFor(logging.DEBUG):
                        # obviously there's a race between logging and actual
                        # delete here, use with caution, compare with actual
                        # number of markers cleared below before relying on it
                        # for debugging
                        markers = await session.execute(select_statement)
                        for stale in markers:
                            logger.debug('Stale in-flight marker to clear: %s',
                                         stale)

                    markers = await session.execute(delete_statement)
                    await session.commit()

                    cleared = markers.rowcount
                    if cleared > 0:
                        logger.warning('%d stale in-flight samples cleared.',
                                       cleared)

                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt,
                        'clearing the database of stale in-flight samples')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Unable to clear the database of stale in-flight '
                            'samples: %s' % error)

            await asyncio.sleep(delay)

        return cleared > 0
예제 #9
0
    async def mark_sample_in_flight(self,
                                    sample,
                                    instance_id=None,
                                    start_time=None):
        """
        Mark a sample as in flight, i.e. being worked on by an instance.

        @param sample: The sample to mark as in flight.
        @param instance_id: (optionally) The ID of the instance that is
                            handling this sample. Default: Us.
        @param start_time: Override the time the marker was placed for
                           debugging purposes.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return True

        # use our own instance id if none is given
        if instance_id is None:
            instance_id = self.instance_id

        if start_time is None:
            start_time = datetime.utcnow()

        in_flight_marker = InFlightSample(identity=await sample.identity,
                                          instance_id=instance_id,
                                          start_time=start_time)

        attempt = 1
        delay = 0
        while attempt <= self.retries:
            # a new session needs to be constructed on each attempt
            async with self.__session_factory() as session:
                # try to mark this sample as in flight in an atomic insert
                # operation (modulo possible deadlocks with various RDBMS)
                session.add(in_flight_marker)

                try:
                    await session.commit()
                    logger.debug('%d: Marked sample in flight', sample.id)
                    return True
                # duplicate primary key == entry already exists
                except IntegrityError:
                    await session.rollback()
                    logger.debug(
                        '%d: Sample is already in flight on another '
                        'instance', sample.id)
                    return False
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt,
                        'marking sample %d in flight' % sample.id)

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            '%d: Unable to mark sample as in flight: %s' %
                            (sample.id, error))

            await asyncio.sleep(delay)

        return False
예제 #10
0
    def __init__(self,
                 db_url,
                 instance_id=0,
                 stale_in_flight_threshold=15 * 60,
                 log_level=logging.WARNING):
        """
        Initialize the Peekaboo database handler.

        @param db_url: An RFC 1738 URL that points to the database.
        @param instance_id: A positive, unique ID differentiating this Peekaboo
                            instance from any other instance using the same
                            database for concurrency coordination. Value of 0
                            means that we're alone and have no other instances
                            to worry about.
        @param stale_in_flight_threshold: Number of seconds after which a in
        flight marker is considered stale and deleted or ignored.
        @param log_level: Overrides the log level of the database modules. The
                          idea is for the database to be silent by default and
                          only emit log messages if switched on explictly and
                          independently of the Peekaboo log level.
        """
        logging.getLogger('sqlalchemy.engine').setLevel(log_level)
        logging.getLogger('sqlalchemy.pool').setLevel(log_level)
        # aiosqlite picks up the global log level unconditionally so we need to
        # override it as well and explicitly
        logging.getLogger('aiosqlite').setLevel(log_level)

        # <backend>[+<driver>]:// -> <backend>
        url_parts = db_url.split(':')
        scheme_parts = url_parts[0].split('+')
        backend = scheme_parts[0]

        engine_kwargs = {}
        if backend == 'sqlite':
            engine_kwargs.update(
                dict(poolclass=sqlalchemy.pool.AsyncAdaptedQueuePool,
                     pool_size=1,
                     max_overflow=0,
                     connect_args={'timeout': 0}))

        # if there is no driver specified or its a known non-asyncio driver,
        # try to find to a known-good asyncio driver
        sync_drivers = {
            'sqlite': ['pysqlite'],
            'mysql': ['mysqldb', 'pymysql'],
            'postgresql':
            ['psycopg2', 'pg8000', 'psycopg2cffi', 'pypostgresql', 'pygresql'],
        }

        asyncio_drivers = {
            'sqlite': ['aiosqlite'],
            'mysql': ['asyncmy', 'aiomysql'],
            'postgresql': ['asyncpg'],
        }

        backend_async_drivers = asyncio_drivers.get(backend)

        # if there seems to be a driver specified, look more closely
        if len(scheme_parts) > 1:
            driver = scheme_parts[1]

            backend_sync_drivers = sync_drivers.get(backend)
            if (backend_sync_drivers is not None
                    and driver in backend_sync_drivers):
                logger.warning(
                    'Configuration specifies a synchronous database driver '
                    '"%s". Please update your configuration to use an '
                    'asynchronous driver, preferably out of: %s', driver,
                    backend_async_drivers)
            elif driver not in backend_async_drivers:
                logger.warning(
                    'Configuration specifies unknown asynchronous driver "%s". '
                    'Trying to use anyway.', driver)
                backend_async_drivers = [driver]

        self.__engine = None
        for driver in backend_async_drivers:
            scheme = f'{backend}+{driver}'
            db_url = ':'.join([scheme] + url_parts[1:])

            try:
                logger.debug('Trying SQLAlchemy backend+driver "%s"', scheme)
                self.__engine = sqlalchemy.ext.asyncio.create_async_engine(
                    db_url, **engine_kwargs)
            except ModuleNotFoundError:
                continue

            logger.info(
                'Using "%s" SQLAlchemy backend+driver for '
                'database accesses', scheme)
            break

        if self.__engine is None:
            raise PeekabooDatabaseError(
                f'None of the drivers for backend "{backend}" could be found: '
                f'{backend_async_drivers}')

        self.__session_factory = sessionmaker(
            bind=self.__engine, class_=sqlalchemy.ext.asyncio.AsyncSession)

        self.instance_id = instance_id
        self.stale_in_flight_threshold = stale_in_flight_threshold
        self.retries = 5
        # ultra-simple quadratic backoff:
        # attempt 1: 10 * 2**(1) == 10-20msecs
        # attempt 2: 10 * 2**(2) == 20-40msecs
        # attempt 3: 10 * 2**(3) == 40-80msecs
        # attempt 4: 10 * 2**(4) == 80-160msecs
        self.deadlock_backoff_base = 10
        self.connect_backoff_base = 2000
예제 #11
0
    def mark_sample_in_flight(self, sample, instance_id=None, start_time=None):
        """
        Mark a sample as in flight, i.e. being worked on by an instance.

        @param sample: The sample to mark as in flight.
        @param instance_id: (optionally) The ID of the instance that is
                            handling this sample. Default: Us.
        @param start_time: Override the time the marker was placed for
                           debugging purposes.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return True

        # use our own instance id if none is given
        if instance_id is None:
            instance_id = self.instance_id

        if start_time is None:
            start_time = datetime.utcnow()

        sha256sum = sample.sha256sum
        in_flight_marker = InFlightSample(sha256sum=sha256sum,
                                          instance_id=instance_id,
                                          start_time=start_time)
        attempt = 1
        locked = False
        while attempt <= self.retries:
            # a new session needs to be constructed on each attempt
            session = self.__session()

            # try to mark this sample as in flight in an atomic insert
            # operation (modulo possible deadlocks with various RDBMS)
            session.add(in_flight_marker)

            try:
                session.commit()
                locked = True
                logger.debug('Marked sample %s as in flight', sha256sum)
            # duplicate primary key == entry already exists
            except IntegrityError:
                session.rollback()
                logger.debug(
                    'Sample %s is already in flight on another '
                    'instance', sha256sum)
            except (OperationalError, DBAPIError, SQLAlchemyError) as error:
                session.rollback()

                attempt = self.was_transient_error(
                    error, attempt,
                    'marking sample %s as in flight' % sha256sum)
                if attempt > 0:
                    continue

                raise PeekabooDatabaseError(
                    'Unable to mark sample as in flight: %s' % error)
            finally:
                session.close()

            return locked

        return False
예제 #12
0
    def __init__(self,
                 db_url,
                 instance_id=0,
                 stale_in_flight_threshold=15 * 60,
                 log_level=logging.WARNING,
                 async_driver=None):
        """
        Initialize the Peekaboo database handler.

        @param db_url: An RFC 1738 URL that points to the database.
        @param instance_id: A positive, unique ID differentiating this Peekaboo
                            instance from any other instance using the same
                            database for concurrency coordination. Value of 0
                            means that we're alone and have no other instances
                            to worry about.
        @param stale_in_flight_threshold: Number of seconds after which a in
        flight marker is considered stale and deleted or ignored.
        @param log_level: Overrides the log level of the database modules. The
                          idea is for the database to be silent by default and
                          only emit log messages if switched on explictly and
                          independently of the Peekaboo log level.
        @param async_driver: last resort override of the asyncio driver
                             auto-detection
        """
        logging.getLogger('sqlalchemy.engine').setLevel(log_level)
        logging.getLogger('sqlalchemy.pool').setLevel(log_level)
        # aiosqlite picks up the global log level unconditionally so we need to
        # override it as well and explicitly
        logging.getLogger('aiosqlite').setLevel(log_level)

        # <backend>[+<driver>]:// -> <backend>
        backend = db_url.split(':')[0].split('+')[0]

        connect_args = {}
        if backend == 'sqlite':
            connect_args['timeout'] = 0

        self.__engine = create_engine(db_url,
                                      future=True,
                                      connect_args=connect_args)
        session_factory = sessionmaker(bind=self.__engine)
        self.__session = scoped_session(session_factory)
        self.__lock = threading.RLock()

        asyncio_drivers = {
            'sqlite': ['aiosqlite'],
            'mysql': ['asyncmy', 'aiomysql'],
            'postgresql': ['asyncpg'],
        }

        if async_driver is not None:
            drivers = [async_driver]
        else:
            drivers = asyncio_drivers.get(backend)
            if drivers is None:
                raise PeekabooDatabaseError(
                    'Unknown database backend configured: %s' % backend)

        async_engine = None
        async_db_url = None
        for driver in drivers:
            # replace backend and driver with our asyncio alternative
            scheme = "%s+%s" % (backend, driver)
            async_db_url = ':'.join([scheme] + db_url.split(':')[1:])

            try:
                async_engine = sqlalchemy.ext.asyncio.create_async_engine(
                    async_db_url, connect_args=connect_args)
            except ModuleNotFoundError:
                continue

            logger.debug(
                'Auto-detected %s SQLAlchemy backend+driver for '
                'asyncio database accesses', scheme)
            break

        if async_engine is None:
            raise PeekabooDatabaseError(
                'None of the asyncio drivers for backend %s could be '
                'found: %s' % (backend, drivers))

        self.__async_session_factory = sessionmaker(
            bind=async_engine, class_=sqlalchemy.ext.asyncio.AsyncSession)
        # no scoping necessary as we're not using asyncio across threads

        # special handling for sqlite: since it does not respond well to
        # multiple modify operations in parallel to the same database, we
        # serialise them through a QueuePool with only one connection
        self.__async_session_factory_modify = self.__async_session_factory
        if backend in ['sqlite']:
            async_engine_modify = sqlalchemy.ext.asyncio.create_async_engine(
                async_db_url,
                poolclass=sqlalchemy.pool.AsyncAdaptedQueuePool,
                pool_size=1,
                max_overflow=0)
            self.__async_session_factory_modify = sessionmaker(
                bind=async_engine_modify,
                class_=sqlalchemy.ext.asyncio.AsyncSession)

        self.instance_id = instance_id
        self.stale_in_flight_threshold = stale_in_flight_threshold
        self.retries = 5
        # ultra-simple quadratic backoff:
        # attempt 1: 10 * 2**(1) == 10-20msecs
        # attempt 2: 10 * 2**(2) == 20-40msecs
        # attempt 3: 10 * 2**(3) == 40-80msecs
        # attempt 4: 10 * 2**(4) == 80-160msecs
        self.deadlock_backoff_base = 10
        self.connect_backoff_base = 2000

        attempt = 1
        delay = 0
        while attempt <= self.retries:
            with self.__lock:
                try:
                    Base.metadata.create_all(self.__engine)
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    attempt, delay = self.was_transient_error(
                        error, attempt, 'create metadata')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to create schema in database: %s' % error)

            time.sleep(delay)