def analysis2db(self, sample): """ Save an analysis task to the analysis journal in the database. :param sample: The sample object for this analysis task. """ with self.__lock: session = self.__Session() analysis = AnalysisJournal() analysis.job_hash = sample.get_job_hash() analysis.cuckoo_job_id = sample.job_id analysis.filename = sample.get_filename() analysis.analyses_time = datetime.strptime(sample.analyses_time, "%Y%m%dT%H%M%S") analysis_result = PeekabooDatabase.__get_or_create( session, AnalysisResult, name=sample.get_result().name ) # NOTE: We cannot determine if a known sample is inProgress again. s = PeekabooDatabase.__get( session, SampleInfo, sha256sum=sample.sha256sum, file_extension=sample.file_extension, ) if s is None: s = PeekabooDatabase.__create( SampleInfo, sha256sum=sample.sha256sum, file_extension=sample.file_extension, result=analysis_result ) analysis.sample = s session.add(analysis) try: session.commit() except SQLAlchemyError as e: session.rollback() raise PeekabooDatabaseError( 'Failed to add analysis task to the database: %s' % e ) finally: session.close()
async def analysis_add(self, sample): """ Add an analysis task to the analysis journal in the database. @param sample: The sample object for this analysis task. @returns: ID of the newly created analysis task (also updated in the sample) """ sample_info = SampleInfo(state=sample.state, identity=await sample.identity, analysis_time=datetime.now(), result=sample.result, reason=sample.reason) job_id = None attempt = 1 delay = 0 while attempt <= self.retries: async with self.__session_factory() as session: session.add(sample_info) try: # flush to retrieve the automatically assigned primary # key value await session.flush() job_id = sample_info.id await session.commit() break except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'adding analysis') if attempt < 0: raise PeekabooDatabaseError( 'Failed to add analysis task to the database: %s' % error) await asyncio.sleep(delay) sample.update_id(job_id) return job_id
def clear_stale_in_flight_samples(self): """ Clear all in-flight markers that are too old and therefore stale. This detects instances which are locked up, crashed or shut down. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return True session = self.__session() # delete only the locks of a specific instance query = session.query(InFlightSample).filter( InFlightSample.start_time <= datetime.utcnow() - timedelta(seconds=self.stale_in_flight_threshold)) logger.debug( 'Clearing database of all stale in-flight samples ' '(%d seconds)', self.stale_in_flight_threshold) try: # the loop triggers the query, so only do it if debugging is # enabled if logger.isEnabledFor(logging.DEBUG): # obviously there's a race between logging and actual delete # here, use with caution, compare with actual number of markers # cleared below before relying on it for debugging for stale in query: logger.debug('Stale in-flight marker to clear: %s', stale) # delete() is not queued and goes to the DB before commit() cleared = query.delete() session.commit() if cleared > 0: logger.warn('%d stale in-flight samples cleared.', cleared) except SQLAlchemyError as error: session.rollback() raise PeekabooDatabaseError('Unable to clear the database of ' 'stale in-flight samples: %s' % error) finally: session.close() return cleared > 0
def analysis_journal_fetch_journal(self, sample): """ Fetch information stored in the database about a given sample object. @param sample: The sample object of which the information shall be fetched from the database. @return: A sorted list of (analysis_time, result, reason) of the requested sample. """ statement = sqlalchemy.sql.expression.select( SampleInfo.analysis_time, SampleInfo.result, SampleInfo.reason).where(SampleInfo.id != sample.id).where( SampleInfo.result != Result.failed).filter_by( state=JobState.FINISHED, sha256sum=sample.sha256sum, file_extension=sample.file_extension).order_by( SampleInfo.analysis_time) sample_journal = None attempt = 1 delay = 0 while attempt <= self.retries: with self.__session() as session: try: sample_journal = session.execute(statement).all() break except (OperationalError, DBAPIError, SQLAlchemyError) as error: session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'fetching analysis journal') if attempt < 0: raise PeekabooDatabaseError( 'Failed to fetch analysis journal from the database: %s' % error) time.sleep(delay) return sample_journal
def _init_db(self): """ Initializes the Peekaboo database by creating tables and writing meta information to the '_meta' table. """ Base.metadata.create_all(self.__engine) meta = PeekabooMetadata() meta.peekaboo_version = __version__ meta.db_schema_version = DB_SCHEMA_VERSION # TODO: Get Cuckoo version. meta.cuckoo_version = '2.0' session = self.__session() session.add(meta) try: session.commit() except SQLAlchemyError as error: session.rollback() raise PeekabooDatabaseError('Cannot initialize the database: %s' % error) finally: session.close()
def clear_in_progress(self): """ Remove all samples with the result 'inProgress'. """ session = self.__Session() in_progress = PeekabooDatabase.__get(session, AnalysisResult, name='inProgress') in_progress_samples = session.query(SampleInfo).filter_by( result=in_progress).all() for in_progress_sample in in_progress_samples: session.query(AnalysisJournal).filter_by( sample=in_progress_sample).delete() try: session.commit() logger.debug('Cleared the database from "inProgress" entries.') except SQLAlchemyError as e: session.rollback() raise PeekabooDatabaseError( 'Unable to clear the database from "inProgress" entries: %s' % e) finally: session.close()
async def analysis_retrieve(self, job_id): """ Fetch information stored in the database about a given sample object. @param job_id: ID of the analysis to retrieve @type job_id: int @return: reason and result for the given analysis task """ statement = sqlalchemy.sql.expression.select( SampleInfo.reason, SampleInfo.result).filter_by(id=job_id, state=JobState.FINISHED) result = None attempt = 1 delay = 0 while attempt <= self.retries: async with self.__session_factory() as session: try: proxy = await session.execute(statement) result = proxy.first() break except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'retrieving analysis result') if attempt < 0: raise PeekabooDatabaseError( 'Failed to retrieve analysis from the database: %s' % error) await asyncio.sleep(delay) return result
async def clear_stale_in_flight_samples(self): """ Clear all in-flight markers that are too old and therefore stale. This detects instances which are locked up, crashed or shut down. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return True logger.debug( 'Clearing database of all stale in-flight samples ' '(%d seconds)', self.stale_in_flight_threshold) def clear_statement(statement_class): # delete only the locks of a specific instance return statement_class(InFlightSample).where( InFlightSample.start_time <= datetime.utcnow() - timedelta(seconds=self.stale_in_flight_threshold)) delete_statement = clear_statement(sqlalchemy.sql.expression.delete) select_statement = clear_statement(sqlalchemy.sql.expression.select) attempt = 1 cleared = 0 while attempt <= self.retries: async with self.__session_factory() as session: try: # only do the query if debugging is enabled if logger.isEnabledFor(logging.DEBUG): # obviously there's a race between logging and actual # delete here, use with caution, compare with actual # number of markers cleared below before relying on it # for debugging markers = await session.execute(select_statement) for stale in markers: logger.debug('Stale in-flight marker to clear: %s', stale) markers = await session.execute(delete_statement) await session.commit() cleared = markers.rowcount if cleared > 0: logger.warning('%d stale in-flight samples cleared.', cleared) break except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'clearing the database of stale in-flight samples') if attempt < 0: raise PeekabooDatabaseError( 'Unable to clear the database of stale in-flight ' 'samples: %s' % error) await asyncio.sleep(delay) return cleared > 0
async def mark_sample_in_flight(self, sample, instance_id=None, start_time=None): """ Mark a sample as in flight, i.e. being worked on by an instance. @param sample: The sample to mark as in flight. @param instance_id: (optionally) The ID of the instance that is handling this sample. Default: Us. @param start_time: Override the time the marker was placed for debugging purposes. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return True # use our own instance id if none is given if instance_id is None: instance_id = self.instance_id if start_time is None: start_time = datetime.utcnow() in_flight_marker = InFlightSample(identity=await sample.identity, instance_id=instance_id, start_time=start_time) attempt = 1 delay = 0 while attempt <= self.retries: # a new session needs to be constructed on each attempt async with self.__session_factory() as session: # try to mark this sample as in flight in an atomic insert # operation (modulo possible deadlocks with various RDBMS) session.add(in_flight_marker) try: await session.commit() logger.debug('%d: Marked sample in flight', sample.id) return True # duplicate primary key == entry already exists except IntegrityError: await session.rollback() logger.debug( '%d: Sample is already in flight on another ' 'instance', sample.id) return False except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'marking sample %d in flight' % sample.id) if attempt < 0: raise PeekabooDatabaseError( '%d: Unable to mark sample as in flight: %s' % (sample.id, error)) await asyncio.sleep(delay) return False
def __init__(self, db_url, instance_id=0, stale_in_flight_threshold=15 * 60, log_level=logging.WARNING): """ Initialize the Peekaboo database handler. @param db_url: An RFC 1738 URL that points to the database. @param instance_id: A positive, unique ID differentiating this Peekaboo instance from any other instance using the same database for concurrency coordination. Value of 0 means that we're alone and have no other instances to worry about. @param stale_in_flight_threshold: Number of seconds after which a in flight marker is considered stale and deleted or ignored. @param log_level: Overrides the log level of the database modules. The idea is for the database to be silent by default and only emit log messages if switched on explictly and independently of the Peekaboo log level. """ logging.getLogger('sqlalchemy.engine').setLevel(log_level) logging.getLogger('sqlalchemy.pool').setLevel(log_level) # aiosqlite picks up the global log level unconditionally so we need to # override it as well and explicitly logging.getLogger('aiosqlite').setLevel(log_level) # <backend>[+<driver>]:// -> <backend> url_parts = db_url.split(':') scheme_parts = url_parts[0].split('+') backend = scheme_parts[0] engine_kwargs = {} if backend == 'sqlite': engine_kwargs.update( dict(poolclass=sqlalchemy.pool.AsyncAdaptedQueuePool, pool_size=1, max_overflow=0, connect_args={'timeout': 0})) # if there is no driver specified or its a known non-asyncio driver, # try to find to a known-good asyncio driver sync_drivers = { 'sqlite': ['pysqlite'], 'mysql': ['mysqldb', 'pymysql'], 'postgresql': ['psycopg2', 'pg8000', 'psycopg2cffi', 'pypostgresql', 'pygresql'], } asyncio_drivers = { 'sqlite': ['aiosqlite'], 'mysql': ['asyncmy', 'aiomysql'], 'postgresql': ['asyncpg'], } backend_async_drivers = asyncio_drivers.get(backend) # if there seems to be a driver specified, look more closely if len(scheme_parts) > 1: driver = scheme_parts[1] backend_sync_drivers = sync_drivers.get(backend) if (backend_sync_drivers is not None and driver in backend_sync_drivers): logger.warning( 'Configuration specifies a synchronous database driver ' '"%s". Please update your configuration to use an ' 'asynchronous driver, preferably out of: %s', driver, backend_async_drivers) elif driver not in backend_async_drivers: logger.warning( 'Configuration specifies unknown asynchronous driver "%s". ' 'Trying to use anyway.', driver) backend_async_drivers = [driver] self.__engine = None for driver in backend_async_drivers: scheme = f'{backend}+{driver}' db_url = ':'.join([scheme] + url_parts[1:]) try: logger.debug('Trying SQLAlchemy backend+driver "%s"', scheme) self.__engine = sqlalchemy.ext.asyncio.create_async_engine( db_url, **engine_kwargs) except ModuleNotFoundError: continue logger.info( 'Using "%s" SQLAlchemy backend+driver for ' 'database accesses', scheme) break if self.__engine is None: raise PeekabooDatabaseError( f'None of the drivers for backend "{backend}" could be found: ' f'{backend_async_drivers}') self.__session_factory = sessionmaker( bind=self.__engine, class_=sqlalchemy.ext.asyncio.AsyncSession) self.instance_id = instance_id self.stale_in_flight_threshold = stale_in_flight_threshold self.retries = 5 # ultra-simple quadratic backoff: # attempt 1: 10 * 2**(1) == 10-20msecs # attempt 2: 10 * 2**(2) == 20-40msecs # attempt 3: 10 * 2**(3) == 40-80msecs # attempt 4: 10 * 2**(4) == 80-160msecs self.deadlock_backoff_base = 10 self.connect_backoff_base = 2000
def mark_sample_in_flight(self, sample, instance_id=None, start_time=None): """ Mark a sample as in flight, i.e. being worked on by an instance. @param sample: The sample to mark as in flight. @param instance_id: (optionally) The ID of the instance that is handling this sample. Default: Us. @param start_time: Override the time the marker was placed for debugging purposes. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return True # use our own instance id if none is given if instance_id is None: instance_id = self.instance_id if start_time is None: start_time = datetime.utcnow() sha256sum = sample.sha256sum in_flight_marker = InFlightSample(sha256sum=sha256sum, instance_id=instance_id, start_time=start_time) attempt = 1 locked = False while attempt <= self.retries: # a new session needs to be constructed on each attempt session = self.__session() # try to mark this sample as in flight in an atomic insert # operation (modulo possible deadlocks with various RDBMS) session.add(in_flight_marker) try: session.commit() locked = True logger.debug('Marked sample %s as in flight', sha256sum) # duplicate primary key == entry already exists except IntegrityError: session.rollback() logger.debug( 'Sample %s is already in flight on another ' 'instance', sha256sum) except (OperationalError, DBAPIError, SQLAlchemyError) as error: session.rollback() attempt = self.was_transient_error( error, attempt, 'marking sample %s as in flight' % sha256sum) if attempt > 0: continue raise PeekabooDatabaseError( 'Unable to mark sample as in flight: %s' % error) finally: session.close() return locked return False
def __init__(self, db_url, instance_id=0, stale_in_flight_threshold=15 * 60, log_level=logging.WARNING, async_driver=None): """ Initialize the Peekaboo database handler. @param db_url: An RFC 1738 URL that points to the database. @param instance_id: A positive, unique ID differentiating this Peekaboo instance from any other instance using the same database for concurrency coordination. Value of 0 means that we're alone and have no other instances to worry about. @param stale_in_flight_threshold: Number of seconds after which a in flight marker is considered stale and deleted or ignored. @param log_level: Overrides the log level of the database modules. The idea is for the database to be silent by default and only emit log messages if switched on explictly and independently of the Peekaboo log level. @param async_driver: last resort override of the asyncio driver auto-detection """ logging.getLogger('sqlalchemy.engine').setLevel(log_level) logging.getLogger('sqlalchemy.pool').setLevel(log_level) # aiosqlite picks up the global log level unconditionally so we need to # override it as well and explicitly logging.getLogger('aiosqlite').setLevel(log_level) # <backend>[+<driver>]:// -> <backend> backend = db_url.split(':')[0].split('+')[0] connect_args = {} if backend == 'sqlite': connect_args['timeout'] = 0 self.__engine = create_engine(db_url, future=True, connect_args=connect_args) session_factory = sessionmaker(bind=self.__engine) self.__session = scoped_session(session_factory) self.__lock = threading.RLock() asyncio_drivers = { 'sqlite': ['aiosqlite'], 'mysql': ['asyncmy', 'aiomysql'], 'postgresql': ['asyncpg'], } if async_driver is not None: drivers = [async_driver] else: drivers = asyncio_drivers.get(backend) if drivers is None: raise PeekabooDatabaseError( 'Unknown database backend configured: %s' % backend) async_engine = None async_db_url = None for driver in drivers: # replace backend and driver with our asyncio alternative scheme = "%s+%s" % (backend, driver) async_db_url = ':'.join([scheme] + db_url.split(':')[1:]) try: async_engine = sqlalchemy.ext.asyncio.create_async_engine( async_db_url, connect_args=connect_args) except ModuleNotFoundError: continue logger.debug( 'Auto-detected %s SQLAlchemy backend+driver for ' 'asyncio database accesses', scheme) break if async_engine is None: raise PeekabooDatabaseError( 'None of the asyncio drivers for backend %s could be ' 'found: %s' % (backend, drivers)) self.__async_session_factory = sessionmaker( bind=async_engine, class_=sqlalchemy.ext.asyncio.AsyncSession) # no scoping necessary as we're not using asyncio across threads # special handling for sqlite: since it does not respond well to # multiple modify operations in parallel to the same database, we # serialise them through a QueuePool with only one connection self.__async_session_factory_modify = self.__async_session_factory if backend in ['sqlite']: async_engine_modify = sqlalchemy.ext.asyncio.create_async_engine( async_db_url, poolclass=sqlalchemy.pool.AsyncAdaptedQueuePool, pool_size=1, max_overflow=0) self.__async_session_factory_modify = sessionmaker( bind=async_engine_modify, class_=sqlalchemy.ext.asyncio.AsyncSession) self.instance_id = instance_id self.stale_in_flight_threshold = stale_in_flight_threshold self.retries = 5 # ultra-simple quadratic backoff: # attempt 1: 10 * 2**(1) == 10-20msecs # attempt 2: 10 * 2**(2) == 20-40msecs # attempt 3: 10 * 2**(3) == 40-80msecs # attempt 4: 10 * 2**(4) == 80-160msecs self.deadlock_backoff_base = 10 self.connect_backoff_base = 2000 attempt = 1 delay = 0 while attempt <= self.retries: with self.__lock: try: Base.metadata.create_all(self.__engine) break except (OperationalError, DBAPIError, SQLAlchemyError) as error: attempt, delay = self.was_transient_error( error, attempt, 'create metadata') if attempt < 0: raise PeekabooDatabaseError( 'Failed to create schema in database: %s' % error) time.sleep(delay)