def handle_task(self, task: ServiceTask) -> None: try: self._task = Task(task) self.log.info( f"[{self._task.sid}] Starting task for file: {self._task.sha256} ({self._task.type})" ) self._task.start( self.service_attributes.default_result_classification, self.service_attributes.version, self.get_tool_version()) self.ontologies = defaultdict(list) request = ServiceRequest(self._task) self.execute(request) self._attach_service_meta_ontology(request) self._success() except RuntimeError as re: if is_recoverable_runtime_error(re): new_ex = exceptions.RecoverableError( "Service trying to use a threadpool during shutdown") self._handle_execute_failure( new_ex, exceptions.get_stacktrace_info(re)) else: self._handle_execute_failure( re, exceptions.get_stacktrace_info(re)) except Exception as ex: self._handle_execute_failure(ex, exceptions.get_stacktrace_info(ex)) finally: self._cleanup()
def delete(self, path, location='all'): for t in self.slice(location): try: t.delete(path) except Exception as ex: #pylint: disable=W0703 trace = get_stacktrace_info(ex) self.log.info('Transport problem: %s', trace)
def redispatch(self, name, sid, srl, service, reason, now): entry = None try: entry = self.entries[sid][srl] except KeyError: return False try: stage = self.service_manager.stage_by_name(service.name) d = getattr(entry, name)[stage] c = entry.completed_services[stage] if service.name in c or d and service.name in d: return False log.info("%s for %s: %s/%s", reason, service.name, sid, srl) self.dispatch(service, entry, now) return True except Exception as ex: #pylint: disable=W0703 trace = get_stacktrace_info(ex) log.error("Couldn't redispatch to %s for %s/%s: %s", service.name, sid, srl, trace) response = Task(deepcopy(entry.task.raw)) response.watermark(service.name, '') response.nonrecoverable_failure(trace) self.storage_queue.push({ 'type': 'error', 'name': service.name, 'response': response, }) return False
def close(self): for t in self.transports: try: t.close() except Exception as ex: trace = get_stacktrace_info(ex) self.log.warning('Transport problem: %s', trace)
def select(*queues, **kw): timeout = kw.get('timeout', 0) if len(queues) < 1: raise TypeError('At least one queue must be specified') if any([type(q) != NamedQueue for q in queues]): raise TypeError('Only NamedQueues supported') c = queues[0].c # TODO: Can we compare two StrictRedis instances for equality? # (Queues are back to having their own StrictRedis instance). # if any([q.c != c for q in queues[1:]]): # raise ValueError('All queues must share a client') if not timeout: response = retry_call(c.blpop, [q.name for q in queues], timeout) else: try: response = c.blpop([q.name for q in queues], timeout) except redis.ConnectionError as ex: trace = get_stacktrace_info(ex) log.warning('Redis connection error (4): %s', trace) time.sleep(timeout) response = None if not response: return response return response[0], json.loads(response[1])
def get(self, path, location='any'): for t in self.slice(location): try: if t.exists(path): return t.get(path) except Exception as ex: trace = get_stacktrace_info(ex) self.log.warning('Transport problem: %s', trace)
def put_batch(self, local_remote_tuples, location='all'): failed_tuples = [] for (src_path, dst_path) in local_remote_tuples: try: self.put(src_path, dst_path, location) except Exception as ex: #pylint: disable=W0703 trace = get_stacktrace_info(ex) failed_tuples.append((src_path, dst_path, trace)) return failed_tuples
def upload_batch(self, local_remote_tuples, location='all'): failed_tuples = [] for (src_path, dst_path) in local_remote_tuples: try: self.upload(src_path, dst_path, location) except Exception as ex: trace = get_stacktrace_info(ex) failed_tuples.append((src_path, dst_path, trace)) return failed_tuples
def delete(self, path: str, location='all'): with elasticapm.capture_span(name='delete', span_type='filestore', labels={'path': path}): for t in self.slice(location): try: t.delete(path) except Exception as ex: trace = get_stacktrace_info(ex) self.log.info('Transport problem: %s', trace)
def exists(self, path): path = self.normalize(path) self.log.debug('Checking for existence of %s', path) size = None try: size = self.ftp.size(path) except ftplib.error_perm as e: trace = get_stacktrace_info(e) # If the file doesnt exist we get a 550. if not e.message.startswith('550'): # pylint:disable=E1101 raise return size is not None
def listen(self): while True: self._connect() try: i = self.p.listen() v = next(i) if isinstance(v, dict) and v.get('type', '') != 'subscribe': yield (v) except redis.ConnectionError as ex: trace = get_stacktrace_info(ex) log.warning('Redis connection error (1): %s', trace) self._connected = False
def unpush(self, num=1): if num < 0: return [] try: return [ json.loads(s[21:]) for s in retry_call(self.t, args=[self.name, num]) ] except redis.ConnectionError as ex: trace = get_stacktrace_info(ex) log.warning('Redis connection error (6): %s', trace) return []
def exists(self, path, location='any') -> list[Transport]: transports = [] for t in self.slice(location): try: if t.exists(path): transports.append(t) if location == 'any': break except Exception as ex: trace = get_stacktrace_info(ex) self.log.warning('Transport problem: %s', trace) return transports
def pop(self, num=1): num -= 1 if num < 0: return [] try: return [ decode(s[21:]) for s in retry_call(self.r, args=[self.name, num]) ] except redis.ConnectionError as ex: trace = get_stacktrace_info(ex) log.warning('Redis connection error (5): %s', trace) return []
def handle_task(self, task: ServiceTask) -> None: try: self._task = Task(task) self.log.info(f"Starting task: {self._task.sid}/{self._task.sha256} ({self._task.type})") self._task.start(self.service_attributes.default_result_classification, self.service_attributes.version, self.get_tool_version()) request = ServiceRequest(self._task) self.execute(request) self._success() except Exception as ex: self._handle_execute_failure(ex, exceptions.get_stacktrace_info(ex)) finally: self._cleanup()
def serve_forever(self): try: # Inject a message onto the controller queue. self.run() except KeyboardInterrupt: self.log.info('Shutting down due to signal.') self.stop() except RemoteShutdownInterrupt as ri: msg = 'Shutting down due to remote command: %s' % ri self.log.info(msg) self.stop() except Exception as ex: msg = 'Shutting down due to unhandled exception: %s' % get_stacktrace_info(ex) self.log.error(msg) self.stop()
def download(self, src_path, dest_path, location='any'): successful = False transports = [] download_errors = [] for t in self.slice(location): try: t.download(src_path, dest_path) transports.append(t) successful = True break except Exception as ex: #pylint: disable=W0703 download_errors.append((str(t), get_stacktrace_info(ex))) if not successful: raise FileStoreException('No transport succeeded:\n%s' % pprint.pformat((download_errors))) return transports
def check_timeouts(self, now=None): if not now: now = time.time() # Make sure the right amount of time has elapsed since our last check. if now - self.last_check < config.system.update_interval: return self.last_check = now with self.lock: try: self.process_timeouts('acknowledged_services', now, 'Ack timeout', self.ack_timeout) self.process_timeouts('completed_services', now, 'Service timeout', self.service_timeout) timeouts = self.child_timeout for k, v in timeouts.iteritems(): start = 0 timeouts[k] = [] for t in v: # Timeouts are added to the end of their list so # when we reach the first non-timed out timeout, # we are done. if t.time >= now: break # Timeouts remain active (so that we don't have # to scan for them when removing tasks that have # completed. So it is possible for a timeout to # refer to an id that no longer exists. submission = self.entries.get(t.sid, None) if submission: entry = submission.get(t.srl, None) if entry: if entry.extracted_children.pop(t.data, None): log.info('Child %s of parent %s timed out', t.data, t.srl) UpdateEntry(entry, now) start += 1 # Remove processed timeouts. timeouts[k] = v[start:] + timeouts[k] except Exception as ex: #pylint: disable=W0703 trace = get_stacktrace_info(ex) log.error('Problem processing timeouts: %s', trace)
def serve_forever(self): try: # Listen for our shutdown signal signal.signal(signal.SIGINT, self._stop_signal_handler) # Inject a message onto the agents queue. self.run() except KeyboardInterrupt: self.log.info('Shutting down due to KeyboardInterrupt.') self.stop() except RemoteShutdownInterrupt as ri: msg = 'Shutting down due to remote command: %s' % ri self.log.info(msg) self.stop() except Exception as ex: msg = 'Shutting down due to unhandled exception: %s' % get_stacktrace_info( ex) self.log.error(msg) self.stop()
def process(self, msg): func = None task = Task.wrap(msg) if not msg: log.warning("Got 'None' msg") return try: func = self.__getattribute__(task.state) except AttributeError: log.warning('Unknown message type: %s', task.state) try: func(task) except Exception as ex: #pylint: disable=W0703 trace = get_stacktrace_info(ex) log.error('Problem processing %s: %s', pformat(task.raw), trace)
def retry(raw, scan_key, sha256, ex): # df node def current_time = now() notice = Notice(raw) retries = notice.get('retries', 0) + 1 if retries > max_retries: trace = '' if ex and type(ex) != FileStoreException: trace = ': ' + get_stacktrace_info(ex) logger.error('Max retries exceeded for %s%s', sha256, trace) dupq.delete(dup_prefix + scan_key) elif expired(current_time - seconds(notice.get('ts', current_time)), 0): logger.info('No point retrying expired submission for %s', sha256) dupq.delete(dup_prefix + scan_key) # df pull delete else: logger.info('Requeuing %s (%s)', sha256, ex or 'unknown') notice.set('retries', retries) notice.set('retry_at', now(retry_delay)) retryq.push(notice.raw) # df push push
def pop(self, name, blocking=True, timeout=0): if blocking: if not timeout: response = retry_call(self.c.blpop, name, timeout) else: try: response = self.c.blpop(name, timeout) except redis.ConnectionError as ex: trace = get_stacktrace_info(ex) log.warning('Redis connection error (2): %s', trace) time.sleep(timeout) response = None else: response = retry_call(self.c.lpop, name) if not response: return response if blocking: return json.loads(response[1]) else: return json.loads(response)
def retry(self, task, scan_key, ex): current_time = now() retries = task.retries + 1 if retries > _max_retries: trace = '' if ex: trace = ': ' + get_stacktrace_info(ex) self.log.error( f'[{task.ingest_id} :: {task.sha256}] Max retries exceeded {trace}' ) self.duplicate_queue.delete(_dup_prefix + scan_key) elif self.expired(current_time - task.ingest_time.timestamp(), 0): self.log.info( f'[{task.ingest_id} :: {task.sha256}] No point retrying expired submission' ) self.duplicate_queue.delete(_dup_prefix + scan_key) else: self.log.info( f'[{task.ingest_id} :: {task.sha256}] Requeuing ({ex or "unknown"})' ) task.retries = retries self.retry_queue.push(int(now(_retry_delay)), task.json())
def try_run(self, volatile=False): ingester = self.ingester logger = self.log time_mark, cpu_mark = time.time(), time.process_time() while self.running: # noinspection PyBroadException try: self.heartbeat() ingester.counter.increment_execution_time( 'cpu_seconds', time.process_time() - cpu_mark) ingester.counter.increment_execution_time( 'busy_seconds', time.time() - time_mark) # Check if there is room for more submissions length = ingester.scanning.length() if length >= ingester.config.core.ingester.max_inflight: time.sleep(0.1) time_mark, cpu_mark = time.time(), time.process_time() continue raw = ingester.unique_queue.pop() if not raw: time.sleep(0.1) time_mark, cpu_mark = time.time(), time.process_time() continue # Start timing 'busy' time, we reset this above after the sleeps so that the sleeps # don't get counted as busy time_mark, cpu_mark = time.time(), time.process_time() # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_msg') task = IngestTask(raw) # noinspection PyBroadException if any( len(file.sha256) != 64 for file in task.submission.files): logger.error("Malformed entry on submission queue: %s", task.ingest_id) # End of ingest message (invalid_hash) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'invalid_hash') continue # If between the initial ingestion and now the drop/whitelist status # of this submission has changed, then drop it now if ingester.drop(task): # End of ingest message (dropped) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'dropped') continue if ingester.is_whitelisted(task): # End of ingest message (whitelisted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'whitelisted') continue # Check if this file has been previously processed. pprevious, previous, score, scan_key = None, False, None, None if not task.submission.params.ignore_cache: pprevious, previous, score, scan_key = ingester.check(task) else: scan_key = ingester.stamp_filescore_key(task) # If it HAS been previously processed, we are dealing with a resubmission # finalize will decide what to do, and put the task back in the queue # rewritten properly if we are going to run it again if previous: if not task.submission.params.services.resubmit and not pprevious: logger.warning( f"No psid for what looks like a resubmission of " f"{task.submission.files[0].sha256}: {scan_key}") ingester.finalize(pprevious, previous, score, task) # End of ingest message (finalized) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'finalized') continue # We have decided this file is worth processing # Add the task to the scanning table, this is atomic across all submit # workers, so if it fails, someone beat us to the punch, record the file # as a duplicate then. if not ingester.scanning.add(scan_key, task.as_primitives()): logger.debug('Duplicate %s', task.submission.files[0].sha256) ingester.counter.increment('duplicates') ingester.duplicate_queue.push(_dup_prefix + scan_key, task.as_primitives()) # End of ingest message (duplicate) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'duplicate') continue # We have managed to add the task to the scan table, so now we go # ahead with the submission process try: ingester.submit(task) # End of ingest message (submitted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'submitted') continue except Exception as _ex: # For some reason (contained in `ex`) we have failed the submission # The rest of this function is error handling/recovery ex = _ex traceback = _ex.__traceback__ ingester.counter.increment('error') should_retry = True if isinstance(ex, CorruptedFileStoreException): logger.error( "Submission for file '%s' failed due to corrupted filestore: %s" % (task.sha256, str(ex))) should_retry = False elif isinstance(ex, DataStoreException): trace = exceptions.get_stacktrace_info(ex) logger.error( "Submission for file '%s' failed due to data store error:\n%s" % (task.sha256, trace)) elif not isinstance(ex, FileStoreException): trace = exceptions.get_stacktrace_info(ex) logger.error("Submission for file '%s' failed: %s" % (task.sha256, trace)) task = IngestTask(ingester.scanning.pop(scan_key)) if not task: logger.error('No scanning entry for for %s', task.sha256) # End of ingest message (no_scan_entry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'no_scan_entry') continue if not should_retry: # End of ingest message (cannot_retry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'cannot_retry') continue ingester.retry(task, scan_key, ex) # End of ingest message (retry) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'retried') if volatile: raise ex.with_traceback(traceback) except Exception: logger.exception("Unexpected error") # End of ingest message (exception) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'exception') if volatile: raise
def handle_submit(self): time_mark, cpu_mark = time.time(), time.process_time() while self.running: # noinspection PyBroadException try: self.counter.increment_execution_time( 'cpu_seconds', time.process_time() - cpu_mark) self.counter.increment_execution_time('busy_seconds', time.time() - time_mark) # Check if there is room for more submissions length = self.scanning.length() if length >= self.config.core.ingester.max_inflight: self.sleep(0.1) time_mark, cpu_mark = time.time(), time.process_time() continue raw = self.unique_queue.blocking_pop(timeout=3) time_mark, cpu_mark = time.time(), time.process_time() if not raw: continue # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_msg') task = IngestTask(raw) # Check if we need to drop a file for capacity reasons, but only if the # number of files in flight is alreay over 80% if length >= self.config.core.ingester.max_inflight * 0.8 and self.drop( task): # End of ingest message (dropped) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'dropped') continue if self.is_whitelisted(task): # End of ingest message (whitelisted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'whitelisted') continue # Check if this file has been previously processed. pprevious, previous, score, scan_key = None, None, None, None if not task.submission.params.ignore_cache: pprevious, previous, score, scan_key = self.check(task) else: scan_key = self.stamp_filescore_key(task) # If it HAS been previously processed, we are dealing with a resubmission # finalize will decide what to do, and put the task back in the queue # rewritten properly if we are going to run it again if previous: if not task.submission.params.services.resubmit and not pprevious: self.log.warning( f"No psid for what looks like a resubmission of " f"{task.submission.files[0].sha256}: {scan_key}") self.finalize(pprevious, previous, score, task) # End of ingest message (finalized) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'finalized') continue # We have decided this file is worth processing # Add the task to the scanning table, this is atomic across all submit # workers, so if it fails, someone beat us to the punch, record the file # as a duplicate then. if not self.scanning.add(scan_key, task.as_primitives()): self.log.debug('Duplicate %s', task.submission.files[0].sha256) self.counter.increment('duplicates') self.duplicate_queue.push(_dup_prefix + scan_key, task.as_primitives()) # End of ingest message (duplicate) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'duplicate') continue # We have managed to add the task to the scan table, so now we go # ahead with the submission process try: self.submit(task) # End of ingest message (submitted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'submitted') continue except Exception as _ex: # For some reason (contained in `ex`) we have failed the submission # The rest of this function is error handling/recovery ex = _ex # traceback = _ex.__traceback__ self.counter.increment('error') should_retry = True if isinstance(ex, CorruptedFileStoreException): self.log.error( "Submission for file '%s' failed due to corrupted " "filestore: %s" % (task.sha256, str(ex))) should_retry = False elif isinstance(ex, DataStoreException): trace = exceptions.get_stacktrace_info(ex) self.log.error("Submission for file '%s' failed due to " "data store error:\n%s" % (task.sha256, trace)) elif not isinstance(ex, FileStoreException): trace = exceptions.get_stacktrace_info(ex) self.log.error("Submission for file '%s' failed: %s" % (task.sha256, trace)) task = IngestTask(self.scanning.pop(scan_key)) if not task: self.log.error('No scanning entry for for %s', task.sha256) # End of ingest message (no_scan_entry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'no_scan_entry') continue if not should_retry: # End of ingest message (cannot_retry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'cannot_retry') continue self.retry(task, scan_key, ex) # End of ingest message (retry) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'retried') except Exception: self.log.exception("Unexpected error") # End of ingest message (exception) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'exception')
def submitter(): # df node def client = forge.get_submission_service() datastore = forge.get_datastore() while running: try: raw = submissionq.pop(timeout=1) # df pull pop if not raw: continue # noinspection PyBroadException try: sha256 = raw['sha256'] except Exception: # pylint: disable=W0703 logger.exception("Malformed entry on submission queue:") continue if not sha256: logger.error("Malformed entry on submission queue: %s", raw) continue notice = Notice(raw) if drop(notice): # df push calls continue if is_whitelisted(notice): # df push calls continue pprevious, previous, score = None, False, None if not notice.get('ignore_cache', False): pprevious, previous, score, scan_key = check(datastore, notice) if previous: if not notice.get('resubmit_to', []) and not pprevious: logger.warning( "No psid for what looks like a resubmission of %s: %s", sha256, scan_key) finalize(pprevious, previous, score, notice) # df push calls continue with ScanLock(scan_key): if scanning.exists(scan_key): logger.debug('Duplicate %s', sha256) ingester_counts.increment('ingest.duplicates') dupq.push(dup_prefix + scan_key, notice.raw) # df push push continue scanning.add(scan_key, notice.raw) # df push add ex = return_exception(submit, client, notice) if not ex: continue ingester_counts.increment('ingest.error') should_retry = True tex = type(ex) if tex == FileStoreException: ex = tex("Problem with file: %s" % sha256) elif tex == CorruptedFileStoreException: logger.error( "Submission failed due to corrupted filestore: %s" % ex.message) should_retry = False else: trace = get_stacktrace_info(ex) logger.error("Submission failed: %s", trace) raw = scanning.pop(scan_key) if not raw: logger.error('No scanning entry for for %s', sha256) continue if not should_retry: continue retry(raw, scan_key, sha256, ex) if tex == riak.RiakError: raise ex # pylint: disable=E0702 except Exception: # pylint:disable=W0703 logger.exception("Unexpected error")