def resubmit_submission_for_analysis(sid, *args, **kwargs): """ Resubmit a submission for analysis with the exact same parameters as before Variables: sid => Submission ID to re-submit Arguments: None Data Block: None Result example: # Submission message object as a json dictionary """ user = kwargs['user'] submission = STORAGE.submission.get(sid, as_obj=False) if submission: if not Classification.is_accessible(user['classification'], submission['classification']): return make_api_response( "", "You are not allowed to re-submit a submission that you don't have access to", 403) submission_params = submission['params'] submission_params['classification'] = submission['classification'] else: return make_api_response({}, "Submission %s does not exists." % sid, status_code=404) submission_params['submitter'] = user['uname'] submission_params['description'] = "Resubmit %s for analysis" % ", ".join( [x['name'] for x in submission["files"]]) try: submission_obj = Submission({ "files": submission["files"], "params": submission_params }) except (ValueError, KeyError) as e: return make_api_response("", err=str(e), status_code=400) with forge.get_filestore() as f_transport: try: submit_result = SubmissionClient( datastore=STORAGE, filestore=f_transport, config=config).submit(submission_obj) except SubmissionException as e: return make_api_response("", err=str(e), status_code=400) return make_api_response(submit_result.as_primitives())
def __init__(self, datastore, logger, classification=None, redis=None, persistent_redis=None, metrics_name='ingester'): self.datastore = datastore self.log = logger # Cache the user groups self.cache_lock = threading.RLock( ) # TODO are middle man instances single threaded now? self._user_groups = {} self._user_groups_reset = time.time() // HOUR_IN_SECONDS self.cache = {} self.notification_queues = {} self.whitelisted = {} self.whitelisted_lock = threading.RLock() # Create a config cache that will refresh config values periodically self.config = forge.CachedObject(forge.get_config) # Module path parameters are fixed at start time. Changing these involves a restart self.is_low_priority = load_module_by_path( self.config.core.ingester.is_low_priority) self.get_whitelist_verdict = load_module_by_path( self.config.core.ingester.get_whitelist_verdict) self.whitelist = load_module_by_path( self.config.core.ingester.whitelist) # Constants are loaded based on a non-constant path, so has to be done at init rather than load constants = forge.get_constants(self.config) self.priority_value = constants.PRIORITIES self.priority_range = constants.PRIORITY_RANGES self.threshold_value = constants.PRIORITY_THRESHOLDS # Connect to the redis servers self.redis = redis or get_client( host=self.config.core.redis.nonpersistent.host, port=self.config.core.redis.nonpersistent.port, private=False, ) self.persistent_redis = persistent_redis or get_client( host=self.config.core.redis.persistent.host, port=self.config.core.redis.persistent.port, private=False, ) # Classification engine self.ce = classification or forge.get_classification() # Metrics gathering factory self.counter = MetricsFactory(metrics_type='ingester', schema=Metrics, redis=self.redis, config=self.config, name=metrics_name) # State. The submissions in progress are stored in Redis in order to # persist this state and recover in case we crash. self.scanning = Hash('m-scanning-table', self.persistent_redis) # Input. The dispatcher creates a record when any submission completes. self.complete_queue = NamedQueue(_completeq_name, self.redis) # Internal. Dropped entries are placed on this queue. # self.drop_queue = NamedQueue('m-drop', self.persistent_redis) # Input. An external process places submission requests on this queue. self.ingest_queue = NamedQueue(INGEST_QUEUE_NAME, self.persistent_redis) # Output. Duplicate our input traffic into this queue so it may be cloned by other systems self.traffic_queue = CommsQueue('submissions', self.redis) # Internal. Unique requests are placed in and processed from this queue. self.unique_queue = PriorityQueue('m-unique', self.persistent_redis) # Internal, delay queue for retrying self.retry_queue = PriorityQueue('m-retry', self.persistent_redis) # Internal, timeout watch queue self.timeout_queue = PriorityQueue('m-timeout', self.redis) # Internal, queue for processing duplicates # When a duplicate file is detected (same cache key => same file, and same # submission parameters) the file won't be ingested normally, but instead a reference # will be written to a duplicate queue. Whenever a file is finished, in the complete # method, not only is the original ingestion finalized, but all entries in the duplicate queue # are finalized as well. This has the effect that all concurrent ingestion of the same file # are 'merged' into a single submission to the system. self.duplicate_queue = MultiQueue(self.persistent_redis) # Output. submissions that should have alerts generated self.alert_queue = NamedQueue(ALERT_QUEUE_NAME, self.persistent_redis) # Utility object to help submit tasks to dispatching self.submit_client = SubmissionClient(datastore=self.datastore, redis=self.redis)
class Ingester: """Internal interface to the ingestion queues.""" def __init__(self, datastore, logger, classification=None, redis=None, persistent_redis=None, metrics_name='ingester'): self.datastore = datastore self.log = logger # Cache the user groups self.cache_lock = threading.RLock( ) # TODO are middle man instances single threaded now? self._user_groups = {} self._user_groups_reset = time.time() // HOUR_IN_SECONDS self.cache = {} self.notification_queues = {} self.whitelisted = {} self.whitelisted_lock = threading.RLock() # Create a config cache that will refresh config values periodically self.config = forge.CachedObject(forge.get_config) # Module path parameters are fixed at start time. Changing these involves a restart self.is_low_priority = load_module_by_path( self.config.core.ingester.is_low_priority) self.get_whitelist_verdict = load_module_by_path( self.config.core.ingester.get_whitelist_verdict) self.whitelist = load_module_by_path( self.config.core.ingester.whitelist) # Constants are loaded based on a non-constant path, so has to be done at init rather than load constants = forge.get_constants(self.config) self.priority_value = constants.PRIORITIES self.priority_range = constants.PRIORITY_RANGES self.threshold_value = constants.PRIORITY_THRESHOLDS # Connect to the redis servers self.redis = redis or get_client( host=self.config.core.redis.nonpersistent.host, port=self.config.core.redis.nonpersistent.port, private=False, ) self.persistent_redis = persistent_redis or get_client( host=self.config.core.redis.persistent.host, port=self.config.core.redis.persistent.port, private=False, ) # Classification engine self.ce = classification or forge.get_classification() # Metrics gathering factory self.counter = MetricsFactory(metrics_type='ingester', schema=Metrics, redis=self.redis, config=self.config, name=metrics_name) # State. The submissions in progress are stored in Redis in order to # persist this state and recover in case we crash. self.scanning = Hash('m-scanning-table', self.persistent_redis) # Input. The dispatcher creates a record when any submission completes. self.complete_queue = NamedQueue(_completeq_name, self.redis) # Internal. Dropped entries are placed on this queue. # self.drop_queue = NamedQueue('m-drop', self.persistent_redis) # Input. An external process places submission requests on this queue. self.ingest_queue = NamedQueue(INGEST_QUEUE_NAME, self.persistent_redis) # Output. Duplicate our input traffic into this queue so it may be cloned by other systems self.traffic_queue = CommsQueue('submissions', self.redis) # Internal. Unique requests are placed in and processed from this queue. self.unique_queue = PriorityQueue('m-unique', self.persistent_redis) # Internal, delay queue for retrying self.retry_queue = PriorityQueue('m-retry', self.persistent_redis) # Internal, timeout watch queue self.timeout_queue = PriorityQueue('m-timeout', self.redis) # Internal, queue for processing duplicates # When a duplicate file is detected (same cache key => same file, and same # submission parameters) the file won't be ingested normally, but instead a reference # will be written to a duplicate queue. Whenever a file is finished, in the complete # method, not only is the original ingestion finalized, but all entries in the duplicate queue # are finalized as well. This has the effect that all concurrent ingestion of the same file # are 'merged' into a single submission to the system. self.duplicate_queue = MultiQueue(self.persistent_redis) # Output. submissions that should have alerts generated self.alert_queue = NamedQueue(ALERT_QUEUE_NAME, self.persistent_redis) # Utility object to help submit tasks to dispatching self.submit_client = SubmissionClient(datastore=self.datastore, redis=self.redis) def get_groups_from_user(self, username: str) -> List[str]: # Reset the group cache at the top of each hour if time.time() // HOUR_IN_SECONDS > self._user_groups_reset: self._user_groups = {} self._user_groups_reset = time.time() // HOUR_IN_SECONDS # Get the groups for this user if not known if username not in self._user_groups: user_data = self.datastore.user.get(username) if user_data: self._user_groups[username] = user_data.groups else: self._user_groups[username] = [] return self._user_groups[username] def ingest(self, task: IngestTask): self.log.info( f"[{task.ingest_id} :: {task.sha256}] Task received for processing" ) # Load a snapshot of ingest parameters as of right now. max_file_size = self.config.submission.max_file_size param = task.params self.counter.increment('bytes_ingested', increment_by=task.file_size) self.counter.increment('submissions_ingested') if any(len(file.sha256) != 64 for file in task.submission.files): self.log.error( f"[{task.ingest_id} :: {task.sha256}] Invalid sha256, skipped") self.send_notification(task, failure="Invalid sha256", logfunc=self.log.warning) return # Clean up metadata strings, since we may delete some, iterate on a copy of the keys for key in list(task.submission.metadata.keys()): value = task.submission.metadata[key] meta_size = len(value) if meta_size > self.config.submission.max_metadata_length: self.log.info( f'[{task.ingest_id} :: {task.sha256}] ' f'Removing {key} from metadata because value is too big') task.submission.metadata.pop(key) if task.file_size > max_file_size and not task.params.ignore_size and not task.params.never_drop: task.failure = f"File too large ({task.file_size} > {max_file_size})" self._notify_drop(task) self.counter.increment('skipped') self.log.error( f"[{task.ingest_id} :: {task.sha256}] {task.failure}") return # Set the groups from the user, if they aren't already set if not task.params.groups: task.params.groups = self.get_groups_from_user( task.params.submitter) # Check if this file is already being processed pprevious, previous, score = None, False, None if not param.ignore_cache: pprevious, previous, score, _ = self.check(task) # Assign priority. low_priority = self.is_low_priority(task) priority = param.priority if priority < 0: priority = self.priority_value['medium'] if score is not None: priority = self.priority_value['low'] for level, threshold in self.threshold_value.items(): if score >= threshold: priority = self.priority_value[level] break elif low_priority: priority = self.priority_value['low'] # Reduce the priority by an order of magnitude for very old files. current_time = now() if priority and self.expired( current_time - task.submission.time.timestamp(), 0): priority = (priority / 10) or 1 param.priority = priority # Do this after priority has been assigned. # (So we don't end up dropping the resubmission). if previous: self.counter.increment('duplicates') self.finalize(pprevious, previous, score, task) return if self.drop(task): self.log.info(f"[{task.ingest_id} :: {task.sha256}] Dropped") return if self.is_whitelisted(task): self.log.info(f"[{task.ingest_id} :: {task.sha256}] Whitelisted") return self.unique_queue.push(priority, task.as_primitives()) def check(self, task: IngestTask): key = self.stamp_filescore_key(task) with self.cache_lock: result = self.cache.get(key, None) if result: self.counter.increment('cache_hit_local') self.log.info( f'[{task.ingest_id} :: {task.sha256}] Local cache hit') else: result = self.datastore.filescore.get(key) if result: self.counter.increment('cache_hit') self.log.info( f'[{task.ingest_id} :: {task.sha256}] Remote cache hit') else: self.counter.increment('cache_miss') return None, False, None, key with self.cache_lock: self.cache[key] = result current_time = now() age = current_time - result.time errors = result.errors if self.expired(age, errors): self.log.info( f"[{task.ingest_id} :: {task.sha256}] Cache hit dropped, cache has expired" ) self.counter.increment('cache_expired') self.cache.pop(key, None) self.datastore.filescore.delete(key) return None, False, None, key elif self.stale(age, errors): self.log.info( f"[{task.ingest_id} :: {task.sha256}] Cache hit dropped, cache is stale" ) self.counter.increment('cache_stale') return None, False, result.score, key return result.psid, result.sid, result.score, key def stale(self, delta: float, errors: int): if errors: return delta >= self.config.core.ingester.incomplete_stale_after_seconds else: return delta >= self.config.core.ingester.stale_after_seconds @staticmethod def stamp_filescore_key(task: IngestTask, sha256=None): if not sha256: sha256 = task.submission.files[0].sha256 key = task.scan_key if not key: key = task.params.create_filescore_key(sha256) task.scan_key = key return key def completed(self, sub): """Invoked when notified that a submission has completed.""" # There is only one file in the submissions we have made sha256 = sub.files[0].sha256 scan_key = sub.params.create_filescore_key(sha256) raw = self.scanning.pop(scan_key) psid = sub.params.psid score = sub.max_score sid = sub.sid if not raw: # Some other worker has already popped the scanning queue? self.log.warning( f"[{sub.metadata.get('ingest_id', 'unknown')} :: {sha256}] " f"Submission completed twice") return scan_key task = IngestTask(raw) task.submission.sid = sid errors = sub.error_count file_count = sub.file_count self.counter.increment('submissions_completed') self.counter.increment('files_completed', increment_by=file_count) self.counter.increment('bytes_completed', increment_by=task.file_size) with self.cache_lock: fs = self.cache[scan_key] = FileScore({ 'expiry_ts': now(self.config.core.ingester.cache_dtl * 24 * 60 * 60), 'errors': errors, 'psid': psid, 'score': score, 'sid': sid, 'time': now(), }) self.datastore.filescore.save(scan_key, fs) self.finalize(psid, sid, score, task) def exhaust() -> Iterable[IngestTask]: while True: res = self.duplicate_queue.pop(_dup_prefix + scan_key, blocking=False) if res is None: break res = IngestTask(res) res.submission.sid = sid yield res # You may be tempted to remove the assignment to dups and use the # value directly in the for loop below. That would be a mistake. # The function finalize may push on the duplicate queue which we # are pulling off and so condensing those two lines creates a # potential infinite loop. dups = [dup for dup in exhaust()] for dup in dups: self.finalize(psid, sid, score, dup) return scan_key def send_notification(self, task: IngestTask, failure=None, logfunc=None): if logfunc is None: logfunc = self.log.info if failure: task.failure = failure failure = task.failure if failure: logfunc("%s: %s", failure, str(task.json())) if not task.submission.notification.queue: return note_queue = _notification_queue_prefix + task.submission.notification.queue threshold = task.submission.notification.threshold if threshold is not None and task.score is not None and task.score < threshold: return q = self.notification_queues.get(note_queue, None) if not q: self.notification_queues[note_queue] = q = NamedQueue( note_queue, self.persistent_redis) q.push(task.as_primitives()) def expired(self, delta: float, errors) -> bool: if errors: return delta >= self.config.core.ingester.incomplete_expire_after_seconds else: return delta >= self.config.core.ingester.expire_after def drop(self, task: IngestTask) -> bool: priority = task.params.priority sample_threshold = self.config.core.ingester.sampling_at dropped = False if priority <= _min_priority: dropped = True else: for level, rng in self.priority_range.items(): if rng[0] <= priority <= rng[1] and level in sample_threshold: dropped = must_drop(self.unique_queue.count(*rng), sample_threshold[level]) break if not dropped: if task.file_size > self.config.submission.max_file_size or task.file_size == 0: dropped = True if task.params.never_drop or not dropped: return False task.failure = 'Skipped' self._notify_drop(task) self.counter.increment('skipped') return True def _notify_drop(self, task: IngestTask): self.send_notification(task) c12n = task.params.classification expiry = now_as_iso(86400) sha256 = task.submission.files[0].sha256 self.datastore.save_or_freshen_file(sha256, {'sha256': sha256}, expiry, c12n, redis=self.redis) def is_whitelisted(self, task: IngestTask): reason, hit = self.get_whitelist_verdict(self.whitelist, task) hit = {x: dotdump(safe_str(y)) for x, y in hit.items()} sha256 = task.submission.files[0].sha256 if not reason: with self.whitelisted_lock: reason = self.whitelisted.get(sha256, None) if reason: hit = 'cached' if reason: if hit != 'cached': with self.whitelisted_lock: self.whitelisted[sha256] = reason task.failure = "Whitelisting due to reason %s (%s)" % (dotdump( safe_str(reason)), hit) self._notify_drop(task) self.counter.increment('whitelisted') return reason def submit(self, task: IngestTask): self.submit_client.submit( submission_obj=task.submission, completed_queue=_completeq_name, ) self.timeout_queue.push(int(now(_max_time)), task.scan_key) self.log.info( f"[{task.ingest_id} :: {task.sha256}] Submitted to dispatcher for analysis" ) def retry(self, task, scan_key, ex): current_time = now() retries = task.retries + 1 if retries > _max_retries: trace = '' if ex: trace = ': ' + get_stacktrace_info(ex) self.log.error( f'[{task.ingest_id} :: {task.sha256}] Max retries exceeded {trace}' ) self.duplicate_queue.delete(_dup_prefix + scan_key) elif self.expired(current_time - task.ingest_time.timestamp(), 0): self.log.info( f'[{task.ingest_id} :: {task.sha256}] No point retrying expired submission' ) self.duplicate_queue.delete(_dup_prefix + scan_key) else: self.log.info( f'[{task.ingest_id} :: {task.sha256}] Requeuing ({ex or "unknown"})' ) task.retries = retries self.retry_queue.push(int(now(_retry_delay)), task.json()) def finalize(self, psid, sid, score, task: IngestTask): self.log.info(f"[{task.ingest_id} :: {task.sha256}] Completed") if psid: task.params.psid = psid task.score = score task.submission.sid = sid selected = task.params.services.selected resubmit_to = task.params.services.resubmit resubmit_selected = determine_resubmit_selected(selected, resubmit_to) will_resubmit = resubmit_selected and should_resubmit(score) if will_resubmit: task.extended_scan = 'submitted' task.params.psid = None if self.is_alert(task, score): self.log.info( f"[{task.ingest_id} :: {task.sha256}] Notifying alerter " f"to {'update' if will_resubmit else 'create'} an alert") self.alert_queue.push(task.as_primitives()) self.send_notification(task) if will_resubmit: self.log.info( f"[{task.ingest_id} :: {task.sha256}] Resubmitted for extended analysis" ) task.params.psid = sid task.submission.sid = None task.params.services.resubmit = [] task.scan_key = None task.params.services.selected = resubmit_selected self.unique_queue.push(task.params.priority, task.as_primitives()) def is_alert(self, task: IngestTask, score): if not task.params.generate_alert: return False if score < self.threshold_value['critical']: return False return True
def resubmit_for_dynamic(sha256, *args, **kwargs): """ Resubmit a file for dynamic analysis Variables: sha256 => Resource locator (SHA256) Arguments (Optional): copy_sid => Mimic the attributes of this SID. name => Name of the file for the submission Data Block: None Result example: # Submission message object as a json dictionary """ user = kwargs['user'] copy_sid = request.args.get('copy_sid', None) name = request.args.get('name', sha256) if copy_sid: submission = STORAGE.submission.get(copy_sid, as_obj=False) else: submission = None if submission: if not Classification.is_accessible(user['classification'], submission['classification']): return make_api_response( "", "You are not allowed to re-submit a submission that you don't have access to", 403) submission_params = submission['params'] submission_params['classification'] = submission['classification'] else: submission_params = ui_to_submission_params( STORAGE.user_settings.get(user['uname'], as_obj=False)) with forge.get_filestore() as f_transport: if not f_transport.exists(sha256): return make_api_response( {}, "File %s cannot be found on the server therefore it cannot be resubmitted." % sha256, status_code=404) files = [{'name': name, 'sha256': sha256}] submission_params['submitter'] = user['uname'] if 'priority' not in submission_params: submission_params['priority'] = 500 submission_params[ 'description'] = "Resubmit %s for Dynamic Analysis" % name if "Dynamic Analysis" not in submission_params['services']['selected']: submission_params['services']['selected'].append( "Dynamic Analysis") try: submission_obj = Submission({ "files": files, "params": submission_params }) except (ValueError, KeyError) as e: return make_api_response("", err=str(e), status_code=400) try: submit_result = SubmissionClient( datastore=STORAGE, filestore=f_transport, config=config).submit(submission_obj) except SubmissionException as e: return make_api_response("", err=str(e), status_code=400) return make_api_response(submit_result.as_primitives())
def submit(**kwargs): """ Submit a single file, sha256 or url for analysis Note 1: If you are submitting a sh256 or a URL, you must use the application/json encoding and one of sha256 or url parameters must be included in the data block. Note 2: If you are submitting a file directly, you have to use multipart/form-data encoding this was done to reduce the memory footprint and speedup file transfers ** Read documentation of mime multipart standard if your library does not support it** The multipart/form-data for sending binary has two parts: - The first part contains a JSON dump of the optional params and uses the name 'json' - The last part conatins the file binary, uses the name 'bin' and includes a filename Variables: None Arguments: None Data Block (SHA256 or URL): { // REQUIRED: One of the two following "sha256": "123...DEF", # SHA256 hash of the file already in the datastore "url": "http://...", # Url to fetch the file from // OPTIONAL VALUES "name": "file.exe", # Name of the file to scan otherwise the sha256 or base file of the url "metadata": { # Submission metadata "key": val, # Key/Value pair metadata values }, "params": { # Submission parameters "key": val, # Key/Value pair for params that different then defaults }, # Default params can be fetch at /api/v3/user/submission_params/<user>/ } Data Block (Binary): --0b34a3c50d3c02dd804a172329a0b2aa <-- Randomly generated boundary for this http request Content-Disposition: form-data; name="json" <-- JSON data blob part (only previous optional values valid) {"metadata": {"hello": "world"}} --0b34a3c50d3c02dd804a172329a0b2aa <-- Switch to next part, file part Content-Disposition: form-data; name="bin"; filename="name_of_the_file_to_scan.bin" <BINARY DATA OF THE FILE TO SCAN... DOES NOT NEED TO BE ENCODDED> --0b34a3c50d3c02dd804a172329a0b2aa-- <-- End of HTTP transmission Result example: <Submission message object as a json dictionary> """ user = kwargs['user'] quota_error = check_submission_quota(user) if quota_error: return make_api_response("", quota_error, 503) out_dir = os.path.join(TEMP_SUBMIT_DIR, get_random_id()) with forge.get_filestore() as f_transport: try: # Get data block and binary blob if 'multipart/form-data' in request.content_type: if 'json' in request.values: data = json.loads(request.values['json']) else: data = {} binary = request.files['bin'] name = data.get("name", binary.filename) sha256 = None url = None elif 'application/json' in request.content_type: data = request.json binary = None sha256 = data.get('sha256', None) url = data.get('url', None) name = data.get( "name", None) or sha256 or os.path.basename(url) or None else: return make_api_response({}, "Invalid content type", 400) if data is None: return make_api_response({}, "Missing data block", 400) if not name: return make_api_response({}, "Filename missing", 400) name = os.path.basename(name) if not name: return make_api_response({}, "Invalid filename", 400) # Create task object if "ui_params" in data: s_params = ui_to_submission_params(data['ui_params']) else: s_params = ui_to_submission_params( STORAGE.user_settings.get(user['uname'], as_obj=False)) if not s_params: s_params = get_default_user_settings(user) s_params.update(data.get("params", {})) if 'groups' not in s_params: s_params['groups'] = user['groups'] s_params['quota_item'] = True s_params['submitter'] = user['uname'] if not s_params['description']: s_params['description'] = "Inspection of file: %s" % name if not Classification.is_accessible(user['classification'], s_params['classification']): return make_api_response( {}, "You cannot start a scan with higher " "classification then you're allowed to see", 400) # Prepare the output directory try: os.makedirs(out_dir) except Exception: pass out_file = os.path.join(out_dir, name) # Get the output file extra_meta = {} if not binary: if sha256: if f_transport.exists(sha256): f_transport.download(sha256, out_file) else: return make_api_response( {}, "SHA256 does not exist in our datastore", 404) else: if url: if not config.ui.allow_url_submissions: return make_api_response( {}, "URL submissions are disabled in this system", 400) try: safe_download(url, out_file) extra_meta['submitted_url'] = url except FileTooBigException: return make_api_response( {}, "File too big to be scanned.", 400) except InvalidUrlException: return make_api_response( {}, "Url provided is invalid.", 400) except ForbiddenLocation: return make_api_response( {}, "Hostname in this URL cannot be resolved.", 400) else: return make_api_response( {}, "Missing file to scan. No binary, sha256 or url provided.", 400) else: with open(out_file, "wb") as my_file: my_file.write(binary.read()) try: metadata = flatten(data.get('metadata', {})) metadata.update(extra_meta) submission_obj = Submission({ "files": [], "metadata": metadata, "params": s_params }) except (ValueError, KeyError) as e: return make_api_response("", err=str(e), status_code=400) # Submit the task to the system try: result = SubmissionClient(datastore=STORAGE, filestore=f_transport, config=config).submit( submission_obj, local_files=[out_file], cleanup=False) except SubmissionException as e: return make_api_response("", err=str(e), status_code=400) return make_api_response(result.as_primitives()) finally: try: # noinspection PyUnboundLocalVariable os.unlink(out_file) except Exception: pass try: shutil.rmtree(out_dir, ignore_errors=True) except Exception: pass
def start_ui_submission(ui_sid, **kwargs): """ Start UI submission. Starts processing after files where uploaded to the server. Variables: ui_sid => UUID for the current UI file upload Arguments: None Data Block (REQUIRED): Dictionary of UI specific user settings Result example: { 'started': True, # Has the submission started processing? 'sid' : "c7668cfa-...-c4132285142e" # Submission ID } """ user = kwargs['user'] quota_error = check_submission_quota(user) if quota_error: return make_api_response("", quota_error, 503) ui_params = request.json ui_params['groups'] = kwargs['user']['groups'] ui_params['quota_item'] = True ui_params['submitter'] = user['uname'] if not Classification.is_accessible(user['classification'], ui_params['classification']): return make_api_response({ "started": False, "sid": None }, "You cannot start a scan with higher " "classification then you're allowed to see", 403) request_files = [] request_dirs = [] fnames = [] try: flist = glob.glob(TEMP_DIR + ui_sid + "*") if len(flist) > 0: # Generate file list for fpath in flist: request_dirs.append(fpath) files = os.listdir(fpath) for myfile in files: request_files.append(os.path.join(fpath, myfile)) if myfile not in fnames: fnames.append(myfile) if not ui_params['description']: ui_params['description'] = "Inspection of file%s: %s" % ( { True: "s", False: "" }[len(fnames) > 1], ", ".join(fnames)) # Submit to dispatcher try: submission_obj = Submission({ "files": [], "params": ui_to_submission_params(ui_params) }) except (ValueError, KeyError) as e: return make_api_response("", err=str(e), status_code=400) with forge.get_filestore() as f_transport: try: result = SubmissionClient(datastore=STORAGE, filestore=f_transport, config=config).submit( submission_obj, local_files=request_files, cleanup=False) except SubmissionException as e: return make_api_response("", err=str(e), status_code=400) return make_api_response({"started": True, "sid": result.sid}) else: return make_api_response({ "started": False, "sid": None }, "No files where found for ID %s. " "Try again..." % ui_sid, 404) finally: # Remove files for myfile in request_files: try: os.unlink(myfile) except Exception: pass # Remove dirs for fpath in request_dirs: try: os.rmdir(fpath) except Exception: pass
def start_ui_submission(ui_sid, **kwargs): """ Start UI submission. Starts processing after files where uploaded to the server. Variables: ui_sid => UUID for the current UI file upload Arguments: None Data Block (REQUIRED): Dictionary of UI specific user settings Result example: { 'started': True, # Has the submission started processing? 'sid' : "c7668cfa-...-c4132285142e" # Submission ID } """ user = kwargs['user'] ui_params = request.json ui_params['groups'] = kwargs['user']['groups'] ui_params['quota_item'] = True ui_params['submitter'] = user['uname'] if not Classification.is_accessible(user['classification'], ui_params['classification']): return make_api_response({ "started": False, "sid": None }, "You cannot start a scan with higher " "classification then you're allowed to see", 403) quota_error = check_submission_quota(user) if quota_error: return make_api_response("", quota_error, 503) submit_result = None submitted_file = None try: # Download the file from the cache with forge.get_cachestore("flowjs", config) as cache: ui_sid = get_cache_name(ui_sid) if cache.exists(ui_sid): target_dir = os.path.join(TEMP_DIR, ui_sid) os.makedirs(target_dir, exist_ok=True) target_file = os.path.join(target_dir, ui_params.pop('filename', ui_sid)) if os.path.exists(target_file): os.unlink(target_file) # Save the reconstructed file cache.download(ui_sid, target_file) submitted_file = target_file # Submit the file if submitted_file is not None: with open(submitted_file, 'rb') as fh: if is_cart(fh.read(256)): meta = get_metadata_only(submitted_file) if meta.get('al', {}).get('type', 'unknown') == 'archive/bundle/al': try: submission = import_bundle(submitted_file, allow_incomplete=True, identify=IDENTIFY) except Exception as e: return make_api_response("", err=str(e), status_code=400) return make_api_response({ "started": True, "sid": submission['sid'] }) if not ui_params['description']: ui_params[ 'description'] = f"Inspection of file: {os.path.basename(submitted_file)}" # Submit to dispatcher try: params = ui_to_submission_params(ui_params) # Enforce maximum DTL if config.submission.max_dtl > 0: params['ttl'] = min(int( params['ttl']), config.submission.max_dtl) if int( params['ttl']) else config.submission.max_dtl submission_obj = Submission({"files": [], "params": params}) except (ValueError, KeyError) as e: return make_api_response("", err=str(e), status_code=400) try: submit_result = SubmissionClient( datastore=STORAGE, filestore=FILESTORE, config=config, identify=IDENTIFY).submit(submission_obj, local_files=[submitted_file]) submission_received(submission_obj) except SubmissionException as e: return make_api_response("", err=str(e), status_code=400) return make_api_response({ "started": True, "sid": submit_result.sid }) else: return make_api_response({ "started": False, "sid": None }, "No files where found for ID %s. " "Try again..." % ui_sid, 404) finally: if submit_result is None: decrement_submission_quota(user) # Remove file if os.path.exists(submitted_file): os.unlink(submitted_file) # Remove dir if os.path.exists(target_dir) and os.path.isdir(target_dir): os.rmdir(target_dir)
def resubmit_for_dynamic(sha256, *args, **kwargs): """ Resubmit a file for dynamic analysis Variables: sha256 => Resource locator (SHA256) Arguments (Optional): copy_sid => Mimic the attributes of this SID. name => Name of the file for the submission Data Block: None Result example: # Submission message object as a json dictionary """ user = kwargs['user'] quota_error = check_submission_quota(user) if quota_error: return make_api_response("", quota_error, 503) file_info = STORAGE.file.get(sha256, as_obj=False) if not file_info: return make_api_response( {}, f"File {sha256} cannot be found on the server therefore it cannot be resubmitted.", status_code=404) if not Classification.is_accessible(user['classification'], file_info['classification']): return make_api_response( "", "You are not allowed to re-submit a file that you don't have access to", 403) submit_result = None try: copy_sid = request.args.get('copy_sid', None) name = safe_str(request.args.get('name', sha256)) if copy_sid: submission = STORAGE.submission.get(copy_sid, as_obj=False) else: submission = None if submission: if not Classification.is_accessible(user['classification'], submission['classification']): return make_api_response( "", "You are not allowed to re-submit a submission that you don't have access to", 403) submission_params = submission['params'] submission_params['classification'] = submission['classification'] expiry = submission['expiry_ts'] else: submission_params = ui_to_submission_params( load_user_settings(user)) submission_params['classification'] = file_info['classification'] expiry = file_info['expiry_ts'] # Calculate original submit time if submission_params['ttl'] and expiry: submit_time = epoch_to_iso( iso_to_epoch(expiry) - submission_params['ttl'] * 24 * 60 * 60) else: submit_time = None if not FILESTORE.exists(sha256): return make_api_response( {}, "File %s cannot be found on the server therefore it cannot be resubmitted." % sha256, status_code=404) files = [{'name': name, 'sha256': sha256, 'size': file_info['size']}] submission_params['submitter'] = user['uname'] submission_params['quota_item'] = True if 'priority' not in submission_params: submission_params['priority'] = 500 submission_params[ 'description'] = "Resubmit %s for Dynamic Analysis" % name if "Dynamic Analysis" not in submission_params['services']['selected']: submission_params['services']['selected'].append( "Dynamic Analysis") try: submission_obj = Submission({ "files": files, "params": submission_params, "time": submit_time }) except (ValueError, KeyError) as e: return make_api_response("", err=str(e), status_code=400) submit_result = SubmissionClient( datastore=STORAGE, filestore=FILESTORE, config=config, identify=IDENTIFY).submit(submission_obj) submission_received(submission_obj) return make_api_response(submit_result.as_primitives()) except SubmissionException as e: return make_api_response("", err=str(e), status_code=400) finally: if submit_result is None: decrement_submission_quota(user)
def resubmit_submission_for_analysis(sid, *args, **kwargs): """ Resubmit a submission for analysis with the exact same parameters as before Variables: sid => Submission ID to re-submit Arguments: None Data Block: None Result example: # Submission message object as a json dictionary """ user = kwargs['user'] quota_error = check_submission_quota(user) if quota_error: return make_api_response("", quota_error, 503) submit_result = None try: submission = STORAGE.submission.get(sid, as_obj=False) if submission: if not Classification.is_accessible(user['classification'], submission['classification']): return make_api_response( "", "You are not allowed to re-submit a submission that you don't have access to", 403) submission_params = submission['params'] submission_params['classification'] = submission['classification'] else: return make_api_response({}, "Submission %s does not exists." % sid, status_code=404) submission_params['submitter'] = user['uname'] submission_params['quota_item'] = True submission_params[ 'description'] = "Resubmit %s for analysis" % ", ".join( [x['name'] for x in submission["files"]]) # Calculate original submit time if submission_params['ttl'] and submission['expiry_ts']: submit_time = epoch_to_iso( iso_to_epoch(submission['expiry_ts']) - submission_params['ttl'] * 24 * 60 * 60) else: submit_time = None try: submission_obj = Submission({ "files": submission["files"], "metadata": submission['metadata'], "params": submission_params, "time": submit_time }) except (ValueError, KeyError) as e: return make_api_response("", err=str(e), status_code=400) submit_result = SubmissionClient( datastore=STORAGE, filestore=FILESTORE, config=config, identify=IDENTIFY).submit(submission_obj) submission_received(submission_obj) return make_api_response(submit_result.as_primitives()) except SubmissionException as e: return make_api_response("", err=str(e), status_code=400) finally: if submit_result is None: decrement_submission_quota(user)
class Ingester(ThreadedCoreBase): def __init__(self, datastore=None, logger=None, classification=None, redis=None, persistent_redis=None, metrics_name='ingester', config=None): super().__init__('assemblyline.ingester', logger, redis=redis, redis_persist=persistent_redis, datastore=datastore, config=config) # Cache the user groups self.cache_lock = threading.RLock() self._user_groups = {} self._user_groups_reset = time.time() // HOUR_IN_SECONDS self.cache = {} self.notification_queues = {} self.whitelisted = {} self.whitelisted_lock = threading.RLock() # Module path parameters are fixed at start time. Changing these involves a restart self.is_low_priority = load_module_by_path( self.config.core.ingester.is_low_priority) self.get_whitelist_verdict = load_module_by_path( self.config.core.ingester.get_whitelist_verdict) self.whitelist = load_module_by_path( self.config.core.ingester.whitelist) # Constants are loaded based on a non-constant path, so has to be done at init rather than load constants = forge.get_constants(self.config) self.priority_value: dict[str, int] = constants.PRIORITIES self.priority_range: dict[str, Tuple[int, int]] = constants.PRIORITY_RANGES self.threshold_value: dict[str, int] = constants.PRIORITY_THRESHOLDS # Classification engine self.ce = classification or forge.get_classification() # Metrics gathering factory self.counter = MetricsFactory(metrics_type='ingester', schema=Metrics, redis=self.redis, config=self.config, name=metrics_name) # State. The submissions in progress are stored in Redis in order to # persist this state and recover in case we crash. self.scanning = Hash('m-scanning-table', self.redis_persist) # Input. The dispatcher creates a record when any submission completes. self.complete_queue = NamedQueue(COMPLETE_QUEUE_NAME, self.redis) # Input. An external process places submission requests on this queue. self.ingest_queue = NamedQueue(INGEST_QUEUE_NAME, self.redis_persist) # Output. Duplicate our input traffic into this queue so it may be cloned by other systems self.traffic_queue = CommsQueue('submissions', self.redis) # Internal. Unique requests are placed in and processed from this queue. self.unique_queue = PriorityQueue('m-unique', self.redis_persist) # Internal, delay queue for retrying self.retry_queue = PriorityQueue('m-retry', self.redis_persist) # Internal, timeout watch queue self.timeout_queue: PriorityQueue[str] = PriorityQueue( 'm-timeout', self.redis) # Internal, queue for processing duplicates # When a duplicate file is detected (same cache key => same file, and same # submission parameters) the file won't be ingested normally, but instead a reference # will be written to a duplicate queue. Whenever a file is finished, in the complete # method, not only is the original ingestion finalized, but all entries in the duplicate queue # are finalized as well. This has the effect that all concurrent ingestion of the same file # are 'merged' into a single submission to the system. self.duplicate_queue = MultiQueue(self.redis_persist) # Output. submissions that should have alerts generated self.alert_queue = NamedQueue(ALERT_QUEUE_NAME, self.redis_persist) # Utility object to help submit tasks to dispatching self.submit_client = SubmissionClient(datastore=self.datastore, redis=self.redis) if self.config.core.metrics.apm_server.server_url is not None: self.log.info( f"Exporting application metrics to: {self.config.core.metrics.apm_server.server_url}" ) elasticapm.instrument() self.apm_client = elasticapm.Client( server_url=self.config.core.metrics.apm_server.server_url, service_name="ingester") else: self.apm_client = None def try_run(self): threads_to_maintain = { 'Retries': self.handle_retries, 'Timeouts': self.handle_timeouts } threads_to_maintain.update({ f'Complete_{n}': self.handle_complete for n in range(COMPLETE_THREADS) }) threads_to_maintain.update( {f'Ingest_{n}': self.handle_ingest for n in range(INGEST_THREADS)}) threads_to_maintain.update( {f'Submit_{n}': self.handle_submit for n in range(SUBMIT_THREADS)}) self.maintain_threads(threads_to_maintain) def handle_ingest(self): cpu_mark = time.process_time() time_mark = time.time() # Move from ingest to unique and waiting queues. # While there are entries in the ingest queue we consume chunk_size # entries at a time and move unique entries to uniqueq / queued and # duplicates to their own queues / waiting. while self.running: self.counter.increment_execution_time( 'cpu_seconds', time.process_time() - cpu_mark) self.counter.increment_execution_time('busy_seconds', time.time() - time_mark) message = self.ingest_queue.pop(timeout=1) cpu_mark = time.process_time() time_mark = time.time() if not message: continue # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_msg') try: if 'submission' in message: # A retried task task = IngestTask(message) else: # A new submission sub = MessageSubmission(message) task = IngestTask(dict( submission=sub, ingest_id=sub.sid, )) task.submission.sid = None # Reset to new random uuid # Write all input to the traffic queue self.traffic_queue.publish( SubmissionMessage({ 'msg': sub, 'msg_type': 'SubmissionIngested', 'sender': 'ingester', }).as_primitives()) except (ValueError, TypeError) as error: self.counter.increment('error') self.log.exception( f"Dropped ingest submission {message} because {str(error)}" ) # End of ingest message (value_error) if self.apm_client: self.apm_client.end_transaction('ingest_input', 'value_error') continue self.ingest(task) # End of ingest message (success) if self.apm_client: self.apm_client.end_transaction('ingest_input', 'success') def handle_submit(self): time_mark, cpu_mark = time.time(), time.process_time() while self.running: # noinspection PyBroadException try: self.counter.increment_execution_time( 'cpu_seconds', time.process_time() - cpu_mark) self.counter.increment_execution_time('busy_seconds', time.time() - time_mark) # Check if there is room for more submissions length = self.scanning.length() if length >= self.config.core.ingester.max_inflight: self.sleep(0.1) time_mark, cpu_mark = time.time(), time.process_time() continue raw = self.unique_queue.blocking_pop(timeout=3) time_mark, cpu_mark = time.time(), time.process_time() if not raw: continue # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_msg') task = IngestTask(raw) # Check if we need to drop a file for capacity reasons, but only if the # number of files in flight is alreay over 80% if length >= self.config.core.ingester.max_inflight * 0.8 and self.drop( task): # End of ingest message (dropped) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'dropped') continue if self.is_whitelisted(task): # End of ingest message (whitelisted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'whitelisted') continue # Check if this file has been previously processed. pprevious, previous, score, scan_key = None, None, None, None if not task.submission.params.ignore_cache: pprevious, previous, score, scan_key = self.check(task) else: scan_key = self.stamp_filescore_key(task) # If it HAS been previously processed, we are dealing with a resubmission # finalize will decide what to do, and put the task back in the queue # rewritten properly if we are going to run it again if previous: if not task.submission.params.services.resubmit and not pprevious: self.log.warning( f"No psid for what looks like a resubmission of " f"{task.submission.files[0].sha256}: {scan_key}") self.finalize(pprevious, previous, score, task) # End of ingest message (finalized) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'finalized') continue # We have decided this file is worth processing # Add the task to the scanning table, this is atomic across all submit # workers, so if it fails, someone beat us to the punch, record the file # as a duplicate then. if not self.scanning.add(scan_key, task.as_primitives()): self.log.debug('Duplicate %s', task.submission.files[0].sha256) self.counter.increment('duplicates') self.duplicate_queue.push(_dup_prefix + scan_key, task.as_primitives()) # End of ingest message (duplicate) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'duplicate') continue # We have managed to add the task to the scan table, so now we go # ahead with the submission process try: self.submit(task) # End of ingest message (submitted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'submitted') continue except Exception as _ex: # For some reason (contained in `ex`) we have failed the submission # The rest of this function is error handling/recovery ex = _ex # traceback = _ex.__traceback__ self.counter.increment('error') should_retry = True if isinstance(ex, CorruptedFileStoreException): self.log.error( "Submission for file '%s' failed due to corrupted " "filestore: %s" % (task.sha256, str(ex))) should_retry = False elif isinstance(ex, DataStoreException): trace = exceptions.get_stacktrace_info(ex) self.log.error("Submission for file '%s' failed due to " "data store error:\n%s" % (task.sha256, trace)) elif not isinstance(ex, FileStoreException): trace = exceptions.get_stacktrace_info(ex) self.log.error("Submission for file '%s' failed: %s" % (task.sha256, trace)) task = IngestTask(self.scanning.pop(scan_key)) if not task: self.log.error('No scanning entry for for %s', task.sha256) # End of ingest message (no_scan_entry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'no_scan_entry') continue if not should_retry: # End of ingest message (cannot_retry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'cannot_retry') continue self.retry(task, scan_key, ex) # End of ingest message (retry) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'retried') except Exception: self.log.exception("Unexpected error") # End of ingest message (exception) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'exception') def handle_complete(self): while self.running: result = self.complete_queue.pop(timeout=3) if not result: continue cpu_mark = time.process_time() time_mark = time.time() # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_msg') sub = DatabaseSubmission(result) self.completed(sub) # End of ingest message (success) if self.apm_client: elasticapm.label(sid=sub.sid) self.apm_client.end_transaction('ingest_complete', 'success') self.counter.increment_execution_time( 'cpu_seconds', time.process_time() - cpu_mark) self.counter.increment_execution_time('busy_seconds', time.time() - time_mark) def handle_retries(self): tasks = [] while self.sleep(0 if tasks else 3): cpu_mark = time.process_time() time_mark = time.time() # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_retries') tasks = self.retry_queue.dequeue_range(upper_limit=isotime.now(), num=100) for task in tasks: self.ingest_queue.push(task) # End of ingest message (success) if self.apm_client: elasticapm.label(retries=len(tasks)) self.apm_client.end_transaction('ingest_retries', 'success') self.counter.increment_execution_time( 'cpu_seconds', time.process_time() - cpu_mark) self.counter.increment_execution_time('busy_seconds', time.time() - time_mark) def handle_timeouts(self): timeouts = [] while self.sleep(0 if timeouts else 3): cpu_mark = time.process_time() time_mark = time.time() # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_timeouts') timeouts = self.timeout_queue.dequeue_range( upper_limit=isotime.now(), num=100) for scan_key in timeouts: # noinspection PyBroadException try: actual_timeout = False # Remove the entry from the hash of submissions in progress. entry = self.scanning.pop(scan_key) if entry: actual_timeout = True self.log.error("Submission timed out for %s: %s", scan_key, str(entry)) dup = self.duplicate_queue.pop(_dup_prefix + scan_key, blocking=False) if dup: actual_timeout = True while dup: self.log.error("Submission timed out for %s: %s", scan_key, str(dup)) dup = self.duplicate_queue.pop(_dup_prefix + scan_key, blocking=False) if actual_timeout: self.counter.increment('timed_out') except Exception: self.log.exception("Problem timing out %s:", scan_key) # End of ingest message (success) if self.apm_client: elasticapm.label(timeouts=len(timeouts)) self.apm_client.end_transaction('ingest_timeouts', 'success') self.counter.increment_execution_time( 'cpu_seconds', time.process_time() - cpu_mark) self.counter.increment_execution_time('busy_seconds', time.time() - time_mark) def get_groups_from_user(self, username: str) -> List[str]: # Reset the group cache at the top of each hour if time.time() // HOUR_IN_SECONDS > self._user_groups_reset: self._user_groups = {} self._user_groups_reset = time.time() // HOUR_IN_SECONDS # Get the groups for this user if not known if username not in self._user_groups: user_data = self.datastore.user.get(username) if user_data: self._user_groups[username] = user_data.groups else: self._user_groups[username] = [] return self._user_groups[username] def ingest(self, task: IngestTask): self.log.info( f"[{task.ingest_id} :: {task.sha256}] Task received for processing" ) # Load a snapshot of ingest parameters as of right now. max_file_size = self.config.submission.max_file_size param = task.params self.counter.increment('bytes_ingested', increment_by=task.file_size) self.counter.increment('submissions_ingested') if any(len(file.sha256) != 64 for file in task.submission.files): self.log.error( f"[{task.ingest_id} :: {task.sha256}] Invalid sha256, skipped") self.send_notification(task, failure="Invalid sha256", logfunc=self.log.warning) return # Clean up metadata strings, since we may delete some, iterate on a copy of the keys for key in list(task.submission.metadata.keys()): value = task.submission.metadata[key] meta_size = len(value) if meta_size > self.config.submission.max_metadata_length: self.log.info( f'[{task.ingest_id} :: {task.sha256}] ' f'Removing {key} from metadata because value is too big') task.submission.metadata.pop(key) if task.file_size > max_file_size and not task.params.ignore_size and not task.params.never_drop: task.failure = f"File too large ({task.file_size} > {max_file_size})" self._notify_drop(task) self.counter.increment('skipped') self.log.error( f"[{task.ingest_id} :: {task.sha256}] {task.failure}") return # Set the groups from the user, if they aren't already set if not task.params.groups: task.params.groups = self.get_groups_from_user( task.params.submitter) # Check if this file is already being processed self.stamp_filescore_key(task) pprevious, previous, score = None, None, None if not param.ignore_cache: pprevious, previous, score, _ = self.check(task, count_miss=False) # Assign priority. low_priority = self.is_low_priority(task) priority = param.priority if priority < 0: priority = self.priority_value['medium'] if score is not None: priority = self.priority_value['low'] for level, threshold in self.threshold_value.items(): if score >= threshold: priority = self.priority_value[level] break elif low_priority: priority = self.priority_value['low'] # Reduce the priority by an order of magnitude for very old files. current_time = now() if priority and self.expired( current_time - task.submission.time.timestamp(), 0): priority = (priority / 10) or 1 param.priority = priority # Do this after priority has been assigned. # (So we don't end up dropping the resubmission). if previous: self.counter.increment('duplicates') self.finalize(pprevious, previous, score, task) # On cache hits of any kind we want to send out a completed message self.traffic_queue.publish( SubmissionMessage({ 'msg': task.submission, 'msg_type': 'SubmissionCompleted', 'sender': 'ingester', }).as_primitives()) return if self.drop(task): self.log.info(f"[{task.ingest_id} :: {task.sha256}] Dropped") return if self.is_whitelisted(task): self.log.info(f"[{task.ingest_id} :: {task.sha256}] Whitelisted") return self.unique_queue.push(priority, task.as_primitives()) def check( self, task: IngestTask, count_miss=True ) -> Tuple[Optional[str], Optional[str], Optional[float], str]: key = self.stamp_filescore_key(task) with self.cache_lock: result = self.cache.get(key, None) if result: self.counter.increment('cache_hit_local') self.log.info( f'[{task.ingest_id} :: {task.sha256}] Local cache hit') else: result = self.datastore.filescore.get_if_exists(key) if result: self.counter.increment('cache_hit') self.log.info( f'[{task.ingest_id} :: {task.sha256}] Remote cache hit') else: if count_miss: self.counter.increment('cache_miss') return None, None, None, key with self.cache_lock: self.cache[key] = result current_time = now() age = current_time - result.time errors = result.errors if self.expired(age, errors): self.log.info( f"[{task.ingest_id} :: {task.sha256}] Cache hit dropped, cache has expired" ) self.counter.increment('cache_expired') self.cache.pop(key, None) self.datastore.filescore.delete(key) return None, None, None, key elif self.stale(age, errors): self.log.info( f"[{task.ingest_id} :: {task.sha256}] Cache hit dropped, cache is stale" ) self.counter.increment('cache_stale') return None, None, result.score, key return result.psid, result.sid, result.score, key def stop(self): super().stop() if self.apm_client: elasticapm.uninstrument() self.submit_client.stop() def stale(self, delta: float, errors: int): if errors: return delta >= self.config.core.ingester.incomplete_stale_after_seconds else: return delta >= self.config.core.ingester.stale_after_seconds @staticmethod def stamp_filescore_key(task: IngestTask, sha256: str = None) -> str: if not sha256: sha256 = task.submission.files[0].sha256 key = task.submission.scan_key if not key: key = task.params.create_filescore_key(sha256) task.submission.scan_key = key return key def completed(self, sub: DatabaseSubmission): """Invoked when notified that a submission has completed.""" # There is only one file in the submissions we have made sha256 = sub.files[0].sha256 scan_key = sub.scan_key if not scan_key: self.log.warning( f"[{sub.metadata.get('ingest_id', 'unknown')} :: {sha256}] " f"Submission missing scan key") scan_key = sub.params.create_filescore_key(sha256) raw = self.scanning.pop(scan_key) psid = sub.params.psid score = sub.max_score sid = sub.sid if not raw: # Some other worker has already popped the scanning queue? self.log.warning( f"[{sub.metadata.get('ingest_id', 'unknown')} :: {sha256}] " f"Submission completed twice") return scan_key task = IngestTask(raw) task.submission.sid = sid errors = sub.error_count file_count = sub.file_count self.counter.increment('submissions_completed') self.counter.increment('files_completed', increment_by=file_count) self.counter.increment('bytes_completed', increment_by=task.file_size) with self.cache_lock: fs = self.cache[scan_key] = FileScore({ 'expiry_ts': now(self.config.core.ingester.cache_dtl * 24 * 60 * 60), 'errors': errors, 'psid': psid, 'score': score, 'sid': sid, 'time': now(), }) self.datastore.filescore.save(scan_key, fs) self.finalize(psid, sid, score, task) def exhaust() -> Iterable[IngestTask]: while True: res = self.duplicate_queue.pop(_dup_prefix + scan_key, blocking=False) if res is None: break res = IngestTask(res) res.submission.sid = sid yield res # You may be tempted to remove the assignment to dups and use the # value directly in the for loop below. That would be a mistake. # The function finalize may push on the duplicate queue which we # are pulling off and so condensing those two lines creates a # potential infinite loop. dups = [dup for dup in exhaust()] for dup in dups: self.finalize(psid, sid, score, dup) return scan_key def send_notification(self, task: IngestTask, failure=None, logfunc=None): if logfunc is None: logfunc = self.log.info if failure: task.failure = failure failure = task.failure if failure: logfunc("%s: %s", failure, str(task.json())) if not task.submission.notification.queue: return note_queue = _notification_queue_prefix + task.submission.notification.queue threshold = task.submission.notification.threshold if threshold is not None and task.score is not None and task.score < threshold: return q = self.notification_queues.get(note_queue, None) if not q: self.notification_queues[note_queue] = q = NamedQueue( note_queue, self.redis_persist) q.push(task.as_primitives()) def expired(self, delta: float, errors) -> bool: if errors: return delta >= self.config.core.ingester.incomplete_expire_after_seconds else: return delta >= self.config.core.ingester.expire_after def drop(self, task: IngestTask) -> bool: priority = task.params.priority sample_threshold = self.config.core.ingester.sampling_at dropped = False if priority <= _min_priority: dropped = True else: for level, rng in self.priority_range.items(): if rng[0] <= priority <= rng[1] and level in sample_threshold: dropped = must_drop(self.unique_queue.count(*rng), sample_threshold[level]) break if not dropped: if task.file_size > self.config.submission.max_file_size or task.file_size == 0: dropped = True if task.params.never_drop or not dropped: return False task.failure = 'Skipped' self._notify_drop(task) self.counter.increment('skipped') return True def _notify_drop(self, task: IngestTask): self.send_notification(task) c12n = task.params.classification expiry = now_as_iso(86400) sha256 = task.submission.files[0].sha256 self.datastore.save_or_freshen_file(sha256, {'sha256': sha256}, expiry, c12n, redis=self.redis) def is_whitelisted(self, task: IngestTask): reason, hit = self.get_whitelist_verdict(self.whitelist, task) hit = {x: dotdump(safe_str(y)) for x, y in hit.items()} sha256 = task.submission.files[0].sha256 if not reason: with self.whitelisted_lock: reason = self.whitelisted.get(sha256, None) if reason: hit = 'cached' if reason: if hit != 'cached': with self.whitelisted_lock: self.whitelisted[sha256] = reason task.failure = "Whitelisting due to reason %s (%s)" % (dotdump( safe_str(reason)), hit) self._notify_drop(task) self.counter.increment('whitelisted') return reason def submit(self, task: IngestTask): self.submit_client.submit( submission_obj=task.submission, completed_queue=COMPLETE_QUEUE_NAME, ) self.timeout_queue.push(int(now(_max_time)), task.submission.scan_key) self.log.info( f"[{task.ingest_id} :: {task.sha256}] Submitted to dispatcher for analysis" ) def retry(self, task: IngestTask, scan_key: str, ex): current_time = now() retries = task.retries + 1 if retries > _max_retries: trace = '' if ex: trace = ': ' + get_stacktrace_info(ex) self.log.error( f'[{task.ingest_id} :: {task.sha256}] Max retries exceeded {trace}' ) self.duplicate_queue.delete(_dup_prefix + scan_key) elif self.expired(current_time - task.ingest_time.timestamp(), 0): self.log.info( f'[{task.ingest_id} :: {task.sha256}] No point retrying expired submission' ) self.duplicate_queue.delete(_dup_prefix + scan_key) else: self.log.info( f'[{task.ingest_id} :: {task.sha256}] Requeuing ({ex or "unknown"})' ) task.retries = retries self.retry_queue.push(int(now(_retry_delay)), task.as_primitives()) def finalize(self, psid: str, sid: str, score: float, task: IngestTask): self.log.info(f"[{task.ingest_id} :: {task.sha256}] Completed") if psid: task.params.psid = psid task.score = score task.submission.sid = sid selected = task.params.services.selected resubmit_to = task.params.services.resubmit resubmit_selected = determine_resubmit_selected(selected, resubmit_to) will_resubmit = resubmit_selected and should_resubmit(score) if will_resubmit: task.extended_scan = 'submitted' task.params.psid = None if self.is_alert(task, score): self.log.info( f"[{task.ingest_id} :: {task.sha256}] Notifying alerter " f"to {'update' if task.params.psid else 'create'} an alert") self.alert_queue.push(task.as_primitives()) self.send_notification(task) if will_resubmit: self.log.info( f"[{task.ingest_id} :: {task.sha256}] Resubmitted for extended analysis" ) task.params.psid = sid task.submission.sid = None task.submission.scan_key = None task.params.services.resubmit = [] task.params.services.selected = resubmit_selected self.unique_queue.push(task.params.priority, task.as_primitives()) def is_alert(self, task: IngestTask, score: float) -> bool: if not task.params.generate_alert: return False if score < self.config.core.alerter.threshold: return False return True
def import_bundle(path, working_dir=WORK_DIR, min_classification=Classification.UNRESTRICTED, allow_incomplete=False, rescan_services=None, exist_ok=False, cleanup=True, identify=None): with forge.get_datastore(archive_access=True) as datastore: current_working_dir = os.path.join(working_dir, get_random_id()) res_file = os.path.join(current_working_dir, "results.json") try: os.makedirs(current_working_dir) except Exception: pass with open(path, 'rb') as original_file: if is_cart(original_file.read(256)): original_file.seek(0) extracted_fd, extracted_path = tempfile.mkstemp() extracted_file = os.fdopen(extracted_fd, 'wb') try: hdr, _ = unpack_stream(original_file, extracted_file) if hdr.get('al', {}).get('type', 'unknown') != BUNDLE_TYPE: raise BundlingException( f"Not a valid CaRTed bundle, should be of type: {BUNDLE_TYPE}" ) finally: extracted_file.close() else: extracted_path = path # Extract the bundle try: subprocess.check_call( ["tar", "-zxf", extracted_path, "-C", current_working_dir]) except subprocess.CalledProcessError: raise BundlingException( "Bundle decompression failed. Not a valid bundle...") with open(res_file, 'rb') as fh: data = json.load(fh) alert = data.get('alert', None) submission = data.get('submission', None) try: if submission: sid = submission['sid'] # Load results, files and errors results = data.get('results', None) files = data.get('files', None) errors = data.get('errors', None) # Check if we have all the service results for res_key in submission['results']: if results is None or (res_key not in results['results'].keys() and not allow_incomplete): raise IncompleteBundle( "Incomplete results in bundle. Skipping %s..." % sid) # Check if we have all files for sha256 in list(set([x[:64] for x in submission['results']])): if files is None or (sha256 not in files['infos'].keys() and not allow_incomplete): raise IncompleteBundle( "Incomplete files in bundle. Skipping %s..." % sid) # Check if we all errors for err_key in submission['errors']: if errors is None or (err_key not in errors['errors'].keys() and not allow_incomplete): raise IncompleteBundle( "Incomplete errors in bundle. Skipping %s..." % sid) # Check if the submission does not already exist if not datastore.submission.exists(sid): # Make sure bundle's submission meets minimum classification and save the submission submission[ 'classification'] = Classification.max_classification( submission['classification'], min_classification) submission.setdefault('metadata', {}) submission['metadata']['bundle.loaded'] = now_as_iso() submission['metadata'].pop('replay', None) submission.update( Classification.get_access_control_parts( submission['classification'])) if not rescan_services: # Save the submission in the system datastore.submission.save(sid, submission) # Make sure files meet minimum classification and save the files with forge.get_filestore() as filestore: for f, f_data in files['infos'].items(): f_classification = Classification.max_classification( f_data['classification'], min_classification) datastore.save_or_freshen_file( f, f_data, f_data['expiry_ts'], f_classification, cl_engine=Classification) try: filestore.upload( os.path.join(current_working_dir, f), f) except IOError: pass # Make sure results meet minimum classification and save the results for key, res in results['results'].items(): if key.endswith(".e"): datastore.emptyresult.save( key, {"expiry_ts": res['expiry_ts']}) else: res['classification'] = Classification.max_classification( res['classification'], min_classification) datastore.result.save(key, res) # Make sure errors meet minimum classification and save the errors for ekey, err in errors['errors'].items(): datastore.error.save(ekey, err) # Start the rescan if rescan_services and SubmissionClient: extracted_file_infos = { k: { vk: v[vk] for vk in [ 'magic', 'md5', 'mime', 'sha1', 'sha256', 'size', 'type' ] } for k, v in files['infos'].items() if k in files['list'] } with SubmissionClient(datastore=datastore, filestore=filestore, config=config, identify=identify) as sc: sc.rescan(submission, results['results'], extracted_file_infos, files['tree'], list(errors['errors'].keys()), rescan_services) elif not exist_ok: raise SubmissionAlreadyExist( "Submission %s already exists." % sid) # Save alert if present and does not exist if alert and not datastore.alert.exists(alert['alert_id']): alert['classification'] = Classification.max_classification( alert['classification'], min_classification) alert.setdefault('metadata', {}) alert['metadata']['bundle.loaded'] = now_as_iso() alert['metadata'].pop('replay', None) alert['workflows_completed'] = False datastore.alert.save(alert['alert_id'], alert) return submission finally: if extracted_path != path and os.path.exists(extracted_path): os.remove(extracted_path) if cleanup and os.path.exists(path): os.remove(path) if os.path.exists(current_working_dir): shutil.rmtree(current_working_dir, ignore_errors=True)