def test_dispatch_extracted(clean_redis, clean_datastore): redis = clean_redis ds = clean_datastore # def service_queue(name): return get_service_queue(name, redis) # Setup the fake datastore file_hash = get_random_hash(64) second_file_hash = get_random_hash(64) for fh in [file_hash, second_file_hash]: obj = random_model_obj(models.file.File) obj.sha256 = fh ds.file.save(fh, obj) # Inject the fake submission submission = random_model_obj(models.submission.Submission) submission.files = [dict(name='./file', sha256=file_hash)] sid = submission.sid = 'first-submission' disp = Dispatcher(ds, redis, redis) disp.running = ToggleTrue() client = DispatchClient(ds, redis, redis) client.dispatcher_data_age = time.time() client.dispatcher_data.append(disp.instance_id) # Launch the submission client.dispatch_submission(submission) disp.pull_submissions() disp.service_worker(disp.process_queue_index(sid)) # Finish one service extracting a file job = client.request_work('0', 'extract', '0') assert job.fileinfo.sha256 == file_hash assert job.filename == './file' new_result: Result = random_minimal_obj(Result) new_result.sha256 = file_hash new_result.response.service_name = 'extract' new_result.response.extracted = [ dict(sha256=second_file_hash, name='second-*', description='abc', classification='U') ] client.service_finished(sid, 'extracted-done', new_result) # process the result disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) disp.service_worker(disp.process_queue_index(sid)) # job = client.request_work('0', 'extract', '0') assert job.fileinfo.sha256 == second_file_hash assert job.filename == 'second-*'
class SubmissionClient: """A helper class to simplify submitting files from internal or external sources. This tool helps take care of interactions between the filestore, datastore, dispatcher, and any sources of files to be processed. """ def __init__(self, datastore: AssemblylineDatastore = None, filestore: FileStore = None, config=None, redis=None, identify=None): self.log = logging.getLogger('assemblyline.submission_client') self.config = config or forge.CachedObject(forge.get_config) self.datastore = datastore or forge.get_datastore(self.config) self.filestore = filestore or forge.get_filestore(self.config) self.redis = redis if identify: self.cleanup = False else: self.cleanup = True self.identify = identify or forge.get_identify( config=self.config, datastore=self.datastore, use_cache=True) # A client for interacting with the dispatcher self.dispatcher = DispatchClient(datastore, redis) def __enter__(self): return self def __exit__(self, *_): self.stop() def stop(self): if self.cleanup: self.identify.stop() @elasticapm.capture_span(span_type='submission_client') def rescan(self, submission: Submission, results: Dict[str, Result], file_infos: Dict[str, FileInfo], file_tree, errors: List[str], rescan_services: List[str]): """ Rescan a submission started on another system. """ # Reset submission processing data submission['times'].pop('completed') submission['state'] = 'submitted' # Set the list of service to rescan submission['params']['services']['rescan'] = rescan_services # Create the submission object submission_obj = Submission(submission) if len(submission_obj.files) == 0: raise SubmissionException("No files found to submit.") for f in submission_obj.files: if not self.datastore.file.exists(f.sha256): raise SubmissionException( f"File {f.sha256} does not exist, cannot continue submission." ) # Set the new expiry if submission_obj.params.ttl: submission_obj.expiry_ts = epoch_to_iso(now() + submission_obj.params.ttl * 24 * 60 * 60) # Clearing runtime_excluded on initial submit or resubmit submission_obj.params.services.runtime_excluded = [] # Save the submission self.datastore.submission.save(submission_obj.sid, submission_obj) # Dispatch the submission self.log.debug("Submission complete. Dispatching: %s", submission_obj.sid) self.dispatcher.dispatch_bundle(submission_obj, results, file_infos, file_tree, errors) return submission @elasticapm.capture_span(span_type='submission_client') def submit(self, submission_obj: SubmissionObject, local_files: List = None, completed_queue=None): """Submit several files in a single submission. After this method runs, there should be no local copies of the file left. """ if local_files is None: local_files = [] if len(submission_obj.files) == 0 and len(local_files) == 0: raise SubmissionException("No files found to submit...") if submission_obj.params.ttl: expiry = epoch_to_iso(submission_obj.time.timestamp() + submission_obj.params.ttl * 24 * 60 * 60) else: expiry = None max_size = self.config.submission.max_file_size for local_file in local_files: # Upload/download, extract, analyze files original_classification = str(submission_obj.params.classification) file_hash, size, new_metadata = self._ready_file( local_file, expiry, original_classification) new_name = new_metadata.pop('name', safe_str(os.path.basename(local_file))) meta_classification = new_metadata.pop('classification', original_classification) if meta_classification != original_classification: try: submission_obj.params.classification = Classification.max_classification( meta_classification, original_classification) except InvalidClassification as ic: raise SubmissionException( "The classification found inside the cart file cannot be merged with " f"the classification the file was submitted as: {str(ic)}" ) submission_obj.metadata.update(**flatten(new_metadata)) # Check that after we have resolved exactly what to pass on, that it # remains a valid target for scanning if size > max_size and not submission_obj.params.ignore_size: msg = "File too large (%d > %d). Submission failed" % ( size, max_size) raise SubmissionException(msg) elif size == 0: msg = "File empty. Submission failed" raise SubmissionException(msg) submission_obj.files.append( File({ 'name': new_name, 'size': size, 'sha256': file_hash, })) # Clearing runtime_excluded on initial submit or resubmit submission_obj.params.services.runtime_excluded = [] # We should now have all the information we need to construct a submission object sub = Submission( dict( archive_ts=now_as_iso( self.config.datastore.ilm.days_until_archive * 24 * 60 * 60), classification=submission_obj.params.classification, error_count=0, errors=[], expiry_ts=expiry, file_count=len(submission_obj.files), files=submission_obj.files, max_score=0, metadata=submission_obj.metadata, params=submission_obj.params, results=[], sid=submission_obj.sid, state='submitted', scan_key=submission_obj.scan_key, )) if self.config.ui.allow_malicious_hinting and submission_obj.params.malicious: sub.verdict = {"malicious": [submission_obj.params.submitter]} self.datastore.submission.save(sub.sid, sub) self.log.debug("Submission complete. Dispatching: %s", sub.sid) self.dispatcher.dispatch_submission(sub, completed_queue=completed_queue) return sub def _ready_file(self, local_path: str, expiry, classification) -> Tuple[str, int, dict]: """Take a file from local storage and prepare it for submission. After this method finished the file will ONLY exist on the filestore, not locally. """ extracted_path = None try: # Analyze the file and make sure the file table is up to date fileinfo = self.identify.fileinfo(local_path) if fileinfo['size'] == 0: raise SubmissionException("File empty. Submission failed") # Check if there is an integrated decode process for this file # eg. files that are packaged, and the contained file (not the package # that local_path points to) should be passed into the system. extracted_path, fileinfo, al_meta = decode_file( local_path, fileinfo, self.identify) al_meta['classification'] = al_meta.get('classification', classification) if not Classification.is_valid(al_meta['classification']): raise SubmissionException( f"{al_meta['classification']} is not a valid classification for this system" ", submission is cancelled...") if extracted_path: local_path = extracted_path self.datastore.save_or_freshen_file(fileinfo['sha256'], fileinfo, expiry, al_meta['classification'], redis=self.redis) self.filestore.upload(local_path, fileinfo['sha256']) return fileinfo['sha256'], fileinfo['size'], al_meta finally: # If we extracted anything delete it if extracted_path: if os.path.exists(extracted_path): os.unlink(extracted_path)
def test_simple(clean_redis, clean_datastore): ds = clean_datastore redis = clean_redis def service_queue(name): return get_service_queue(name, redis) file = random_model_obj(File) file_hash = file.sha256 file.type = 'unknown' ds.file.save(file_hash, file) sub: Submission = random_model_obj(models.submission.Submission) sub.sid = sid = 'first-submission' sub.params.ignore_cache = False sub.params.max_extracted = 5 sub.params.classification = get_classification().UNRESTRICTED sub.params.initial_data = json.dumps({'cats': 'big'}) sub.files = [dict(sha256=file_hash, name='file')] disp = Dispatcher(ds, redis, redis) disp.running = ToggleTrue() client = DispatchClient(ds, redis, redis) client.dispatcher_data_age = time.time() client.dispatcher_data.append(disp.instance_id) # Submit a problem, and check that it gets added to the dispatch hash # and the right service queues logger.info('==== first dispatch') # task = SubmissionTask(sub.as_primitives(), 'some-completion-queue') client.dispatch_submission(sub) disp.pull_submissions() disp.service_worker(disp.process_queue_index(sid)) task = disp.tasks.get(sid) assert task.queue_keys[(file_hash, 'extract')] is not None assert task.queue_keys[(file_hash, 'wrench')] is not None assert service_queue('extract').length() == 1 assert service_queue('wrench').length() == 1 # Making the same call again will queue it up again logger.info('==== second dispatch') disp.dispatch_file(task, file_hash) assert task.queue_keys[(file_hash, 'extract')] is not None assert task.queue_keys[(file_hash, 'wrench')] is not None assert service_queue('extract').length() == 1 # the queue doesn't pile up assert service_queue('wrench').length() == 1 logger.info('==== third dispatch') job = client.request_work('0', 'extract', '0') assert job.temporary_submission_data == [{'name': 'cats', 'value': 'big'}] client.service_failed(sid, 'abc123', make_error(file_hash, 'extract')) # Deliberately do in the wrong order to make sure that works disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) assert task.queue_keys[(file_hash, 'extract')] is not None assert task.queue_keys[(file_hash, 'wrench')] is not None assert service_queue('extract').length() == 1 # Mark extract as finished, wrench as failed logger.info('==== fourth dispatch') client.request_work('0', 'extract', '0') client.request_work('0', 'wrench', '0') client.service_finished(sid, 'extract-result', make_result(file_hash, 'extract')) client.service_failed(sid, 'wrench-error', make_error(file_hash, 'wrench', False)) for _ in range(2): disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) assert wait_error(task, file_hash, 'wrench') assert wait_result(task, file_hash, 'extract') assert service_queue('av-a').length() == 1 assert service_queue('av-b').length() == 1 assert service_queue('frankenstrings').length() == 1 # Have the AVs fail, frankenstrings finishes logger.info('==== fifth dispatch') client.request_work('0', 'av-a', '0') client.request_work('0', 'av-b', '0') client.request_work('0', 'frankenstrings', '0') client.service_failed(sid, 'av-a-error', make_error(file_hash, 'av-a', False)) client.service_failed(sid, 'av-b-error', make_error(file_hash, 'av-b', False)) client.service_finished(sid, 'f-result', make_result(file_hash, 'frankenstrings')) for _ in range(3): disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) assert wait_result(task, file_hash, 'frankenstrings') assert wait_error(task, file_hash, 'av-a') assert wait_error(task, file_hash, 'av-b') assert service_queue('xerox').length() == 1 # Finish the xerox service and check if the submission completion got checked logger.info('==== sixth dispatch') client.request_work('0', 'xerox', '0') client.service_finished(sid, 'xerox-result-key', make_result(file_hash, 'xerox')) disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) disp.save_submission() assert wait_result(task, file_hash, 'xerox') assert disp.tasks.get(sid) is None
class SubmissionClient: """A helper class to simplify submitting files from internal or external sources. This tool helps take care of interactions between the filestore, datastore, dispatcher, and any sources of files to be processed. """ def __init__(self, datastore: AssemblylineDatastore = None, filestore: FileStore = None, config=None, redis=None): self.log = logging.getLogger('assemblyline.submission_client') self.config = config or forge.CachedObject(forge.get_config) self.datastore = datastore or forge.get_datastore(self.config) self.filestore = filestore or forge.get_filestore(self.config) self.redis = redis # A client for interacting with the dispatcher self.dispatcher = DispatchClient(datastore, redis) def submit(self, submission_obj: SubmissionObject, local_files: List = None, cleanup=True, completed_queue=None): """Submit several files in a single submission. After this method runs, there should be no local copies of the file left. """ if local_files is None: local_files = [] try: expiry = now_as_iso(submission_obj.params.ttl * 24 * 60 * 60) if submission_obj.params.ttl else None max_size = self.config.submission.max_file_size if len(submission_obj.files) == 0: if len(local_files) == 0: raise SubmissionException("No files found to submit...") for local_file in local_files: # Upload/download, extract, analyze files file_hash, size, new_metadata = self._ready_file(local_file, expiry, str(submission_obj.params.classification), cleanup, upload=True) new_name = new_metadata.pop('name', safe_str(os.path.basename(local_file))) submission_obj.params.classification = new_metadata.pop('classification', submission_obj.params.classification) submission_obj.metadata.update(**flatten(new_metadata)) # Check that after we have resolved exactly what to pass on, that it # remains a valid target for scanning if size > max_size and not submission_obj.params.ignore_size: msg = "File too large (%d > %d). Submission failed" % (size, max_size) raise SubmissionException(msg) elif size == 0: msg = "File empty. Submission failed" raise SubmissionException(msg) submission_obj.files.append(File({ 'name': new_name, 'size': size, 'sha256': file_hash, })) else: for f in submission_obj.files: temporary_path = None try: fd, temporary_path = tempfile.mkstemp(prefix="submission.submit") os.close(fd) # We don't need the file descriptor open self.filestore.download(f.sha256, temporary_path) file_hash, size, new_metadata = self._ready_file(temporary_path, expiry, str(submission_obj.params.classification), cleanup, sha256=f.sha256) new_name = new_metadata.pop('name', f.name) submission_obj.params.classification = new_metadata.pop('classification', submission_obj.params.classification) submission_obj.metadata.update(**flatten(new_metadata)) # Check that after we have resolved exactly what to pass on, that it # remains a valid target for scanning if size > max_size and not submission_obj.params.ignore_size: msg = "File too large (%d > %d). Submission failed" % (size, max_size) raise SubmissionException(msg) elif size == 0: msg = "File empty. Submission failed" raise SubmissionException(msg) if f.size is None: f.size = size f.name = new_name f.sha256 = file_hash finally: if temporary_path: if os.path.exists(temporary_path): os.unlink(temporary_path) # Initialize the temporary data from the submission parameter if submission_obj.params.initial_data: try: temp_hash_name = get_temporary_submission_data_name(submission_obj.sid, submission_obj.files[0].sha256) temporary_submission_data = ExpiringHash(temp_hash_name, host=self.redis) temporary_submission_data.multi_set(json.loads(submission_obj.params.initial_data)) except ValueError as err: self.log.warning(f"[{submission_obj.sid}] could not process initialization data: {err}") # Clearing runtime_excluded on initial submit or resubmit submission_obj.params.services.runtime_excluded = [] # We should now have all the information we need to construct a submission object sub = Submission(dict( archive_ts=now_as_iso(self.config.datastore.ilm.days_until_archive * 24 * 60 * 60), classification=submission_obj.params.classification, error_count=0, errors=[], expiry_ts=expiry, file_count=len(submission_obj.files), files=submission_obj.files, max_score=0, metadata=submission_obj.metadata, params=submission_obj.params, results=[], sid=submission_obj.sid, state='submitted' )) self.datastore.submission.save(sub.sid, sub) self.log.debug("Submission complete. Dispatching: %s", sub.sid) self.dispatcher.dispatch_submission(sub, completed_queue=completed_queue) return sub finally: # Just in case this method fails clean up local files if cleanup: for path in local_files: if path and os.path.exists(path): # noinspection PyBroadException try: os.unlink(path) except Exception: self.log.error("Couldn't delete dangling file %s", path) def _ready_file(self, local_path: str, expiry, classification, cleanup, sha256=None, upload=False) -> Tuple[str, int, dict]: """Take a file from local storage and prepare it for submission. After this method finished the file will ONLY exist on the filestore, not locally. """ extracted_path = None try: # Analyze the file and make sure the file table is up to date fileinfo = identify.fileinfo(local_path) if fileinfo['size'] == 0: raise SubmissionException("File empty. Submission failed") if sha256 is not None and fileinfo['sha256'] != sha256: raise CorruptedFileStoreException(f"SHA256 mismatch between received and calculated " f"sha256. {sha256} != {fileinfo['sha256']}") # Check if there is an integrated decode process for this file # eg. files that are packaged, and the contained file (not the package # that local_path points to) should be passed into the system. extracted_path, fileinfo, al_meta = decode_file(local_path, fileinfo) al_meta['classification'] = al_meta.get('classification', classification) if extracted_path: local_path = extracted_path self.filestore.upload(local_path, fileinfo['sha256']) elif upload: self.filestore.upload(local_path, fileinfo['sha256']) self.datastore.save_or_freshen_file(fileinfo['sha256'], fileinfo, expiry, al_meta['classification'], redis=self.redis) return fileinfo['sha256'], fileinfo['size'], al_meta finally: # If we extracted anything delete it if extracted_path: if os.path.exists(extracted_path): os.unlink(extracted_path) # If we DIDN'T download anything, still delete it if local_path and cleanup: if os.path.exists(local_path): os.unlink(local_path)