def test_depth_limit(core): # Make a nested set of files that goes deeper than the max depth by one sha, size = ready_body(core) for _ in range(core.config.submission.max_extraction_depth + 1): sha, size = ready_extract(core, sha) core.ingest_queue.push( SubmissionInput( dict( metadata={}, params=dict( description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], # Make sure we can extract enough files that we will definitely hit the depth limit first max_extracted=core.config.submission.max_extraction_depth + 10), notification=dict(queue='test-depth-limit', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-test-depth-limit', core.redis) start = time.time() task = notification_queue.pop(timeout=10) print("notification time waited", time.time() - start) assert task is not None task = IngestTask(task) sub: Submission = core.ds.submission.get(task.submission.sid) assert len(sub.files) == 1 # We should only get results for each file up to the max depth assert len(sub.results) == 4 * core.config.submission.max_extraction_depth assert len(sub.errors) == 1
def test_submit_simple(submit_harness): datastore, submitter = submit_harness # Push a normal ingest task submitter.unique_queue.push( 0, IngestTask({ 'submission': { 'params': SubmissionParams({ 'classification': 'U', 'description': 'file abc', 'services': { 'selected': [], 'excluded': [], 'resubmit': [], }, 'submitter': 'user', }), 'files': [{ 'sha256': '0' * 64, 'size': 100, 'name': 'abc', }], 'metadata': {} }, 'ingest_id': '123abc' }).as_primitives()) submitter.handle_submit() # The task has been passed to the submit tool and there are no other submissions submitter.submit_client.submit.assert_called() assert submitter.unique_queue.pop() is None
def test_max_extracted_in_one(core): # Make a set of files that is bigger than max_extracted (3 in this case) children = [ready_body(core)[0] for _ in range(5)] sha, size = ready_extract(core, children) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=3), notification=dict(queue='test-extracted-in-one', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-test-extracted-in-one', core.redis) start = time.time() task = notification_queue.pop(timeout=10) print("notification time waited", time.time() - start) assert task is not None task = IngestTask(task) sub: Submission = core.ds.submission.get(task.submission.sid) assert len(sub.files) == 1 # We should only get results for each file up to the max depth assert len(sub.results) == 4 * (1 + 3) assert len(sub.errors) == 2 # The number of children that errored out
def test_ingest_retry(core: CoreSession, metrics): # ------------------------------------------------------------------------------- # sha, size = ready_body(core) original_retry_delay = assemblyline_core.ingester.ingester._retry_delay assemblyline_core.ingester.ingester._retry_delay = 1 attempts = [] failures = [] original_submit = core.ingest.submit def fail_once(task): attempts.append(task) if len(attempts) > 1: original_submit(task) else: failures.append(task) raise ValueError() core.ingest.submit = fail_once try: core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict( description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], ), notification=dict(queue='output-queue-one', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-output-queue-one', core.redis) first_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT) # One of the submission will get processed fully assert first_task is not None first_task = IngestTask(first_task) first_submission: Submission = core.ds.submission.get( first_task.submission.sid) assert len(attempts) == 2 assert len(failures) == 1 assert first_submission.state == 'completed' assert len(first_submission.files) == 1 assert len(first_submission.errors) == 0 assert len(first_submission.results) == 4 metrics.expect('ingester', 'submissions_ingested', 1) metrics.expect('ingester', 'submissions_completed', 1) metrics.expect('ingester', 'files_completed', 1) metrics.expect('ingester', 'duplicates', 0) metrics.expect('dispatcher', 'submissions_completed', 1) metrics.expect('dispatcher', 'files_completed', 1) finally: core.ingest.submit = original_submit assemblyline_core.ingester.ingester._retry_delay = original_retry_delay
def test_max_extracted_in_several(core): # Make a set of in a non trivial tree, that add up to more than 3 (max_extracted) files children = [ ready_extract( core, [ready_body(core)[0], ready_body(core)[0]])[0], ready_extract( core, [ready_body(core)[0], ready_body(core)[0]])[0] ] sha, size = ready_extract(core, children) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=3), notification=dict(queue='test-extracted-in-several', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-test-extracted-in-several', core.redis) task = IngestTask(notification_queue.pop(timeout=10)) sub: Submission = core.ds.submission.get(task.submission.sid) assert len(sub.files) == 1 # We should only get results for each file up to the max depth assert len(sub.results) == 4 * ( 1 + 3) # 4 services, 1 original file, 3 extracted files assert len(sub.errors) == 3 # The number of children that errored out
def test_ingest_stale_score_exists(ingest_harness): datastore, ingester, in_queue = ingest_harness get_if_exists = datastore.filescore.get_if_exists try: # Add a stale file score to the database for every file always from assemblyline.odm.models.filescore import FileScore datastore.filescore.get_if_exists = mock.MagicMock( return_value=FileScore( dict(psid='000', expiry_ts=0, errors=0, score=10, sid='000', time=0))) # Process a message that hits the stale score in_queue.push(make_message()) ingester.handle_ingest() # The stale filescore was retrieved datastore.filescore.get_if_exists.assert_called_once() # but message was ingested as a cache miss task = ingester.unique_queue.pop() assert task task = IngestTask(task) assert task.submission.files[0].sha256 == '0' * 64 assert ingester.unique_queue.length() == 0 assert ingester.ingest_queue.length() == 0 finally: datastore.filescore.get_if_exists = get_if_exists
def test_extracted_file(core, metrics): sha, size = ready_extract(core, ready_body(core)[0]) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='text-extracted-file', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-text-extracted-file', core.redis) task = notification_queue.pop(timeout=RESPONSE_TIMEOUT) assert task task = IngestTask(task) sub = core.ds.submission.get(task.submission.sid) assert len(sub.files) == 1 assert len(sub.results) == 8 assert len(sub.errors) == 0 metrics.expect('ingester', 'submissions_ingested', 1) metrics.expect('ingester', 'submissions_completed', 1) metrics.expect('dispatcher', 'submissions_completed', 1) metrics.expect('dispatcher', 'files_completed', 2)
def run_once(): counter.reset_mock() core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict( description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], ), notification=dict(queue='1', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-1', core.redis) first_task = notification_queue.pop(timeout=5) # One of the submission will get processed fully assert first_task is not None first_task = IngestTask(first_task) first_submission: Submission = core.ds.submission.get( first_task.submission.sid) assert first_submission.state == 'completed' assert len(first_submission.files) == 1 assert len(first_submission.errors) == 0 assert len(first_submission.results) == 4 return first_submission.sid
def test_dropping_early(core, metrics): # ------------------------------------------------------------------------------- # This time have a file get marked for dropping by a service sha, size = ready_body(core, {'pre': {'result': {'drop_file': True}}}) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='drop', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-drop', core.redis) dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT) dropped_task = IngestTask(dropped_task) sub = core.ds.submission.get(dropped_task.submission.sid) assert len(sub.files) == 1 assert len(sub.results) == 1 metrics.expect('ingester', 'submissions_ingested', 1) metrics.expect('ingester', 'submissions_completed', 1) metrics.expect('dispatcher', 'submissions_completed', 1) metrics.expect('dispatcher', 'files_completed', 1)
def test_service_retry_limit(core, metrics): # This time have the service 'crash' sha, size = ready_body(core, {'pre': {'drop': 3}}) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='watcher-recover', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-watcher-recover', core.redis) dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT) assert dropped_task dropped_task = IngestTask(dropped_task) sub = core.ds.submission.get(dropped_task.submission.sid) assert len(sub.errors) == 1 assert len(sub.results) == 3 assert core.pre_service.drops[sha] == 3 assert core.pre_service.hits[sha] == 3 # Wait until we get feedback from the metrics channel metrics.expect('ingester', 'submissions_ingested', 1) metrics.expect('ingester', 'submissions_completed', 1) metrics.expect('dispatcher', 'service_timeouts', 3) metrics.expect('service', 'fail_recoverable', 3) metrics.expect('service', 'fail_nonrecoverable', 1) metrics.expect('dispatcher', 'submissions_completed', 1) metrics.expect('dispatcher', 'files_completed', 1)
def test_service_retry_limit(core): watch = WatcherServer(redis=core.redis, redis_persist=core.redis) watch.start() try: # This time have the service 'crash' sha, size = ready_body(core, {'pre': {'drop': 3}}) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='watcher-recover', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-watcher-recover', core.redis) dropped_task = notification_queue.pop(timeout=16) assert dropped_task dropped_task = IngestTask(dropped_task) sub = core.ds.submission.get(dropped_task.submission.sid) assert len(sub.errors) == 1 assert len(sub.results) == 3 assert core.pre_service.drops[sha] == 3 assert core.pre_service.hits[sha] == 3 finally: watch.stop() watch.join()
def test_plumber_clearing(core, metrics): global _global_semaphore _global_semaphore = threading.Semaphore(value=0) start = time.time() try: # Have the plumber cancel tasks sha, size = ready_body(core, {'pre': {'hold': 60}}) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='test_plumber_clearing', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) metrics.expect('ingester', 'submissions_ingested', 1) service_queue = get_service_queue('pre', core.redis) start = time.time() while service_queue.length() < 1: if time.time() - start > RESPONSE_TIMEOUT: pytest.fail(f'Found { service_queue.length()}') time.sleep(0.1) service_delta = core.ds.service_delta.get('pre') service_delta['enabled'] = False core.ds.service_delta.save('pre', service_delta) notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis) dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT) dropped_task = IngestTask(dropped_task) sub = core.ds.submission.get(dropped_task.submission.sid) assert len(sub.files) == 1 assert len(sub.results) == 3 assert len(sub.errors) == 1 error = core.ds.error.get(sub.errors[0]) assert "disabled" in error.response.message metrics.expect('ingester', 'submissions_completed', 1) metrics.expect('dispatcher', 'submissions_completed', 1) metrics.expect('dispatcher', 'files_completed', 1) metrics.expect('service', 'fail_recoverable', 1) finally: _global_semaphore.release() service_delta = core.ds.service_delta.get('pre') service_delta['enabled'] = True core.ds.service_delta.save('pre', service_delta)
def test_submit_duplicate(submit_harness): datastore, submitter = submit_harness # a normal ingest task task = IngestTask({ 'submission': { 'params': SubmissionParams({ 'classification': 'U', 'description': 'file abc', 'services': { 'selected': [], 'excluded': [], 'resubmit': [], }, 'submitter': 'user', }), 'files': [{ 'sha256': '0' * 64, 'size': 100, 'name': 'abc', }], 'metadata': {} }, 'ingest_id': 'abc123' }) # Make sure the scan key is correct, this is normally done on ingest task.submission.scan_key = task.params.create_filescore_key( task.submission.files[0].sha256, []) # Add this file to the scanning table, so it looks like it has already been submitted + ingest again submitter.scanning.add(task.submission.scan_key, task.as_primitives()) submitter.unique_queue.push(0, task.as_primitives()) submitter.handle_submit() # No tasks should be left in the queue assert submitter.unique_queue.pop() is None # The task should have been pushed to the duplicates queue assert submitter.duplicate_queue.length(_dup_prefix + task.submission.scan_key) == 1
def test_ingest_simple(ingest_harness): datastore, ingester, in_queue = ingest_harness user = random_minimal_obj(User) user.name = 'user' custom_user_groups = ['users', 'the_user'] user.groups = list(custom_user_groups) datastore.user.save('user', user) # Let the ingest loop run an extra time because we send two messages ingester.running.counter += 1 # Send a message with a garbled sha, this should be dropped in_queue.push(make_message(files={'sha256': '1' * 10})) with pytest.raises(ValueError): # Process garbled message ingester.try_run(volatile=True) # Send a message that is fine, but has an illegal metadata field in_queue.push( make_message(dict( metadata={ 'tobig': 'a' * (ingester.ingester.config.submission.max_metadata_length + 2), 'small': '100' }), params={ 'submitter': 'user', 'groups': [] })) # Process those ok message ingester.try_run(volatile=True) mm = ingester.ingester # The only task that makes it through though fit these parameters task = mm.unique_queue.pop() assert task task = IngestTask(task) assert task.submission.files[ 0].sha256 == '0' * 64 # Only the valid sha passed through assert 'tobig' not in task.submission.metadata # The bad metadata was stripped assert task.submission.metadata[ 'small'] == '100' # The valid metadata is unchanged assert task.submission.params.submitter == 'user' assert task.submission.params.groups == custom_user_groups # None of the other tasks should reach the end assert mm.unique_queue.length() == 0 assert mm.ingest_queue.length() == 0
def test_existing_score(submit_harness): datastore, submitter = submit_harness get_if_exists = datastore.filescore.get_if_exists try: # Set everything to have an existing filestore datastore.filescore.get_if_exists = mock.MagicMock( return_value=FileScore( dict(psid='000', expiry_ts=0, errors=0, score=10, sid='000', time=time.time()))) # add task to internal queue submitter.unique_queue.push( 0, IngestTask({ 'submission': { 'params': SubmissionParams({ 'classification': 'U', 'description': 'file abc', 'services': { 'selected': [], 'excluded': [], 'resubmit': [], }, 'submitter': 'user', }), 'files': [{ 'sha256': '0' * 64, 'size': 100, 'name': 'abc', }], 'metadata': {}, 'notification': { 'queue': 'our_queue' } }, 'ingest_id': 'abc123' }).as_primitives()) submitter.handle_submit() # No tasks should be left in the queue assert submitter.unique_queue.pop() is None # We should have received a notification about our task, since it was already 'done' assert submitter.notification_queues['nq-our_queue'].length() == 1 finally: datastore.filescore.get_if_exists = get_if_exists
def test_plumber_clearing(core): global _global_semaphore _global_semaphore = threading.Semaphore(value=0) start = time.time() watch = WatcherServer(redis=core.redis, redis_persist=core.redis) watch.start() try: # Have the plumber cancel tasks sha, size = ready_body(core, {'pre': {'semaphore': 60}}) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='test_plumber_clearing', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) service_queue = get_service_queue('pre', core.redis) time.sleep(0.5) while service_queue.length() == 0 and time.time() - start < 20: time.sleep(0.1) service_delta = core.ds.service_delta.get('pre') service_delta['enabled'] = False core.ds.service_delta.save('pre', service_delta) notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis) dropped_task = notification_queue.pop(timeout=5) dropped_task = IngestTask(dropped_task) sub = core.ds.submission.get(dropped_task.submission.sid) assert len(sub.files) == 1 assert len(sub.results) == 3 assert len(sub.errors) == 1 error = core.ds.error.get(sub.errors[0]) assert "disabled" in error.response.message finally: _global_semaphore.release() service_delta = core.ds.service_delta.get('pre') service_delta['enabled'] = True core.ds.service_delta.save('pre', service_delta) watch.stop() watch.join()
def test_service_error(core, metrics): # ------------------------------------------------------------------------------- # Have a service produce an error # ------------------------------------------------------------------------------- # This time have a file get marked for dropping by a service sha, size = ready_body( core, { 'core-a': { 'error': { 'archive_ts': time.time() + 250, 'sha256': 'a' * 64, 'response': { 'message': 'words', 'status': 'FAIL_NONRECOVERABLE', 'service_name': 'core-a', 'service_tool_version': 0, 'service_version': '0' }, 'expiry_ts': time.time() + 500 }, 'failure': True, } }) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='error', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-error', core.redis) task = IngestTask(notification_queue.pop(timeout=RESPONSE_TIMEOUT)) sub = core.ds.submission.get(task.submission.sid) assert len(sub.files) == 1 assert len(sub.results) == 3 assert len(sub.errors) == 1 metrics.expect('ingester', 'submissions_ingested', 1) metrics.expect('ingester', 'submissions_completed', 1) metrics.expect('dispatcher', 'submissions_completed', 1) metrics.expect('dispatcher', 'files_completed', 1)
def test_ingest_groups_custom(ingest_harness): datastore, ingester, in_queue = ingest_harness user = random_minimal_obj(User) user.name = 'user' custom_user_groups = ['users', 'the_user'] user.groups = list(custom_user_groups) datastore.user.save('user', user) in_queue.push( make_message(params={ 'submitter': 'user', 'groups': ['group_b'] })) ingester.handle_ingest() task = ingester.unique_queue.pop() assert task task = IngestTask(task) assert task.submission.params.submitter == 'user' assert task.submission.params.groups == ['group_b']
def try_run(self, volatile=False): ingester = self.ingester logger = self.log time_mark, cpu_mark = time.time(), time.process_time() while self.running: # noinspection PyBroadException try: self.heartbeat() ingester.counter.increment_execution_time( 'cpu_seconds', time.process_time() - cpu_mark) ingester.counter.increment_execution_time( 'busy_seconds', time.time() - time_mark) # Check if there is room for more submissions length = ingester.scanning.length() if length >= ingester.config.core.ingester.max_inflight: time.sleep(0.1) time_mark, cpu_mark = time.time(), time.process_time() continue raw = ingester.unique_queue.pop() if not raw: time.sleep(0.1) time_mark, cpu_mark = time.time(), time.process_time() continue # Start timing 'busy' time, we reset this above after the sleeps so that the sleeps # don't get counted as busy time_mark, cpu_mark = time.time(), time.process_time() # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_msg') task = IngestTask(raw) # noinspection PyBroadException if any( len(file.sha256) != 64 for file in task.submission.files): logger.error("Malformed entry on submission queue: %s", task.ingest_id) # End of ingest message (invalid_hash) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'invalid_hash') continue # If between the initial ingestion and now the drop/whitelist status # of this submission has changed, then drop it now if ingester.drop(task): # End of ingest message (dropped) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'dropped') continue if ingester.is_whitelisted(task): # End of ingest message (whitelisted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'whitelisted') continue # Check if this file has been previously processed. pprevious, previous, score, scan_key = None, False, None, None if not task.submission.params.ignore_cache: pprevious, previous, score, scan_key = ingester.check(task) else: scan_key = ingester.stamp_filescore_key(task) # If it HAS been previously processed, we are dealing with a resubmission # finalize will decide what to do, and put the task back in the queue # rewritten properly if we are going to run it again if previous: if not task.submission.params.services.resubmit and not pprevious: logger.warning( f"No psid for what looks like a resubmission of " f"{task.submission.files[0].sha256}: {scan_key}") ingester.finalize(pprevious, previous, score, task) # End of ingest message (finalized) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'finalized') continue # We have decided this file is worth processing # Add the task to the scanning table, this is atomic across all submit # workers, so if it fails, someone beat us to the punch, record the file # as a duplicate then. if not ingester.scanning.add(scan_key, task.as_primitives()): logger.debug('Duplicate %s', task.submission.files[0].sha256) ingester.counter.increment('duplicates') ingester.duplicate_queue.push(_dup_prefix + scan_key, task.as_primitives()) # End of ingest message (duplicate) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'duplicate') continue # We have managed to add the task to the scan table, so now we go # ahead with the submission process try: ingester.submit(task) # End of ingest message (submitted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'submitted') continue except Exception as _ex: # For some reason (contained in `ex`) we have failed the submission # The rest of this function is error handling/recovery ex = _ex traceback = _ex.__traceback__ ingester.counter.increment('error') should_retry = True if isinstance(ex, CorruptedFileStoreException): logger.error( "Submission for file '%s' failed due to corrupted filestore: %s" % (task.sha256, str(ex))) should_retry = False elif isinstance(ex, DataStoreException): trace = exceptions.get_stacktrace_info(ex) logger.error( "Submission for file '%s' failed due to data store error:\n%s" % (task.sha256, trace)) elif not isinstance(ex, FileStoreException): trace = exceptions.get_stacktrace_info(ex) logger.error("Submission for file '%s' failed: %s" % (task.sha256, trace)) task = IngestTask(ingester.scanning.pop(scan_key)) if not task: logger.error('No scanning entry for for %s', task.sha256) # End of ingest message (no_scan_entry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'no_scan_entry') continue if not should_retry: # End of ingest message (cannot_retry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'cannot_retry') continue ingester.retry(task, scan_key, ex) # End of ingest message (retry) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'retried') if volatile: raise ex.with_traceback(traceback) except Exception: logger.exception("Unexpected error") # End of ingest message (exception) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'exception') if volatile: raise
def try_run(self, volatile=False): ingester = self.ingester cpu_mark = time.process_time() time_mark = time.time() # Move from ingest to unique and waiting queues. # While there are entries in the ingest queue we consume chunk_size # entries at a time and move unique entries to uniqueq / queued and # duplicates to their own queues / waiting. while self.running: self.heartbeat() while True: result = ingester.complete_queue.pop(blocking=False) if not result: break # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_msg') sub = Submission(result) ingester.completed(sub) # End of ingest message (success) if self.apm_client: elasticapm.tag(sid=sub.sid) self.apm_client.end_transaction('ingest_complete', 'success') ingester.counter.increment_execution_time('cpu_seconds', time.process_time() - cpu_mark) ingester.counter.increment_execution_time('busy_seconds', time.time() - time_mark) message = ingester.ingest_queue.pop(timeout=1) cpu_mark = time.process_time() time_mark = time.time() if not message: continue # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_msg') try: sub = SubmissionInput(message) # Write all input to the traffic queue ingester.traffic_queue.publish(SubmissionMessage({ 'msg': sub, 'msg_type': 'SubmissionIngested', 'sender': 'ingester', }).as_primitives()) task = IngestTask(dict( submission=sub, ingest_id=sub.sid, )) task.submission.sid = None # Reset to new random uuid except (ValueError, TypeError) as error: self.log.exception(f"Dropped ingest submission {message} because {str(error)}") # End of ingest message (value_error) if self.apm_client: self.apm_client.end_transaction('ingest_input', 'value_error') if volatile: raise continue if any(len(file.sha256) != 64 for file in task.submission.files): self.log.error(f"Invalid sha256: {[file.sha256 for file in task.submission.files]}") # End of ingest message (invalid_hash) if self.apm_client: self.apm_client.end_transaction('ingest_input', 'invalid_hash') continue for file in task.submission.files: file.sha256 = file.sha256.lower() ingester.ingest(task) # End of ingest message (success) if self.apm_client: self.apm_client.end_transaction('ingest_input', 'success')
def test_deduplication(core): # ------------------------------------------------------------------------------- # Submit two identical jobs, check that they get deduped by ingester sha, size = ready_body(core) for _ in range(2): core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict( description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], ), notification=dict(queue='output-queue-one', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-output-queue-one', core.redis) first_task = notification_queue.pop(timeout=5) second_task = notification_queue.pop(timeout=5) # One of the submission will get processed fully assert first_task is not None first_task = IngestTask(first_task) first_submission: Submission = core.ds.submission.get( first_task.submission.sid) assert first_submission.state == 'completed' assert len(first_submission.files) == 1 assert len(first_submission.errors) == 0 assert len(first_submission.results) == 4 # The other will get processed as a duplicate # (Which one is the 'real' one and which is the duplicate isn't important for our purposes) second_task = IngestTask(second_task) assert second_task.submission.sid == first_task.submission.sid # ------------------------------------------------------------------------------- # Submit the same body, but change a parameter so the cache key misses, core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='2', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) notification_queue = NamedQueue('nq-2', core.redis) third_task = notification_queue.pop(timeout=5) assert third_task # The third task should not be deduplicated by ingester, so will have a different submission third_task = IngestTask(third_task) third_submission: Submission = core.ds.submission.get( third_task.submission.sid) assert third_submission.state == 'completed' assert first_submission.sid != third_submission.sid assert len(third_submission.files) == 1 assert len(third_submission.results) == 4