def test_submit_duplicate(submit_harness): datastore, submitter = submit_harness # a normal ingest task task = IngestTask({ 'submission': { 'params': SubmissionParams({ 'classification': 'U', 'description': 'file abc', 'services': { 'selected': [], 'excluded': [], 'resubmit': [], }, 'submitter': 'user', }), 'files': [{ 'sha256': '0' * 64, 'size': 100, 'name': 'abc', }], 'metadata': {} }, 'ingest_id': 'abc123' }) # Make sure the scan key is correct, this is normally done on ingest task.submission.scan_key = task.params.create_filescore_key( task.submission.files[0].sha256, []) # Add this file to the scanning table, so it looks like it has already been submitted + ingest again submitter.scanning.add(task.submission.scan_key, task.as_primitives()) submitter.unique_queue.push(0, task.as_primitives()) submitter.handle_submit() # No tasks should be left in the queue assert submitter.unique_queue.pop() is None # The task should have been pushed to the duplicates queue assert submitter.duplicate_queue.length(_dup_prefix + task.submission.scan_key) == 1
def try_run(self, volatile=False): ingester = self.ingester logger = self.log time_mark, cpu_mark = time.time(), time.process_time() while self.running: # noinspection PyBroadException try: self.heartbeat() ingester.counter.increment_execution_time( 'cpu_seconds', time.process_time() - cpu_mark) ingester.counter.increment_execution_time( 'busy_seconds', time.time() - time_mark) # Check if there is room for more submissions length = ingester.scanning.length() if length >= ingester.config.core.ingester.max_inflight: time.sleep(0.1) time_mark, cpu_mark = time.time(), time.process_time() continue raw = ingester.unique_queue.pop() if not raw: time.sleep(0.1) time_mark, cpu_mark = time.time(), time.process_time() continue # Start timing 'busy' time, we reset this above after the sleeps so that the sleeps # don't get counted as busy time_mark, cpu_mark = time.time(), time.process_time() # Start of ingest message if self.apm_client: self.apm_client.begin_transaction('ingest_msg') task = IngestTask(raw) # noinspection PyBroadException if any( len(file.sha256) != 64 for file in task.submission.files): logger.error("Malformed entry on submission queue: %s", task.ingest_id) # End of ingest message (invalid_hash) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'invalid_hash') continue # If between the initial ingestion and now the drop/whitelist status # of this submission has changed, then drop it now if ingester.drop(task): # End of ingest message (dropped) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'dropped') continue if ingester.is_whitelisted(task): # End of ingest message (whitelisted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'whitelisted') continue # Check if this file has been previously processed. pprevious, previous, score, scan_key = None, False, None, None if not task.submission.params.ignore_cache: pprevious, previous, score, scan_key = ingester.check(task) else: scan_key = ingester.stamp_filescore_key(task) # If it HAS been previously processed, we are dealing with a resubmission # finalize will decide what to do, and put the task back in the queue # rewritten properly if we are going to run it again if previous: if not task.submission.params.services.resubmit and not pprevious: logger.warning( f"No psid for what looks like a resubmission of " f"{task.submission.files[0].sha256}: {scan_key}") ingester.finalize(pprevious, previous, score, task) # End of ingest message (finalized) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'finalized') continue # We have decided this file is worth processing # Add the task to the scanning table, this is atomic across all submit # workers, so if it fails, someone beat us to the punch, record the file # as a duplicate then. if not ingester.scanning.add(scan_key, task.as_primitives()): logger.debug('Duplicate %s', task.submission.files[0].sha256) ingester.counter.increment('duplicates') ingester.duplicate_queue.push(_dup_prefix + scan_key, task.as_primitives()) # End of ingest message (duplicate) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'duplicate') continue # We have managed to add the task to the scan table, so now we go # ahead with the submission process try: ingester.submit(task) # End of ingest message (submitted) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'submitted') continue except Exception as _ex: # For some reason (contained in `ex`) we have failed the submission # The rest of this function is error handling/recovery ex = _ex traceback = _ex.__traceback__ ingester.counter.increment('error') should_retry = True if isinstance(ex, CorruptedFileStoreException): logger.error( "Submission for file '%s' failed due to corrupted filestore: %s" % (task.sha256, str(ex))) should_retry = False elif isinstance(ex, DataStoreException): trace = exceptions.get_stacktrace_info(ex) logger.error( "Submission for file '%s' failed due to data store error:\n%s" % (task.sha256, trace)) elif not isinstance(ex, FileStoreException): trace = exceptions.get_stacktrace_info(ex) logger.error("Submission for file '%s' failed: %s" % (task.sha256, trace)) task = IngestTask(ingester.scanning.pop(scan_key)) if not task: logger.error('No scanning entry for for %s', task.sha256) # End of ingest message (no_scan_entry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'no_scan_entry') continue if not should_retry: # End of ingest message (cannot_retry) if self.apm_client: self.apm_client.end_transaction( 'ingest_submit', 'cannot_retry') continue ingester.retry(task, scan_key, ex) # End of ingest message (retry) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'retried') if volatile: raise ex.with_traceback(traceback) except Exception: logger.exception("Unexpected error") # End of ingest message (exception) if self.apm_client: self.apm_client.end_transaction('ingest_submit', 'exception') if volatile: raise