def _get_deep_dispatches( self, payload: Payload, add_deep_dispatches: List[str], request_meta: RequestMeta, ) -> Tuple[Set[str], DefaultDict[str, List[str]]]: errors: DefaultDict[str, List[str]] = defaultdict(list) deep_dispatches = set(add_deep_dispatches) for ( deep_dispatcher_name, deep_dispatcher, ) in self._loaded_deep_dispatcher_plugins.items(): try: deep_dispatcher_result = deep_dispatcher.get_deep_dispatches( payload, request_meta) deep_dispatches.update(deep_dispatcher_result.plugin_names) if deep_dispatcher_result.meta is not None: payload.deep_dispatch_meta[ deep_dispatcher_name] = deep_dispatcher_result.meta except Exception as e: msg = 'deep dispatcher:failed to deep dispatch' self.log.exception(msg) errors[deep_dispatcher_name].append( helpers.format_exc(e, msg=msg)) return (deep_dispatches, errors)
async def scan( self, content: bytes, payload_meta: Optional[PayloadMeta] = None, request_meta: Optional[RequestMeta] = None, add_start_dispatch: Optional[List[str]] = None, ratelimit: Optional[str] = None, ) -> StoqResponse: """ Wrapper for `scan_request` that creates a `Payload` object from bytes :param content: Raw bytes to be scanned :param payload_meta: Metadata pertaining to originating source :param request_meta: Metadata pertaining to the originating request :param add_start_dispatch: Force first round of scanning to use specified plugins :param ratelimit: Rate limit calls to scan """ self.log.debug( f'Content received ({len(content)} bytes): ' f'PayloadMeta: {helpers.dumps(payload_meta, indent=0)}, ' f'RequestMeta: {helpers.dumps(request_meta, indent=0)}' ) payload_meta = payload_meta or PayloadMeta() payload = Payload(content, payload_meta) request_meta = request_meta or RequestMeta() request = Request(payloads=[payload], request_meta=request_meta) return await self.scan_request(request, add_start_dispatch)
def scan( self, content: bytes, payload_meta: Optional[PayloadMeta] = None, request_meta: Optional[RequestMeta] = None, add_start_dispatch: Optional[List[str]] = None, add_start_deep_dispatch: Optional[List[str]] = None, ratelimit: Optional[str] = None, ) -> StoqResponse: """ Wrapper for `scan_payload` that creates a `Payload` object from bytes :param content: Raw bytes to be scanned :param payload_meta: Metadata pertaining to originating source :param request_meta: Metadata pertaining to the originating request :param add_start_dispatch: Force first round of scanning to use specified plugins :param add_start_deep_dispatch: Force second round of scanning to use specified plugins :param ratelimit: Rate limit calls to scan :return: Complete scan results :rtype: StoqResponse """ payload_meta = PayloadMeta() if payload_meta is None else payload_meta payload = Payload(content, payload_meta) return self.scan_payload(payload, request_meta, add_start_dispatch, add_start_deep_dispatch)
async def ingest(self, queue: Queue) -> None: consumer = AIOKafkaConsumer( self.topic, group_id=self.group, auto_offset_reset='earliest', bootstrap_servers=self.servers, heartbeat_interval_ms=self.heartbeat_interval_ms, session_timeout_ms=self.session_timeout_ms, loop=get_event_loop(), ) await consumer.start() self.log.info(f'Monitoring {self.topic} topic for messages...') async for message in consumer: msg = json.loads(message.value) if msg.get('_is_payload'): # This message is a payload that was placed on the queue # from the kafka-queue archiver plugin extra_data = msg['_payload_meta'] extra_data['request_meta'] = msg['_request_meta'] meta = PayloadMeta(extra_data=extra_data) payload = Payload(content=b64decode(msg['_content']), payload_meta=meta) await queue.put(payload) else: await queue.put(msg)
def get(self, task: ArchiverResponse) -> Optional[Payload]: """ Retrieve archived payload from MongoDB """ self._connect_gridfs() result = self.gridfs_db.get(task.results['_id']) if result: # payload = result.read() return Payload(payload, PayloadMeta(extra_data=task.results))
async def _apply_worker( self, payload: Payload, plugin: WorkerPlugin, request: Request ) -> Tuple[Set[Tuple[Payload, str]], List[Payload]]: self.log.debug( f'Scanning Payload {payload.results.payload_id} with WorkerPlugin {plugin.plugin_name}' ) try: worker_response: Optional[WorkerResponse] = await plugin.scan( payload, request ) except Exception as e: worker_response = None msg = 'worker:failed to scan' self.log.exception(msg) request.errors.append( Error( payload_id=payload.results.payload_id, plugin_name=plugin.plugin_name, error=helpers.format_exc(e, msg=msg), ) ) payload.results.plugins_run['workers'].append(plugin.plugin_name) if not worker_response: return set(), [] if worker_response.results is not None: payload.results.workers[plugin.plugin_name] = worker_response.results request.errors.extend(worker_response.errors) additional_dispatches: Set[Tuple[Payload, str]] = { (payload, plugin_name) for plugin_name in worker_response.dispatch_to } extracted_payloads: List[Payload] = [ Payload( content=extracted_payload.content, payload_meta=extracted_payload.payload_meta, extracted_by=plugin.plugin_name, extracted_from=payload.results.payload_id, ) for extracted_payload in worker_response.extracted ] self.log.debug( f'Completed scan of {payload.results.payload_id} with ' f'{len(worker_response.results) if worker_response.results else 0} result keys, ' # type: ignore f'{len(additional_dispatches)} additional dispatches, and ' f'{len(extracted_payloads)} extracted payloads' ) return additional_dispatches, extracted_payloads
async def get(self, task: ArchiverResponse) -> Payload: """ Retrieve archived payload from Azure Blob Storage """ blob_client: BlobClient = BlobClient.from_connection_string( conn_str=self.conn_str, container_name=task.results['container_name'], blob_name=task.results['blob_name'], ) content = await blob_client.download_blob() await blob_client.close() meta = PayloadMeta(task.results) return Payload(content.readall(), meta)
def get(self, task: ArchiverResponse) -> Payload: """ Retrieve archived payload from S3 """ if not self.client: self._get_client() meta = PayloadMeta( extra_data={'bucket': task.results['bucket'], 'path': task.results['path']} ) content = self.client.get_object( Bucket=task.results['bucket'], Key=task.results['path'] )['Body'] return Payload(content.read(), meta)
def ingest(self, queue: Queue) -> None: consumer = KafkaConsumer( self.topic, group_id=self.group, auto_offset_reset='earliest', bootstrap_servers=self.servers, ) print(f'Monitoring {self.topic} topic for messages...') for message in consumer: msg = json.loads(message.value) if msg.get('_is_payload'): meta = PayloadMeta(extra_data=msg['_request_meta']) payload = Payload(content=msg['_content'], payload_meta=meta) queue.put(payload) else: queue.put(msg)
def get(self, task: ArchiverResponse) -> Payload: """ Retrieve archived payload from gcs """ meta = PayloadMeta( extra_data={ 'bucket': task.results['archive_bucket'], 'path': task.results['path'], 'project_id': task.results['project_id'], }) client = Client(project=task.results['project_id']) bucket = client.get_bucket(task.results['archive_bucket']) blob = Blob(task.results['path'], bucket) content = BytesIO() blob.download_to_file(content) content.seek(0) return Payload(content.read(), meta)
async def ingest(self, queue: Queue) -> None: self.log.info(f'Monitoring redis queue {self.redis_queue}') while True: msg = self.conn.blpop(self.redis_queue, timeout=0) if not msg: time.sleep(0.1) continue data = msg[1].decode() payload = self.conn.get(f'{data}_buf') meta = self.conn.get(f'{data}_meta') if meta and payload: meta = json.loads(meta.decode()) await queue.put( Payload(payload, payload_meta=PayloadMeta(extra_data=meta)) ) self.conn.delete(f'{meta}_buf') self.conn.delete(f'{meta}_meta') else: await queue.put(json.loads(data))
def get(self, task: ArchiverResponse) -> Payload: """ Retrieve archived payload from gcs """ meta = PayloadMeta( extra_data={ 'bucketId': task.results['bucketId'], 'objectId': task.results['objectId'], 'projectId': task.results['projectId'], } ) count = 0 client = Client(project=task.results['projectId']) while count < self.max_retries: try: bucket = client.get_bucket(task.results['bucketId']) blob = Blob(task.results['objectId'], bucket) content = BytesIO() blob.download_to_file(content) break except ( InvalidResponse, GoogleAPICallError, InternalServerError, SSLError, ) as e: if count >= self.max_retries: raise StoqPluginException( f'Failed to download {task.results["bucketId"]}/{task.results["objectId"]} from GCS: {str(e)}' ) count += 1 sleep(randrange(0, 4)) content.seek(0) data = content.read() if self.use_encryption: data = self._decrypt(data) return Payload(data, meta)
async def reconstruct_all_subresponses( self, stoq_response: StoqResponse) -> AsyncGenerator[StoqResponse, None]: """ Generate a new `StoqResponse` object for each `Payload` within the `Request` """ for i, new_root_payload_result in enumerate(stoq_response.results): parent_payload_ids = {stoq_response.results[i].payload_id} # Contruct a new root Payload object since StoqResponse only has the # PayloadResults object new_root_payload = Payload(b'') new_root_payload.results = new_root_payload_result relevant_payloads: List[Payload] = [new_root_payload] for payload_result in stoq_response.results[i:]: for extracted_from in payload_result.extracted_from: if extracted_from in parent_payload_ids: parent_payload_ids.add(payload_result.payload_id) new_payload = Payload(b'') new_payload.results = payload_result relevant_payloads.append(new_payload) new_request = Request(payloads=relevant_payloads, request_meta=stoq_response.request_meta) new_response = StoqResponse( request=new_request, time=stoq_response.time, scan_id=stoq_response.scan_id, ) decorator_tasks = [] for plugin_name, decorator in self._loaded_decorator_plugins.items( ): decorator_tasks.append( self._apply_decorator(decorator, new_response)) await asyncio.gather(*decorator_tasks) yield new_response
async def test_reconstruct_all_subresponses(self): # Construct a fake stoq_response as if it were generated from a file # A.zip that contains two files, B.txt and C.zip, where C.zip contains D.txt results = [ Payload(content=b'', payload_id='A.zip', payload_meta=PayloadMeta()), Payload( content=b'', payload_id='B.txt', payload_meta=PayloadMeta(), extracted_from='A.zip', extracted_by='fake', ), Payload( content=b'', payload_id='C.zip', payload_meta=PayloadMeta(), extracted_from='A.zip', extracted_by='fake', ), Payload( content=b'', payload_id='D.txt', payload_meta=PayloadMeta(), extracted_from='C.zip', extracted_by='fake', ), ] request = Request(request_meta=RequestMeta(extra_data={'check': 'me'})) payload_count = 1 for result in results: result.results.workers['fake'] = f'result-{payload_count}' result.results.plugins_run['workers'].append('fake') request.payloads.append(result) payload_count += 1 initial_response = StoqResponse(request) s = Stoq(base_dir=utils.get_data_dir(), decorators=['simple_decorator']) all_subresponses = [ r async for r in s.reconstruct_all_subresponses(initial_response) ] # We expect there to be four "artificial" responses generated, one for # each payload as the root. self.assertEqual(len(all_subresponses), 4) # We expect the first response to have all 4 payloads, the second response # to have just the second payload, the third response to have the third # and fourth payload, and the fourth response to have just the fourth payload self.assertEqual( [len(stoq_response.results) for stoq_response in all_subresponses], [4, 1, 2, 1], ) self.assertEqual( [ stoq_response.results[0].workers['fake'] for stoq_response in all_subresponses ], ['result-1', 'result-2', 'result-3', 'result-4'], ) self.assertTrue( all( 'simple_decorator' in stoq_response.decorators for stoq_response in all_subresponses ) ) # Assert that they all have the same scan ID self.assertEqual( len({stoq_response.scan_id for stoq_response in all_subresponses}), 1 )
def _single_scan( self, payload: Payload, add_dispatch: List[str], add_deep_dispatch: List[str], request_meta: RequestMeta, ) -> Tuple[PayloadResults, List[Payload], DefaultDict[str, List[str]]]: extracted = [] errors: DefaultDict[str, List[str]] = defaultdict(list) dispatch_pass = 0 dispatches, dispatch_errors = self._get_dispatches( payload, add_dispatch, request_meta) if dispatch_errors: errors = helpers.merge_dicts(errors, dispatch_errors) for plugin_name in dispatches: try: plugin = self.load_plugin(plugin_name) except Exception as e: msg = 'worker:failed to load' self.log.exception(msg) errors[plugin_name].append(helpers.format_exc(e, msg=msg)) continue # Normal dispatches are the "1st round" of scanning payload.plugins_run['workers'][0].append(plugin_name) try: worker_response = plugin.scan(payload, request_meta) # pyre-ignore[16] except Exception as e: msg = 'worker:failed to scan' self.log.exception(msg) errors[plugin_name].append(helpers.format_exc(e, msg=msg)) continue if worker_response is None: continue if worker_response.results is not None: # Normal dispatches are the "1st round" of scanning payload.worker_results[0][ plugin_name] = worker_response.results extracted.extend([ Payload(ex.content, ex.payload_meta, plugin_name, payload.payload_id) for ex in worker_response.extracted ]) if worker_response.errors: errors[plugin_name].extend(worker_response.errors) while dispatch_pass < self.max_dispatch_passes: dispatch_pass += 1 deep_dispatches, deep_dispatch_errors = self._get_deep_dispatches( payload, add_deep_dispatch, request_meta) if deep_dispatch_errors: errors = helpers.merge_dicts(errors, deep_dispatch_errors) if deep_dispatches: # Add another entry for this round payload.plugins_run['workers'].append([]) payload.worker_results.append({}) else: break for plugin_name in deep_dispatches: try: plugin = self.load_plugin(plugin_name) except Exception as e: msg = f'deep dispatch:failed to load (pass {dispatch_pass}/{self.max_dispatch_passes})' self.log.exception(msg) errors[plugin_name].append(helpers.format_exc(e, msg=msg)) continue payload.plugins_run['workers'][dispatch_pass].append( plugin_name) try: worker_response = plugin.scan( # pyre-ignore[16] payload, request_meta) except Exception as e: msg = f'deep dispatch:failed to scan (pass {dispatch_pass}/{self.max_dispatch_passes})' self.log.exception(msg) errors[plugin_name].append(helpers.format_exc(e, msg=msg)) continue if worker_response is None: continue if worker_response.results is not None: payload.worker_results[dispatch_pass][ plugin_name] = worker_response.results extracted.extend([ Payload(ex.content, ex.payload_meta, plugin_name, payload.payload_id) for ex in worker_response.extracted ]) if worker_response.errors: errors[plugin_name].extend(worker_response.errors) payload_results = PayloadResults.from_payload(payload) if request_meta.archive_payloads and payload.payload_meta.should_archive: for plugin_name, archiver in self._loaded_dest_archiver_plugins.items( ): payload.plugins_run['archivers'].append(plugin_name) try: archiver_response = archiver.archive(payload, request_meta) except Exception as e: msg = 'archiver:failed to archive' self.log.exception(msg) errors[plugin_name].append(helpers.format_exc(e, msg=msg)) continue if archiver_response is None: continue if archiver_response.results is not None: payload_results.archivers[ plugin_name] = archiver_response.results if archiver_response.errors: errors[plugin_name].extend(archiver_response.errors) return (payload_results, extracted, errors)
def get(self, task: ArchiverResponse) -> Optional[Payload]: if self.RAISE_EXCEPTION: raise Exception('Test exception please ignore') return Payload(self.PAYLOAD, PayloadMeta(extra_data=task.results))
def test_payloadresults_to_str(self): payload = Payload(self.generic_content) response_str = str(payload.results) response_dict = json.loads(response_str) self.assertIsInstance(response_str, str) self.assertIsInstance(response_dict, dict)