def scan(self, data, file, options, expire_at): self.event['total'] = {'attachments': 0, 'extracted': 0} self.event.setdefault('object_names', []) tnef = tnefparse.TNEF(data) tnef_objects = getattr(tnef, 'objects', []) for tnef_object in tnef_objects: descriptive_name = tnefparse.TNEF.codes.get(tnef_object.name) if descriptive_name not in self.event['object_names']: self.event['object_names'].append(descriptive_name) try: object_data = tnef_object.data.strip(b'\0') or None except: object_data = tnef_object.data if object_data is not None: if descriptive_name == 'Subject': self.event['subject'] = object_data elif descriptive_name == 'Message ID': self.event['message_id'] = object_data elif descriptive_name == 'Message Class': self.event['message_class'] = object_data tnef_attachments = getattr(tnef, 'attachments', []) self.event['total']['attachments'] = len(tnef_attachments) for attachment in tnef_attachments: extract_file = strelka.File( name=attachment.name.decode(), source=self.name, ) for c in strelka.chunk_string(attachment.data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 tnef_html = getattr(tnef, 'htmlbody', None) if tnef_html is not None: extract_file = strelka.File( name='htmlbody', source=self.name, ) for c in strelka.chunk_string(tnef_html): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): file_limit = options.get('limit', 1000) self.event['total'] = {'objects': 0, 'extracted': 0} rtf = rtfobj.RtfObjParser(data) rtf.parse() self.event['total']['objects'] = len(rtf.objects) for object in rtf.objects: if self.event['total']['extracted'] >= file_limit: break index = rtf.server.index(object) if object.is_package: extract_file = strelka.File( name=object.filename, source=self.name, ) for c in strelka.chunk_string(object.olepkgdata): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) elif object.is_ole: extract_file = strelka.File( name=f'object_{index}', source=self.name, ) for c in strelka.chunk_string(object.oledata): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) else: extract_file = strelka.File( name=f'object_{index}', source=self.name, ) for c in strelka.chunk_string(object.rawdata): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1
def scan(self, data, file, options, expire_at): self.event['total'] = {'streams': 0, 'extracted': 0} try: ole = olefile.OleFileIO(data) ole_streams = ole.listdir(streams=True) self.event['total']['streams'] = len(ole_streams) for stream in ole_streams: file = ole.openstream(stream) extract_data = file.read() extract_name = f'{"_".join(stream)}' extract_name = re.sub(r'[\x00-\x1F]', '', extract_name) if extract_name.endswith('Ole10Native'): native_stream = oletools.oleobj.OleNativeStream( bindata=extract_data, ) if native_stream.filename: extract_name = extract_name + f'_{str(native_stream.filename)}' else: extract_name = extract_name + '_native_data' extract_file = strelka.File( name=extract_name, source=self.name, ) for c in strelka.chunk_string(native_stream.data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) else: extract_file = strelka.File( name=extract_name, source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except OSError: self.flags.append('os_error') finally: ole.close()
def scan(self, data, file, options, expire_at): tmp_directory = options.get('tmp_directory', '/tmp/') self.event['total'] = {'certificates': 0, 'extracted': 0} with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() if data[:1] == b'0': pkcs7 = SMIME.load_pkcs7_der(tmp_data.name) else: pkcs7 = SMIME.load_pkcs7(tmp_data.name) certs = pkcs7.get0_signers(X509.X509_Stack()) if certs: self.event['total']['certificates'] = len(certs) for cert in certs: extract_file = strelka.File( name=f'sn_{cert.get_serial_number()}', source=self.name, ) for c in strelka.chunk_string(cert.as_der()): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1
def scan(self, data, file, options, expire_at): try: with io.BytesIO(data) as lzma_io: with lzma.LZMAFile(filename=lzma_io) as lzma_obj: try: decompressed_file = lzma_obj.read() decompressed_size = len(decompressed_file) self.event['decompressed_size'] = decompressed_size extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(decompressed_file): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) except EOFError: self.flags.append('eof_error') except lzma.LZMAError: self.flags.append('lzma_error')
def scan(self, data, file, options, expire_at): file_limit = options.get('limit', 1000) self.event['total'] = {'files': 0, 'extracted': 0} with io.BytesIO(data) as rar_io: with rarfile.RarFile(rar_io) as rar_obj: rf_info_list = rar_obj.infolist() self.event['total']['files'] = len(rf_info_list) for rf_object in rf_info_list: if not rf_object.isdir(): if self.event['total']['extracted'] >= file_limit: break file_info = rar_obj.getinfo(rf_object) if not file_info.needs_password(): self.event['host_os'] = HOST_OS_MAPPING[file_info.host_os] extract_file = strelka.File( name=f'{file_info.filename}', source=self.name, ) for c in strelka.chunk_string(rar_obj.read(rf_object)): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 else: self.flags.append('password_protected')
def scan(self, data, file, options, expire_at): tmp_directory = options.get('tmp_directory', '/tmp/') with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() upx_return = subprocess.call( ['upx', '-d', tmp_data.name, '-o', f'{tmp_data.name}_upx'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if upx_return == 0: with open(f'{tmp_data.name}_upx', 'rb') as upx_fin: upx_file = upx_fin.read() upx_size = len(upx_file) if upx_size > len(data): extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(upx_file): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) os.remove(f'{tmp_data.name}_upx') else: self.flags.append(f'return_code_{upx_return}')
def scan(self, data, file, options, expire_at): jtr_path = options.get('jtr_path', '/jtr/') tmp_directory = options.get('tmp_file_directory', '/tmp/') file_limit = options.get('limit', 1000) password_file = options.get('password_file', '/etc/strelka/passwords.dat') log_extracted_pws = options.get('log_pws', False) scanner_timeout = options.get('scanner_timeout', 150) brute = options.get('brute_force', False) max_length = options.get('max_length', 5) self.event['total'] = {'files': 0, 'extracted': 0} with io.BytesIO(data) as zip_io: try: with zipfile.ZipFile(zip_io) as zip_obj: name_list = zip_obj.namelist() self.event['total']['files'] = len(name_list) extracted_pw = crack_zip(self, data, jtr_path, tmp_directory, brute=brute, scanner_timeout=scanner_timeout, max_length=max_length, password_file=password_file) if not extracted_pw: self.flags.append('Could not extract password') return if log_extracted_pws: self.event['cracked_password'] = extracted_pw for i, name in enumerate(name_list): if not name.endswith('/'): if self.event['total']['extracted'] >= file_limit: break try: extract_data = zip_obj.read(name, extracted_pw) if extract_data: extract_file = strelka.File( name=name, source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except NotImplementedError: self.flags.append('unsupported_compression') except RuntimeError: self.flags.append('runtime_error') except ValueError: self.flags.append('value_error') except zlib.error: self.flags.append('zlib_error') except zipfile.BadZipFile: self.flags.append('bad_zip')
def scan(self, data, file, options, expire_at): tmp_directory = options.get('tmp_directory', '/tmp/') with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() (stdout, stderr) = subprocess.Popen( ['antiword', tmp_data.name], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL ).communicate() if stdout: extract_file = strelka.File( name='text', source=self.name, ) for c in strelka.chunk_string(stdout): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): with io.BytesIO(data) as encoded_file: extract_data = b'' try: extract_data = base64.b64decode(encoded_file.read()) self.event['decoded_header'] = extract_data[:50] except binascii.Error: self.flags.append('not_decodable_from_base64') if extract_data: extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): with io.BytesIO(data) as swf_io: swf_io.seek(4) swf_size = struct.unpack('<i', swf_io.read(4))[0] swf_io.seek(0) magic = swf_io.read(3) extract_data = b'FWS' + swf_io.read(5) if magic == b'CWS': self.event['type'] = 'CWS' try: extract_data += zlib.decompress(swf_io.read())[:swf_size - 8] extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) except zlib.error: self.flags.append('zlib_error') elif magic == b'ZWS': self.event['type'] = 'ZWS' swf_io.seek(12) extract_data += pylzma.decompress(swf_io.read())[:swf_size - 8] extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) elif magic == b'FWS': self.event['type'] = 'FWS'
def scan(self, data, file, options, expire_at): file_limit = options.get('limit', 1000) self.event['total'] = {'files': 0, 'extracted': 0} with io.BytesIO(data) as zip_io: try: with zipfile.ZipFile(zip_io) as zip_obj: name_list = zip_obj.namelist() self.event['total']['files'] = len(name_list) for i, name in enumerate(name_list): if not name.endswith('/'): if self.event['total']['extracted'] >= file_limit: break try: extract_data = b'' zinfo = zip_obj.getinfo(name) if zinfo.flag_bits & 0x1: if i == 0: self.flags.append('encrypted') else: extract_data = zip_obj.read(name) if extract_data: extract_file = strelka.File( name=name, source=self.name, ) for c in strelka.chunk_string( extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except NotImplementedError: self.flags.append('unsupported_compression') except RuntimeError: self.flags.append('runtime_error') except ValueError: self.flags.append('value_error') except zlib.error: self.flags.append('zlib_error') except zipfile.BadZipFile: self.flags.append('bad_zip')
def scan(self, data, file, options, expire_at): analyze_macros = options.get('analyze_macros', True) self.event['total'] = {'files': 0, 'extracted': 0} try: vba = olevba3.VBA_Parser(filename=file.name, data=data) if vba.detect_vba_macros(): extract_macros = list(vba.extract_macros()) self.event['total']['files'] = len(extract_macros) for (filename, stream_path, vba_filename, vba_code) in extract_macros: extract_file = strelka.File( name=f'{vba_filename}', source=self.name, ) for c in strelka.chunk_string(vba_code): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 if analyze_macros: self.event.setdefault('auto_exec', []) self.event.setdefault('base64', []) self.event.setdefault('dridex', []) self.event.setdefault('hex', []) self.event.setdefault('ioc', []) self.event.setdefault('suspicious', []) macros = vba.analyze_macros() for (macro_type, keyword, description) in macros: if macro_type == 'AutoExec': self.event['auto_exec'].append(keyword) elif macro_type == 'Base64 String': self.event['base64'].append(keyword) elif macro_type == 'Dridex String': self.event['dridex'].append(keyword) elif macro_type == 'Hex String': self.event['hex'].append(keyword) elif macro_type == 'IOC': self.event['ioc'].append(keyword) elif macro_type == 'Suspicious': self.event['suspicious'].append(keyword) except olevba3.FileOpenError: self.flags.append('file_open_error') finally: # TODO referenced before potential assignment as vba is opened in a try / catch block vba.close()
def scan(self, data, file, options, expire_at): password_file = options.get('password_file', '/etc/strelka/passwords.dat') if not self.passwords: if os.path.isfile(password_file): with open(password_file, 'rb') as f: for line in f: self.passwords.append(line.strip()) with io.BytesIO(data) as doc_io: msoff_doc = msoffcrypto.OfficeFile(doc_io) output_doc = io.BytesIO() password = '' extract_data = b'' if msoff_doc.is_encrypted(): self.flags.append('password_protected') for pw in self.passwords: if not password: try: msoff_doc.load_key(password=pw.decode('utf-8')) output_doc.seek(0) msoff_doc.decrypt(output_doc) output_doc.seek(0) if output_doc.readable(): extract_data = output_doc.read() password = pw.decode('utf-8') break except Exception: pass if password: self.event['password'] = password extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) else: self.flags.append('no_password_match_found')
def scan(self, data, file, options, expire_at): decoded = base64.b64decode(data) extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(decoded): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): headers = options.get('headers', []) self.event['total'] = {'parts': 0, 'extracted': 0} try: message = email.message_from_string(data.decode( 'UTF-8', 'replace')) self.event['headers'] = [] for h, v in message.items(): if headers and h not in headers: continue self.event['headers'].append({ 'header': h, 'value': v, }) self.event['parts'] = [] for (index, part) in enumerate(message.walk()): self.event['total']['parts'] += 1 extract_data = part.get_payload(decode=True) if extract_data is not None: part_filename = part.get_filename() if part_filename is not None: extract_name = f'{part_filename}' self.event['parts'].append(part_filename) else: extract_name = f'part_{index}' extract_file = strelka.File( name=extract_name, source=self.name, ) extract_file.add_flavors( {'external': [part.get_content_type()]}) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except AssertionError: self.flags.append('assertion_error')
def _recurse_node(self, node, xml_args): """Recursively parses XML file. The XML file is recursively parsed down every node tree. Args: node: node to be recursively parsed. xml_args: options set by the scanner that affect XMl parsing. """ if node is not None: if hasattr(node.tag, '__getitem__'): if node.tag.startswith('{'): namespace, separator, tag = node.tag[1:].partition('}') else: namespace = None tag = node.tag self.event['total']['tags'] += 1 if namespace not in self.event['namespaces']: self.event['namespaces'].append(namespace) if tag not in self.event['tags']: self.event['tags'].append(tag) text = node.attrib.get('name', node.text) if text is not None: if tag in xml_args['metadata_tags']: tag_data = {'tag': tag, 'text': text.strip()} if tag_data not in self.event['tag_data']: self.event['tag_data'].append(tag_data) elif tag in xml_args['extract_tags']: extract_file = strelka.File( name=tag, source=self.name, ) for c in strelka.chunk_string(text): self.upload_to_coordinator( extract_file.pointer, c, self.expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 for child in node.getchildren(): self._recurse_node(self, child, xml_args) return
def scan(self, data, file, options, expire_at): with io.BytesIO(data) as gzip_io: with gzip.GzipFile(fileobj=gzip_io) as gzip_obj: decompressed = gzip_obj.read() self.event['size'] = len(decompressed) extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(decompressed): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): decompressed = zlib.decompress(data) self.event["size"] = len(decompressed) extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(decompressed): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): jtr_path = options.get('jtr_path', '/jtr/') tmp_directory = options.get('tmp_file_directory', '/tmp/') password_file = options.get('password_file', '/etc/strelka/passwords.dat') log_extracted_pws = options.get('log_pws', False) scanner_timeout = options.get('scanner_timeout', 150) brute = options.get('brute_force', False) max_length = options.get('max_length', 5) with io.BytesIO(data) as doc_io: msoff_doc = msoffcrypto.OfficeFile(doc_io) output_doc = io.BytesIO() if extracted_pw := crack_word(self, data, jtr_path, tmp_directory, brute=brute, scanner_timeout=scanner_timeout, max_length=max_length, password_file=password_file): if log_extracted_pws: self.event['cracked_password'] = extracted_pw try: msoff_doc.load_key(password=extracted_pw.decode('utf-8')) msoff_doc.decrypt(output_doc) output_doc.seek(0) extract_data = output_doc.read() output_doc.seek(0) extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) except: self.flags.append( 'Could not decrypt document with recovered password') else:
def scan(self, data, file, options, expire_at): self.event['total'] = {'parts': 0, 'extracted': 0} try: message = email.message_from_string(data.decode( 'UTF-8', 'replace')) self.event.setdefault('headers', []) for (key, value) in message.items(): normalized_value = strelka.normalize_whitespace(value.strip()) header_entry = {'header': key, 'value': normalized_value} if header_entry not in self.event['headers']: self.event['headers'].append(header_entry) self.event.setdefault('parts', []) for (index, part) in enumerate(message.walk()): self.event['total']['parts'] += 1 extract_data = part.get_payload(decode=True) if extract_data is not None: part_filename = part.get_filename() if part_filename is not None: extract_name = f'{part_filename}' self.event['parts'].append(part_filename) else: extract_name = f'part_{index}' extract_file = strelka.File( name=extract_name, source=self.name, ) extract_file.add_flavors( {'external': [part.get_content_type()]}) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except AssertionError: self.flags.append('assertion_error')
def scan(self, data, file, options, expire_at): expectedSize = int.from_bytes(data[2:6], "little") actualSize = len(data) if expectedSize != actualSize: self.event['trailer_index'] = expectedSize trailer_bytes_data = data[expectedSize:] extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(trailer_bytes_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.event['BMP_EOF'] = data[expectedSize:] self.files.append(extract_file) else: self.flags.append('no_trailer')
def scan(self, data, file, options, expire_at): if not data.endswith(b'\xff\xd9'): trailer_index = data.rfind(b'\xff\xd9') if trailer_index == -1: self.flags.append('no_trailer') else: trailer_data = data[trailer_index + 2:] if trailer_data: self.event['trailer_index'] = trailer_index extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(trailer_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): with io.BytesIO(data) as bzip2_io: with bz2.BZ2File(filename=bzip2_io) as bzip2_obj: try: decompressed = bzip2_obj.read() self.event['size'] = len(decompressed) extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(decompressed): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) except EOFError: self.flags.append('eof_error') except OSError: self.flags.append('os_error')
def scan(self, data, file, options, expire_at): file_limit = options.get('limit', 1000) self.event['total'] = {'files': 0, 'extracted': 0} with io.BytesIO(data) as tar_io: try: with tarfile.open(fileobj=tar_io) as tar_obj: tar_members = tar_obj.getmembers() self.event['total']['files'] = len(tar_members) for tar_member in tar_members: if tar_member.isfile: if self.event['total']['extracted'] >= file_limit: break try: tar_file = tar_obj.extractfile(tar_member) if tar_file is not None: extract_file = strelka.File( name=tar_member.name, source=self.name, ) for c in strelka.chunk_string( tar_file.read()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except KeyError: self.flags.append('key_error') except tarfile.ReadError: self.flags.append('tarfile_read_error')
def scan(self, data, file, options, expire_at): self.event['total'] = {'certificates': 0, 'extracted': 0} if data[:1] == b'0': crypto_file_type = crypto.FILETYPE_ASN1 self.event['cryptoType'] = 'der' else: crypto_file_type = crypto.FILETYPE_PEM self.event['cryptoType'] = 'pem' try: pkcs7 = crypto.load_pkcs7_data(crypto_file_type, data) pkcs7_certificates = pkcs7.get_certificates() if pkcs7_certificates is not None: self.event['total']['certificates'] = len(pkcs7_certificates) for certificate in pkcs7_certificates: extract_file = strelka.File( name=f'sn_{certificate.get_serial_number()}', source=self.name, ) extract_data = crypto.dump_certificate( crypto_file_type, certificate, ) for c in strelka.chunk_string(extract_data): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except crypto.Error: self.flags.append('load_pkcs7_error')
def scan(self, data, file, options, expire_at): extract_text = options.get('extract_text', False) tmp_directory = options.get('tmp_directory', '/tmp/') with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_tess: tess_return = subprocess.call( ['tesseract', tmp_data.name, tmp_tess.name], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ) tess_txt_name = f'{tmp_tess.name}.txt' if tess_return == 0: with open(tess_txt_name, 'rb') as tess_txt: ocr_file = tess_txt.read() if ocr_file: self.event['text'] = ocr_file.split() if extract_text: extract_file = strelka.File( name='text', source=self.name, ) for c in strelka.chunk_string(ocr_file): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) else: self.flags.append('return_code_{tess_return}') os.remove(tess_txt_name)
def scan(self, data, file, options, expire_at): parser = options.get('parser', 'html.parser') self.event['total'] = { 'scripts': 0, 'forms': 0, 'inputs': 0, 'frames': 0, 'extracted': 0, } try: soup = bs4.BeautifulSoup(data, parser) if soup.title: self.event['title'] = strelka.normalize_whitespace( soup.title.text) hyperlinks = [] hyperlinks.extend(soup.find_all('a', href=True)) hyperlinks.extend(soup.find_all('img', src=True)) self.event.setdefault('hyperlinks', []) for hyperlink in hyperlinks: link = hyperlink.get('href') or hyperlink.get('src') if link not in self.event['hyperlinks']: self.event['hyperlinks'].append(link) forms = soup.find_all('form') self.event['total']['forms'] = len(forms) self.event.setdefault('forms', []) for form in forms: form_entry = { 'action': form.get('action'), 'method': form.get('method'), } if form_entry not in self.event['forms']: self.event['forms'].append(form_entry) frames = [] frames.extend(soup.find_all('frame')) frames.extend(soup.find_all('iframe')) self.event['total']['frames'] = len(frames) self.event.setdefault('frames', []) for frame in frames: frame_entry = { 'src': frame.get('src'), 'name': frame.get('name'), 'height': frame.get('height'), 'width': frame.get('width'), 'border': frame.get('border'), 'id': frame.get('id'), 'style': frame.get('style'), } if frame_entry not in self.event['frames']: self.event['frames'].append(frame_entry) inputs = soup.find_all('input') self.event['total']['inputs'] = len(inputs) self.event.setdefault('inputs', []) for input in inputs: input_entry = { 'type': input.get('type'), 'name': input.get('name'), 'value': input.get('value'), } if input_entry not in self.event['inputs']: self.event['inputs'].append(input_entry) scripts = soup.find_all('script') self.event['total']['scripts'] = len(scripts) self.event.setdefault('scripts', []) for (index, script) in enumerate(scripts): script_flavors = [ script.get('language', '').lower(), script.get('type', '').lower(), ] script_entry = { 'src': script.get('src'), 'language': script.get('language'), 'type': script.get('type'), } if script_entry not in self.event['scripts']: self.event['scripts'].append(script_entry) if script.text: extract_file = strelka.File( name=f'script_{index}', source=self.name, ) extract_file.add_flavors({'external': script_flavors}) for c in strelka.chunk_string(script.text): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 spans = soup.find_all('span') self.event['total']['spans'] = len(spans) self.event.setdefault('spans', []) for span in spans: span_entry = { 'class': span.get('class'), 'style': span.get('style'), } if span_entry not in self.event['spans']: self.event['spans'].append(span_entry) except TypeError: self.flags.append('type_error')
def scan(self, data, file, options, expire_at): tmp_directory = options.get('tmp_directory', '/tmp/') with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() try: with rpmfile.open(tmp_data.name) as rpm_obj: extract_name = '' for (key, value) in rpm_obj.headers.items(): if key == 'arch': self.event['architecture'] = value elif key == 'archive_compression': self.event['archive_compression'] = value elif key == 'archive_format': self.event['archive_format'] = value elif key == 'authors': self.event['authors'] = value elif key == 'buildhost': self.event['build_host'] = value elif key == 'buildtime': self.event['build_time'] = value elif key == 'copyright': self.event['copyright'] = value elif key == 'description': if value is not None: self.event['description'] = value.replace(b'\n', b' ') elif key == 'filenames': self.event['filenames'] = value elif key == 'group': self.event['group'] = value elif key == 'name': self.event['name'] = value extract_name = f'{value.decode()}' elif key == 'os': self.event['os'] = value elif key == 'packager': self.event['packager'] = value elif key == 'provides': self.event['provides'] = value elif key == 'release': self.event['release'] = value elif key == 'requirename': self.event['require_name'] = value elif key == 'rpmversion': self.event['rpm_version'] = value elif key == 'serial': self.event['serial'] = value elif key == 'sourcerpm': self.event['source_rpm'] = value elif key == 'summary': self.event['summary'] = value elif key == 'vendor': self.event['vendor'] = value elif key == 'version': self.event['version'] = value elif key == 'url': self.event['url'] = value extract_file = strelka.File( name=extract_name, source=self.name, ) for c in strelka.chunk_string(data[rpm_obj.data_offset:]): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) except ValueError: self.flags.append('value_error')
def scan(self, data, file, options, expire_at): extract_text = options.get("extract_text", False) file_limit = options.get("limit", 2000) self.event["total"] = {"objects": 0, "extracted": 0} extracted_objects = set() try: with io.BytesIO(data) as pdf_io: # Open file as with PyMuPDF as file object reader = fitz.open(stream=pdf_io, filetype="pdf") # Get length of xrefs to be used in xref / annotation iteration xreflen = reader.xref_length() # Iterate through xrefs and collect annotations i = 0 for xref in range(1, xreflen): # PDF Annotation Flags xref_object = reader.xref_object(i, compressed=False) if any(obj in xref_object for obj in ["/AA", "/OpenAction"]): self.flags.append("auto_action") if any(obj in xref_object for obj in ["/JS", "/JavaScript"]): self.flags.append("javascript_embedded") # PDF Object Resubmission # If xref is a stream, add that object back into the analysis pipeline if reader.is_stream(xref): try: if xref not in extracted_objects: extract_file = strelka.File( name=f"object_{xref}", source=self.name, ) for c in strelka.chunk_string( reader.xref_stream(xref)): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event["total"]["extracted"] += 1 extracted_objects.add(xref) except Exception: self.flags.append("stream_read_exception") i += 1 # Iterate through pages and collect links and text if extract_text: extracted_text = "" try: for page in reader: # PDF Link Extraction self.event.setdefault("annotated_uris", []) links = page.get_links() if links: for link in links: if "uri" in link: self.event["annotated_uris"].append( link["uri"]) if extract_text: extracted_text += page.getText() # PDF Text Extraction # Caution: Will increase time and object storage size if extract_text: extract_file = strelka.File( name="text", source=self.name, ) for c in strelka.chunk_string(extracted_text): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.flags.append("extracted_text") except: self.flags.append("page_parsing_failure") except Exception: self.flags.append("pdf_load_error")