def recurse_add_res(self, file_res, res_list, new_files, parent=None): for res_dic in res_list: # Check if condition is OK if self.pass_condition(res_dic.get("condition", None)): res = ResultSection(res_dic['title_text'], classification=res_dic.get('classification', Classification.UNRESTRICTED), parent=parent, body_format=res_dic.get('body_format', BODY_FORMAT.TEXT)) heur_id = self.heuristic_alteration(res_dic.get('score_condition', None), res_dic['heur_id']) res.set_heuristic(heur_id) # Add Tags tags = res_dic.get('tags', []) for res_tag in tags: res.add_tag(res_tag[0], res_tag[1]) # Add body body = res_dic.get('body', None) if body: res.set_body(body) # File for resubmit files = res_dic.get('files', []) for res_file in files: if isinstance(res_file, tuple): res_file = res_file[1] new_files.append(res_file) # Add to file res if root result if parent is None: file_res.add_section(res)
def test_init(mocker): from assemblyline_v4_service.common.result import ResultSection mocker.patch("assemblyline_v4_service.common.api.ServiceAPIError") from metadefender import AvErrorSection av_name = "blah" engine = {} actual_res_sec = AvErrorSection(av_name, engine) correct_result_section = ResultSection( f"{av_name} failed to scan the file") correct_result_section.set_body("") assert check_section_equality(actual_res_sec, correct_result_section) engine = {"version": "blah", "def_time": "blah"} actual_res_sec = AvErrorSection(av_name, engine) correct_result_section = ResultSection( f"{av_name} failed to scan the file") correct_result_section.set_body( f"Engine: {engine['version']} :: Definition: {engine['def_time']}") assert check_section_equality(actual_res_sec, correct_result_section)
def subsection_builder(parent_section: ResultSection = None, fields: dict = {}): for mwcp_field, mwcp_field_data in fields.items(): if mwcp_field in FIELD_TAG_MAP: tag = FIELD_TAG_MAP[mwcp_field] table_body = [] table_section = ResultSection( f"Extracted {mwcp_field.capitalize()}") if tag: for x in mwcp_field_data: table_section.add_tag(tag, x) # Tag everything that we can # Add data to section body for line in mwcp_field_data: if type(line) is str: table_body.append({mwcp_field: line}) elif type(line) is list: for item in line: table_body.append({mwcp_field: item}) table_section.set_body(body_format=BODY_FORMAT.TABLE, body=json.dumps(table_body)) parent_section.add_subsection(table_section)
def execute(self, request): parser = eml_parser.eml_parser.EmlParser(include_raw_body=True, include_attachment_data=True) content_str = request.file_contents # Attempt conversion of potential Outlook file -> eml if request.file_type == "document/office/email": try: content_str = msg2eml(request.file_path).as_bytes() except Exception: # Try using mailparser to convert converted_path, _ = msgconvert(request.file_path) content_str = open(converted_path, "rb").read() header_agg = { "From": set(), "To": set(), "Cc": set(), "Sent": set(), "Reply-To": set(), "Date": set() } # Assume this is an email saved in HTML format if request.file_type == "code/html": parsed_html = BeautifulSoup(content_str, "lxml") valid_headers = [ "To:", "Cc:", "Sent:", "From:", "Subject:", "Reply-To:" ] if not parsed_html.body or not any(header in parsed_html.body.text for header in valid_headers): # We can assume this is just an HTML doc (or lacking body), one of which we can't process request.result = Result() return # Can't trust 'Date' to determine the difference between HTML docs vs HTML emails valid_headers.append("Date:") html_email = email.message_from_bytes(content_str) generator_metadata_content = "" for meta in parsed_html.find_all("meta"): if meta.attrs.get("name", None) == "Generator": generator_metadata_content = meta.attrs.get("content", "") break # Process HTML emails generated from Outlook if generator_metadata_content == "Microsoft Word 15": paragraphs = parsed_html.body.find_all("p") # Likely an email that was exported with original email headers if any(header in paragraphs[0] for header in valid_headers): for p in paragraphs: if any(valid_header in p.text for valid_header in valid_headers): h_key, h_value = p.text.replace( "\xa0", "").replace("\r\n", " ").split(":", 1) html_email[h_key] = h_value # Subject line indicates the end of the email header, beginning of body if "Subject" in p.text: break # Process HTML emails from MS Exchange Server or missing top-level headers (aggregate headers) elif (generator_metadata_content == "Microsoft Word 15 (filtered medium)" or generator_metadata_content == "Microsoft Exchange Server" or generator_metadata_content == ""): subject = None for div in parsed_html.find_all("div"): # Header information within divs if any(header in div.text for header in valid_headers ) and "WordSection1" not in div.attrs.get( "class", []): # Usually expect headers to be \n separated in text output but check first if "\n" in div.text: for h in div.text.split("\n"): if any(header in h for header in valid_headers): h_key, h_value = h.split(":", 1) # Implying some malformed message got mixed with the headers of another message if h_key not in valid_headers: for header in valid_headers: if header in h: h_key = header[:-1] # Use the latest message's subject (this maintains FW, RE, etc.) if h_key == "Subject" and not subject: subject = h_value elif h_key != "Subject": header_agg[h_key].add(h_value) # Document was probably not well formatted, so we'll use the headers as delimiters else: header_offset_map = {} # Determine the position of each header for header in list( header_agg.keys()) + ["Subject"]: if header in div.text: header_offset_map[div.text.index( header)] = header # Use the positions and length of header name to determine an offset for i in range(len(header_offset_map)): sorted_keys = sorted(header_offset_map.keys()) header_name = header_offset_map[sorted_keys[i]] offset = len( f"{header_name}: ") + sorted_keys[i] value = (div.text[offset:sorted_keys[i + 1]] if i < len(header_offset_map) - 1 else div.text[offset:]) if header_name == "Subject": subject = value else: header_agg[header_name].add(value) # Assign aggregated info to email object html_email["Subject"] = subject for key, value in header_agg.items(): html_email[key] = "; ".join(value) content_str = html_email.as_bytes() parsed_eml = parser.decode_email_bytes(content_str) result = Result() header = parsed_eml["header"] if "from" in header or "to" in header: all_uri = set() body_words = set(extract_passwords(header["subject"])) for body_counter, body in enumerate(parsed_eml["body"]): body_text = BeautifulSoup(body["content"]).text body_words.update(extract_passwords(body_text)) if request.get_param("extract_body_text"): fd, path = mkstemp() with open(path, "w") as f: f.write(body["content"]) os.close(fd) request.add_extracted(path, "body_" + str(body_counter), "Body text") if "uri" in body: for uri in body["uri"]: all_uri.add(uri) # Words in the email body, used by extract to guess passwords request.temp_submission_data["email_body"] = list(body_words) kv_section = ResultSection("Email Headers", body_format=BODY_FORMAT.KEY_VALUE, parent=result) # Basic tags from_addr = header["from"].strip() if header.get("from", None) else None if from_addr and re.match(EMAIL_REGEX, from_addr): kv_section.add_tag("network.email.address", from_addr) [ kv_section.add_tag("network.email.address", to.strip()) for to in header["to"] if re.match(EMAIL_REGEX, to.strip()) ] kv_section.add_tag("network.email.date", str(header["date"]).strip()) subject = header["subject"].strip() if header.get("subject", None) else None if subject: kv_section.add_tag("network.email.subject", subject) # Add CCs to body and tags if "cc" in header: [ kv_section.add_tag("network.email.address", cc.strip()) for cc in header["cc"] if re.match(EMAIL_REGEX, cc.strip()) ] # Add Message ID to body and tags if "message-id" in header["header"]: kv_section.add_tag("network.email.msg_id", header["header"]["message-id"][0].strip()) # Add Tags for received IPs if "received_ip" in header: for ip in header["received_ip"]: ip = ip.strip() try: if isinstance(ip_address(ip), IPv4Address): kv_section.add_tag("network.static.ip", ip) except ValueError: pass # Add Tags for received Domains if "received_domain" in header: for dom in header["received_domain"]: kv_section.add_tag("network.static.domain", dom.strip()) # If we've found URIs, add them to a section if len(all_uri) > 0: uri_section = ResultSection("URIs Found:", parent=result) for uri in all_uri: uri_section.add_line(uri) uri_section.add_tag("network.static.uri", uri.strip()) parsed_url = urlparse(uri) if parsed_url.hostname and re.match( IP_ONLY_REGEX, parsed_url.hostname): uri_section.add_tag("network.static.ip", parsed_url.hostname) else: uri_section.add_tag("network.static.domain", parsed_url.hostname) # Bring all headers together... extra_header = header.pop("header", {}) header.pop("received", None) header.update(extra_header) # Convert to common format header["date"] = [self.json_serial(header["date"])] # Replace with aggregated date(s) if any available if header_agg["Date"]: # Replace if any( default_date in header["date"] for default_date in ["1970-01-01T00:00:00", "Thu, 01 Jan 1970 00:00:00 +0000" ]): header["date"] = list(header_agg["Date"]) # Append else: header["date"] += list(header_agg["Date"]) (kv_section.add_tag("network.email.date", str(date).strip()) for date in header_agg["Date"]) # Filter out useless headers from results self.log.debug(header.keys()) [header.pop(h) for h in self.header_filter if h in header.keys()] kv_section.set_body(json.dumps(header, default=self.json_serial)) attachments_added = [] if "attachment" in parsed_eml: attachments = parsed_eml["attachment"] for attachment in attachments: fd, path = mkstemp() with open(path, "wb") as f: f.write(base64.b64decode(attachment["raw"])) os.close(fd) try: if request.add_extracted( path, attachment["filename"], "Attachment ", safelist_interface=self.api_interface): attachments_added.append(attachment["filename"]) except MaxExtractedExceeded: self.log.warning( f"Extract limit reached on attachments: " f"{len(attachment) - len(attachments_added)} not added" ) break ResultSection("Extracted Attachments:", body="\n".join([x for x in attachments_added]), parent=result) if request.get_param("save_emlparser_output"): fd, temp_path = tempfile.mkstemp(dir=self.working_directory) attachments = parsed_eml.get("attachment", []) # Remove raw attachments, all attachments up to MaxExtractedExceeded already extracted for attachment in attachments: _ = attachment.pop("raw", None) with os.fdopen(fd, "w") as myfile: myfile.write( json.dumps(parsed_eml, default=self.json_serial)) request.add_supplementary( temp_path, "parsing.json", "These are the raw results of running GOVCERT-LU's eml_parser" ) else: self.log.warning( "emlParser could not parse EML; no useful information in result's headers" ) request.result = result
def peepdf_analysis(self, temp_filename, file_content, request): file_res = Result() try: res_list = [] # js_stream = [] f_list = [] js_dump = [] pdf_parser = PDFParser() ret, pdf_file = pdf_parser.parse(temp_filename, True, False, file_content) if ret == 0: stats_dict = pdf_file.getStats() if ", ".join(stats_dict['Errors']) == "Bad PDF header, %%EOF not found, PDF sections not found, No " \ "indirect objects found in the body": # Not a PDF return json_body = dict( version=stats_dict['Version'], binary=stats_dict['Binary'], linearized=stats_dict['Linearized'], encrypted=stats_dict['Encrypted'], ) if stats_dict['Encryption Algorithms']: temp = [] for algorithmInfo in stats_dict['Encryption Algorithms']: temp.append(f"{algorithmInfo[0]} {str(algorithmInfo[1])} bits") json_body["encryption_algorithms"] = temp json_body.update(dict( updates=stats_dict['Updates'], objects=stats_dict['Objects'], streams=stats_dict['Streams'], comments=stats_dict['Comments'], errors={True: ", ".join(stats_dict['Errors']), False: "None"}[len(stats_dict['Errors']) != 0] )) res = ResultSection("PDF File Information", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(json_body)) for version in range(len(stats_dict['Versions'])): stats_version = stats_dict['Versions'][version] v_json_body = dict( catalog=stats_version['Catalog'] or "no", info=stats_version['Info'] or "no", objects=self.list_first_x(stats_version['Objects'][1]), ) if stats_version['Compressed Objects'] is not None: v_json_body['compressed_objects'] = self.list_first_x(stats_version['Compressed Objects'][1]) if stats_version['Errors'] is not None: v_json_body['errors'] = self.list_first_x(stats_version['Errors'][1]) v_json_body['streams'] = self.list_first_x(stats_version['Streams'][1]) if stats_version['Xref Streams'] is not None: v_json_body['xref_streams'] = self.list_first_x(stats_version['Xref Streams'][1]) if stats_version['Object Streams'] is not None: v_json_body['object_streams'] = self.list_first_x(stats_version['Object Streams'][1]) if int(stats_version['Streams'][0]) > 0: v_json_body['encoded'] = self.list_first_x(stats_version['Encoded'][1]) if stats_version['Decoding Errors'] is not None: v_json_body['decoding_errors'] = self.list_first_x(stats_version['Decoding Errors'][1]) if stats_version['Objects with JS code'] is not None: v_json_body['objects_with_js_code'] = \ self.list_first_x(stats_version['Objects with JS code'][1]) # js_stream.extend(stats_version['Objects with JS code'][1]) res_version = ResultSection(f"Version {str(version)}", parent=res, body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(v_json_body)) actions = stats_version['Actions'] events = stats_version['Events'] vulns = stats_version['Vulns'] elements = stats_version['Elements'] is_suspicious = False if events is not None or actions is not None or vulns is not None or elements is not None: res_suspicious = ResultSection('Suspicious elements', parent=res_version) if events is not None: for event in events: res_suspicious.add_line(f"{event}: {self.list_first_x(events[event])}") is_suspicious = True if actions is not None: for action in actions: res_suspicious.add_line(f"{action}: {self.list_first_x(actions[action])}") is_suspicious = True if vulns is not None: for vuln in vulns: if vuln in vulnsDict: temp = [vuln, ' ('] for vulnCVE in vulnsDict[vuln]: if len(temp) != 2: temp.append(',') vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE temp.append(vulnCVE) cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE) if cve_found: res_suspicious.add_tag('attribution.exploit', vulnCVE[cve_found.start():cve_found.end()]) res_suspicious.add_tag('file.behavior', vulnCVE[cve_found.start():cve_found.end()]) temp.append('): ') temp.append(str(vulns[vuln])) res_suspicious.add_line(temp) else: res_suspicious.add_line(f"{vuln}: {str(vulns[vuln])}") is_suspicious = True if elements is not None: for element in elements: if element in vulnsDict: temp = [element, ' ('] for vulnCVE in vulnsDict[element]: if len(temp) != 2: temp.append(',') vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE temp.append(vulnCVE) cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE) if cve_found: res_suspicious.add_tag('attribution.exploit', vulnCVE[cve_found.start():cve_found.end()]) res_suspicious.add_tag('file.behavior', vulnCVE[cve_found.start():cve_found.end()]) temp.append('): ') temp.append(str(elements[element])) res_suspicious.add_line(temp) is_suspicious = True else: res_suspicious.add_line(f"\t\t{element}: {str(elements[element])}") is_suspicious = True res_suspicious.set_heuristic(8) if is_suspicious else None urls = stats_version['URLs'] if urls is not None: res.add_line("") res_url = ResultSection('Found URLs', parent=res) for url in urls: res_url.add_line(f"\t\t{url}") res_url.set_heuristic(9) for obj in stats_version['Objects'][1]: cur_obj = pdf_file.getObject(obj, version) if cur_obj.containsJScode: cur_res = ResultSection(f"Object [{obj} {version}] contains {len(cur_obj.JSCode)} " f"block of JavaScript") score_modifier = 0 js_idx = 0 for js in cur_obj.JSCode: sub_res = ResultSection('Block of JavaScript', parent=cur_res) js_idx += 1 js_score = 0 js_code, unescaped_bytes, _, _, _ = analyseJS(js) js_dump += [x for x in js_code] # Malicious characteristics big_buffs = self.get_big_buffs("".join(js_code)) if len(big_buffs) == 1: js_score += 500 * len(big_buffs) if len(big_buffs) > 0: js_score += 500 * len(big_buffs) has_eval, has_unescape = self.check_dangerous_func("".join(js_code)) if has_unescape: js_score += 100 if has_eval: js_score += 100 js_cmt = "" if has_eval or has_unescape or len(big_buffs) > 0: score_modifier += js_score js_cmt = "Suspiciously malicious " cur_res.add_tag('file.behavior', "Suspicious JavaScript in PDF") sub_res.set_heuristic(7) js_res = ResultSection(f"{js_cmt}JavaScript Code (block: {js_idx})", parent=sub_res) if js_score > 0: temp_js_outname = f"object{obj}-{version}_{js_idx}.js" temp_js_path = os.path.join(self.working_directory, temp_js_outname) temp_js_bin = "".join(js_code).encode("utf-8") f = open(temp_js_path, "wb") f.write(temp_js_bin) f.close() f_list.append(temp_js_path) js_res.add_line(f"The JavaScript block was saved as {temp_js_outname}") if has_eval or has_unescape: analysis_res = ResultSection("[Suspicious Functions]", parent=js_res) if has_eval: analysis_res.add_line("eval: This JavaScript block uses eval() function " "which is often used to launch deobfuscated " "JavaScript code.") analysis_res.set_heuristic(3) if has_unescape: analysis_res.add_line("unescape: This JavaScript block uses unescape() " "function. It may be legitimate but it is definitely " "suspicious since malware often use this to " "deobfuscate code blocks.") analysis_res.set_heuristic(3) buff_idx = 0 for buff in big_buffs: buff_idx += 1 error, new_buff = unescape(buff) if error == 0: buff = new_buff if buff not in unescaped_bytes: temp_path_name = None if ";base64," in buff[:100] and "data:" in buff[:100]: temp_path_name = f"obj{obj}_unb64_{buff_idx}.buff" try: buff = b64decode(buff.split(";base64,")[1].strip()) temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(buff) f.close() f_list.append(temp_path) except Exception: self.log.error("Found 'data:;base64, ' buffer " "but failed to base64 decode.") temp_path_name = None if temp_path_name is not None: buff_cond = f" and was resubmitted as {temp_path_name}" else: buff_cond = "" buff_res = ResultSection( f"A {len(buff)} bytes buffer was found in the JavaScript " f"block{buff_cond}. Here are the first 256 bytes.", parent=js_res, body=hexdump(bytes(buff[:256], "utf-8")), body_format=BODY_FORMAT.MEMORY_DUMP) buff_res.set_heuristic(2) processed_sc = [] sc_idx = 0 for sc in unescaped_bytes: if sc not in processed_sc: sc_idx += 1 processed_sc.append(sc) try: sc = sc.decode("hex") except Exception: pass shell_score = 500 temp_path_name = f"obj{obj}_unescaped_{sc_idx}.buff" shell_res = ResultSection(f"Unknown unescaped {len(sc)} bytes JavaScript " f"buffer (id: {sc_idx}) was resubmitted as " f"{temp_path_name}. Here are the first 256 bytes.", parent=js_res) shell_res.set_body(hexdump(sc[:256]), body_format=BODY_FORMAT.MEMORY_DUMP) temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(sc) f.close() f_list.append(temp_path) cur_res.add_tag('file.behavior', "Unescaped JavaScript Buffer") shell_res.set_heuristic(6) score_modifier += shell_score if score_modifier > 0: res_list.append(cur_res) elif cur_obj.type == "stream": if cur_obj.isEncodedStream and cur_obj.filter is not None: data = cur_obj.decodedStream encoding = cur_obj.filter.value.replace("[", "").replace("]", "").replace("/", "").strip() val = cur_obj.rawValue otype = cur_obj.elements.get("/Type", None) sub_type = cur_obj.elements.get("/Subtype", None) length = cur_obj.elements.get("/Length", None) else: data = cur_obj.rawStream encoding = None val = cur_obj.rawValue otype = cur_obj.elements.get("/Type", None) sub_type = cur_obj.elements.get("/Subtype", None) length = cur_obj.elements.get("/Length", None) if otype: otype = otype.value.replace("/", "").lower() if sub_type: sub_type = sub_type.value.replace("/", "").lower() if length: length = length.value if otype == "embeddedfile": if len(data) > 4096: if encoding is not None: temp_encoding_str = f"_{encoding}" else: temp_encoding_str = "" cur_res = ResultSection( f'Embedded file found ({length} bytes) [obj: {obj} {version}] ' f'and dumped for analysis {f"(Type: {otype}) " if otype is not None else ""}' f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}' f'{f"(Encoded with {encoding})" if encoding is not None else ""}' ) temp_path_name = f"EmbeddedFile_{obj}{temp_encoding_str}.obj" temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(data) f.close() f_list.append(temp_path) cur_res.add_line(f"The EmbeddedFile object was saved as {temp_path_name}") res_list.append(cur_res) elif otype not in BANNED_TYPES: cur_res = ResultSection( f'Unknown stream found [obj: {obj} {version}] ' f'{f"(Type: {otype}) " if otype is not None else ""}' f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}' f'{f"(Encoded with {encoding})" if encoding is not None else ""}' ) for line in val.splitlines(): cur_res.add_line(line) emb_res = ResultSection('First 256 bytes', parent=cur_res) first_256 = data[:256] if isinstance(first_256, str): first_256 = first_256.encode() emb_res.set_body(hexdump(first_256), BODY_FORMAT.MEMORY_DUMP) res_list.append(cur_res) else: pass file_res.add_section(res) for results in res_list: file_res.add_section(results) if js_dump: js_dump_res = ResultSection('Full JavaScript dump') temp_js_dump = "javascript_dump.js" temp_js_dump_path = os.path.join(self.working_directory, temp_js_dump) try: temp_js_dump_bin = "\n\n----\n\n".join(js_dump).encode("utf-8") except UnicodeDecodeError: temp_js_dump_bin = "\n\n----\n\n".join(js_dump) temp_js_dump_sha1 = hashlib.sha1(temp_js_dump_bin).hexdigest() f = open(temp_js_dump_path, "wb") f.write(temp_js_dump_bin) f.flush() f.close() f_list.append(temp_js_dump_path) js_dump_res.add_line(f"The JavaScript dump was saved as {temp_js_dump}") js_dump_res.add_line(f"The SHA-1 for the JavaScript dump is {temp_js_dump_sha1}") js_dump_res.add_tag('file.pdf.javascript.sha1', temp_js_dump_sha1) file_res.add_section(js_dump_res) for filename in f_list: request.add_extracted(filename, os.path.basename(filename), f"Dumped from {os.path.basename(temp_filename)}") else: res = ResultSection("ERROR: Could not parse file with PeePDF.") file_res.add_section(res) finally: request.result = file_res try: del pdf_file except Exception: pass try: del pdf_parser except Exception: pass gc.collect()
def test_init(mocker): from json import dumps from assemblyline_v4_service.common.result import BODY_FORMAT, ResultSection mocker.patch("assemblyline_v4_service.common.api.ServiceAPIError") from metadefender import AvHitSection av_name = "blah" virus_name = "blah" engine = {} heur_id = 1 sig_score_rev_map = {} kw_score_rev_map = {} safelist_match = [] actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.set_heuristic(1) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "infected", "engine_version": "unknown", "engine_definition_time": "unknown" }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section) engine = {"version": "blah", "def_time": 1} heur_id = 2 safelist_match = ["blah"] actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_heuristic(2) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}", 0) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "suspicious", "engine_version": "blah", "engine_definition_time": 1 }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section) kw_score_rev_map = {"bla": 1} actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_heuristic(2) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}", 1) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "suspicious", "engine_version": "blah", "engine_definition_time": 1 }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section) kw_score_rev_map = {"bla": 1, "h": 2} actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_heuristic(2) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}", 2) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "suspicious", "engine_version": "blah", "engine_definition_time": 1 }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section) sig_score_rev_map = {f"{av_name}.{virus_name}": 10} actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_heuristic(2) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}", 10) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "suspicious", "engine_version": "blah", "engine_definition_time": 1 }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section)
def analyze_pdf(self, request, res_txt, path, working_dir, heur, additional_keywords, get_malform=True): """Extract metadata, keyword objects and content of interest from a PDF sample using PDFId, PDFId plugins, and PDF Parser. Args: request: AL request object. res_txt: Header string for AL result section title. path: Original PDF sample path. working_dir: AL working directory. heur: List of plugins to run on PDFId results (provided in service configuration). additional_keywords: List of additional keywords to be searched (provided in service configuration). get_malform: Extract malformed objects from PDF. Returns: AL result object, AL heuristics list to add to result, list of object streams (objstms), and an errors list. """ triage_keywords = set() all_errors = set() embed_present = False objstms = False res = ResultSection(title_text=res_txt) carved_extracted_shas = set() if request.deep_scan: run_pdfparse = True else: run_pdfparse = False # Run PDFId try: pdfid_result, errors = self.get_pdfid([path], additional_keywords, heur, request.deep_scan) except Exception as e: raise NonRecoverableError(e) # Parse PDFId results pdfidres = ResultSection(title_text="PDFID Results", parent=res) if len(pdfid_result) == 0: pdfidres.add_line( "No results generated for file. Please see errors.") else: # Do not run for objstms, which are being analyzed when get_malform == False if get_malform: version = pdfid_result.get("PDFID", None) if version: pdfidres.add_line(version) properties = pdfid_result.get("Properties", None) if properties: pres = ResultSection(title_text="PDF Properties", parent=pdfidres) for k, v in properties.items(): pres.add_line(f"{k}: {v}") if k == "/ModDate": pres.add_tag('file.pdf.date.modified', v) elif k == "/CreationDate": pres.add_tag('file.date.creation', v) elif k == "/LastModified": pres.add_tag('file.date.last_modified', v) elif k == "/SourceModified": pres.add_tag('file.pdf.date.source_modified', v) elif k == "/pdfx": pres.add_tag('file.pdf.date.pdfx', v) entropy = pdfid_result.get("Entropy", None) if entropy: enres = ResultSection(title_text="Entropy", parent=pdfidres) for enlist in entropy: enres.add_line( f"{enlist[0]}: {enlist[1]}, ({enlist[2]})") flags = pdfid_result.get("Flags", None) if isinstance(flags, dict): fres = ResultSection(title_text="PDF Keyword Flags (Count)", parent=pdfidres) for k, v in flags.items(): if k == "/ObjStm": objstms = True # Filter out seemingly meaningless keywords if ((not isinstance(v, dict) and int(v) > 1) or (isinstance(v, dict))) and len(k) > 2: fres.add_line(f"{k}: {v}") fres.add_tag('file.string.extracted', k.replace("/", "", 1)) if k in additional_keywords: triage_keywords.add(k.replace("/", "", 1)) plugin = pdfid_result.get("Plugin", []) # If any plugin results, or flagged keywords found, run PDF Parser if plugin or len(triage_keywords) > 0: run_pdfparse = True for pllist in plugin: pl_name, pl_heur, pl_text = pllist pl_heur = int(pl_heur) pl_text = pl_text[14:] if not pl_text or pl_text == "None": continue if pl_name in ['EmbeddedFile', 'Name Obfuscation']: modres = ResultSection(title_text=pl_text, parent=pdfidres) if pl_heur > 0: modres.set_heuristic(pl_heur) if pl_name == 'EmbeddedFile': embed_present = True elif pl_name in ['Triage', 'Suspicious Properties']: javascript_found = False for line in pl_text.splitlines(): lineres = ResultSection(title_text=line) # Triage results if '/JavaScript' in line: triage_keywords.add('JavaScript') if not javascript_found: lineres.set_heuristic(19) javascript_found = True elif '/JS' in line: triage_keywords.add('JS') if not javascript_found: lineres.set_heuristic(19) javascript_found = True elif '/JBIG2Decode' in line: triage_keywords.add('JBIG2Decode') lineres.set_heuristic(3) elif '/Colors > 2^24' in line: triage_keywords.add('Colors > 2^24') lineres.set_heuristic(20) elif '/AA' in line: triage_keywords.add('AA') lineres.set_heuristic(1) elif '/Launch' in line: triage_keywords.add('Launch') lineres.set_heuristic(1) elif '/OpenAction' in line: triage_keywords.add('OpenAction') lineres.set_heuristic(1) elif '/GoToE' in line: triage_keywords.add('GoToE') lineres.set_heuristic(21) elif '/GoToR' in line: triage_keywords.add('GoToR') lineres.set_heuristic(22) elif '/Encrypt' in line: triage_keywords.add('Encrypt') lineres.set_heuristic(11) elif '/AcroForm' in line: triage_keywords.add('AcroForm') lineres.set_heuristic(4) elif '/RichMedia' in line: triage_keywords.add('RichMedia') lineres.set_heuristic(5) elif '/XFA' in line: triage_keywords.add('XFA') lineres.set_heuristic(23) elif '/Annot' in line: triage_keywords.add('Annot') lineres.set_heuristic(25) elif '/ObjStm' in line: triage_keywords.add('ObjStm') lineres.set_heuristic(7) elif '/URI' in line: triage_keywords.add('URI') lineres.set_heuristic(24) # Suspicious properties results elif "eof2" in line: lineres.set_heuristic(2) elif "eof5" in line: lineres.set_heuristic(17) elif "page" in line: lineres.set_heuristic(26) elif "entropy" in line: lineres.set_heuristic(12) elif "obj/endobj" in line: lineres.set_heuristic(13) elif "stream/endstream" in line: lineres.set_heuristic(14) if lineres.heuristic is not None: pdfidres.add_subsection(lineres) for e in errors: all_errors.add(e) if e.startswith('Error running plugin'): self.log.warn(e) if run_pdfparse: # CALL PDF parser and extract further information pdf_parserres = ResultSection(title_text="PDF Parser Results") # STATISTICS # Do not run for objstms, which are being analyzed when get_malform == False if get_malform: options = { "stats": True, } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: if len(pdf_parser_result) == 0: pdf_parserres.add_line( "No statistical results generated for file. Please see errors." ) else: version = pdf_parser_result.get("version", None) if version and version[0] != '0': pdf_parserres.add_line(version[0]) stats = pdf_parser_result.get("stats", None) if stats: sres = ResultSection( title_text="PDF Statistcs", parent=pdf_parserres, body_format=BODY_FORMAT.MEMORY_DUMP) for p in stats: sres.add_line(p) for e in errors: all_errors.add(e) # Triage plugin -- search sample for keywords and carve content or extract object (if it contains a stream) carved_content = {} # Format { "objnum": [{keyword: content list}} obj_extract_triage = set() jbig_objs = set() for keyword in triage_keywords: # ObjStms handled differently if keyword == 'ObjStm': continue options = { "search": keyword, } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: for p in pdf_parser_result['parts']: content = "" references = [] # Trailer will be extracted anyways, try and grab all references anyways -- will be messy if p.startswith("trailer:"): # Grab the content after the keyword # Check that keyword actually in content if f"/{keyword}" in p: try: content = p.split(keyword, 1)[1].replace( '>>++>>', '').split("/", 1)[0].strip() references = re.findall( "[0-9]* [0-9]* R", content) except Exception: continue # If not trailer, should be object elif 'Referencing:' in p: # Grab the content after the keyword if '>>++>>' in p: try: content = p.split(keyword, 1)[1].replace( '>>++>>', '').strip() except Exception: try: content = p.split("\n", 3)[3] except Exception: content = p else: try: content = p.split("\n", 3)[3] except Exception: content = p # Sometimes the content is the same keyword with references (i.e "/URI /URI 10 0 R" if content.startswith(f"/{keyword}"): try: content = re.sub(f"/{keyword}[ ]*", "", content, 1) except Exception: pass try: references = p.split("\n", 3)[2].replace( 'Referencing:', '').strip().split(", ") except Exception: pass # Only extract JBIG2Decode objects with deep scan, but always report on their presence if keyword == "JBIG2Decode" and "/Filter" in p and "Contains stream" in p: try: objnum = p.split("\n", 1)[0].split(" ")[1] if request.deep_scan: obj_extract_triage.add(objnum) jbig_objs.add(objnum) continue except Exception as e: self.log.debug(e) continue # If no content, then keyword likely points to reference objects, so grab those if content == '': if len(references) > 0: content = references else: # Something is wrong, drop it. continue else: while True: # Multiple references might be in a list, i.e. /Annot # # R vs. /Annots [# # R # # R] islist = re.match( r"[s]?[ ]?\[([0-9]* [0-9]* R[ \\rn]{0,8})*\]", content) if islist: content = re.sub( r"[\[\]]", "", islist.group(0).replace( "s ", '').replace("R ", "R,")).split(",") break # References might be with instructions, i.e. [# # R /FitH null] withinst = re.match( r"[s]?[ \\']{0,3}\[[ ]?([0-9]* [0-9]* R)[ \\rn]{1,8}" r"[/a-zA-Z0-9 ]*[ ]?\]", content) if withinst: content = [withinst.group(1)] break content = [content] break for c in content: # If keyword = Javascript and content starts with '/JS', disregard as 'JS' will be extracted if "JS" in triage_keywords and keyword == "JavaScript" and "/JS" in c[ 0:5]: continue if c in references or re.match( "[0-9]* [0-9]* R", c): try: ref_obj = c.split(" ", 1)[0] options = { "object": ref_obj, "get_object_detail": True } pdf_parser_subresult, err = self.get_pdf_parser( path, working_dir, options) if pdf_parser_subresult: for sub_p in pdf_parser_subresult[ 'parts']: sub_references = sub_p.split("\n", 3)[2].replace('Referencing:', '')\ .strip().split(", ") ptyp = sub_p.split( "\n", 2)[1].replace( 'Type:', '').strip().replace( "/", "") # If the object contains a stream, extract the object. if "Contains stream" in sub_p: try: objnum = sub_p.split( "\n", 1)[0].split(" ")[1] obj_extract_triage.add( objnum) except Exception: pass # Or if the object Type is the keyword, grab all referenced objects. elif sub_references[0] != '' and len(sub_references) >= 1 \ and ptyp == keyword: for sr in sub_references: try: objnum = sr.split( " ", 1)[0] obj_extract_triage.add( objnum) except Exception: pass # If not, extract object detail in to carved output elif pdf_parser_subresult[ 'obj_details'] != "": try: objnum = sub_p.split( "\n", 1)[0].split(" ")[1] if objnum in carved_content: carved_content[objnum]\ .append({keyword: pdf_parser_subresult['obj_details']}) else: carved_content[objnum] = \ [{keyword: pdf_parser_subresult['obj_details']}] except Exception: continue for e in err: errors.add(e) except Exception: # If none of that work, just extract the original object for examination. try: objnum = p.split("\n", 1)[0].split(" ")[1] obj_extract_triage.add(objnum) except Exception: pass # If content does not look like a reference: else: if p.startswith("trailer:"): continue objnum = p.split("\n", 1)[0].split(" ")[1] # If the object contains a stream extract the object if p.split("\n", 4)[3] == "Contains stream": obj_extract_triage.add(objnum) else: # Or just carve the content if objnum in carved_content: carved_content[objnum].append( {keyword: c}) else: carved_content[objnum] = [{keyword: c}] for e in errors: all_errors.add(e) # Add carved content to result output show_content_of_interest = False if len(carved_content) > 0 or len(jbig_objs) > 0: carres = ResultSection(title_text="Content of Interest") else: carres = None if len(jbig_objs) > 0: jbigres = ResultSection( title_text= "The following Object IDs are JBIG2DECODE streams:", body_format=BODY_FORMAT.MEMORY_DUMP, parent=carres) jbigres.add_line(', '.join(map(str, jbig_objs))) show_content_of_interest = True if len(carved_content) > 0: carved_obj_size_limit = request.get_param( 'carved_obj_size_limit') for k, l in sorted(carved_content.items()): for d in l: for keyw, con in d.items(): subres = ResultSection( title_text= f"Object {k}: Hits for Keyword '{keyw}':") subres.set_heuristic(8) con_bytes = con.encode() if len(con) < carved_obj_size_limit: subres.set_body(con, BODY_FORMAT.MEMORY_DUMP) # Check for IOC content patterns = PatternMatch() st_value = patterns.ioc_match(con_bytes, bogon_ip=True) if len(st_value) > 0: carres.add_subsection(subres) show_content_of_interest = True for ty, val in st_value.items(): for v in val: subres.add_tag(ty, v) else: crv_sha = hashlib.sha256(con_bytes).hexdigest() is_supplementary = keyw in ['URI'] extraction_purpose = "as supplementary file" if is_supplementary else "for analysis" if crv_sha not in carved_extracted_shas: f_name = f"carved_content_obj_{k}_{crv_sha[0:7]}" subres.add_lines([ f"Content over {carved_obj_size_limit} bytes it will be extracted {extraction_purpose}", f"Name: {f_name} - SHA256: {crv_sha}" ]) carres.add_subsection(subres) show_content_of_interest = True crvf = os.path.join( self.working_directory, f_name) with open(crvf, 'wb') as f: f.write(con_bytes) try: if is_supplementary: # Add as supplementary request.add_supplementary( crvf, os.path.basename(crvf), f"Supplementary content from object {k}" ) else: request.add_extracted( crvf, os.path.basename(crvf), f"Extracted content from object {k}", safelist_interface=self. api_interface) except MaxExtractedExceeded: pass carved_extracted_shas.add(crv_sha) if show_content_of_interest: pdf_parserres.add_subsection(carres) # ELEMENTS # Do not show for objstms if get_malform: if request.deep_scan: options = { "verbose": True, "nocanonicalizedoutput": True, "get_malform": get_malform } elif embed_present: options = { "verbose": True, "elements": "ctsi", "type": "/EmbeddedFile", "get_malform": get_malform } else: options = { "verbose": True, "elements": "cst", "get_malform": get_malform } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) embed_extracted = set() if pdf_parser_result: if len(pdf_parser_result) == 0: pdf_parserres.add_line( "No structure information generated for file. Please see errors." ) else: # PDF Parser will write any malformed content over 100 bytes to a file files = pdf_parser_result.get("files", None) if files: for f, l in files.items(): if f == 'malformed': if len(l) > 0: pdf_parserres.set_heuristic(6) for i in l: try: request.add_extracted( i, os.path.basename(i), "Extracted malformed content in PDF Parser Analysis.", safelist_interface=self. api_interface) except MaxExtractedExceeded: break parts = pdf_parser_result.get("parts", None) # Extract service will extract the sample's embedded files. # However we want to make note of them so that they are not extracted again below if parts: for p in sorted(parts): if "Type: /EmbeddedFile" in p: getobj = p.split("\n", 1)[0].split(" ")[1] embed_extracted.add(getobj) # Extract objects collected from above analysis obj_to_extract = obj_extract_triage - embed_extracted - jbig_objs if len(obj_to_extract) > 0: options = { "filter": True, "object": obj_to_extract, "dump": "extracted_obj_", } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: files = pdf_parser_result.get("files", None) extracted_files = [] if files: for f, l in files.items(): if f == 'embedded': for i in l: f_name = os.path.basename(i) obj_id = f_name.replace( "extracted_obj_", "") try: if request.add_extracted( i, f_name, f"Object {obj_id} extracted in PDF Parser Analysis.", safelist_interface=self. api_interface): extracted_files.append( f"Extracted object {obj_id} as {f_name}" ) except MaxExtractedExceeded: break for e in errors: all_errors.add(e) if extracted_files: extract_res = ResultSection( title_text="Extracted embedded objects", parent=pdf_parserres) extract_res.set_heuristic(9) extract_res.add_lines(extracted_files) # Extract jbig2decode objects in deep scan mode if request.deep_scan and len(jbig_objs) > 0: options = { "object": jbig_objs, "dump": "extracted_jb_obj_", } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: extracted_jb = [] files = pdf_parser_result.get("files", None) if files: for f, l in files.items(): if f == 'embedded': for i in l: f_name = os.path.basename(i) obj_id = f_name.replace( "extracted_jb_obj_", "") extracted_jb.append( f"JBIG2DECODE object {obj_id} extracted as {f_name}" ) try: if request.add_extracted( i, f_name, f"JBIG2DECODE object {obj_id} extracted in PDF Parser Analysis.", safelist_interface=self. api_interface): extracted_jb.append( f"JBIG2DECODE object {obj_id} extracted as {f_name}" ) except MaxExtractedExceeded: break for e in errors: all_errors.add(e) if extracted_jb: jbig_extract_res = ResultSection( title_text="Extracted JBIG2Decode objects", parent=pdf_parserres) jbig_extract_res.set_heuristic(9) jbig_extract_res.add_lines(extracted_jb) if len(pdf_parserres.subsections) > 0: res.add_subsection(pdf_parserres) return res, objstms, all_errors
def _add_resultinfo_for_match(self, result: Result, match): """ Parse from Yara signature match and add information to the overall AL service result. This module determines result score and identifies any AL tags that should be added (i.e. IMPLANT_NAME, THREAT_ACTOR, etc.). Args: result: AL ResultSection object. match: Yara rules Match object item. Returns: None. """ almeta = YaraMetadata(match) self._normalize_metadata(almeta) section = ResultSection('', classification=almeta.classification) if self.deep_scan or almeta.al_status != "NOISY": section.set_heuristic(self.YARA_HEURISTICS_MAP.get( almeta.category, 1), signature=f'{match.namespace}.{match.rule}', attack_id=almeta.mitre_att) section.add_tag(f'file.rule.{self.name.lower()}', f'{match.namespace}.{match.rule}') title_elements = [ f"[{match.namespace}] {match.rule}", ] if almeta.actor_type: section.add_tag('attribution.actor', almeta.actor_type) for tag in almeta.tags: section.add_tag(tag['type'], tag['value']) # Malware Tags implant_title_elements = [] for (implant_name, implant_family) in almeta.malwares: if implant_name: implant_title_elements.append(implant_name) section.add_tag('attribution.implant', implant_name) if implant_family: implant_title_elements.append(implant_family) section.add_tag('attribution.family', implant_family) if implant_title_elements: title_elements.append( f"- Implant(s): {', '.join(implant_title_elements)}") # Threat Actor metadata for actor in almeta.actors: title_elements.append(actor) section.add_tag('attribution.actor', actor) # Exploit / CVE metadata if almeta.exploits: title_elements.append( f"- Exploit(s): {', '.join(almeta.exploits)}") for exploit in almeta.exploits: section.add_tag('attribution.exploit', exploit) # Include technique descriptions in the section behavior for (category, name) in almeta.techniques: descriptor = self.TECHNIQUE_DESCRIPTORS.get(category, None) if descriptor: technique_type, technique_description = descriptor section.add_tag(technique_type, name) almeta.behavior.add(technique_description) for (category, name) in almeta.infos: descriptor = self.INFO_DESCRIPTORS.get(category, None) if descriptor: info_type, info_description = descriptor section.add_tag(info_type, name) almeta.behavior.add(info_description) # Summaries if almeta.behavior: title_elements.append(f"- Behavior: {', '.join(almeta.behavior)}") for element in almeta.behavior: section.add_tag('file.behavior', element) title = " ".join(title_elements) section.title_text = title json_body = dict(name=match.rule, ) for item in [ 'id', 'version', 'author', 'description', 'source', 'malware', 'info', 'technique', 'tool', 'exploit', 'actor', 'category', 'mitre_att' ]: val = almeta.__dict__.get(item, None) if val: json_body[item] = val string_match_data = self._add_string_match_data(match) if string_match_data: json_body['string_hits'] = string_match_data section.set_body(json.dumps(json_body), body_format=BODY_FORMAT.KEY_VALUE) result.add_section(section)
def section_builder(self, parser, field_dict, result, parsertype="MWCP"): json_body = {} malware_name = '' malware_types = [] mitre_group = '' mitre_att = '' category = 'malware' # get malware names from parser objects if parsertype == "RATDecoder": malware_name = parser if parsertype == "MWCP": for name, obj in self.file_parsers.items(): if parser in obj.parser_list: malware_name = obj.malware malware_types = obj.malware_types mitre_att = obj.mitre_att mitre_group = obj.mitre_group category = obj.category for item in [ 'classification', 'mitre_group', 'mitre_att', 'malware', 'malware_types', 'category' ]: val = getattr(obj, item, None) if val: json_body[item] = val break parser_section = ResultSection(f"{parsertype} : {parser}") parser_section = classification_checker(parser_section, parser, self.file_parsers) if len(field_dict) > 0: # if any decoder output exists raise heuristic parser_section.set_body(json.dumps(json_body), body_format=BODY_FORMAT.KEY_VALUE) parser_section.set_heuristic(HEURISTICS_MAP.get(category, 1), attack_id=mitre_att) parser_section.add_tag("source", parsertype) if malware_name: parser_section.add_tag('attribution.implant', malware_name.upper()) if mitre_group: parser_section.add_tag('attribution.actor', mitre_group.upper()) for malware_type in malware_types: parser_section.add_tag('attribution.family', malware_type.upper()) # Create subsections and attach them to the main parser_section subsection_builder(parser_section, field_dict) other_key = "other" if other_key in field_dict: other_content = field_dict[other_key] other_section = ResultSection(f"Other metadata found", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(other_content)) parser_section.add_subsection(other_section) for field in field_dict: if field != other_key and field not in FIELD_TAG_MAP: self.log.debug(f"{field} does not exist in FIELD_TAG_MAP") result.add_section(parser_section)
def execute(self, request): file_path = request.file_path result = Result() # Report the version of suricata as the service context request.set_service_context( f"Suricata version: {self.get_suricata_version()}") # restart Suricata if we need to self.start_suricata_if_necessary() # Strip frame headers from the PCAP, since Suricata sometimes has trouble parsing strange PCAPs stripped_filepath = self.strip_frame_headers(file_path) # Check to make sure the size of the stripped file isn't 0 - this happens on pcapng files # TODO: there's probably a better way to do this - don't event strip it if it's pcapng if os.stat(stripped_filepath).st_size == 0: stripped_filepath = file_path # Switch stdout and stderr so we don't get our logs polluted mystdout = StringIO() old_stdout = sys.stdout sys.stdout = mystdout mystderr = StringIO() old_stderr = sys.stderr sys.stderr = mystderr # Pass the pcap file to Suricata via the socket ret = self.suricata_sc.send_command( "pcap-file", { "filename": stripped_filepath, "output-dir": self.working_directory }) if not ret or ret["return"] != "OK": self.log.exception( f"Failed to submit PCAP for processing: {ret['message']}") # Wait for the socket finish processing our PCAP while True: time.sleep(1) try: ret = self.suricata_sc.send_command("pcap-current") if ret and ret["message"] == "None": break except ConnectionResetError as e: raise RecoverableError(e) # Bring back stdout and stderr sys.stdout = old_stdout sys.stderr = old_stderr # NOTE: for now we will ignore content of mystdout and mystderr but we have them just in case... alerts, signatures, domains, ips, urls, email_addresses, tls_dict, extracted_files, reverse_lookup = self.parse_suricata_output( ).values() file_extracted_section = ResultSection("File(s) extracted by Suricata") # Parse the json results of the service if request.get_param("extract_files"): for file in extracted_files: sha256, filename, extracted_file_path = file.values() self.log.info(f"extracted file {filename}") try: if request.add_extracted( extracted_file_path, filename, "Extracted by Suricata", safelist_interface=self.api_interface): file_extracted_section.add_line(filename) if filename != sha256: file_extracted_section.add_tag( 'file.name.extracted', filename) except FileNotFoundError as e: # An intermittent issue, just try again raise RecoverableError(e) except MaxExtractedExceeded: # We've hit our limit pass # Report a null score to indicate that files were extracted. If no sigs hit, it's not clear # where the extracted files came from if file_extracted_section.body: result.add_section(file_extracted_section) # Add tags for the domains, urls, and IPs we've discovered root_section = ResultSection("Discovered IOCs", parent=result) if domains: domain_section = ResultSection("Domains", parent=root_section) for domain in domains: domain_section.add_line(domain) domain_section.add_tag('network.dynamic.domain', domain) if ips: ip_section = ResultSection("IP Addresses", parent=root_section) for ip in ips: # Make sure it's not a local IP if not (ip.startswith("127.") or ip.startswith("192.168.") or ip.startswith("10.") or (ip.startswith("172.") and 16 <= int(ip.split(".")[1]) <= 31)): ip_section.add_line(ip) ip_section.add_tag('network.dynamic.ip', ip) if urls: url_section = ResultSection("URLs", parent=root_section) for url in urls: url_section.add_line(url) url_section.add_tag('network.dynamic.uri', url) if email_addresses: email_section = ResultSection("Email Addresses", parent=root_section) for eml in email_addresses: email_section.add_line(eml) email_section.add_tag('network.email.address', eml) # Map between suricata key names and AL tag types tls_mappings = { "subject": 'cert.subject', "issuerdn": 'cert.issuer', "version": 'cert.version', "notbefore": 'cert.valid.start', "notafter": 'cert.valid.end', "fingerprint": 'cert.thumbprint', "sni": 'network.tls.sni' } if tls_dict: tls_section = ResultSection("TLS Information", parent=root_section, body_format=BODY_FORMAT.JSON) kv_body = {} for tls_type, tls_values in tls_dict.items(): if tls_type == "fingerprint": # make sure the cert fingerprint/thumbprint matches other values, # like from PEFile tls_values = [ v.replace(":", "").lower() for v in tls_values ] if tls_type in tls_mappings: kv_body[tls_type] = tls_values tag_type = tls_mappings[tls_type] if tag_type is not None: for tls_value in tls_values: tls_section.add_tag(tag_type, tls_value) elif tls_type == "ja3": kv_body.setdefault('ja3_hash', []) kv_body.setdefault('ja3_string', []) for ja3_entry in tls_values: ja3_hash = ja3_entry.get("hash") ja3_string = ja3_entry.get("string") if ja3_hash: kv_body['ja3_hash'].append(ja3_hash) tls_section.add_tag('network.tls.ja3_hash', ja3_hash) if ja3_string: kv_body['ja3_string'].append(ja3_string) tls_section.add_tag('network.tls.ja3_string', ja3_string) else: kv_body[tls_type] = tls_values # stick a message in the logs about a new TLS type found in suricata logs self.log.info( f"Found new TLS type {tls_type} with values {tls_values}" ) tls_section.set_body(json.dumps(kv_body)) # Create the result sections if there are any hits if len(alerts) > 0: for signature_id, signature_details in signatures.items(): signature = signature_details['signature'] attributes = signature_details['attributes'] section = ResultSection(f'{signature_id}: {signature}') heur_id = 3 if any(x in signature for x in self.config.get("sure_score")): heur_id = 1 elif any(x in signature for x in self.config.get("vhigh_score")): heur_id = 2 section.set_heuristic(heur_id) if signature_details['al_signature']: section.add_tag("file.rule.suricata", signature_details['al_signature']) for timestamp, src_ip, src_port, dest_ip, dest_port in alerts[ signature_id][:10]: section.add_line( f"{timestamp} {src_ip}:{src_port} -> {dest_ip}:{dest_port}" ) if len(alerts[signature_id]) > 10: section.add_line( f'And {len(alerts[signature_id]) - 10} more flows') # Tag IPs/Domains/URIs associated to signature for flow in alerts[signature_id]: dest_ip = flow[3] section.add_tag('network.dynamic.ip', dest_ip) if dest_ip in reverse_lookup.keys(): section.add_tag('network.dynamic.domain', reverse_lookup[dest_ip]) [ section.add_tag('network.dynamic.uri', uri) for uri in urls if dest_ip in uri or (reverse_lookup.get(dest_ip) and reverse_lookup[dest_ip] in uri) ] # Add a tag for the signature id and the message section.add_tag('network.signature.signature_id', str(signature_id)) section.add_tag('network.signature.message', signature) [ section.add_tag('network.static.uri', attr['uri']) for attr in attributes if attr.get('uri') ] # Tag malware_family for malware_family in signature_details['malware_family']: section.add_tag('attribution.family', malware_family) result.add_section(section) self.ontology.add_result_part( Signature, data=dict( name=signature_details['al_signature'], type="SURICATA", malware_families=signature_details['malware_family'] or None, attributes=attributes)) # Add the original Suricata output as a supplementary file in the result request.add_supplementary( os.path.join(self.working_directory, 'eve.json'), 'SuricataEventLog.json', 'json') # Add the stats.log to the result, which can be used to determine service success if os.path.exists(os.path.join(self.working_directory, 'stats.log')): request.add_supplementary( os.path.join(self.working_directory, 'stats.log'), 'stats.log', 'log') request.result = result