def _unpack(self, buf): """Extract into a list irc messages of a tcp streams. @buf: tcp stream data """ try: f = cStringIO.StringIO(buf) lines = f.readlines() except Exception: log.error("Failed reading tcp stream buffer") return False logirc = False for element in lines: if not re.match("^:", element) is None: command = "([a-zA-Z]+|[0-9]{3})" params = "(\x20.+)" irc_server_msg = re.findall("(^:[\w+.{}!@|()]+\x20)" + command + params, element) if irc_server_msg: self._sc["prefix"] = convert_to_printable(irc_server_msg[0][0].strip()) self._sc["command"] = convert_to_printable(irc_server_msg[0][1].strip()) self._sc["params"] = convert_to_printable(irc_server_msg[0][2].strip()) self._sc["type"] = "server" if logirc: self._messages.append(dict(self._sc)) else: irc_client_msg = re.findall("([a-zA-Z]+\x20)(.+[\x0a\0x0d])", element) if irc_client_msg and irc_client_msg[0][0].strip() in self.__methods_client: self._cc["command"] = convert_to_printable(irc_client_msg[0][0].strip()) if self._cc["command"] in ["NICK", "USER"]: logirc = True self._cc["params"] = convert_to_printable(irc_client_msg[0][1].strip()) self._cc["type"] = "client" if logirc: self._messages.append(dict(self._cc))
def extract_strings(path, nulltermonly, minchars): strings = [] try: data = open(path, "rb").read() except (IOError, OSError) as e: raise CuckooProcessingError(f"Error opening file {e}") endlimit = b"" if not HAVE_RE2: endlimit = b"8192" if nulltermonly: apat = b"([\x20-\x7e]{" + str( minchars).encode() + b"," + endlimit + b"})\x00" upat = b"((?:[\x20-\x7e][\x00]){" + str( minchars).encode() + b"," + endlimit + b"})\x00\x00" else: apat = b"[\x20-\x7e]{" + str( minchars).encode() + b"," + endlimit + b"}" upat = b"(?:[\x20-\x7e][\x00]){" + str( minchars).encode() + b"," + endlimit + b"}" strings = [bytes2str(string) for string in re.findall(apat, data)] for ws in re.findall(upat, data): strings.append(str(ws.decode("utf-16le"))) return strings
def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"]) self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"]) self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""), (":", ":"), (":", "::")])
def do_strings(self): if not self.voptions.basic.dostrings: return None try: with open(self.memfile, "rb") as f: data = f.read() except (IOError, OSError, MemoryError) as e: raise CuckooProcessingError(f"Error opening file {e}") from e nulltermonly = self.voptions.basic.get("strings_nullterminated_only", True) minchars = str(self.voptions.basic.get("strings_minchars", 5)).encode() if nulltermonly: apat = b"([\x20-\x7e]{" + minchars + b",})\x00" upat = b"((?:[\x20-\x7e][\x00]){" + minchars + b",})\x00\x00" else: apat = b"[\x20-\x7e]{" + minchars + b",}" upat = b"(?:[\x20-\x7e][\x00]){" + minchars + b",}" strings = re.findall(apat, data) + [ ws.decode("utf-16le").encode() for ws in re.findall(upat, data) ] with open(f"{self.memfile}.strings", "wb") as f: f.write(b"\n".join(strings)) return f"{self.memfile}.strings"
def run(self): """Run extract of printable strings. @return: list of printable strings. """ self.key = "strings" strings = [] if self.task["category"] == "file": if not os.path.exists(self.file_path): raise CuckooProcessingError("Sample file doesn't exist: \"%s\"" % self.file_path) try: data = open(self.file_path, "rb").read() except (IOError, OSError) as e: raise CuckooProcessingError("Error opening file %s" % e) nulltermonly = self.options.get("nullterminated_only", True) minchars = self.options.get("minchars", 5) if nulltermonly: apat = "([\x20-\x7e]{" + str(minchars) + ",})\x00" upat = "((?:[\x20-\x7e][\x00]){" + str(minchars) + ",})\x00\x00" else: apat = "[\x20-\x7e]{" + str(minchars) + ",}" upat = "(?:[\x20-\x7e][\x00]){" + str(minchars) + ",}" strings = re.findall(apat, data) for ws in re.findall(upat, data): strings.append(str(ws.decode("utf-16le"))) return strings
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if eventData in self.results: return None else: self.results[eventData] = True self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) # Retrieve profile try: network = eventData.split(": ")[0] url = eventData.split(": ")[1] except BaseException as e: self.sf.error( "Unable to parse SOCIAL_MEDIA: " + eventData + " (" + str(e) + ")", False) return None if not network == "Twitter": self.sf.debug("Skipping social network profile, " + url + ", as not a Twitter profile") return None res = self.sf.fetchUrl(url, timeout=self.opts['_fetchtimeout'], useragent="SpiderFoot") if res['content'] is None: return None if not res['code'] == "200": self.sf.debug(url + " is not a valid Twitter profile") return None # Retrieve name human_name = re.findall(r'<div class="fullname">([^<]+)\s*</div>', res['content'], re.MULTILINE) if human_name: e = SpiderFootEvent("RAW_RIR_DATA", "Possible full name: " + human_name[0], self.__name__, event) self.notifyListeners(e) # Retrieve location location = re.findall(r'<div class="location">([^<]+)</div>', res['content']) if location: if len(location[0]) < 3 or len(location[0]) > 100: self.sf.debug("Skipping likely invalid location.") else: e = SpiderFootEvent("GEOINFO", location[0], self.__name__, event) self.notifyListeners(e)
def __findTagAttributes(tag): att_double = re.findall('<\w*[ ]| *(.*?)[ ]*=[ ]*"(.*?)"[ +|>]', tag) att_single = re.findall('<\w*[ ]| *(.*?)[ ]*=[ ]*\'(.*?)\'[ +|>]', tag) att_none = re.findall('<\w*[ ]| *(.*?)[ ]*=[ ]*["|\']?(.*?)["|\']?[ +|>]', tag) att_none.extend(att_single) att_none.extend(att_double) return att_none
def portScanUDP(self, ip): res = self.sf.fetchUrl( "https://hackertarget.com/udp-port-scan/", timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], postData="theinput=" + ip + "&thetest=udpscan&name_of_nonce_field=&_wp_http_referer=%2Fudp-port-scan%2F" ) if res['content'] is None: return None html_data = re.findall(r'<pre id="formResponse">(.*?)</pre>', res['content'], re.MULTILINE | re.DOTALL) if not html_data: self.sf.debug("Found no open UDP ports on " + ip) return None open_ports = re.findall(r'(\d+)/udp\s+open\s+', html_data[0]) if not open_ports: self.sf.debug("Found no open UDP ports on " + ip) return None self.sf.debug("Found " + str(len(open_ports)) + " open UDP ports on " + ip) return open_ports
def do_strings(self): strings_path = None if self.voptions.basic.dostrings: try: data = open(self.memfile, "rb").read() except (IOError, OSError) as e: raise CuckooProcessingError("Error opening file %s" % e) nulltermonly = self.voptions.basic.get( "strings_nullterminated_only", True) minchars = self.voptions.basic.get("strings_minchars", 5) if nulltermonly: apat = "([\x20-\x7e]{" + str(minchars) + ",})\x00" upat = "((?:[\x20-\x7e][\x00]){" + str( minchars) + ",})\x00\x00" else: apat = "[\x20-\x7e]{" + str(minchars) + ",}" upat = "(?:[\x20-\x7e][\x00]){" + str(minchars) + ",}" strings = re.findall(apat, data) for ws in re.findall(upat, data): strings.append(str(ws.decode("utf-16le"))) data = None f = open(self.memfile + ".strings", "w") f.write("\n".join(strings)) f.close()
def do_strings(self): strings_path = None if self.voptions.basic.dostrings: try: data = open(self.memfile, "rb").read() except (IOError, OSError) as e: raise CuckooProcessingError("Error opening file %s" % e) nulltermonly = self.voptions.basic.get("strings_nullterminated_only", True) minchars = self.voptions.basic.get("strings_minchars", 5) if nulltermonly: apat = "([\x20-\x7e]{" + str(minchars) + ",})\x00" upat = "((?:[\x20-\x7e][\x00]){" + str(minchars) + ",})\x00\x00" else: apat = "[\x20-\x7e]{" + str(minchars) + ",}" upat = "(?:[\x20-\x7e][\x00]){" + str(minchars) + ",}" strings = re.findall(apat, data) for ws in re.findall(upat, data): strings.append(str(ws.decode("utf-16le"))) data = None f=open(self.memfile + ".strings", "w") f.write("\n".join(strings)) f.close()
def _unpack(self, buf): """Extract into a list irc messages of a tcp streams. @buf: tcp stream data """ try: f = BytesIO(buf) lines = f.readlines() except Exception: log.error("Failed reading tcp stream buffer") return False logirc = False for element in lines: if not re.match(b"^:", element) is None: command = "([a-zA-Z]+|[0-9]{3})" params = "(\x20.+)" irc_server_msg = re.findall("(^:[\w+.{}!@|()]+\x20)" + command + params, element) if irc_server_msg: self._sc["prefix"] = convert_to_printable(irc_server_msg[0][0].strip()) self._sc["command"] = convert_to_printable(irc_server_msg[0][1].strip()) self._sc["params"] = convert_to_printable(irc_server_msg[0][2].strip()) self._sc["type"] = "server" if logirc: self._messages.append(dict(self._sc)) else: irc_client_msg = re.findall(b"([a-zA-Z]+\x20)(.+[\x0a\0x0d])", element) if irc_client_msg and irc_client_msg[0][0].strip() in self.__methods_client: self._cc["command"] = convert_to_printable(irc_client_msg[0][0].strip()) if self._cc["command"] in ["NICK", "USER"]: logirc = True self._cc["params"] = convert_to_printable(irc_client_msg[0][1].strip()) self._cc["type"] = "client" if logirc: self._messages.append(dict(self._cc))
def handleEvent(self, event): # We are only interested in the raw data from the spidering module # because the spidering module will always provide events with the # event.sourceEvent.data set to the URL of the source. if "sfp_spider" not in event.module: self.sf.debug("Ignoring web content from " + event.module) return None eventName = event.eventType srcModuleName = event.module eventData = event.data eventSource = event.actualSource self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) # We aren't interested in describing pages that are not hosted on # our base domain. if not self.getTarget().matches(self.sf.urlFQDN(eventSource)): self.sf.debug("Not gathering page info for external site " + eventSource) return None if eventSource not in self.results: self.results[eventSource] = list() else: self.sf.debug("Already checked this page for a page type, skipping.") return None # Check the configured regexps to determine the page type for regexpGrp in regexps: if regexpGrp in self.results[eventSource]: continue for regex in regexps[regexpGrp]: rx = re.compile(regex, re.IGNORECASE) matches = re.findall(rx, eventData) if len(matches) > 0 and regexpGrp not in self.results[eventSource]: self.sf.info("Matched " + regexpGrp + " in content from " + eventSource) self.results[eventSource] = self.results[eventSource] + [regexpGrp] evt = SpiderFootEvent(regexpGrp, eventSource, self.__name__, event) self.notifyListeners(evt) # If no regexps were matched, consider this a static page if len(self.results[eventSource]) == 0: self.sf.info("Treating " + eventSource + " as URL_STATIC") evt = SpiderFootEvent("URL_STATIC", eventSource, self.__name__, event) self.notifyListeners(evt) # Check for externally referenced Javascript pages pat = re.compile("<script.*src=[\'\"]?([^\'\">]*)", re.IGNORECASE) matches = re.findall(pat, eventData) if len(matches) > 0: for match in matches: if '://' in match and not self.getTarget().matches(self.sf.urlFQDN(match)): self.sf.debug("Externally hosted Javascript found at: " + match) evt = SpiderFootEvent("PROVIDER_JAVASCRIPT", match, self.__name__, event) self.notifyListeners(evt) return None
def __findTagAttributes(tag): att_double = re.findall('<\w*[ ]| *(.*?)[ ]*=[ ]*"(.*?)"[ +|>]', tag) att_single = re.findall('<\w*[ ]| *(.*?)[ ]*=[ ]*\'(.*?)\'[ +|>]', tag) att_none = re.findall( '<\w*[ ]| *(.*?)[ ]*=[ ]*["|\']?(.*?)["|\']?[ +|>]', tag) att_none.extend(att_single) att_none.extend(att_double) return att_none
def handle_data(self, data): if self.inscript: allowed_ext = [".php", ".asp", ".xml", ".js", ".json", ".jsp"] self.liens.extend(lamejs.lamejs(data).getLinks()) candidates = re.findall(r'"([A-Za-z0-9_=#&%\.\+\?/-]*)"', data) candidates += re.findall(r"'([A-Za-z0-9_=#&%\.\+\?/-]*)'", data) for jstr in candidates: if jstr not in self.common_js_strings: for ext in allowed_ext: if ext in jstr: self.liens.append(jstr)
def do_strings(self): strings_path = None if self.voptions.basic.dostrings: try: data = open(self.memfile, "r").read() except (IOError, OSError) as e: raise CuckooProcessingError("Error opening file %s" % e) nulltermonly = self.voptions.basic.get( "strings_nullterminated_only", True) minchars = self.voptions.basic.get("strings_minchars", 5) if nulltermonly: apat = "([\x20-\x7e]{" + str(minchars) + ",})\x00" strings = re.findall(apat, data) upat = "((?:[\x20-\x7e][\x00]){" + str( minchars) + ",})\x00\x00" strings += [ str(ws.decode("utf-16le")) for ws in re.findall(upat, data) ] data = None f = open(dmp_path + ".strings", "w") f.write("\n".join(strings)) f.close() strings_path = self.memfile + ".strings" else: apat = "([\x20-\x7e]{" + str(minchars) + ",})\x00" strings = re.findall(apat, data) upat = "(?:[\x20-\x7e][\x00]){" + str(minchars) + ",}" strings += [ str(ws.decode("utf-16le")) for ws in re.findall(upat, data) ] data = None f = open(self.memfile + ".strings", "w") f.write("\n".join(strings)) f.close() strings_path = self.memfile + ".strings" if self.voptions.basic.zipstrings: try: f = zipfile.ZipFile("%s.zip" % (strings_path), "w", allowZip64=True) f.write(strings_path, os.path.basename(strings_path), zipfile.ZIP_DEFLATED) f.close() os.remove(strings_path) strings_path = "%s.zip" % (strings_path) except Exception as e: raise CuckooProcessingError( "Error creating Process Memory Strings Zip File %s" % e)
def extract_urls(msg, html=False): if html: msg = msg.replace("=3D", '=') for x in REPLACE: msg = msg.replace(x, '') urls = re.findall(RE_URL_HTML, msg) else: urls = re.findall(RE_URL_PLAIN, msg) pprint(urls) links = set() for u in urls: u = str(u.decode()).rstrip("/") links.add(u) return links
def get_schedule_line_groups(classified_event): text = classified_event.processed_text.get_tokenized_text() # (?!20[01][05]) time = r'\b[012]?\d[:.,h]?(?:[0-5][05])?(?:am|pm)?\b' time_with_minutes = r'\b[012]?\d[:.,h]?(?:[0-5][05])(?:am|pm)?\b' time_to_time = r'%s ?(?:to|do|до|til|till|alle|a|-|–|[^\w,.]) ?%s' % (time, time) # We try to grab all lines in schedule up until schedule ends, # so we need a "non-schedule line at the end", aka [''] lines = text.split('\n') + [''] idx = 0 schedule_lines = [] while idx < len(lines): first_idx = idx while idx < len(lines): line = lines[idx] # if it has # grab time one and time two, store diff # store delimiters # maybe store description as well? # compare delimiters, times, time diffs, styles, etc times = re.findall(time_to_time, line) if not times or len(line) > 80: if idx - first_idx >= 1: schedule_lines.append(lines[first_idx:idx]) break idx += 1 first_idx = idx while idx < len(lines): line = lines[idx] times = re.findall(time, line) # TODO(lambert): Somehow track "1)" that might show up here? :( times = [x for x in times if x not in ['1.', '2.']] if not times or len(line) > 80: if idx - first_idx >= 3: schedule_lines.append(lines[first_idx:idx]) break idx += 1 idx += 1 schedule_groups = [] for sub_lines in schedule_lines: if not [x for x in sub_lines if re.search(time_with_minutes, x)]: continue schedule_groups.append(sub_lines) return schedule_groups
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) pat = re.compile("([A-Za-z0-9+\/]+\=\=|[A-Za-z0-9+\/]+\=)") m = re.findall(pat, eventData) for match in m: if self.checkForStop(): return None minlen = int(self.opts['minlength']) if len(match) >= minlen: caps = sum(1 for c in match if c.isupper()) # Base64-encoded strings don't look like normal strings if caps < (minlen / 4): return None self.sf.info("Found Base64 string: " + match) if type(match) == str: string = unicode(match, 'utf-8', errors='replace') else: string = match try: string += " (" + base64.b64decode(match) + ")" evt = SpiderFootEvent("BASE64_DATA", string, self.__name__, event) self.notifyListeners(evt) except BaseException as e: self.sf.debug("Unable to base64-decode a string.") return None
def find_competitor_list(search_text): processed_text = grammar_matcher.StringProcessor(search_text) results_match = re.search(r'\n0*1[^\d].+\n^0*2[^\d].+\n(?:^\d+.+\n){2,}', processed_text.text, re.MULTILINE) if results_match: numbered_list = results_match.group(0) num_lines = numbered_list.count('\n') if len(re.findall(r'\d ?[.:h] ?\d\d|\bam\b|\bpm\b', numbered_list)) > num_lines / 4: return None # good list of times! workshops, etc! performance/shows/club-set times! processed_numbered_list = grammar_matcher.StringProcessor(numbered_list, processed_text.match_on_word_boundaries) event_keywords = processed_numbered_list.get_tokens(rules.EVENT) if len(event_keywords) > num_lines / 8: return None if processed_text.has_token(keywords.WRONG_NUMBERED_LIST): return None if num_lines > 10: return numbered_list else: lines = numbered_list.split('\n') qualified_lines = len([x for x in lines if re.search(r'[^\d\W].*[-(]', x)]) if qualified_lines > num_lines / 2: return numbered_list for type in ['crew', 'pop|boog', 'lock', 'b\W?(?:boy|girl)']: qualified_lines = len([x for x in lines if re.search(type, x)]) if qualified_lines > num_lines / 8: return numbered_list if processed_text.match_on_word_boundaries == regex_keywords.WORD_BOUNDARIES: # maybe separate on kana vs kanji? avg_words = 1.0 * sum([len([y for y in x.split(' ')]) for x in lines]) / num_lines if avg_words < 3: return numbered_list return None
def on_call(self, call, process): if process["process_name"].lower() not in self.whitelistprocs: buff = call["arguments"]["buffer"].lower() if len(buff) >= 128 and (call["arguments"]["filepath"].endswith(".txt") or call["arguments"]["filepath"].endswith(".htm") or call["arguments"]["filepath"].endswith(".html")): patterns = "|".join(indicators) if len(re.findall(patterns, buff)) > 1: self.mark_call()
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data # We only want web content from the target if srcModuleName != "sfp_spider": return None eventSource = event.sourceEvent.data self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) if eventSource not in self.results.keys(): self.results[eventSource] = list() # We only want web content for pages on the target site if not self.getTarget().matches(self.sf.urlFQDN(eventSource)): self.sf.debug("Not collecting web content information for external sites.") return None for regexpGrp in regexps.keys(): if regexpGrp in self.results[eventSource]: continue for regex in regexps[regexpGrp]: pat = re.compile(regex, re.IGNORECASE) matches = re.findall(pat, eventData) if len(matches) > 0 and regexpGrp not in self.results[eventSource]: self.sf.info("Matched " + regexpGrp + " in content from " + eventSource) self.results[eventSource].append(regexpGrp) evt = SpiderFootEvent("ERROR_MESSAGE", regexpGrp, self.__name__, event.sourceEvent) self.notifyListeners(evt) return None
def on_call(self, call, process): if process["process_name"].lower() not in self.whitelistprocs: buff = call["arguments"]["buffer"].lower() if len(buff) >= 128: patterns = "|".join(self.indicators) if len(re.findall(patterns, buff)) > 1: self.mark_call()
def get_vt_consensus(namelist: list): finaltoks = defaultdict(int) for name in namelist: toks = re.findall(r"[A-Za-z0-9]+", name) for tok in toks: finaltoks[tok.title()] += 1 for tok in list(finaltoks): lowertok = tok.lower() accepted = True numlist = [x for x in tok if x.isdigit()] if len(numlist) > 2 or len(tok) < 4: accepted = False if accepted: for black in banlist: if black == lowertok: accepted = False break if not accepted: del finaltoks[tok] sorted_finaltoks = sorted(list(finaltoks.items()), key=operator.itemgetter(1), reverse=True) if len(sorted_finaltoks) == 1 and sorted_finaltoks[0][1] >= 2: return sorted_finaltoks[0][0] elif len(sorted_finaltoks) > 1 and (sorted_finaltoks[0][1] >= sorted_finaltoks[1][1] * 2 or sorted_finaltoks[0][1] > 8): return sorted_finaltoks[0][0] elif len(sorted_finaltoks) > 1 and sorted_finaltoks[0][1] == sorted_finaltoks[1][1] and sorted_finaltoks[0][1] > 2: return sorted_finaltoks[0][0] return ""
def query(self, qry): url = "https://en.wikipedia.org/w/api.php?action=feedcontributions&user="******"0": dt = datetime.datetime.now() - datetime.timedelta( days=int(self.opts['days_limit'])) y = dt.strftime("%Y") m = dt.strftime("%m") url += "&year=" + y + "&month=" + m res = self.sf.fetchUrl(url, timeout=self.opts['_fetchtimeout'], useragent="SpiderFoot") if res['code'] in ["404", "403", "500"]: return None links = list() try: parser = HTMLParser() for line in res['content'].split("\n"): matches = re.findall("<link>(.*?)</link>", line, re.IGNORECASE) for m in matches: if "Special:Contributions" in m: continue d = parser.unescape(m) links.append(d) return links except Exception as e: self.sf.error( "Error processing response from Wikipedia: " + str(e), False) return None
def get_suricata_family(signature): """ Args: signature: suricata alert string Return family: family name or False """ # ToDo Trojan-Proxy family = False words = re.findall(r"[A-Za-z0-9/\-]+", signature) famcheck = words[2] if "/" in famcheck: famcheck_list = famcheck.split("/") # [-1] for fam_name in famcheck_list: if not any( [block in fam_name.lower() for block in suricata_blocklist]): famcheck = fam_name break famchecklower = famcheck.lower() if famchecklower.startswith("win.") and famchecklower.count(".") == 1: famchecklower = famchecklower.split(".")[-1] famcheck = famcheck.split(".")[-1] if famchecklower in ("win32", "w32", "ransomware"): famcheck = words[3] famchecklower = famcheck.lower() if famchecklower == "ptsecurity": famcheck = words[3] famchecklower = famcheck.lower() isbad = any([block in famchecklower for block in suricata_blocklist]) if not isbad and len(famcheck) >= 4: family = famcheck.title() isgood = any([allow in famchecklower for allow in suricata_passlist]) if isgood and len(famcheck) >= 4: family = famcheck.title() return family
def on_complete(self): matches = [ r'(https?:\/\/)?([\da-z\.-]+)\.([0-9a-z\.]{2,6})(:\d{1,5})?([\/\w\.-]*)\/?', ] dedup = list() extracted_config = False for potential_ioc in self.iocs: for entry in matches: all_matches = re.findall(entry, potential_ioc) if all_matches: extracted_config = True for buf in all_matches: ioc = "" idx = 0 for tmp in buf: idx += 1 if tmp == '': pass # Account for match groups and the second # (or third depending on match) period as a # delimiter. We need to add it in manually. if idx == 2: ioc += tmp + "." else: ioc += tmp if ioc not in dedup: dedup.append(ioc) if dedup: for ioc in dedup: self.data.append({"ioc": ioc}) return extracted_config
def get_clamav_consensus(namelist: list): for detection in namelist: if detection.startswith("Win.Trojan."): words = re.findall(r"[A-Za-z0-9]+", detection) family = words[2] if family: return family
def getLatestIndexes(self): url = "https://commoncrawl.s3.amazonaws.com/cc-index/collections/index.html" res = self.sf.fetchUrl(url , timeout=60, useragent="SpiderFoot") if res['code'] in [ "400", "401", "402", "403", "404" ]: self.sf.error("CommonCrawl index collection doesn't seem to be available.", False) self.errorState = True return list() if not res['content']: self.sf.error("CommonCrawl index collection doesn't seem to be available.", False) self.errorState = True return list() indexes = re.findall(".*(CC-MAIN-\d+-\d+).*", res['content']) highest = 0 indexlist = dict() for m in indexes: ms = m.replace("CC-MAIN-", "").replace("-", "") indexlist[ms] = True topindexes = sorted(indexlist.keys(), reverse=True)[0:self.opts['indexes']] if len(topindexes) < self.opts['indexes']: self.sf.error("Not able to find latest CommonCrawl indexes.", False) self.errorState = True return list() retindex = list() for i in topindexes: retindex.append("CC-MAIN-" + str(i)[0:4] + "-" + str(i)[4:6]) self.sf.debug("CommonCrawl indexes: " + str(retindex)) return retindex
def run(self) -> List[str]: ret = [] with open(self.filepath, "r") as f: source = f.read() # Get rid of superfluous comments. source = re.sub("/\\*.*?\\*/", "", source, flags=re.S) for script in re.findall(self.script_re, source, re.I | re.S): try: x = bs4.BeautifulSoup(script, "html.parser") language = x.script.attrs.get("language", "").lower() except Exception: language = None # We can't rely on bs4 or any other HTML/XML parser to provide us # with the raw content of the xml tag as they decode html entities # and all that, leaving us with a corrupted string. source = re.match("<.*>(.*)</.*>$", script, re.S).group(0) # Decode JScript.Encode encoding. if language in {"jscript.encode", "vbscript.encode"}: source = EncodedScriptFile(self.filepath).decode(source.encode()) if len(source) > 65536: source = f"{source[:65536]}\r\n<truncated>" ret.append(source) return ret
def search(self, regex, flags=0, all=False): if all: result = dict() result["detail"] = [] matches = [] for map in self.address_space: for chunk in map["chunks"]: self.dumpfile.seek(chunk["offset"]) match = re.findall( regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags) if match: matches.extend(match) result["detail"].append({ "match": match, "chunk": chunk }) result["matches"] = matches return result else: for map in self.address_space: for chunk in map["chunks"]: self.dumpfile.seek(chunk["offset"]) match = re.search( regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags) if match: result = dict() result["match"] = match result["chunk"] = chunk return result
def get_suricata_family(signature): """ Args: signature: suricata alert string Return family: family name or False """ family = False #alert["signature"].startswith(("ET JA3 HASH")): words = re.findall(r"[A-Za-z0-9/\-]+", signature) famcheck = words[2] if "/" in famcheck: famcheck = famcheck.split("/")[-1] famchecklower = famcheck.lower() #ET MALWARE Sharik/Smoke CnC Beacon 11 #ETPRO TROJAN MSIL/Revenge-RAT CnC Checkin #ETPRO TROJAN Win32/Predator The Thief Initial CnC Checkin if famchecklower in ("win32", "w32", "ransomware"): famcheck = words[3] famchecklower = famcheck.lower() isbad = any(True for black in suricata_blacklist if black in famchecklower) if not isbad and len(famcheck) >= 4: family = famcheck.title() return family
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data sourceData = self.sf.hashstring(eventData) if sourceData in self.results: return None else: self.results.append(sourceData) self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) # thanks to https://stackoverflow.com/questions/21683680/regex-to-match-bitcoin-addresses matches = re.findall("[\s:=\>]([13][a-km-zA-HJ-NP-Z1-9]{25,34})", eventData) for m in matches: self.sf.debug("Bitcoin potential match: " + m) if self.check_bc(m): evt = SpiderFootEvent("BITCOIN_ADDRESS", m, self.__name__, event) self.notifyListeners(evt) return None
def on_complete(self): ret = False networkret = False campaign = set() mutexs = [ "^(Global|Local)\\\\pen3j3832h$", "^(Global|Local)\\\\u1nyj3rt20", ] for mutex in mutexs: if self.check_mutex(pattern=mutex, regex=True): self.syncapis = True break # Check if there are any winners if self.cryptoapis or self.syncapis or networkret: ret = True if (self.cryptoapis or self.syncapis) and networkret: self.confidence = 100 self.description = "Exhibits behaviorial and network characteristics of Upatre+Dyre/Mini-Dyre malware" #for camp in campaign: # self.data.append({"Campaign": camp}) elif networkret: self.description = "Exhibits network behavior characteristic of Upatre+Dyre/Mini-Dyre malware" #for camp in campaign: # self.data.append({"Campaign": camp}) if self.extract_c2s: dump_pid = 0 for proc in self.results["behavior"]["processtree"]: for child in proc["children"]: # Look for lowest PID svchost.exe if not dump_pid or child["pid"] < dump_pid: if child["name"] == "svchost.exe": dump_pid = child["pid"] if dump_pid: dump_path = "" if len(self.results["procmemory"]): for memdump in self.results["procmemory"]: if dump_pid == memdump["pid"]: dump_path = memdump["file"] if dump_path: whitelist = [ "1.2.3.4", "0.0.0.0", ] with open(dump_path, "rb") as dump_file: dump_data = dump_file.read() ippat = "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}" ips = re.findall(ippat, dump_data) for ip in set(ips): addit = True for item in whitelist: if ip.startswith(item): addit = False #if addit: # self.data.append({"C2": ip}) return ret
def lookupItem(self, target, content): grps = re.findall("<title><\!\[CDATA\[(.[^\]]*)\]\]></title>\s+<link><\!\[CDATA\[(.[^\]]*)\]\]></link>", content) for m in grps: if target in m[0]: self.sf.info("Found zoneh site: " + m[0]) return m[0] + "\n<SFURL>" + m[1] + "</SFURL>" return False
def charReplace(inputString, MODFLAG): # OLD: [char]101 # NEW: e for value in re.findall("\[[Cc][Hh][Aa][Rr]\][0-9]{1,3}", inputString): inputString = inputString.replace(value, '"%s"' % chr(int(value.split("]")[1]))) if MODFLAG == 0: MODFLAG = 1 return inputString, MODFLAG
def parsenamedacts(pattern, intext): namedacts = re.findall(pattern, intext) namedacts = list(set(namedacts)) outtext = intext for namedact in namedacts: #outtext = outtext.replace(namedact+r'@/', encode_act(namedact)+r'@/') outtext = outtext.replace(r'ref-namedact-'+namedact,r'ref-namedact-'+encode_act(namedact)) return outtext
def on_call(self, call, process): if self.checkEvent and self.lastapi == "CryptHashData": if call["api"] == "NtOpenEvent": event = self.get_argument(call, "EventName") event = event.split("\\") if len(event) == 2: if event[1] in self.hashes and event[0] in ["Global", "Local"]: self.found = True if call["api"] == "GetVolumeNameForVolumeMountPointW": if call["status"]: name = self.get_argument(call, "VolumeName") if name and len(name) > 10: name = name[10:-1] if name not in self.volumes: self.volumes.add(name) md5 = hashlib.md5(name).hexdigest()[:16].upper() self.hashes.add(md5) elif call["api"] == "CryptHashData": if self.hashes: buf = self.get_argument(call, "Buffer") if buf and all(word in buf for word in self.keywords): # Try/Except handles when this behavior changes in the future try: args = parse_qs(urlparse("/?" + buf).query, keep_blank_values=True) except: self.sigchanged = True self.severity = 1 self.description = "Potential Locky ransomware behavioral characteristics observed. (See Note)" self.data.append({"Note": "Unexpected behavior observed for Locky. Please " \ "report this sample to https://github.com/spende" \ "rsandbox/community-modified/issues"}) if args and "id" in args.keys(): if args["id"][0] in self.hashes: self.found = process["process_id"] if "affid" in args: tmp = {"Affid": args["affid"][0]} if tmp not in self.data: self.data.append(tmp) elif buf in self.volumes and self.lastapi == "GetVolumeNameForVolumeMountPointW": checkEvent = True else: check = re.findall(r"\s((?:https?://)?\w+(?:\.onion|\.tor2web)[/.](?:\w+\/)?)", buf, re.I) if check: for payment in check: self.payment.add(payment) elif call["api"] == "InternetCrackUrlA": if self.found and process["process_id"] == self.found: url = self.get_argument(call, "Url") if url and url.endswith(".php"): self.c2s.add(url)
def on_complete(self): for screenshot in self.get_results("screenshots", []): if "ocr" in screenshot: ocr = screenshot["ocr"].lower() patterns = "|".join(indicators) if len(re.findall(patterns, ocr)) > 1: self.mark_ioc("message", ocr) return self.has_marks()
def on_call(self, call, process): if call["api"] == "NtWriteFile": filescore = 0 buff = self.get_raw_argument(call, "Buffer").lower() filepath = self.get_raw_argument(call, "HandleName") patterns = "|".join(self.indicators) if (filepath.lower() == "\\??\\physicaldrive0" or filepath.lower().startswith("\\device\\harddisk")) and len(buff) >= 128: if len(re.findall(patterns, buff)) > 1: if filepath not in self.ransomfile: self.ransomfile.append(filepath)
def do_strings(self): strings_path = None if self.voptions.basic.dostrings: try: data = open(self.memfile, "r").read() except (IOError, OSError) as e: raise CuckooProcessingError("Error opening file %s" % e) nulltermonly = self.voptions.basic.get("strings_nullterminated_only", True) minchars = self.voptions.basic.get("strings_minchars", 5) if nulltermonly: apat = "([\x20-\x7e]{" + str(minchars) + ",})\x00" strings = re.findall(apat, data) upat = "((?:[\x20-\x7e][\x00]){" + str(minchars) + ",})\x00\x00" strings += [str(ws.decode("utf-16le")) for ws in re.findall(upat, data)] data = None f=open(dmp_path + ".strings", "w") f.write("\n".join(strings)) f.close() strings_path = self.memfile + ".strings" else: apat = "([\x20-\x7e]{" + str(minchars) + ",})\x00" strings = re.findall(apat, data) upat = "(?:[\x20-\x7e][\x00]){" + str(minchars) + ",}" strings += [str(ws.decode("utf-16le")) for ws in re.findall(upat, data)] data = None f=open(self.memfile + ".strings", "w") f.write("\n".join(strings)) f.close() strings_path = self.memfile + ".strings" if self.voptions.basic.zipstrings: try: f = zipfile.ZipFile("%s.zip" % (strings_path), "w",allowZip64=True) f.write(strings_path, os.path.basename(strings_path), zipfile.ZIP_DEFLATED) f.close() os.remove(strings_path) strings_path = "%s.zip" % (strings_path) except Exception as e: raise CuckooProcessingError("Error creating Process Memory Strings Zip File %s" % e)
def is_workshop(classified_event): trimmed_title = classified_event.processed_title.delete_with_rule(rules.WRONG_CLASS) if classified_event.processed_text.get_tokens(dance_keywords.ROMANCE): has_class_title = trimmed_title.get_tokens(rules.ROMANCE_EXTENDED_CLASS_ONLY) else: has_class_title = trimmed_title.get_tokens(dance_keywords.CLASS_ONLY) has_good_dance_class_title = trimmed_title.has_token(rules.GOOD_DANCE_CLASS) has_non_dance_event_title = classified_event.processed_title.has_token(keywords.BAD_COMPETITION_TITLE_ONLY) has_good_dance_title = trimmed_title.has_token(rules.GOOD_DANCE) has_extended_good_crew_title = trimmed_title.has_token(rules.MANUAL_DANCER[grammar.STRONG_WEAK]) has_wrong_style_title = classified_event.processed_title.has_token(all_styles.DANCE_WRONG_STYLE_TITLE) final_title = classified_event.processed_title.get_tokenized_text() lee_lee_hiphop = 'lee lee' in final_title and re.findall('hip\W?hop', final_title) trimmed_text = classified_event.processed_text.delete_with_rule(rules.WRONG_CLASS) has_good_dance_class = trimmed_text.has_token(rules.GOOD_DANCE_CLASS) has_good_dance = classified_event.processed_text.has_token(rules.GOOD_DANCE) has_wrong_style = classified_event.processed_text.has_token(all_styles.DANCE_WRONG_STYLE_TITLE) has_good_crew = classified_event.processed_text.has_token(rules.MANUAL_DANCER[grammar.STRONG]) # print has_class_title # print has_good_dance_title # print has_extended_good_crew_title # print has_wrong_style_title # print classified_event.processed_text.get_tokenized_text() # print '' # print has_class_title # print has_wrong_style # print has_good_dance # print has_good_crew if has_class_title and (has_good_dance_title or has_extended_good_crew_title) and not has_wrong_style_title: return ( True, 'has class with strong class-title: %s %s' % (has_class_title, (has_good_dance_title or has_extended_good_crew_title)) ) elif classified_event.is_dance_event( ) and has_good_dance_title and has_extended_good_crew_title and not has_wrong_style_title and not has_non_dance_event_title: return (True, 'has class with strong style-title: %s %s' % (has_good_dance_title, has_extended_good_crew_title)) elif classified_event.is_dance_event() and lee_lee_hiphop and not has_wrong_style_title and not has_non_dance_event_title: return (True, 'has class with strong style-title: %s %s' % (has_good_dance_title, has_extended_good_crew_title)) elif has_class_title and not has_wrong_style and (has_good_dance or has_good_crew): return (True, 'has class title: %s, that contains strong description %s, %s' % (has_class_title, has_good_dance, has_good_crew)) elif has_good_dance_class_title: return (True, 'has good dance class title: %s' % has_good_dance_class_title) elif has_good_dance_class and not has_wrong_style_title: return (True, 'has good dance class: %s' % has_good_dance_class) return (False, 'nothing')
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) pat = re.compile("([\%a-zA-Z\.0-9_\-\+]+@[a-zA-Z\.0-9\-]+\.[a-zA-Z\.0-9\-]+)") matches = re.findall(pat, eventData) myres = list() for match in matches: evttype = "EMAILADDR" if len(match) < 4: self.sf.debug("Likely invalid address: " + match) continue # Handle messed up encodings if "%" in match: self.sf.debug("Skipped address: " + match) continue # Get the domain and strip potential ending . mailDom = match.lower().split('@')[1].strip('.') if not self.getTarget().matches(mailDom) and not self.getTarget().matches(match): self.sf.debug("External domain, so possible affiliate e-mail") evttype = "AFFILIATE_EMAILADDR" if eventName.startswith("AFFILIATE_"): evttype = "AFFILIATE_EMAILADDR" self.sf.info("Found e-mail address: " + match) if type(match) == str: mail = unicode(match.strip('.'), 'utf-8', errors='replace') else: mail = match.strip('.') if mail in myres: self.sf.debug("Already found from this source.") continue else: myres.append(mail) evt = SpiderFootEvent(evttype, mail, self.__name__, event) if event.moduleDataSource: evt.moduleDataSource = event.moduleDataSource else: evt.moduleDataSource = "Unknown" self.notifyListeners(evt) return None
def get_tags(src, tags='page,title,revision,text'): # find namespace (eg: http://www.mediawiki.org/xml/export-0.3/) try: root = src.readline() + src.readline() ns = unicode(re.findall(r'xmlns="([^"]*)', root)[0]) tag_prefix = u'{%s}' % (ns,) tag = {} for t in tags.split(','): tag[t] = tag_prefix + unicode(t) finally: src.seek(0) return tag
def on_complete(self): matches = [r"(https?:\/\/)?([\da-z\.-]+)\.([0-9a-z\.]{2,6})(:\d{1,5})?([\/\w\.-]*)\/?"] whitelist = [ "http://crl.microsoft.com", "http://www.microsoft.com", "asm.v1", "asm.v3", "verisign.com", "symantec.com", "thawte.com", ] dedup = list() extracted_data = False for potential_ioc in self.iocs: for entry in matches: all_matches = re.findall(entry, potential_ioc) if all_matches: for buf in all_matches: ioc = "" idx = 0 for tmp in buf: idx += 1 if tmp == "": pass # Account for match groups and the second # (or third depending on match) period as a # delimiter. We need to add it in manually. if idx == 2: ioc += tmp + "." else: ioc += tmp addit = True for item in whitelist: if item in ioc: addit = False if addit and ioc not in dedup: dedup.append(ioc) if dedup: extracted_data = True for ioc in dedup: self.data.append({"ioc": ioc}) return extracted_data
def on_complete(self): if "dropped" in self.results: for dropped in self.results["dropped"]: mimetype = dropped["type"] if "ASCII text" in mimetype: filename = dropped["name"] data = dropped["data"] patterns = "|".join(self.indicators) if len(data) >= 128: if len(re.findall(patterns, data)) > 1: if filename not in self.ransomfile: self.ransomfile.append(filename) if len(self.ransomfile) > 0: for filename in self.ransomfile: self.data.append({"ransom_file" : "%s" % (filename)}) return True return False
def get_namespaces(src): try: counter = 0 namespaces = [] while 1: line = src.readline() if not line: break keys = re.findall( r'<namespace key="(-?\d+)"[^>]*>([^<]*)</namespace>', line) for key, ns in keys: namespaces.append((key, ns)) counter += 1 if counter > 40: break finally: src.seek(0) return namespaces
def search(self, regex, flags=0, all=False): if all: result = dict() result["detail"] = [] matches = [] for map in self.address_space: for chunk in map["chunks"]: self.dumpfile.seek(chunk["offset"]) match = re.findall(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags) if match: matches.extend(match) result["detail"].append({"match": match, "chunk": chunk}) result["matches"] = matches return result else: for map in self.address_space: for chunk in map["chunks"]: self.dumpfile.seek(chunk["offset"]) match = re.search(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags) if match: result = dict() result["match"] = match result["chunk"] = chunk return result
def run(self): """Run analysis. @return: structured results. """ self.key = "procmemory" results = [] zipdump = self.options.get("zipdump", False) zipstrings = self.options.get("zipstrings", False) do_strings = self.options.get("strings", False) nulltermonly = self.options.get("nullterminated_only", True) minchars = self.options.get("minchars", 5) if os.path.exists(self.pmemory_path): for dmp in os.listdir(self.pmemory_path): # if we're re-processing this task, this means if zips are enabled, we won't do any reprocessing on the # process dumps (only matters for now for Yara) if not dmp.endswith(".dmp"): continue dmp_path = os.path.join(self.pmemory_path, dmp) dmp_file = File(dmp_path) process_name = "" process_path = "" process_id = int(os.path.splitext(os.path.basename(dmp_path))[0]) if "behavior" in self.results and "processes" in self.results["behavior"]: for process in self.results["behavior"]["processes"]: if process_id == process["process_id"]: process_name = process["process_name"] process_path = process["module_path"] proc = dict( file=dmp_path, pid=process_id, name=process_name, path=process_path, yara=dmp_file.get_yara(os.path.join(CUCKOO_ROOT, "data", "yara", "index_memory.yar")), address_space=self.parse_dump(dmp_path), zipdump=zipdump, zipstrings=zipstrings, ) if do_strings: try: data = open(dmp_path, "r").read() except (IOError, OSError) as e: raise CuckooProcessingError("Error opening file %s" % e) if nulltermonly: apat = "([\x20-\x7e]{" + str(minchars) + ",})\x00" strings = re.findall(apat, data) upat = "((?:[\x20-\x7e][\x00]){" + str(minchars) + ",})\x00\x00" strings += [str(ws.decode("utf-16le")) for ws in re.findall(upat, data)] f = open(dmp_path + ".strings", "w") f.write("\n".join(strings)) f.close() proc["strings_path"] = dmp_path + ".strings" else: apat = "([\x20-\x7e]{" + str(minchars) + ",})\x00" strings = re.findall(apat, data) upat = "(?:[\x20-\x7e][\x00]){" + str(minchars) + ",}" strings += [str(ws.decode("utf-16le")) for ws in re.findall(upat, data)] f = open(dmp_path + ".strings", "w") f.write("\n".join(strings)) f.close() proc["strings_path"] = dmp_path + ".strings" zipstrings = self.options.get("zipstrings", False) if zipstrings: try: f = zipfile.ZipFile("%s.zip" % (proc["strings_path"]), "w") f.write(proc["strings_path"], os.path.basename(proc["strings_path"]), zipfile.ZIP_DEFLATED) f.close() os.remove(proc["strings_path"]) proc["strings_path"] = "%s.zip" % (proc["strings_path"]) except: raise CuckooProcessingError("Error creating Process Memory Strings Zip File %s" % e) # Deduplicate configs if proc["yara"]: for match in proc["yara"]: # Dyre if match["name"] == "DyreCfgInjectsList": output = list() buf = "" recline = False for ystring in match["strings"]: for line in ystring.splitlines(): if line.startswith("<litem>"): buf = "" recline = True if recline: buf += line.strip() + "\n" if line.startswith("</litem>"): recline = False if buf not in output: output.append(buf) match["strings"] = ["".join(output)] match["meta"]["description"] += " (Observed %d unique inject elements)" % len(output) elif match["name"] == "DyreCfgRedirectList": output = list() buf = "" recline = False for ystring in match["strings"]: for line in ystring.splitlines(): if line.startswith("<rpcgroup>"): buf = "" recline = True if recline: buf += line.strip() + "\n" if line.startswith("</rpcgroup>"): recline = False if buf not in output: output.append(buf) match["strings"] = ["".join(output)] match["meta"]["description"] += " (Observed %d unique redirect elements)" % len(output) # DarkComet elif match["name"] == "DarkCometConfig": output = list() recline = False for ystring in match["strings"]: for line in ystring.splitlines(): if line.startswith("#BEGIN DARKCOMET"): buf = "" recline = True if recline: buf += line.strip() + "\n" if line.startswith("#EOF DARKCOMET"): recline = False if buf not in output: output.append(buf) match["strings"] = ["".join(output)] if zipdump: try: f = zipfile.ZipFile("%s.zip" % (dmp_path), "w") f.write(dmp_path, os.path.basename(dmp_path), zipfile.ZIP_DEFLATED) f.close() os.remove(dmp_path) proc["file"] = "%s.zip" % (dmp_path) except: raise CuckooProcessingError("Error creating Process Memory Zip File %s" % e) results.append(proc) return results
def run(self): """Run evented signatures.""" # This will contain all the matched signatures. matched = [] stats = { } complete_list = list_plugins(group="signatures") evented_list = [sig(self.results) for sig in complete_list if sig.enabled and sig.evented and self._check_signature_version(sig) and (not sig.filter_analysistypes or self.results["target"]["category"] in sig.filter_analysistypes)] overlay = self._load_overlay() log.debug("Applying signature overlays for signatures: %s", ", ".join(overlay.keys())) for signature in complete_list + evented_list: self._apply_overlay(signature, overlay) if evented_list and "behavior" in self.results: log.debug("Running %u evented signatures", len(evented_list)) for sig in evented_list: stats[sig.name] = timedelta() if sig == evented_list[-1]: log.debug("\t `-- %s", sig.name) else: log.debug("\t |-- %s", sig.name) # Iterate calls and tell interested signatures about them. for proc in self.results["behavior"]["processes"]: for call in proc["calls"]: # Loop through active evented signatures. for sig in evented_list: # Skip current call if it doesn't match the filters (if any). if sig.filter_processnames and not proc["process_name"] in sig.filter_processnames: continue if sig.filter_apinames and not call["api"] in sig.filter_apinames: continue if sig.filter_categories and not call["category"] in sig.filter_categories: continue result = None try: pretime = datetime.now() result = sig.on_call(call, proc) posttime = datetime.now() timediff = posttime - pretime stats[sig.name] += timediff except NotImplementedError: result = False except: log.exception("Failed to run signature \"%s\":", sig.name) result = False # If the signature returns None we can carry on, the # condition was not matched. if result is None: continue # On True, the signature is matched. if result is True: log.debug("Analysis matched signature \"%s\"", sig.name) matched.append(sig.as_result()) if sig in complete_list: complete_list.remove(sig) # Either True or False, we don't need to check this sig anymore. evented_list.remove(sig) del sig # Call the stop method on all remaining instances. for sig in evented_list: try: pretime = datetime.now() result = sig.on_complete() posttime = datetime.now() timediff = posttime - pretime stats[sig.name] += timediff except NotImplementedError: continue except: log.exception("Failed run on_complete() method for signature \"%s\":", sig.name) continue else: if result is True: log.debug("Analysis matched signature \"%s\"", sig.name) matched.append(sig.as_result()) if sig in complete_list: complete_list.remove(sig) # Link this into the results already at this point, so non-evented signatures can use it self.results["signatures"] = matched # Add in statistics for evented signatures that took at least some time for key, value in stats.iteritems(): if value: self.results["statistics"]["signatures"].append({ "name": key, "time": float("%d.%03d" % (value.seconds, value.microseconds / 1000)), }) # Compat loop for old-style (non evented) signatures. if complete_list: complete_list.sort(key=lambda sig: sig.order) log.debug("Running non-evented signatures") for signature in complete_list: if not signature.filter_analysistypes or self.results["target"]["category"] in signature.filter_analysistypes: match = self.process(signature) # If the signature is matched, add it to the list. if match: matched.append(match) # Reset the ParseProcessLog instances after each signature if "behavior" in self.results: for process in self.results["behavior"]["processes"]: process["calls"].reset() # Sort the matched signatures by their severity level. matched.sort(key=lambda key: key["severity"]) # Tweak later as needed malscore = 0.0 for match in matched: if match["severity"] == 1: malscore += match["weight"] * 0.5 * (match["confidence"] / 100.0) else: malscore += match["weight"] * (match["severity"] - 1) * (match["confidence"] / 100.0) if malscore > 10.0: malscore = 10.0 if malscore < 0.0: malscore = 0.0 self.results["malscore"] = malscore family = "" # Make a best effort detection of malware family name (can be updated later by re-processing the analysis) for match in matched: if "families" in match and match["families"]: family = match["families"][0].title() break if not family and self.results["info"]["category"] == "file" and "virustotal" in self.results and "results" in self.results["virustotal"] and self.results["virustotal"]["results"]: detectnames = [] for res in self.results["virustotal"]["results"]: if res["sig"]: # weight Microsoft's detection, they seem to be more accurate than the rest if res["vendor"] == "Microsoft": detectnames.append(res["sig"]) detectnames.append(res["sig"]) family = get_vt_consensus(detectnames) # add detection based on suricata here if not family and "suricata" in self.results and "alerts" in self.results["suricata"] and self.results["suricata"]["alerts"]: for alert in self.results["suricata"]["alerts"]: if "signature" in alert and alert["signature"]: if alert["signature"].startswith("ET TROJAN") or alert["signature"].startswith("ETPRO TROJAN"): words = re.findall(r"[A-Za-z0-9]+", alert["signature"]) famcheck = words[2] famchecklower = famcheck.lower() if famchecklower == "win32" or famchecklower == "w32": famcheck = words[3] famchecklower = famcheck.lower() blacklist = [ "upx", "executable", "potential", "likely", "rogue", "supicious", "generic", "possible", "known", "common", "troj", "trojan", "team", "probably", "w2km", "http", "abuse.ch", "win32", "unknown", "single", "exe", "filename", "js", ] isgood = True for black in blacklist: if black == famchecklower: isgood = False break if isgood: famcheck = famcheck.split(".")[0] family = famcheck.title() # fall back to ClamAV detection if not family and self.results["info"]["category"] == "file" and "clamav" in self.results["target"]["file"] and self.results["target"]["file"]["clamav"] and self.results["target"]["file"]["clamav"].startswith("Win.Trojan."): words = re.findall(r"[A-Za-z0-9]+", self.results["target"]["file"]["clamav"]) family = words[2] self.results["malfamily"] = family
def feed(self, html_source): html_source = html_source.replace("\n", "") html_source = html_source.replace("\r", "") html_source = html_source.replace("\t", "") links = re.findall('<a.*?>', html_source) link_attributes = [] for link in links: link_attributes.append(self.__findTagAttributes(link)) #Finding all the forms: getting the text from "<form..." to "...</form>" #the array forms will contain all the forms of the page forms = re.findall('<form.*?>.*?</form>', html_source) forms_attributes = [] for form in forms: forms_attributes.append(self.__findTagAttributes(form)) #Processing the forms, obtaining the method and all the inputs #Also finding the method of the forms inputs_in_forms = [] text_areas_in_forms = [] selects_in_forms = [] for form in forms: inputs_in_forms.append(re.findall('<input.*?>', form)) text_areas_in_forms.append(re.findall('<textarea.*?>', form)) selects_in_forms.append(re.findall('<select.*?>', form)) #Extracting the attributes of the <input> tag as XML parser inputs_attributes = [] for i in xrange(len(inputs_in_forms)): inputs_attributes.append([]) for inputt in inputs_in_forms[i]: inputs_attributes[i].append(self.__findTagAttributes(inputt)) selects_attributes = [] for i in xrange(len(selects_in_forms)): selects_attributes.append([]) for select in selects_in_forms[i]: selects_attributes[i].append(self.__findTagAttributes(select)) textareas_attributes = [] for i in xrange(len(text_areas_in_forms)): textareas_attributes.append([]) for textArea in text_areas_in_forms[i]: textareas_attributes[i].append(self.__findTagAttributes(textArea)) if self.verbose == 3: print('') print('') print(_("Forms")) print("=====") for i in xrange(len(forms)): print(_("Form {0}").format(str(i))) tmpdict = {} for k, v in dict(forms_attributes[i]).items(): tmpdict[k.lower()] = v print(_(" * Method: {0}").format(self.__decode_htmlentities(tmpdict['action']))) print(_(" * Intputs:")) for j in xrange(len(inputs_in_forms[i])): print(u" + " + inputs_in_forms[i][j]) for att in inputs_attributes[i][j]: print(u" - " + str(att)) print(_(" * Selects:")) for j in xrange(len(selects_in_forms[i])): print(u" + " + selects_in_forms[i][j]) for att in selects_attributes[i][j]: print(u" - " + str(att)) print(_(" * TextAreas:")) for j in xrange(len(text_areas_in_forms[i])): print(u" + " + text_areas_in_forms[i][j]) for att in textareas_attributes[i][j]: print(u" - " + str(att)) print('') print(_("URLS")) print("====") for i in xrange(len(links)): tmpdict = {} for k, v in dict(link_attributes[i]).items(): tmpdict[k.lower()] = v if "href" in tmpdict: self.liens.append(self.__decode_htmlentities(tmpdict['href'])) if self.verbose == 3: print(self.__decode_htmlentities(tmpdict['href'])) for i in xrange(len(forms)): tmpdict = {} for k, v in dict(forms_attributes[i]).items(): tmpdict[k.lower()] = v self.form_values = [] if "action" in tmpdict: self.liens.append(self.__decode_htmlentities(tmpdict['action'])) self.current_form_url = self.__decode_htmlentities(tmpdict['action']) # Forms use GET method by default self.current_form_method = "get" if "method" in tmpdict: if tmpdict["method"].lower() == "post": self.current_form_method = "post" for j in xrange(len(inputs_attributes[i])): tmpdict = {} for k, v in dict(inputs_attributes[i][j]).items(): tmpdict[k.lower()] = v if "type" not in tmpdict: tmpdict["type"] = "text" if "name" in tmpdict: if tmpdict['type'].lower() in \ ['text', 'password', 'radio', 'checkbox', 'hidden', 'submit', 'search']: # use default value if present or set it to 'on' if "value" in tmpdict: if tmpdict["value"] != "": val = tmpdict["value"] else: val = u"on" else: val = u"on" self.form_values.append([tmpdict['name'], val]) if tmpdict['type'].lower() == "file": self.uploads.append(self.current_form_url) for j in xrange(len(textareas_attributes[i])): tmpdict = {} for k, v in dict(textareas_attributes[i][j]).items(): tmpdict[k.lower()] = v if "name" in tmpdict: self.form_values.append([tmpdict['name'], u'on']) for j in xrange(len(selects_attributes[i])): tmpdict = {} for k, v in dict(selects_attributes[i][j]).items(): tmpdict[k.lower()] = v if "name" in tmpdict: self.form_values.append([tmpdict['name'], u'on']) if self.current_form_method == "post": self.forms.append((self.current_form_url, self.form_values)) else: l = ["=".join([k, v]) for k, v in self.form_values] l.sort() self.liens.append(self.current_form_url.split("?")[0] + "?" + "&".join(l))
def getWrittenUrls(data): urls = re.findall("(?P<url>https?://[^\|]+)\|", data) if urls: return urls return []
def on_call(self, call, process): if call["api"] == "CreateProcessInternalW": flags = int(self.get_argument(call, "CreationFlags"), 16) if flags & 0x4: handle = self.get_argument(call, "ProcessHandle") self.suspended[handle] = self.get_argument(call, "ProcessId") elif call["api"] == "WriteProcessMemory": buf = self.get_argument(call, "Buffer") if any(string in buf for string in self.bufContents): handle = self.get_argument(call, "ProcessHandle") if handle in self.suspended: for pHandle in self.suspended: if pHandle == handle: self.badPid = self.suspended[pHandle] break check = getWrittenUrls(buf) if len(check) >= 2: self.c2s = check elif call["api"] == "NtClose": if call["status"]: handle = self.get_argument(call, "Handle") if handle in self.suspended: del self.suspended[handle] elif call["api"] == "RtlDecompressBuffer": buf = self.get_argument(call, "UncompressedBuffer") if "Cookie:disclaimer_accepted=true" in buf: self.badPid = str(process["process_id"]) check = getWrittenUrls(buf) if len(check) >= 2: self.c2s = check elif call["api"] == "InternetCrackUrlA": if process["process_id"] == self.badPid and self.netSequence == 0: if call["status"]: self.currentUrl = self.get_argument(call, "Url") self.netSequence += 1 elif call["api"] == "HttpOpenRequestA": if process["process_id"] == self.badPid and self.netSequence == 1: if call["status"]: method = self.get_argument(call, "Verb") if method and method == "POST": self.netSequence += 1 elif call["api"] == "HttpSendRequestA": if process["process_id"] == self.badPid and self.netSequence == 2: pData = self.get_argument(call, "PostData") if pData and all(word in pData for word in self.keywords): self.found = True c2 = {"C2": self.currentUrl} if c2 not in self.data: self.data.append(c2) self.netSequence = 0 elif call["api"] == "InternetReadFile": if call["status"] and str(process["process_id"]) == self.badPid: buf = self.get_argument(call, "Buffer") if buf and buf.startswith("{") and buf.strip().endswith("}"): check = re.findall(":(?P<url>https?://[^\}]+)\}", buf) if check: self.c2s += check return None