def analyze_url(self, info): m_parsed_url = info.parsed_url m_results = [] Logger.log_more_verbose("Processing URL: %s" % m_parsed_url) #---------------------------------------------------------------------- # Find suspicious URLs by matching against known substrings. # Load wordlists m_wordlist_middle = WordListLoader.get_wordlist_as_raw( Config.plugin_config['middle']) m_wordlist_extensions = WordListLoader.get_wordlist_as_raw( Config.plugin_config['extensions']) # Add matching keywords at any positions of URL. m_results.extend([ SuspiciousURLPath(info, x) for x in m_wordlist_middle if x in m_parsed_url.directory.split("/") or x == m_parsed_url.filebase or x == m_parsed_url.extension ]) # Add matching keywords at any positions of URL. m_results.extend([ SuspiciousURLPath(info, x) for x in m_wordlist_extensions if m_parsed_url.extension == x ]) #---------------------------------------------------------------------- # Find suspicious URLs by calculating the Shannon entropy of the hostname. # Idea from: https://github.com/stricaud/urlweirdos/blob/master/src/urlw/plugins/shannon/__init__.py # TODO: test with unicode enabled hostnames! # Check the Shannon entropy for the hostname. hostname = info.parsed_url.hostname entropy = calculate_shannon_entropy(hostname) if entropy > 4.0: m_results.append(SuspiciousURLPath(info, hostname)) # Check the Shannon entropy for the subdomains. for subdomain in info.parsed_url.hostname.split('.'): if len(subdomain) > 3: entropy = calculate_shannon_entropy(subdomain) if entropy > 4.0: m_results.append(SuspiciousURLPath(info, subdomain)) return m_results
def analyze_url(self, info): m_parsed_url = info.parsed_url m_results = [] Logger.log_more_verbose("Processing URL: %s" % m_parsed_url) #---------------------------------------------------------------------- # Find suspicious URLs by matching against known substrings. # Load wordlists m_wordlist_middle = WordListLoader.get_wordlist_as_raw(Config.plugin_config['middle']) m_wordlist_extensions = WordListLoader.get_wordlist_as_raw(Config.plugin_config['extensions']) # Add matching keywords at any positions of URL. m_results.extend([SuspiciousURLPath(info, x) for x in m_wordlist_middle if x in m_parsed_url.directory.split("/") or x == m_parsed_url.filebase or x == m_parsed_url.extension]) # Add matching keywords at any positions of URL. m_results.extend([SuspiciousURLPath(info, x) for x in m_wordlist_extensions if m_parsed_url.extension == x]) #---------------------------------------------------------------------- # Find suspicious URLs by calculating the Shannon entropy of the hostname. # Idea from: https://github.com/stricaud/urlweirdos/blob/master/src/urlw/plugins/shannon/__init__.py # TODO: test with unicode enabled hostnames! # Check the Shannon entropy for the hostname. hostname = info.parsed_url.hostname entropy = calculate_shannon_entropy(hostname) if entropy > 4.0: m_results.append( SuspiciousURLPath(info, hostname) ) # Check the Shannon entropy for the subdomains. for subdomain in info.parsed_url.hostname.split('.'): if len(subdomain) > 3: entropy = calculate_shannon_entropy(subdomain) if entropy > 4.0: m_results.append( SuspiciousURLPath(info, subdomain) ) return m_results
def analyze_html(self, info): #---------------------------------------------------------------------- # Get malware suspicious links. Logger.log_more_verbose("Processing HTML: %s" % info.identity) # Load the malware wordlist. wordlist_filename = Config.plugin_config["malware_sites"] try: wordlist = WordListLoader.get_wordlist_as_list(wordlist_filename) except WordlistNotFound: Logger.log_error("Wordlist '%s' not found.." % wordlist_filename) return except TypeError: Logger.log_error("Wordlist '%s' is not a file." % wordlist_filename) return if not wordlist: Logger.log_error("Wordlist '%s' is empty." % wordlist_filename) Logger.log("1") # Get links base_urls = set() for url in info.find_linked_data(Data.TYPE_RESOURCE, Resource.RESOURCE_URL): m_url = url.url base_urls.add(m_url) if info.information_type == Information.INFORMATION_HTML: m_links = extract_from_html(info.raw_data, m_url) m_links.update(extract_from_text(info.raw_data, m_url)) elif info.information_type == Information.INFORMATION_PLAIN_TEXT: m_links = extract_from_text(info.raw_data, m_url) else: raise Exception("Internal error!") m_links.difference_update(base_urls) Logger.log("2") # If we have no links, abort now if not m_links: Logger.log_verbose("No output links found.") return # Do not follow URLs that contain certain keywords m_forbidden = WordListLoader.get_wordlist_as_raw( Config.plugin_config["wordlist_no_spider"]) m_urls_allowed = { url for url in m_links if url and not any(x in url for x in m_forbidden) } Logger.log("3") # Get only output links m_output_links = [] for url in m_urls_allowed: try: if url not in Config.audit_scope: m_output_links.append(url) except Exception, e: Logger.log_error_more_verbose(format_exc())
def analyze_html(self, info): #---------------------------------------------------------------------- # Get malware suspicious links. Logger.log_more_verbose("Processing HTML: %s" % info.identity) # Load the malware wordlist. wordlist_filename = Config.plugin_config["malware_sites"] try: wordlist = WordListLoader.get_wordlist_as_list( wordlist_filename) except WordlistNotFound: Logger.log_error("Wordlist '%s' not found.." % wordlist_filename) return except TypeError: Logger.log_error( "Wordlist '%s' is not a file." % wordlist_filename) return if not wordlist: Logger.log_error("Wordlist '%s' is empty." % wordlist_filename) Logger.log("1") # Get links base_urls = set() for url in info.find_linked_data(Data.TYPE_RESOURCE, Resource.RESOURCE_URL): m_url = url.url base_urls.add(m_url) if info.information_type == Information.INFORMATION_HTML: m_links = extract_from_html(info.raw_data, m_url) m_links.update( extract_from_text(info.raw_data, m_url) ) elif info.information_type == Information.INFORMATION_PLAIN_TEXT: m_links = extract_from_text(info.raw_data, m_url) else: raise Exception("Internal error!") m_links.difference_update(base_urls) Logger.log("2") # If we have no links, abort now if not m_links: Logger.log_verbose("No output links found.") return # Do not follow URLs that contain certain keywords m_forbidden = WordListLoader.get_wordlist_as_raw( Config.plugin_config["wordlist_no_spider"]) m_urls_allowed = { url for url in m_links if url and not any(x in url for x in m_forbidden) } Logger.log("3") # Get only output links m_output_links = [] for url in m_urls_allowed: try: if url not in Config.audit_scope: m_output_links.append(url) except Exception, e: Logger.log_error_more_verbose(format_exc())