def recv_info(self, info): m_parsed_url = info.parsed_url m_results = [] #------------------------------------------------------------------ # Find suspicious URLs by matching against known substrings. # Load wordlists m_wordlist_middle = WordListLoader.get_wordlist(Config.plugin_config['middle']) m_wordlist_extensions = WordListLoader.get_wordlist(Config.plugin_config['extensions']) # Add matching keywords at any positions of URL. m_results.extend([SuspiciousURLPath(info, x) for x in m_wordlist_middle if x in m_parsed_url.directory.split("/") or x == m_parsed_url.filebase or x == m_parsed_url.extension]) # Add matching keywords at any positions of URL. m_results.extend([SuspiciousURLPath(info, x) for x in m_wordlist_extensions if m_parsed_url.extension == x]) #------------------------------------------------------------------ # Find suspicious URLs by calculating the Shannon entropy of the hostname. # Idea from: https://github.com/stricaud/urlweirdos/blob/master/src/urlw/plugins/shannon/__init__.py # TODO: test with unicode enabled hostnames! # Check the Shannon entropy for the hostname. hostname = info.parsed_url.hostname entropy = calculate_shannon_entropy(hostname) if entropy > 4.0: m_results.append( SuspiciousURLPath(info, hostname) ) # Check the Shannon entropy for the subdomains. for subdomain in info.parsed_url.hostname.split('.'): if len(subdomain) > 3: entropy = calculate_shannon_entropy(subdomain) if entropy > 4.0: m_results.append( SuspiciousURLPath(info, subdomain) ) #------------------------------------------------------------------ # # # # Get malware suspicious links # # # #------------------------------------------------------------------ p = None m_url = info.url Logger.log_more_verbose("Looking for output links to malware sites") try: allow_redirects = Config.audit_config.follow_redirects or \ (info.depth == 0 and Config.audit_config.follow_first_redirect) p = download(m_url, self.check_download, allow_redirects=allow_redirects) except NetworkException,e: Logger.log_more_verbose("Error while processing %r: %s" % (m_url, str(e)))
def recv_info(self, info): m_parsed_url = info.parsed_url m_results = [] #------------------------------------------------------------------ # Find suspicious URLs by matching against known substrings. # Load wordlists m_wordlist_middle = WordListLoader.get_wordlist( Config.plugin_config['middle']) m_wordlist_extensions = WordListLoader.get_wordlist( Config.plugin_config['extensions']) # Add matching keywords at any positions of URL. m_results.extend([ SuspiciousURL(info, x) for x in m_wordlist_middle if x in m_parsed_url.directory.split("/") or x == m_parsed_url.filebase or x == m_parsed_url.extension ]) # Add matching keywords at any positions of URL. m_results.extend([ SuspiciousURL(info, x) for x in m_wordlist_extensions if m_parsed_url.extension == x ]) #------------------------------------------------------------------ # Find suspicious URLs by calculating the Shannon entropy of the hostname. # Idea from: https://github.com/stricaud/urlweirdos/blob/master/src/urlw/plugins/shannon/__init__.py # TODO: test with unicode enabled hostnames! # Check the Shannon entropy for the hostname. hostname = info.parsed_url.hostname entropy = calculate_shannon_entropy(hostname) if entropy > 4.0: m_results.append(SuspiciousURL(info, hostname)) # Check the Shannon entropy for the subdomains. for subdomain in info.parsed_url.hostname.split('.'): if len(subdomain) > 3: entropy = calculate_shannon_entropy(subdomain) if entropy > 4.0: m_results.append(SuspiciousURL(info, subdomain)) #------------------------------------------------------------------ return m_results
def load_wordlists(wordlists): """ Load the with names pased as parameter. This function receives a list of names of wordlist, defined in plugin configuration file, and return a dict with instances of wordlists. :param wordlists: list with wordlists names :type wordlists: list :returns: A dict with wordlists :rtype: dict """ m_tmp_wordlist = {} # Get wordlist to load for l_w in wordlists: for wordlist_family, l_wordlists in Config.plugin_extra_config.iteritems(): if wordlist_family.lower() in l_w.lower(): m_tmp_wordlist[l_w] = l_wordlists # Load the wordlist m_return = {} for k, w_paths in m_tmp_wordlist.iteritems(): m_return[k] = [WordListLoader.get_wordlist(w) for w in w_paths] return m_return
def load_wordlists(wordlists): """ Load the with names pased as parameter. This function receives a list of names of wordlist, defined in plugin configuration file, and return a dict with instances of wordlists. :param wordlists: list with wordlists names :type wordlists: list :returns: A dict with wordlists :rtype: dict """ m_tmp_wordlist = {} # Get wordlist to load for l_w in wordlists: for wordlist_family, l_wordlists in Config.plugin_extra_config.iteritems(): if wordlist_family.lower() in l_w.lower(): m_tmp_wordlist[l_w] = l_wordlists # Load the wordlist m_return = {} for k, w_paths in m_tmp_wordlist.iteritems(): m_return[k] = [WordListLoader.get_wordlist(w) for w in w_paths] return m_return
def analyze_url(self, info): m_parsed_url = info.parsed_url m_results = [] Logger.log_more_verbose("Processing URL: %s" % m_parsed_url) #---------------------------------------------------------------------- # Find suspicious URLs by matching against known substrings. # Load wordlists m_wordlist_middle = WordListLoader.get_wordlist(Config.plugin_config['middle']) m_wordlist_extensions = WordListLoader.get_wordlist(Config.plugin_config['extensions']) # Add matching keywords at any positions of URL. m_results.extend([SuspiciousURLPath(info, x) for x in m_wordlist_middle if x in m_parsed_url.directory.split("/") or x == m_parsed_url.filebase or x == m_parsed_url.extension]) # Add matching keywords at any positions of URL. m_results.extend([SuspiciousURLPath(info, x) for x in m_wordlist_extensions if m_parsed_url.extension == x]) #---------------------------------------------------------------------- # Find suspicious URLs by calculating the Shannon entropy of the hostname. # Idea from: https://github.com/stricaud/urlweirdos/blob/master/src/urlw/plugins/shannon/__init__.py # TODO: test with unicode enabled hostnames! # Check the Shannon entropy for the hostname. hostname = info.parsed_url.hostname entropy = calculate_shannon_entropy(hostname) if entropy > 4.0: m_results.append( SuspiciousURLPath(info, hostname) ) # Check the Shannon entropy for the subdomains. for subdomain in info.parsed_url.hostname.split('.'): if len(subdomain) > 3: entropy = calculate_shannon_entropy(subdomain) if entropy > 4.0: m_results.append( SuspiciousURLPath(info, subdomain) ) return m_results
def __detect_wordpress_installation(self, url, wordpress_urls): """ Try to detect a wordpress instalation in the current path. :param url: URL where try to find the WordPress installation. :type url: str :param wordpress_urls: string with wordlist name with WordPress URLs. :type wordpress_urls: str :return: True if wordpress installation found. False otherwise. :rtype: bool """ Logger.log_more_verbose( "Detecting Wordpress instalation in URI: '%s'." % url) total_urls = 0 urls_found = 0 error_page = get_error_page(url).raw_data for u in WordListLoader.get_wordlist(wordpress_urls): total_urls += 1 tmp_url = urljoin(url, u) r = HTTP.get_url(tmp_url, use_cache=False) if r.status == "200": # Try to detect non-default error pages ratio = get_diff_ratio(r.raw_response, error_page) if ratio < 0.35: urls_found += 1 discard_data(r) # If Oks > 85% continue if (urls_found / float(total_urls)) < 0.85: # If all fails, make another last test url_wp_admin = urljoin(url, "wp-admin/") try: p = HTTP.get_url(url_wp_admin, use_cache=False, allow_redirects=False) if p: discard_data(p) except Exception, e: return False if p.status == "302" and "wp-login.php?redirect_to=" in p.headers.get( "Location", ""): return True else: return False
def __detect_wordpress_installation(self, url, wordpress_urls): """ Try to detect a wordpress instalation in the current path. :param url: URL where try to find the WordPress installation. :type url: str :param wordpress_urls: string with wordlist name with WordPress URLs. :type wordpress_urls: str :return: True if wordpress installation found. False otherwise. :rtype: bool """ Logger.log_more_verbose("Detecting Wordpress instalation in URI: '%s'." % url) total_urls = 0 urls_found = 0 error_page = get_error_page(url).raw_data for u in WordListLoader.get_wordlist(wordpress_urls): total_urls += 1 tmp_url = urljoin(url, u) r = HTTP.get_url(tmp_url, use_cache=False) if r.status == "200": # Try to detect non-default error pages ratio = get_diff_ratio(r.raw_response, error_page) if ratio < 0.35: urls_found += 1 discard_data(r) # If Oks > 85% continue if (urls_found / float(total_urls)) < 0.85: # If all fails, make another last test url_wp_admin = urljoin(url, "wp-admin/") try: p = HTTP.get_url(url_wp_admin, use_cache=False, allow_redirects=False) if p: discard_data(p) except Exception, e: return False if p.status == "302" and "wp-login.php?redirect_to=" in p.headers.get("Location", ""): return True else: return False
def analyze_html(self, info): #---------------------------------------------------------------------- # Get malware suspicious links. Logger.log_more_verbose("Processing HTML: %s" % info.identity) # Load the malware wordlist. wordlist_filename = Config.plugin_config["malware_sites"] try: wordlist = WordListLoader.get_advanced_wordlist_as_list( wordlist_filename) except WordlistNotFound: Logger.log_error("Wordlist '%s' not found.." % wordlist_filename) return except TypeError: Logger.log_error( "Wordlist '%s' is not a file." % wordlist_filename) return if not wordlist: Logger.log_error("Wordlist '%s' is empty." % wordlist_filename) Logger.log("1") # Get links base_urls = set() for url in info.find_linked_data(Data.TYPE_RESOURCE, Resource.RESOURCE_URL): m_url = url.url base_urls.add(m_url) if info.information_type == Information.INFORMATION_HTML: m_links = extract_from_html(info.raw_data, m_url) m_links.update( extract_from_text(info.raw_data, m_url) ) elif info.information_type == Information.INFORMATION_PLAIN_TEXT: m_links = extract_from_text(info.raw_data, m_url) else: raise Exception("Internal error!") m_links.difference_update(base_urls) Logger.log("2") # If we have no links, abort now if not m_links: Logger.log_verbose("No output links found.") return # Do not follow URLs that contain certain keywords m_forbidden = WordListLoader.get_wordlist( Config.plugin_config["wordlist_no_spider"]) m_urls_allowed = { url for url in m_links if url and not any(x in url for x in m_forbidden) } Logger.log("3") # Get only output links m_output_links = [] for url in m_urls_allowed: try: if url not in Config.audit_scope: m_output_links.append(url) except Exception, e: Logger.log_error_more_verbose(format_exc())
def check_download(self, url, name, content_length, content_type): # Only accept content when the content type header is present. if not content_type: Logger.log_more_verbose( "Skipping URL, missing content type: %s" % url) return False # Is the content length present? if content_length is not None: # Check the file doesn't have 0 bytes. if content_length <= 0: Logger.log_more_verbose( "Skipping URL, empty content: %s" % url) return False # Check the file is not too big. if content_type.strip().lower().startswith("text/"): if content_length > 100000: Logger.log_more_verbose( "Skipping URL, content too large (%d bytes): %s" % (content_length, url)) return False else: if content_length > 5000000: Logger.log_more_verbose( "Skipping URL, content too large (%d bytes): %s" % (content_length, url)) return False # Approved! return True # Content length absent but likely points to a directory index. parsed_url = parse_url(url) if not parsed_url.filename: # Approved! return True # Extension absent. if not parsed_url.extension: # Approved! return True # Match against a known list of valid HTML extensions. # See: http://en.wikipedia.org/wiki/List_of_file_formats#Webpage if parsed_url.extension in ( ".xml", ".html", ".htm", ".xhtml", ".xht", ".mht", ".mhtml", ".maff", ".asp", ".aspx", ".bml", ".cfm", ".cgi", ".ihtml", ".jsp", ".las", ".lasso", ".lassoapp", ".pl", ".php", ".php3", ".phtml", ".rna", ".r", ".rnx", ".shtml", ".stm", ".atom", ".xml", ".eml", ".jsonld", ".metalink", ".met", ".rss", ".xml", ".markdown"): # Approved! return True # If URL path in blacklist? m_forbidden = [x for x in WordListLoader.get_wordlist(Config.plugin_config["wordlist_no_spider"])] if any(x in url for x in m_forbidden): return False # Success! return True
class Spider(TestingPlugin): """ This plugin is a web spider. """ #---------------------------------------------------------------------- def get_accepted_info(self): return [Url] #---------------------------------------------------------------------- def recv_info(self, info): m_return = [] m_url = info.url m_depth = info.depth # Check depth if Config.audit_config.depth is not None and m_depth > Config.audit_config.depth: Logger.log_more_verbose("Spider depth level exceeded for URL: %s" % m_url) return m_return Logger.log_verbose("Spidering URL: %r" % m_url) # Check if need follow first redirect p = None try: allow_redirects = Config.audit_config.follow_redirects or \ (m_depth == 0 and Config.audit_config.follow_first_redirect) p = download(m_url, self.check_download, allow_redirects=allow_redirects) except NetworkException, e: Logger.log_more_verbose("Error while processing %r: %s" % (m_url, str(e))) if not p: return m_return # Send back the data m_return.append(p) # TODO: If it's a 301 response, get the Location header # Get links if p.information_type == Information.INFORMATION_HTML: m_links = extract_from_html(p.raw_data, m_url) else: m_links = extract_from_text(p.raw_data, m_url) try: m_links.remove(m_url) except Exception: pass # Do not follow URLs that contain certain keywords m_forbidden = WordListLoader.get_wordlist( Config.plugin_config["wordlist_no_spider"]) m_urls_allowed = [ url for url in m_links if not any(x in url for x in m_forbidden) ] m_urls_not_allowed = m_links.difference(m_urls_allowed) if m_urls_not_allowed: Logger.log_more_verbose("Skipped forbidden URLs:\n %s" % "\n ".join(sorted(m_urls_not_allowed))) # Do not follow URLs out of scope m_out_of_scope_count = len(m_urls_allowed) m_urls_allowed = [ url for url in m_urls_allowed if url in Config.audit_scope ] m_out_of_scope_count -= len(m_urls_allowed) if m_out_of_scope_count: Logger.log_more_verbose("Skipped %d links out of scope." % m_out_of_scope_count) if m_urls_allowed: Logger.log_verbose("Found %d links in URL: %s" % (len(m_urls_allowed), m_url)) else: Logger.log_verbose("No links found in URL: %s" % m_url) # Convert to Url data type for u in m_urls_allowed: m_resource = Url(url=u, depth=m_depth + 1, referer=m_url) m_resource.add_resource(info) m_return.append(m_resource) # Send the results return m_return
def analyze_html(self, info): #---------------------------------------------------------------------- # Get malware suspicious links. Logger.log_more_verbose("Processing HTML: %s" % info.identity) # Load the malware wordlist. wordlist_filename = Config.plugin_config["malware_sites"] try: wordlist = WordListLoader.get_advanced_wordlist_as_list( wordlist_filename) except WordlistNotFound: Logger.log_error("Wordlist '%s' not found.." % wordlist_filename) return except TypeError: Logger.log_error("Wordlist '%s' is not a file." % wordlist_filename) return if not wordlist: Logger.log_error("Wordlist '%s' is empty." % wordlist_filename) Logger.log("1") # Get links base_urls = set() for url in info.find_linked_data(Data.TYPE_RESOURCE, Resource.RESOURCE_URL): m_url = url.url base_urls.add(m_url) if info.information_type == Information.INFORMATION_HTML: m_links = extract_from_html(info.raw_data, m_url) m_links.update(extract_from_text(info.raw_data, m_url)) elif info.information_type == Information.INFORMATION_PLAIN_TEXT: m_links = extract_from_text(info.raw_data, m_url) else: raise Exception("Internal error!") m_links.difference_update(base_urls) Logger.log("2") # If we have no links, abort now if not m_links: Logger.log_verbose("No output links found.") return # Do not follow URLs that contain certain keywords m_forbidden = WordListLoader.get_wordlist( Config.plugin_config["wordlist_no_spider"]) m_urls_allowed = { url for url in m_links if url and not any(x in url for x in m_forbidden) } Logger.log("3") # Get only output links m_output_links = [] for url in m_urls_allowed: try: if url not in Config.audit_scope: m_output_links.append(url) except Exception, e: Logger.log_error_more_verbose(format_exc())
class Spider(TestingPlugin): """ This plugin is a web spider. """ #-------------------------------------------------------------------------- def get_accepted_types(self): return [URL] #-------------------------------------------------------------------------- def run(self, info): m_return = [] m_url = info.url Logger.log_verbose("Spidering URL: %s" % m_url) # Check if need follow first redirect, then follow the link. p = None try: allow_redirects = Config.audit_config.follow_redirects or \ (info.depth == 0 and Config.audit_config.follow_first_redirect) p = download(m_url, self.check_download, allow_redirects=allow_redirects) except NetworkException, e: Logger.log_error_verbose("Error while processing %r: %s" % (m_url, str(e))) if not p: return m_return # Send back the data m_return.append(p) # TODO: If it's a 301 response, get the Location header # Get links m_forms = None if p.information_type == HTML.data_subtype: m_links = extract_from_html(p.raw_data, m_url) m_forms = extract_forms_from_html(p.raw_data, m_url) #m_links.update( extract_from_text(p.raw_data, m_url) ) elif p.information_type == Text.data_subtype: m_links = extract_from_text(p.raw_data, m_url) else: return m_return try: m_links.remove(m_url) except Exception: pass # Do not follow URLs that contain certain keywords m_forbidden = [ x for x in WordListLoader.get_wordlist( Config.plugin_config["wordlist_no_spider"]) ] m_urls_allowed = [ url for url in m_links if not any(x in url for x in m_forbidden) ] m_urls_not_allowed = m_links.difference(m_urls_allowed) if m_urls_not_allowed: Logger.log_more_verbose("Skipped forbidden URLs:\n %s" % "\n ".join(sorted(m_urls_not_allowed))) # Do not follow URLs out of scope m_urls_in_scope = [] m_broken = [] for url in m_urls_allowed: try: if url in Config.audit_scope: m_urls_in_scope.append(url) except Exception: m_broken.append(url) if m_broken: if len(m_broken) == 1: Logger.log_more_verbose("Skipped uncrawlable URL: %s" % m_broken[0]) else: Logger.log_more_verbose("Skipped uncrawlable URLs:\n %s" % "\n ".join(sorted(m_broken))) m_out_of_scope_count = len(m_urls_allowed) - len( m_urls_in_scope) - len(m_broken) if m_out_of_scope_count: Logger.log_more_verbose("Skipped %d links out of scope." % m_out_of_scope_count) if m_urls_in_scope: Logger.log_verbose("Found %d links in URL: %s" % (len(m_urls_allowed), m_url)) else: Logger.log_more_verbose("No links found in URL: %s" % m_url) # Convert to URL data type for u in m_urls_in_scope: try: p = parse_url(u) if p.scheme == "mailto": m_resource = Email(p.netloc) elif p.scheme in ("http", "https"): m_resource = URL(url=u, referer=m_url) except Exception: warn(format_exc(), RuntimeWarning) m_resource.add_resource(info) m_return.append(m_resource) # Get forms info if m_forms: m_forms_allowed = [ url for url in m_forms if not any(x in url[0] for x in m_forbidden) ] m_forms_not_allowed = {x[0] for x in m_forms }.difference(x[0] for x in m_forms_allowed) else: m_forms_allowed = [] m_forms_not_allowed = set() if m_forms_not_allowed: Logger.log_more_verbose("Skipped forbidden forms:\n %s" % "\n ".join(sorted(m_forms_not_allowed))) # Do not follow forms out of scope m_forms_in_scope = [] m_broken = [] for url in m_forms_allowed: try: if url[0] in Config.audit_scope: m_forms_in_scope.append(url) except Exception: m_broken.append(url[0]) if m_broken: if len(m_broken) == 1: Logger.log_more_verbose("Skipped uncrawlable forms: %s" % m_broken[0]) else: Logger.log_more_verbose("Skipped uncrawlable forms:\n %s" % "\n ".join(sorted(m_broken))) m_out_of_scope_count = len(m_forms_allowed) - len( m_forms_in_scope) - len(m_broken) if m_out_of_scope_count: Logger.log_more_verbose("Skipped %d forms out of scope." % m_out_of_scope_count) if m_forms_in_scope: Logger.log_verbose("Found %d forms in URL: %s" % (len(m_forms_in_scope), m_url)) else: Logger.log_more_verbose("No forms found in URL: %s" % m_url) # Convert to URL data type for u in m_forms_in_scope: try: url = u[0] method = u[1] params = {x["name"]: x["value"] for x in u[2]} m_resource = URL(url=url, referer=m_url, method=method, post_params=params) except Exception: warn(format_exc(), RuntimeWarning) m_resource.add_resource(info) m_return.append(m_resource) # Send the results return m_return
def check_download(self, url, name, content_length, content_type): # Only accept content when the content type header is present. if not content_type: Logger.log_more_verbose("Skipping URL, missing content type: %s" % url) return False # Is the content length present? if content_length is not None: # Check the file doesn't have 0 bytes. if content_length <= 0: Logger.log_more_verbose("Skipping URL, empty content: %s" % url) return False # Check the file is not too big. if content_type.strip().lower().startswith("text/"): if content_length > 100000: Logger.log_more_verbose( "Skipping URL, content too large (%d bytes): %s" % (content_length, url)) return False else: if content_length > 5000000: Logger.log_more_verbose( "Skipping URL, content too large (%d bytes): %s" % (content_length, url)) return False # Approved! return True # Content length absent but likely points to a directory index. parsed_url = parse_url(url) if not parsed_url.filename: # Approved! return True # Extension absent. if not parsed_url.extension: # Approved! return True # Match against a known list of valid HTML extensions. # See: http://en.wikipedia.org/wiki/List_of_file_formats#Webpage if parsed_url.extension in (".xml", ".html", ".htm", ".xhtml", ".xht", ".mht", ".mhtml", ".maff", ".asp", ".aspx", ".bml", ".cfm", ".cgi", ".ihtml", ".jsp", ".las", ".lasso", ".lassoapp", ".pl", ".php", ".php3", ".phtml", ".rna", ".r", ".rnx", ".shtml", ".stm", ".atom", ".xml", ".eml", ".jsonld", ".metalink", ".met", ".rss", ".xml", ".markdown"): # Approved! return True # If URL path in blacklist? m_forbidden = [ x for x in WordListLoader.get_wordlist( Config.plugin_config["wordlist_no_spider"]) ] if any(x in url for x in m_forbidden): return False # Success! return True