def test_simplest(self): re_list = ['1234', '4567', '7890'] mre = MultiRE(re_list) result = to_list(mre.query('4567')) self.assertEqual(1, len(result)) self.assertEqual('4567', result[0][1]) result = to_list(mre.query('7890')) self.assertEqual(1, len(result)) self.assertEqual('7890', result[0][1])
def test_unicode_re(self): re_list = [u'ñandú', u'ýandex'] mre = MultiRE(re_list) result = to_list(mre.query('abcn')) self.assertEqual(0, len(result)) self.assertEqual([], result) result = to_list(mre.query('123 ñandú 345')) self.assertEqual(1, len(result)) self.assertEqual('ñandú', result[0][1])
def test_unicode_query(self): re_list = [u'abc321', u'def123'] mre = MultiRE(re_list) result = to_list(mre.query('abc321ñ')) self.assertEqual(1, len(result)) self.assertEqual('abc321', result[0][1]) result = to_list(mre.query('abc321\x00def123')) self.assertEqual(2, len(result)) match_res = set(i[1] for i in result) self.assertEqual(set(re_list), match_res)
def test_re(self): re_list = ['1234.*56', 'ab.*cdef'] mre = MultiRE(re_list) result = to_list(mre.query('456')) self.assertEqual(0, len(result)) self.assertEqual([], result) result = to_list(mre.query('1234a56')) self.assertEqual(1, len(result)) self.assertEqual('1234.*56', result[0][1]) result = to_list(mre.query('abAAAcdef')) self.assertEqual(1, len(result)) self.assertEqual('ab.*cdef', result[0][1])
def test_re_with_obj(self): re_list = [('1234.*56', None, None), ('ab.*cdef', 1, 2)] mre = MultiRE(re_list) result = to_list(mre.query('1234A56')) self.assertEqual(1, len(result)) self.assertEqual('1234.*56', result[0][1]) self.assertEqual(None, result[0][3]) self.assertEqual(None, result[0][4]) result = to_list(mre.query('abAAAcdef')) self.assertEqual(1, len(result)) self.assertEqual('ab.*cdef', result[0][1]) self.assertEqual(1, result[0][3]) self.assertEqual(2, result[0][4])
def test_re_flags(self): re_list = ['12.*3456', 'ab.*cdef'] mre = MultiRE(re_list, re.IGNORECASE) result = to_list(mre.query('AB3Cdef')) self.assertEqual(1, len(result)) self.assertEqual('ab.*cdef', result[0][1])
def test_special_char(self): re_list = [u'\x00\x01\x02\x03'] mre = MultiRE(re_list) result = to_list(mre.query('abc\x00\x01\x02\x03def')) self.assertEqual(1, len(result)) self.assertEqual('\x00\x01\x02\x03', result[0][1])
class find_backdoors(CrawlPlugin): """ Find web backdoors and web shells. :author: Andres Riancho ([email protected]) """ WEBSHELL_DB = os.path.join(CRAWL_PATH, 'find_backdoors', 'web_shells.txt') SIGNATURE_DB = os.path.join(CRAWL_PATH, 'find_backdoors', 'signatures.txt') def __init__(self): CrawlPlugin.__init__(self) # Internal variables self._analyzed_dirs = ScalableBloomFilter() self._signature_re = None def setup(self): with self._plugin_lock: if self._signature_re is not None: return signatures = self._read_signatures() self._signature_re = MultiRE(signatures, hint_len=2) def _read_signatures(self): for line in file(self.SIGNATURE_DB): line = line.strip() if not line: continue if line.startswith('#'): continue yield (line, 'Backdoor signature') def crawl(self, fuzzable_request): """ For every directory, fetch a list of shell files and analyze the response. :param fuzzable_request: A fuzzable_request instance that contains (among other things) the URL to test. """ domain_path = fuzzable_request.get_url().get_domain_path() if domain_path not in self._analyzed_dirs: self._analyzed_dirs.add(domain_path) self.setup() # Read the web shell database web_shells = self._iter_web_shells() # Send the requests using threads: args_iter = (domain_path.url_join(fname) for fname in web_shells) self.worker_pool.map(self._check_if_exists, args_iter) def _iter_web_shells(self): """ :yield: lines from the web shell DB """ for line in file(self.WEBSHELL_DB): line = line.strip() if line.startswith('#'): continue if not line: continue yield line def _check_if_exists(self, web_shell_url): """ Check if the file exists. :param web_shell_url: The URL to check """ try: response = self._uri_opener.GET(web_shell_url, cache=True) except BaseFrameworkException: om.out.debug('Failed to GET webshell:' + web_shell_url) else: signature = self._match_signature(response) if signature is None: return desc = (u'An HTTP response matching the web backdoor signature' u' "%s" was found at: "%s"; this could indicate that the' u' server has been compromised.') desc %= (signature, response.get_url()) # It's probability is higher if we found a long signature _severity = severity.HIGH if len(signature) > 8 else severity.MEDIUM v = Vuln(u'Potential web backdoor', desc, _severity, response.id, self.get_name()) v.set_url(response.get_url()) kb.kb.append(self, 'backdoors', v) om.out.vulnerability(v.get_desc(), severity=v.get_severity()) fr = FuzzableRequest.from_http_response(response) self.output_queue.put(fr) def _match_signature(self, response): """ Heuristic to infer if the content of <response> has the pattern of a backdoor response. :param response: HTTPResponse object :return: A bool value """ body_text = response.get_body() for match, _, _, _ in self._signature_re.query(body_text): match_string = match.group(0) return match_string return None def get_long_desc(self): """ :return: A DETAILED description of the plugin functions and features. """ return """
def test_short(self): re_list = ['12.?34'] mre = MultiRE(re_list) result = to_list(mre.query('12X34')) self.assertEqual(1, len(result))
def test_dup(self): re_list = ['1234', '4567'] mre = MultiRE(re_list) result = to_list(mre.query('4567 4567')) self.assertEqual(1, len(result))
class path_disclosure(GrepPlugin): """ Grep every page for traces of path disclosure vulnerabilities. :author: Andres Riancho ([email protected]) """ def __init__(self): GrepPlugin.__init__(self) # Internal variables self._reported = DiskList(table_prefix='path_disclosure') self._signature_re = None def setup(self): """ :return: None, the result is saved in self._path_disc_regex_list """ if self._signature_re is not None: return all_signatures = [] for path_disclosure_string in get_common_directories(): regex_string = '(%s.*?)[^A-Za-z0-9\._\-\\/\+~]' regex_string = regex_string % path_disclosure_string all_signatures.append(regex_string) self._signature_re = MultiRE(all_signatures, hint_len=1) def grep(self, request, response): """ Identify the path disclosure vulnerabilities. :param request: The HTTP request object. :param response: The HTTP response object :return: None, the result is saved in the kb. """ if not response.is_text_or_html(): return self.setup() if self.find_path_disclosure(request, response): self._update_kb_path_list() def find_path_disclosure(self, request, response): """ Actually find the path disclosure vulnerabilities """ match_list = [] body_text = response.get_body() real_url = response.get_url().url_decode() for match, _, _ in self._signature_re.query(body_text): match_list.append(match.group(1)) # Sort by the longest match, this is needed for filtering out # some false positives please read the note below. match_list.sort(longest_cmp) for match in match_list: # Avoid duplicated reports if (real_url, match) in self._reported: continue # Remove false positives if self._is_false_positive(match, request, response): continue # Found! self._reported.append((real_url, match)) desc = ('The URL: "%s" has a path disclosure vulnerability which' ' discloses "%s".') desc %= (response.get_url(), match) v = Vuln('Path disclosure vulnerability', desc, severity.LOW, response.id, self.get_name()) v.add_to_highlight(match) v.set_url(real_url) v['path'] = match self.kb_append(self, 'path_disclosure', v) return v def _is_false_positive(self, match, request, response): """ :return: True if the match is a false positive """ # This if is to avoid false positives if request.sent(match): return True # https://github.com/andresriancho/w3af/issues/6640 url_list = kb.kb.get_all_known_urls() for url in url_list: path_and_file = url.get_path() if match == path_and_file: return True # There is a rare bug also, which is triggered in cases like this one: # # >>> import re # # >>> re.findall('/var/www/.*','/var/www/foobar/htdocs/article.php') # ['/var/www/foobar/htdocs/article.php'] # # >>> re.findall('/htdocs/.*','/var/www/foobar/htdocs/article.php') # ['/htdocs/article.php'] # # What I need to do here, is to keep the longest match. for real_url_reported, match_reported in self._reported: if match_reported.endswith(match): return True # Check if the match we got is part of a tag attribute value # # This part of the function is the one that consumes the most CPU usage # thus we run it last, hoping that at least one of the methods we # implemented above tags this match as a false positive and we don't # have to run the expensive method if self._is_attr_value(match, response): return True return False def _is_attr_value(self, path_disclosure_string, response): """ This method was created to remove some false positives. This method consumes 99% of the CPU usage of the plugin, but there are only a few improvements that come to mind: * Run the code that checks if the value is in the attributes in the subprocess. The performance of this plugin will be slightly improved. * Before calling the document parser check at least it looks like the path_disclosure_string is part of an attribute value using a regular expression such as: </?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[\^'">\s]+))?)+\s*|\s*)/?> (I just need to add the path_disclosure_string somewhere there) At some point I was using a similar approach [0] but it seems that it was slow? (I doubt that it will be slower than parsing the response with lxml). Something that could be done, and given that we know that this is an HTML string is: - Find all places in the response where path_disclosure_string appears - Create 'HTTP response snippets' with the locations of path_disclosure_string +/- 500 strings. - Apply the regular expression over those strings only, avoiding the cost of applying the regex to the whole HTML response [0] https://github.com/andresriancho/w3af/commit/f1029328fcaf7e790cc317701b63954c55a3f4c8 [1] https://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/ :return: True if path_disclosure_string is the value of an attribute inside a tag. Examples: path_disclosure_string = '/home/image.png' response_body = '....<img src="/home/image.png">...' return: True path_disclosure_string = '/home/image.png' response_body = '...<b>Error while checking /home/image.png</b>...' return: False """ for tag in mp_doc_parser.get_tags_by_filter(response, None): for value in tag.attrib.itervalues(): if path_disclosure_string in value: return True return False def _update_kb_path_list(self): """ If a path disclosure was found, I can create a list of full paths to all URLs ever visited. This method updates that list. """ path_disc_vulns = kb.kb.get('path_disclosure', 'path_disclosure') url_list = kb.kb.get_all_known_urls() # Now I find the longest match between one of the URLs that w3af has # discovered, and one of the path disclosure strings that this plugin # has found. I use the longest match because with small match_list I # have more probability of making a mistake. longest_match = '' longest_path_disc_vuln = None for path_disc_vuln in path_disc_vulns: for url in url_list: path_and_file = url.get_path() if path_disc_vuln['path'].endswith(path_and_file): if len(longest_match) < len(path_and_file): longest_match = path_and_file longest_path_disc_vuln = path_disc_vuln # Now I recalculate the place where all the resources are in disk, all # this is done taking the longest_match as a reference, so... if we # don't have a longest_match, then nothing is actually done if not longest_match: return # Get the webroot webroot = longest_path_disc_vuln['path'].replace(longest_match, '') # # This if fixes a strange case reported by Olle # if webroot[0] == '/': # IndexError: string index out of range # That seems to be because the webroot == '' # if not webroot: return # Check what path separator we should use (linux / windows) path_sep = '/' if webroot.startswith('/') else '\\' # Create the remote locations remote_locations = [] for url in url_list: remote_path = url.get_path().replace('/', path_sep) remote_locations.append(webroot + remote_path) remote_locations = list(set(remote_locations)) kb.kb.raw_write(self, 'list_files', remote_locations) kb.kb.raw_write(self, 'webroot', webroot) def end(self): self._reported.cleanup() def get_long_desc(self): """ :return: A DETAILED description of the plugin functions and features. """ return """
class find_backdoors(CrawlPlugin): """ Find web backdoors and web shells. :author: Andres Riancho ([email protected]) """ WEBSHELL_DB = os.path.join(CRAWL_PATH, 'find_backdoors', 'web_shells.txt') SIGNATURE_DB = os.path.join(CRAWL_PATH, 'find_backdoors', 'signatures.txt') def __init__(self): CrawlPlugin.__init__(self) # Internal variables self._analyzed_dirs = ScalableBloomFilter() self._signature_re = None def setup(self): with self._plugin_lock: if self._signature_re is not None: return signatures = self._read_signatures() self._signature_re = MultiRE(signatures, hint_len=2) def _read_signatures(self): for line in file(self.SIGNATURE_DB): line = line.strip() if not line: continue if line.startswith('#'): continue yield (line, 'Backdoor signature') def crawl(self, fuzzable_request, debugging_id): """ For every directory, fetch a list of shell files and analyze the response. :param debugging_id: A unique identifier for this call to discover() :param fuzzable_request: A fuzzable_request instance that contains (among other things) the URL to test. """ domain_path = fuzzable_request.get_url().get_domain_path() if domain_path in self._analyzed_dirs: return self._analyzed_dirs.add(domain_path) self.setup() # Read the web shell database web_shells = self._iter_web_shells() # Send the requests using threads: args_iter = (domain_path.url_join(fname) for fname in web_shells) self.worker_pool.map(self._check_if_exists, args_iter) def _iter_web_shells(self): """ :yield: lines from the web shell DB """ for line in file(self.WEBSHELL_DB): line = line.strip() if line.startswith('#'): continue if not line: continue yield line def _check_if_exists(self, web_shell_url): """ Check if the file exists. :param web_shell_url: The URL to check """ try: response = self._uri_opener.GET(web_shell_url, cache=True) except BaseFrameworkException: om.out.debug('Failed to GET webshell: %s' % web_shell_url) return signature = self._match_signature(response) if signature is None: return desc = (u'An HTTP response matching the web backdoor signature' u' "%s" was found at: "%s"; this could indicate that the' u' server has been compromised.') desc %= (signature, response.get_url()) # It's probability is higher if we found a long signature _severity = severity.HIGH if len(signature) > 8 else severity.MEDIUM v = Vuln(u'Potential web backdoor', desc, _severity, response.id, self.get_name()) v.set_url(response.get_url()) kb.kb.append(self, 'backdoors', v) om.out.vulnerability(v.get_desc(), severity=v.get_severity()) fr = FuzzableRequest.from_http_response(response) self.output_queue.put(fr) def _match_signature(self, response): """ Heuristic to infer if the content of <response> has the pattern of a backdoor response. :param response: HTTPResponse object :return: A bool value """ body_text = response.get_body() for match, _, _, _ in self._signature_re.query(body_text): match_string = match.group(0) return match_string return None def get_long_desc(self): """ :return: A DETAILED description of the plugin functions and features. """ return """
class path_disclosure(GrepPlugin): """ Grep every page for traces of path disclosure vulnerabilities. :author: Andres Riancho ([email protected]) """ def __init__(self): GrepPlugin.__init__(self) # Internal variables self._reported = DiskList(table_prefix='path_disclosure') self._signature_re = None def setup(self): """ :return: None, the result is saved in self._path_disc_regex_list """ if self._signature_re is not None: return all_signatures = [] for path_disclosure_string in get_common_directories(): regex_string = '(%s.*?)[^A-Za-z0-9\._\-\\/\+~]' regex_string = regex_string % path_disclosure_string all_signatures.append(regex_string) self._signature_re = MultiRE(all_signatures, hint_len=1) def grep(self, request, response): """ Identify the path disclosure vulnerabilities. :param request: The HTTP request object. :param response: The HTTP response object :return: None, the result is saved in the kb. """ if not response.is_text_or_html(): return self.setup() if self.find_path_disclosure(request, response): self._update_kb_path_list() def find_path_disclosure(self, request, response): """ Actually find the path disclosure vulnerabilities """ body_text = response.get_body() match_list = [] for match, _, _ in self._signature_re.query(body_text): match_list.append(match.group(1)) # Sort by the longest match, this is needed for filtering out # some false positives please read the note below. match_list.sort(longest_cmp) real_url = response.get_url().url_decode() for match in match_list: # Avoid duplicated reports if (real_url, match) in self._reported: continue # Remove false positives if self._is_false_positive(match, request, response): continue # Found! self._reported.append((real_url, match)) desc = ('The URL: "%s" has a path disclosure vulnerability which' ' discloses "%s".') desc %= (response.get_url(), match) v = Vuln('Path disclosure vulnerability', desc, severity.LOW, response.id, self.get_name()) v.add_to_highlight(match) v.set_url(real_url) v['path'] = match self.kb_append(self, 'path_disclosure', v) return v def _is_false_positive(self, match, request, response): """ :return: True if the match is a false positive """ # This if is to avoid false positives if request.sent(match): return True if self._is_attr_value(match, response): return True # https://github.com/andresriancho/w3af/issues/6640 url_list = kb.kb.get_all_known_urls() for url in url_list: path_and_file = url.get_path() if match == path_and_file: return True # There is a rare bug also, which is triggered in cases like this one: # # >>> import re # >>> re.findall('/var/www/.*','/var/www/foobar/htdocs/article.php') # ['/var/www/foobar/htdocs/article.php'] # >>> re.findall('/htdocs/.*','/var/www/foobar/htdocs/article.php') # ['/htdocs/article.php'] # >>> # # What I need to do here, is to keep the longest match. for real_url_reported, match_reported in self._reported: if match_reported.endswith(match): break else: # Note to self: I get here when "break" is NOT executed. # It's a new one, report! return False return True def _is_attr_value(self, path_disclosure_string, response): """ This method was created to remove some false positives. :return: True if path_disclosure_string is the value of an attribute inside a tag. Examples: path_disclosure_string = '/home/image.png' response_body = '....<img src="/home/image.png">...' return: True path_disclosure_string = '/home/image.png' response_body = '...<b>Error while checking /home/image.png</b>...' return: False """ for tag in mp_doc_parser.get_tags_by_filter(response, None): for value in tag.attrib.itervalues(): if path_disclosure_string in value: return True return False def _update_kb_path_list(self): """ If a path disclosure was found, I can create a list of full paths to all URLs ever visited. This method updates that list. """ path_disc_vulns = kb.kb.get('path_disclosure', 'path_disclosure') url_list = kb.kb.get_all_known_urls() # Now I find the longest match between one of the URLs that w3af has # discovered, and one of the path disclosure strings that this plugin # has found. I use the longest match because with small match_list I # have more probability of making a mistake. longest_match = '' longest_path_disc_vuln = None for path_disc_vuln in path_disc_vulns: for url in url_list: path_and_file = url.get_path() if path_disc_vuln['path'].endswith(path_and_file): if len(longest_match) < len(path_and_file): longest_match = path_and_file longest_path_disc_vuln = path_disc_vuln # Now I recalculate the place where all the resources are in disk, all # this is done taking the longest_match as a reference, so... if we # don't have a longest_match, then nothing is actually done if not longest_match: return # Get the webroot webroot = longest_path_disc_vuln['path'].replace(longest_match, '') # # This if fixes a strange case reported by Olle # if webroot[0] == '/': # IndexError: string index out of range # That seems to be because the webroot == '' # if not webroot: return # Check what path separator we should use (linux / windows) path_sep = '/' if webroot.startswith('/') else '\\' # Create the remote locations remote_locations = [] for url in url_list: remote_path = url.get_path().replace('/', path_sep) remote_locations.append(webroot + remote_path) remote_locations = list(set(remote_locations)) kb.kb.raw_write(self, 'list_files', remote_locations) kb.kb.raw_write(self, 'webroot', webroot) def end(self): self._reported.cleanup() def get_long_desc(self): """ :return: A DETAILED description of the plugin functions and features. """ return """