def _extract_links_and_verify(self, resp, fuzzable_req): # # Note: I WANT to follow links that are in the 404 page. # # Modified when I added the pdfParser # I had to add this x OR y stuff, just because I don't want # the SGML parser to analyze a image file, its useless and # consumes CPU power. if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf(): originalURL = resp.getRedirURI() try: doc_parser = dpCache.dpc.getDocumentParserFor(resp) except w3afException, w3: om.out.debug('Failed to find a suitable document parser. ' 'Exception "%s"' % w3) else: # Note: # - With parsed_refs I'm 100% that it's really # something in the HTML that the developer intended to add. # # - The re_refs are the result of regular expressions, # which in some cases are just false positives. parsed_refs, re_refs = doc_parser.getReferences() # I also want to analyze all directories, if the URL I just # fetched is: # http://localhost/a/b/c/f00.php I want to GET: # http://localhost/a/b/c/ # http://localhost/a/b/ # http://localhost/a/ # http://localhost/ # And analyze the responses... dirs = resp.getURL().getDirectories() only_re_refs = set(re_refs) - set(dirs + parsed_refs) for ref in unique_justseen( sorted( itertools.chain(dirs, parsed_refs, re_refs) )): # I don't want w3af sending requests to 3rd parties! if ref.getDomain() != self._target_domain: continue # Filter the URL's according to the configured regexs urlstr = ref.url_string if not self._compiled_follow_re.match(urlstr) or \ self._compiled_ignore_re.match(urlstr): continue # Work with the parsed references and report broken # links. Then work with the regex references and DO NOT # report broken links if self._need_more_variants(ref): self._known_variants.append(ref) possibly_broken = ref in only_re_refs args = (ref, fuzzable_req, originalURL, possibly_broken) self._run_async(meth=self._verify_reference, args=args) self._join()
def end(self): ''' Called when the process ends, prints out the list of broken links. ''' if len(self._broken_links): om.out.information('The following is a list of broken links that ' 'were found by the webSpider plugin:') for broken, where in unique_justseen(self._broken_links.ordered_iter()): om.out.information('- %s [ referenced from: %s ]' % (broken, where))
def end(self): """ Called when the process ends, prints out the list of broken links. """ if len(self._broken_links): om.out.information("The following is a list of broken links that " "were found by the web_spider plugin:") for broken, where in unique_justseen(self._broken_links.ordered_iter()): om.out.information("- %s [ referenced from: %s ]" % (broken, where)) self._broken_links.cleanup()
def end(self): ''' Called when the process ends, prints out the list of broken links. ''' if len(self._broken_links): om.out.information('The following is a list of broken links that ' 'were found by the web_spider plugin:') for broken, where in unique_justseen( self._broken_links.ordered_iter()): om.out.information('- %s [ referenced from: %s ]' % (broken, where)) self._broken_links.cleanup()
def _urls_to_verify_generator(self, resp, fuzzable_req): ''' :param resp: HTTP response object :param fuzzable_req: The HTTP request that generated the response ''' # # Note: I WANT to follow links that are in the 404 page. # # Modified when I added the PDFParser # I had to add this x OR y stuff, just because I don't want # the SGML parser to analyze a image file, its useless and # consumes CPU power. if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf(): original_url = resp.get_redir_uri() try: doc_parser = parser_cache.dpc.get_document_parser_for(resp) except w3afException, w3: om.out.debug('Failed to find a suitable document parser. ' 'Exception "%s"' % w3) else: # Note: # - With parsed_refs I'm 100% that it's really # something in the HTML that the developer intended to add. # # - The re_refs are the result of regular expressions, # which in some cases are just false positives. parsed_refs, re_refs = doc_parser.get_references() # I also want to analyze all directories, if the URL I just # fetched is: # http://localhost/a/b/c/f00.php I want to GET: # http://localhost/a/b/c/ # http://localhost/a/b/ # http://localhost/a/ # http://localhost/ # And analyze the responses... dirs = resp.get_url().get_directories() only_re_refs = set(re_refs) - set(dirs + parsed_refs) all_refs = itertools.chain(dirs, parsed_refs, re_refs) for ref in unique_justseen(sorted(all_refs)): # I don't want w3af sending requests to 3rd parties! if ref.get_domain() != self._target_domain: continue # Filter the URL's according to the configured regexs urlstr = ref.url_string if not self._compiled_follow_re.match(urlstr) or \ self._compiled_ignore_re.match(urlstr): continue if self._only_forward: if not self._is_forward(ref): continue # Work with the parsed references and report broken # links. Then work with the regex references and DO NOT # report broken links if self._need_more_variants(ref): self._known_variants.append(ref) possibly_broken = ref in only_re_refs yield ref, fuzzable_req, original_url, possibly_broken
def _urls_to_verify_generator(self, resp, fuzzable_req): """ :param resp: HTTP response object :param fuzzable_req: The HTTP request that generated the response """ # # Note: I WANT to follow links that are in the 404 page. # # Modified when I added the PDFParser # I had to add this x OR y stuff, just because I don't want # the SGML parser to analyze a image file, its useless and # consumes CPU power. if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf(): original_url = resp.get_redir_uri() try: doc_parser = parser_cache.dpc.get_document_parser_for(resp) except w3afException, w3: om.out.debug("Failed to find a suitable document parser. " 'Exception "%s"' % w3) else: # Note: # - With parsed_refs I'm 100% that it's really # something in the HTML that the developer intended to add. # # - The re_refs are the result of regular expressions, # which in some cases are just false positives. parsed_refs, re_refs = doc_parser.get_references() # I also want to analyze all directories, if the URL I just # fetched is: # http://localhost/a/b/c/f00.php I want to GET: # http://localhost/a/b/c/ # http://localhost/a/b/ # http://localhost/a/ # http://localhost/ # And analyze the responses... dirs = resp.get_url().get_directories() only_re_refs = set(re_refs) - set(dirs + parsed_refs) all_refs = itertools.chain(dirs, parsed_refs, re_refs) for ref in unique_justseen(sorted(all_refs)): # I don't want w3af sending requests to 3rd parties! if ref.get_domain() != self._target_domain: continue # Filter the URL's according to the configured regexs urlstr = ref.url_string if not self._compiled_follow_re.match(urlstr) or self._compiled_ignore_re.match(urlstr): continue if self._only_forward: if not self._is_forward(ref): continue # Work with the parsed references and report broken # links. Then work with the regex references and DO NOT # report broken links if self._need_more_variants(ref): self._known_variants.append(ref) possibly_broken = ref in only_re_refs yield ref, fuzzable_req, original_url, possibly_broken