Exemplo n.º 1
0
class web_spider(CrawlPlugin):
    '''
    Crawl the web application.

    :author: Andres Riancho ([email protected])
    '''
    NOT_404 = set([http_constants.UNAUTHORIZED, http_constants.FORBIDDEN])

    def __init__(self):
        CrawlPlugin.__init__(self)

        # Internal variables
        self._compiled_ignore_re = None
        self._compiled_follow_re = None
        self._broken_links = DiskSet()
        self._first_run = True
        self._known_variants = VariantDB()
        self._already_filled_form = ScalableBloomFilter()

        # User configured variables
        self._ignore_regex = ''
        self._follow_regex = '.*'
        self._only_forward = False
        self._compile_re()

    def crawl(self, fuzzable_req):
        '''
        Searches for links on the html.

        :param fuzzable_req: A fuzzable_req instance that contains
                             (among other things) the URL to test.
        '''
        if self._first_run:
            # I have to set some variables, in order to be able to code
            # the "only_forward" feature
            self._first_run = False
            self._target_urls = [
                i.get_domain_path() for i in cf.cf.get('targets')
            ]

            #    The following line triggered lots of bugs when the "stop" button
            #    was pressed and the core did this: "cf.cf.save('targets', [])"
            #self._target_domain = cf.cf.get('targets')[0].get_domain()
            #    Changing it to something awful but bug-free.
            targets = cf.cf.get('targets')
            if not targets:
                return
            else:
                self._target_domain = targets[0].get_domain()

        #
        # If it is a form, then smart_fill the parameters to send something that
        # makes sense and will allow us to cover more code.
        #
        if isinstance(fuzzable_req, HTTPPostDataRequest):

            if fuzzable_req.get_url() in self._already_filled_form:
                return

            fuzzable_req = self._fill_form(fuzzable_req)

        # Send the HTTP request,
        resp = self._uri_opener.send_mutant(fuzzable_req, follow_redir=False)

        # Nothing to do here...
        if resp.get_code() == 401:
            return

        fuzz_req_list = self._create_fuzzable_requests(resp,
                                                       request=fuzzable_req,
                                                       add_self=False)

        for fr in fuzz_req_list:
            self.output_queue.put(fr)

        self._extract_links_and_verify(resp, fuzzable_req)

    def _urls_to_verify_generator(self, resp, fuzzable_req):
        '''
        :param resp: HTTP response object
        :param fuzzable_req: The HTTP request that generated the response
        '''
        #
        # Note: I WANT to follow links that are in the 404 page.
        #

        # Modified when I added the PDFParser
        # I had to add this x OR y stuff, just because I don't want
        # the SGML parser to analyze a image file, its useless and
        # consumes CPU power.
        if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf():
            original_url = resp.get_redir_uri()
            try:
                doc_parser = parser_cache.dpc.get_document_parser_for(resp)
            except w3afException, w3:
                om.out.debug('Failed to find a suitable document parser. '
                             'Exception "%s"' % w3)
            else:
                # Note:
                # - With parsed_refs I'm 100% that it's really
                # something in the HTML that the developer intended to add.
                #
                # - The re_refs are the result of regular expressions,
                # which in some cases are just false positives.

                parsed_refs, re_refs = doc_parser.get_references()

                # I also want to analyze all directories, if the URL I just
                # fetched is:
                # http://localhost/a/b/c/f00.php I want to GET:
                # http://localhost/a/b/c/
                # http://localhost/a/b/
                # http://localhost/a/
                # http://localhost/
                # And analyze the responses...
                dirs = resp.get_url().get_directories()
                only_re_refs = set(re_refs) - set(dirs + parsed_refs)

                all_refs = itertools.chain(dirs, parsed_refs, re_refs)

                for ref in unique_justseen(sorted(all_refs)):

                    # I don't want w3af sending requests to 3rd parties!
                    if ref.get_domain() != self._target_domain:
                        continue

                    # Filter the URL's according to the configured regexs
                    urlstr = ref.url_string
                    if not self._compiled_follow_re.match(urlstr) or \
                            self._compiled_ignore_re.match(urlstr):
                        continue

                    if self._only_forward:
                        if not self._is_forward(ref):
                            continue

                    # Work with the parsed references and report broken
                    # links. Then work with the regex references and DO NOT
                    # report broken links
                    if self._need_more_variants(ref):
                        self._known_variants.append(ref)
                        possibly_broken = ref in only_re_refs
                        yield ref, fuzzable_req, original_url, possibly_broken
Exemplo n.º 2
0
class TestVariantDB(unittest.TestCase):
    def setUp(self):
        create_temp_dir()
        self.vdb = VariantDB()

    def test_db_int(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s'
        _max = 5

        for i in xrange(_max):
            url = URL(url_fmt % i)
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1, ))))

    def test_db_int_int(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=1'
        _max = 5

        for i in xrange(_max):
            url = URL(url_fmt % i)
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1, ))))

    def test_db_int_int_var(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s'
        _max = 5

        for i in xrange(_max):
            url = URL(url_fmt % (i, i))
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1))))

    def test_db_int_str(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s'
        _max = 5

        for i in xrange(_max):
            url = URL(url_fmt % (i, 'abc' * i))
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(
            self.vdb.need_more_variants(
                URL(url_fmt % (_max + 1, 'abc' * (_max + 1)))))

    def test_db_int_str_then_int_int(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s'
        _max = 5

        # Add (int, str)
        for i in xrange(_max):
            url = URL(url_fmt % (i, 'abc' * i))
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        # Please note that in this case I'm asking for (int, int) and I added
        # (int, str) before
        self.assertTrue(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1))))

        # Add (int, int)
        for i in xrange(_max):
            url = URL(url_fmt % (i, i))
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1))))

    def test_clean_reference_simple(self):
        self.assertEqual(self.vdb._clean_reference(URL('http://w3af.org/')),
                         u'http://w3af.org/')

    def test_clean_reference_file(self):
        self.assertEqual(
            self.vdb._clean_reference(URL('http://w3af.org/index.php')),
            u'http://w3af.org/index.php')

    def test_clean_reference_directory_file(self):
        self.assertEqual(
            self.vdb._clean_reference(URL('http://w3af.org/foo/index.php')),
            u'http://w3af.org/foo/index.php')

    def test_clean_reference_directory_file_int(self):
        self.assertEqual(
            self.vdb._clean_reference(
                URL('http://w3af.org/foo/index.php?id=2')),
            u'http://w3af.org/foo/index.php?id=number')

    def test_clean_reference_int(self):
        self.assertEqual(
            self.vdb._clean_reference(URL('http://w3af.org/index.php?id=2')),
            u'http://w3af.org/index.php?id=number')

    def test_clean_reference_int_str(self):
        self.assertEqual(
            self.vdb._clean_reference(
                URL('http://w3af.org/index.php?id=2&foo=bar')),
            u'http://w3af.org/index.php?id=number&foo=string')

    def test_clean_reference_int_str_empty(self):
        self.assertEqual(
            self.vdb._clean_reference(
                URL('http://w3af.org/index.php?id=2&foo=bar&spam=')),
            u'http://w3af.org/index.php?id=number&foo=string&spam=string')
Exemplo n.º 3
0
class web_spider(CrawlPlugin):
    """
    Crawl the web application.

    :author: Andres Riancho ([email protected])
    """

    NOT_404 = set([http_constants.UNAUTHORIZED, http_constants.FORBIDDEN])

    def __init__(self):
        CrawlPlugin.__init__(self)

        # Internal variables
        self._compiled_ignore_re = None
        self._compiled_follow_re = None
        self._broken_links = DiskSet()
        self._first_run = True
        self._known_variants = VariantDB()
        self._already_filled_form = ScalableBloomFilter()

        # User configured variables
        self._ignore_regex = ""
        self._follow_regex = ".*"
        self._only_forward = False
        self._compile_re()

    def crawl(self, fuzzable_req):
        """
        Searches for links on the html.

        :param fuzzable_req: A fuzzable_req instance that contains
                             (among other things) the URL to test.
        """
        if self._first_run:
            # I have to set some variables, in order to be able to code
            # the "only_forward" feature
            self._first_run = False
            self._target_urls = [i.get_domain_path() for i in cf.cf.get("targets")]

            #    The following line triggered lots of bugs when the "stop" button
            #    was pressed and the core did this: "cf.cf.save('targets', [])"
            # self._target_domain = cf.cf.get('targets')[0].get_domain()
            #    Changing it to something awful but bug-free.
            targets = cf.cf.get("targets")
            if not targets:
                return
            else:
                self._target_domain = targets[0].get_domain()

        #
        # If it is a form, then smart_fill the parameters to send something that
        # makes sense and will allow us to cover more code.
        #
        if isinstance(fuzzable_req, HTTPPostDataRequest):

            if fuzzable_req.get_url() in self._already_filled_form:
                return

            fuzzable_req = self._fill_form(fuzzable_req)

        # Send the HTTP request,
        resp = self._uri_opener.send_mutant(fuzzable_req)

        # Nothing to do here...
        if resp.get_code() == 401:
            return

        fuzz_req_list = self._create_fuzzable_requests(resp, request=fuzzable_req, add_self=False)

        for fr in fuzz_req_list:
            self.output_queue.put(fr)

        self._extract_links_and_verify(resp, fuzzable_req)

    def _urls_to_verify_generator(self, resp, fuzzable_req):
        """
        :param resp: HTTP response object
        :param fuzzable_req: The HTTP request that generated the response
        """
        #
        # Note: I WANT to follow links that are in the 404 page.
        #

        # Modified when I added the PDFParser
        # I had to add this x OR y stuff, just because I don't want
        # the SGML parser to analyze a image file, its useless and
        # consumes CPU power.
        if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf():
            original_url = resp.get_redir_uri()
            try:
                doc_parser = parser_cache.dpc.get_document_parser_for(resp)
            except w3afException, w3:
                om.out.debug("Failed to find a suitable document parser. " 'Exception "%s"' % w3)
            else:
                # Note:
                # - With parsed_refs I'm 100% that it's really
                # something in the HTML that the developer intended to add.
                #
                # - The re_refs are the result of regular expressions,
                # which in some cases are just false positives.

                parsed_refs, re_refs = doc_parser.get_references()

                # I also want to analyze all directories, if the URL I just
                # fetched is:
                # http://localhost/a/b/c/f00.php I want to GET:
                # http://localhost/a/b/c/
                # http://localhost/a/b/
                # http://localhost/a/
                # http://localhost/
                # And analyze the responses...
                dirs = resp.get_url().get_directories()
                only_re_refs = set(re_refs) - set(dirs + parsed_refs)

                all_refs = itertools.chain(dirs, parsed_refs, re_refs)

                for ref in unique_justseen(sorted(all_refs)):

                    # I don't want w3af sending requests to 3rd parties!
                    if ref.get_domain() != self._target_domain:
                        continue

                    # Filter the URL's according to the configured regexs
                    urlstr = ref.url_string
                    if not self._compiled_follow_re.match(urlstr) or self._compiled_ignore_re.match(urlstr):
                        continue

                    if self._only_forward:
                        if not self._is_forward(ref):
                            continue

                    # Work with the parsed references and report broken
                    # links. Then work with the regex references and DO NOT
                    # report broken links
                    if self._need_more_variants(ref):
                        self._known_variants.append(ref)
                        possibly_broken = ref in only_re_refs
                        yield ref, fuzzable_req, original_url, possibly_broken
Exemplo n.º 4
0
class TestVariantDB(unittest.TestCase):

    def setUp(self):
        create_temp_dir()
        self.vdb = VariantDB()

    def test_db_int(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s'
        _max = 5

        for i in xrange(_max):
            url = URL(url_fmt % i)
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1,))))

    def test_db_int_int(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=1'
        _max = 5

        for i in xrange(_max):
            url = URL(url_fmt % i)
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1,))))

    def test_db_int_int_var(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s'
        _max = 5

        for i in xrange(_max):
            url = URL(url_fmt % (i, i))
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1))))

    def test_db_int_str(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s'
        _max = 5

        for i in xrange(_max):
            url = URL(url_fmt % (i, 'abc' * i))
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(self.vdb.need_more_variants(
            URL(url_fmt % (_max + 1, 'abc' * (_max + 1)))))

    def test_db_int_str_then_int_int(self):
        url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s'
        _max = 5

        # Add (int, str)
        for i in xrange(_max):
            url = URL(url_fmt % (i, 'abc' * i))
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        # Please note that in this case I'm asking for (int, int) and I added
        # (int, str) before
        self.assertTrue(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1))))

        # Add (int, int)
        for i in xrange(_max):
            url = URL(url_fmt % (i, i))
            self.assertTrue(self.vdb.need_more_variants(url))
            self.vdb.append(url)

        self.assertFalse(
            self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1))))

    def test_clean_reference_simple(self):
        self.assertEqual(self.vdb._clean_reference(URL('http://w3af.org/')),
                         u'http://w3af.org/')

    def test_clean_reference_file(self):
        self.assertEqual(
            self.vdb._clean_reference(URL('http://w3af.org/index.php')),
            u'http://w3af.org/index.php')

    def test_clean_reference_directory_file(self):
        self.assertEqual(
            self.vdb._clean_reference(URL('http://w3af.org/foo/index.php')),
                                         u'http://w3af.org/foo/index.php')

    def test_clean_reference_directory_file_int(self):
        self.assertEqual(
            self.vdb._clean_reference(URL('http://w3af.org/foo/index.php?id=2')),
                                      u'http://w3af.org/foo/index.php?id=number')

    def test_clean_reference_int(self):
        self.assertEqual(
            self.vdb._clean_reference(URL('http://w3af.org/index.php?id=2')),
            u'http://w3af.org/index.php?id=number')

    def test_clean_reference_int_str(self):
        self.assertEqual(
            self.vdb._clean_reference(
                URL('http://w3af.org/index.php?id=2&foo=bar')),
            u'http://w3af.org/index.php?id=number&foo=string')

    def test_clean_reference_int_str_empty(self):
        self.assertEqual(
            self.vdb._clean_reference(
                URL('http://w3af.org/index.php?id=2&foo=bar&spam=')),
            u'http://w3af.org/index.php?id=number&foo=string&spam=string')