Exemplo n.º 1
0
 def setUp(self):
     self.base = urlparse.urlparse('localhost')
     self.url_queue = Queue.Queue()
     self.html_queue = Queue.Queue()
     self.sqli_queue = Queue.Queue()
     self.visited_queue = Queue.Queue()
     self.forms_queue = Queue.Queue()
     self.worker = WorkThread(self.html_queue, self.url_queue, self.base,
                              self.sqli_queue, self.forms_queue)
     self.html = "<form action='moo'><input type='text' name='input_box'></form><p>This is not a link</p><a href='vacaloca'>Moo</a><a href='/vacaloca?fail=1'>Moo</a><div id='href'>www.thisisalinknotinalink.com</div><a href='/vacaloca?fail=1&cat=2'>Moo</a><a href='/vacaloca?fail=1'>Moo</a><a href='http://localhost/vacaloca'>Moo</a><a href='/vacaloca'>Moo</a><h2>And also not this</h2><span>seriously, move on</span><a href='javascript:sillyjs()'>Click Me</a><a href='#datdiv'>DatDiv</a>"
Exemplo n.º 2
0
 def setUp(self):
   self.base = urlparse.urlparse('localhost')
   self.url_queue = Queue.Queue()
   self.html_queue = Queue.Queue()
   self.sqli_queue = Queue.Queue()
   self.visited_queue = Queue.Queue()
   self.forms_queue = Queue.Queue()
   self.worker = WorkThread(self.html_queue, self.url_queue,self.base, self.sqli_queue, self.forms_queue)
   self.html = "<form action='moo'><input type='text' name='input_box'></form><p>This is not a link</p><a href='vacaloca'>Moo</a><a href='/vacaloca?fail=1'>Moo</a><div id='href'>www.thisisalinknotinalink.com</div><a href='/vacaloca?fail=1&cat=2'>Moo</a><a href='/vacaloca?fail=1'>Moo</a><a href='http://localhost/vacaloca'>Moo</a><a href='/vacaloca'>Moo</a><h2>And also not this</h2><span>seriously, move on</span><a href='javascript:sillyjs()'>Click Me</a><a href='#datdiv'>DatDiv</a>"
Exemplo n.º 3
0
    def spawn_threads(self):

        worker = WorkThread(self.html_queue, self.url_queue, self.base,
                            self.sqli_queue, self.forms_queue)
        worker.setDaemon(True)
        worker.start()

        scrapers = []
        for i in range(5):
            t = ScrapeThread(self.url_queue, self.html_queue,
                             self.visited_queue, self.proxy, self.proxy_port,
                             worker)
            t.setDaemon(True)
            t.start()
            scrapers.append(t)

        while worker.isAlive():
            self.update_status()
            sleep(0.1)

        sys.stdout.write("\rKillin Scrapers..........")
        sys.stdout.flush()
        for thread in scrapers:
            thread.join()
Exemplo n.º 4
0
  def spawn_threads(self):

    worker = WorkThread(self.html_queue, self.url_queue,self.base, self.sqli_queue, self.forms_queue)
    worker.setDaemon(True)
    worker.start()


    scrapers = []
    for i in range(5):
      t = ScrapeThread(self.url_queue, self.html_queue,self.visited_queue,self.proxy,self.proxy_port,worker)
      t.setDaemon(True)
      t.start()
      scrapers.append(t)

    while worker.isAlive():
      self.update_status()
      sleep(0.1)

    sys.stdout.write("\rKillin Scrapers..........")
    sys.stdout.flush()
    for thread in scrapers:
      thread.join()
Exemplo n.º 5
0
class TestWorker(unittest.TestCase):
    def setUp(self):
        self.base = urlparse.urlparse('localhost')
        self.url_queue = Queue.Queue()
        self.html_queue = Queue.Queue()
        self.sqli_queue = Queue.Queue()
        self.visited_queue = Queue.Queue()
        self.forms_queue = Queue.Queue()
        self.worker = WorkThread(self.html_queue, self.url_queue, self.base,
                                 self.sqli_queue, self.forms_queue)
        self.html = "<form action='moo'><input type='text' name='input_box'></form><p>This is not a link</p><a href='vacaloca'>Moo</a><a href='/vacaloca?fail=1'>Moo</a><div id='href'>www.thisisalinknotinalink.com</div><a href='/vacaloca?fail=1&cat=2'>Moo</a><a href='/vacaloca?fail=1'>Moo</a><a href='http://localhost/vacaloca'>Moo</a><a href='/vacaloca'>Moo</a><h2>And also not this</h2><span>seriously, move on</span><a href='javascript:sillyjs()'>Click Me</a><a href='#datdiv'>DatDiv</a>"

    def test_extract_links(self):
        links = self.worker.extract_links(self.html)
        self.assertTrue(
            len(links) == 8, "must extract all <a> and skip non <a> "
        )  #extract only and all links
        self.assertTrue(links[0] == 'vacaloca',
                        "must return href string")  #return only href part

    def test_extract_forms(self):
        forms = self.worker.extract_forms(self.html)
        self.assertTrue(len(forms) == 1, "must extract all forms")

    def test_crunch_links(self):
        self.assertTrue(
            len(self.worker.crunch_links(["mailto:[email protected]"])) == 0,
            "must skip email links")
        self.assertTrue(
            len(self.worker.crunch_links(["javascript:moo()"])) == 0,
            "must skip javascript links")
        self.assertTrue(
            len(self.worker.crunch_links(["#moo_div"])) == 0,
            "must skip id links")
        self.assertTrue(
            len(self.worker.crunch_links(["http://remote.com"])) == 0,
            "must skip remote links")
        self.assertTrue(
            len(self.worker.crunch_links(["http://localhost/happy.php?fail=1"
                                          ])) == 1,
            "must not skip local links")

    def test_is_a_new_url(self):
        url = urlparse.urlparse("http://www.bacon.com/")
        self.worker.seen = [url]
        self.assertFalse(self.worker.is_a_new_url(url),
                         "must return False for already seen urls")
        self.assertTrue(
            self.worker.is_a_new_url(
                urlparse.urlparse("http://www.pancakes.com")),
            "must return True for new urls")

    def test_match_url(self):
        url_1 = urlparse.urlparse("http://www.bacon.com/")
        url_2 = urlparse.urlparse("http://www.pancakes.com/")
        self.assertFalse(self.worker.match_url(url_1, url_2),
                         "must return False for different urls")
        self.assertTrue(self.worker.match_url(url_1, url_1),
                        "must return True for matching urls")

    def test_match_params(self):
        url_1 = urlparse.urlparse("http://www.bacon.com/?id=1")
        url_2 = urlparse.urlparse("http://www.bacon.com/?id=1&moo=2")
        self.assertTrue(self.worker.match_params(url_1, url_1),
                        "must return True for matching params")
        self.assertFalse(self.worker.match_params(url_1, url_2),
                         "must return False for non matching params")

    def test_match_base(self):
        self.worker.base = urlparse.urlparse("localhost")
        url_1 = urlparse.urlparse("http://localhost/?id=1")
        url_2 = urlparse.urlparse("http://www.bacon.com/?id=1&moo=2")
        self.assertTrue(self.worker.match_base(url_1),
                        "must return True for urls of same base")
        self.assertFalse(self.worker.match_base(url_2),
                         "must return False for urls of different base")

    def test_detect_sqli(self):
        self.assertTrue(
            self.worker.detect_sqli(
                urlparse.urlparse("http://somesite.com/posts.php?id=1337")),
            "must detect simple parameter injection")
        self.assertFalse(
            self.worker.detect_sqli(urlparse.urlparse("http://somesite.com")),
            "must not show false positives")

    def test_detect_juicy_files(self):
        self.assertTrue(
            self.worker.detect_juicy_files(
                urlparse.urlparse("http://somesite.com/passwords.pdf")),
            "must detect pdf files")
        self.assertTrue(
            self.worker.detect_juicy_files(
                urlparse.urlparse("http://somesite.com/passwords.xls")),
            "must detect xls files")
        self.assertTrue(
            self.worker.detect_juicy_files(
                urlparse.urlparse("http://somesite.com/passwords.doc")),
            "must detect doc files")
        self.assertTrue(
            self.worker.detect_juicy_files(
                urlparse.urlparse("http://somesite.com/passwords.txt")),
            "must detect txt files")
        self.assertFalse(
            self.worker.detect_sqli(urlparse.urlparse("http://somesite.com")),
            "must not show false positives")

    def test_eat_urls(self):
        self.worker.seen = []  # clear seen urls
        crunched_links = self.worker.crunch_links(
            self.worker.extract_links("<a href='/onelink'>one love</a>"))
        self.worker.eat_urls(crunched_links)
        self.assertTrue(self.url_queue.qsize() == 1,
                        "must add new links to url_queue")

    def test_eat_forms(self):
        self.worker.forms = []
        self.worker.eat_forms(
            self.worker.extract_forms(
                "<form action='moo.php'><input type='text' name='username' /></form>"
            ))
        self.assertTrue(self.forms_queue.qsize() == 1, "must eat forms")

    def test_work(self):
        self.worker.seen = []  # clear seen urls
        self.worker.work(self.html)
        self.assertTrue(self.url_queue.qsize() > 0,
                        "must add new links to url_queue")
Exemplo n.º 6
0
class TestWorker(unittest.TestCase):

  def setUp(self):
    self.base = urlparse.urlparse('localhost')
    self.url_queue = Queue.Queue()
    self.html_queue = Queue.Queue()
    self.sqli_queue = Queue.Queue()
    self.visited_queue = Queue.Queue()
    self.forms_queue = Queue.Queue()
    self.worker = WorkThread(self.html_queue, self.url_queue,self.base, self.sqli_queue, self.forms_queue)
    self.html = "<form action='moo'><input type='text' name='input_box'></form><p>This is not a link</p><a href='vacaloca'>Moo</a><a href='/vacaloca?fail=1'>Moo</a><div id='href'>www.thisisalinknotinalink.com</div><a href='/vacaloca?fail=1&cat=2'>Moo</a><a href='/vacaloca?fail=1'>Moo</a><a href='http://localhost/vacaloca'>Moo</a><a href='/vacaloca'>Moo</a><h2>And also not this</h2><span>seriously, move on</span><a href='javascript:sillyjs()'>Click Me</a><a href='#datdiv'>DatDiv</a>"

  def test_extract_links(self):
    links = self.worker.extract_links(self.html)
    self.assertTrue(len(links) == 8,"must extract all <a> and skip non <a> ") #extract only and all links
    self.assertTrue(links[0] == 'vacaloca', "must return href string") #return only href part

  def test_extract_forms(self):
    forms = self.worker.extract_forms(self.html)
    self.assertTrue(len(forms) == 1,"must extract all forms")

  def test_crunch_links(self):
    self.assertTrue(len(self.worker.crunch_links(["mailto:[email protected]"])) == 0, "must skip email links" )
    self.assertTrue(len(self.worker.crunch_links(["javascript:moo()"])) == 0, "must skip javascript links" )
    self.assertTrue(len(self.worker.crunch_links(["#moo_div"])) == 0, "must skip id links" )
    self.assertTrue(len(self.worker.crunch_links(["http://remote.com"])) == 0, "must skip remote links" ) 
    self.assertTrue(len(self.worker.crunch_links(["http://localhost/happy.php?fail=1"])) == 1, "must not skip local links" ) 

  def test_is_a_new_url(self):
    url = urlparse.urlparse("http://www.bacon.com/")
    self.worker.seen = [url]
    self.assertFalse(self.worker.is_a_new_url(url), "must return False for already seen urls")
    self.assertTrue(self.worker.is_a_new_url(urlparse.urlparse("http://www.pancakes.com")), "must return True for new urls")

  def test_match_url(self):
    url_1 = urlparse.urlparse("http://www.bacon.com/")
    url_2 = urlparse.urlparse("http://www.pancakes.com/")
    self.assertFalse(self.worker.match_url(url_1,url_2), "must return False for different urls")
    self.assertTrue(self.worker.match_url(url_1, url_1), "must return True for matching urls")

  def test_match_params(self):
    url_1 = urlparse.urlparse("http://www.bacon.com/?id=1")
    url_2 = urlparse.urlparse("http://www.bacon.com/?id=1&moo=2")
    self.assertTrue(self.worker.match_params(url_1, url_1), "must return True for matching params")
    self.assertFalse(self.worker.match_params(url_1, url_2), "must return False for non matching params")

  def test_match_base(self):
    self.worker.base = urlparse.urlparse("localhost")
    url_1 = urlparse.urlparse("http://localhost/?id=1")
    url_2 = urlparse.urlparse("http://www.bacon.com/?id=1&moo=2")
    self.assertTrue(self.worker.match_base(url_1), "must return True for urls of same base")
    self.assertFalse(self.worker.match_base(url_2), "must return False for urls of different base") 

  def test_detect_sqli(self):
    self.assertTrue(self.worker.detect_sqli(urlparse.urlparse("http://somesite.com/posts.php?id=1337")), "must detect simple parameter injection")
    self.assertFalse(self.worker.detect_sqli(urlparse.urlparse("http://somesite.com")), "must not show false positives")

  def test_detect_juicy_files(self):
    self.assertTrue(self.worker.detect_juicy_files(urlparse.urlparse("http://somesite.com/passwords.pdf")), "must detect pdf files")
    self.assertTrue(self.worker.detect_juicy_files(urlparse.urlparse("http://somesite.com/passwords.xls")), "must detect xls files")
    self.assertTrue(self.worker.detect_juicy_files(urlparse.urlparse("http://somesite.com/passwords.doc")), "must detect doc files")
    self.assertTrue(self.worker.detect_juicy_files(urlparse.urlparse("http://somesite.com/passwords.txt")), "must detect txt files")
    self.assertFalse(self.worker.detect_sqli(urlparse.urlparse("http://somesite.com")), "must not show false positives")

  def test_eat_urls(self):
    self.worker.seen = [] # clear seen urls
    crunched_links = self.worker.crunch_links(self.worker.extract_links("<a href='/onelink'>one love</a>"))
    self.worker.eat_urls(crunched_links)
    self.assertTrue(self.url_queue.qsize() == 1 , "must add new links to url_queue")

  def test_eat_forms(self):
    self.worker.forms = []
    self.worker.eat_forms(self.worker.extract_forms("<form action='moo.php'><input type='text' name='username' /></form>"))
    self.assertTrue(self.forms_queue.qsize() == 1, "must eat forms")

  def test_work(self):
    self.worker.seen = [] # clear seen urls
    self.worker.work(self.html)
    self.assertTrue(self.url_queue.qsize() > 0 , "must add new links to url_queue")