Exemplo n.º 1
0
    def test_basic_html(self):
        html_sample = """<!DOCTYPE html>
            <html>
            <head>
                <title></title>
            </head>
            <body>
            </body>
            </html>"""
        finder = UrlFinder(html_sample)

        self.assertEqual(finder.get_urls(), [])
Exemplo n.º 2
0
    def test_with_relative_link_and_relative_base_href(self):
        html_sample = """<!DOCTYPE html>
            <html>
            <head>
                <base href="/someWeirdPath/" target="_self">
            </head>
            <body>
            <a href="test.html">test</a>
            </body>
            </html>"""
        finder = UrlFinder(html_sample)
        self.assertEqual(finder.get_urls(), ["/someWeirdPath/test.html"])

        html_sample = """<!DOCTYPE html>
            <html>
            <head>
                <base href="someWeirdPath/" target="_self">
            </head>
            <body>
            <a href="test.html">test</a>
            </body>
            </html>"""
        finder = UrlFinder(html_sample)
        self.assertEqual(finder.get_urls(), ["someWeirdPath/test.html"])
Exemplo n.º 3
0
    def get_requests(self):  # Shared.options['process_timeout']

        if self.request.method == "POST":
            raise Exception("POST method with urllib is not supported yet")

        #parent = self.request.parent.url if self.request.parent else ""

        self.retries_interval = 0.5

        jar_response = cookielib.LWPCookieJar()
        jar_request = cookielib.LWPCookieJar()

        html = ""
        set_cookie = []

        requests = []

        while True:
            try:
                #Shared.th_lock.acquire()

                for cookie in self.request.cookies:
                    jar_request.set_cookie(cookie.get_cookielib_cookie())

                #Shared.th_lock.release()

                opener = self.urllib2_opener(self.request, jar_response)
                req = urllib2.Request(url=self.request.url)
                jar_request.add_cookie_header(req)

                res = opener.open(req, None, self.timeout)

                for cookie in jar_response:
                    set_cookie.append(Cookie(cookie.__dict__,
                                             self.request.url))

                ctype = res.info(
                )['Content-Type']  # @TODO !! WRONG!! (check if wrong...not sure)
                if ctype is not None:
                    if ctype.lower().split(";")[0] != "text/html":
                        opener.close()
                        raise NotHtmlException(ERROR_CONTENTTYPE)

                html = res.read()
                opener.close()

                if html:
                    html = decode_bytes(html)
                    finder = UrlFinder(html)
                    try:
                        urls = finder.get_urls()
                    except Exception as e:
                        raise

                for url in urls:
                    # @TODO handle FORMS
                    requests.append(
                        Request(REQTYPE_LINK,
                                "GET",
                                url,
                                parent=self.request,
                                set_cookie=set_cookie,
                                parent_db_id=self.request.db_id))

                break

            except RedirectException as e:
                set_cookie = []
                for cookie in jar_response:
                    set_cookie.append(Cookie(cookie.__dict__,
                                             self.request.url))

                r = Request(REQTYPE_REDIRECT,
                            "GET",
                            str(e),
                            parent=self.request,
                            set_cookie=set_cookie,
                            parent_db_id=self.request.db_id)
                requests.append(r)
                break
            except NotHtmlException:
                raise
            except Exception as e:
                self.retries -= 1
                if self.retries == 0: raise
                time.sleep(self.retries_interval)

        return requests
Exemplo n.º 4
0
    def test_with_http_absolute_link(self):
        html_sample = '<a href="http://test.lan">test</a>'

        finder = UrlFinder(html_sample)

        self.assertEqual(finder.get_urls(), ["http://test.lan"])
Exemplo n.º 5
0
    def test_with_anchor_link(self):
        html_sample = '<a href="#test">test</a>'
        finder = UrlFinder(html_sample)

        self.assertEqual(finder.get_urls(), [])
Exemplo n.º 6
0
    def test_empty_html(self):
        html_sample = ""
        finder = UrlFinder(html_sample)

        self.assertEqual(finder.get_urls(), [])
Exemplo n.º 7
0
    def test_with_relative_link(self):
        html_sample = '<a href="test.html">test</a>'
        finder = UrlFinder(html_sample)

        self.assertEqual(finder.get_urls(), ["test.html"])
Exemplo n.º 8
0
	def get_requests(self): # Shared.options['process_timeout']
		
		if self.request.method == "POST":
			raise Exception("POST method with urllib is not supported yet")

		#parent = self.request.parent.url if self.request.parent else ""
				
		self.retries_interval = 0.5

		jar_response = cookielib.LWPCookieJar()
		jar_request = cookielib.LWPCookieJar()


		html = ""
		set_cookie = []
		
		requests = []


		while True:
			try :
				#Shared.th_lock.acquire()					
				
				for cookie in self.request.cookies:
					jar_request.set_cookie(cookie.get_cookielib_cookie())
							
				#Shared.th_lock.release()

				opener = self.urllib2_opener(self.request, jar_response)
				req = urllib2.Request(url=self.request.url)					
				jar_request.add_cookie_header(req)
				
				res = opener.open(req, None, self.timeout)
				
				for cookie in jar_response:		
					set_cookie.append(Cookie(cookie.__dict__, self.request.url))			
				
				ctype = res.info()['Content-Type']
				if ctype is not None:				
					if ctype.lower().split(";")[0] != "text/html":						
						opener.close()
						raise NotHtmlException(ERROR_CONTENTTYPE)						
				
				html = res.read() 
				opener.close()

				if html:		
					finder = UrlFinder(html)
					try:				
						urls = finder.get_urls()
					except Exception as e:
						raise					

				for url in urls:				
					# @TODO handle FORMS								
					requests.append(Request(REQTYPE_LINK, "GET", url, parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id))		
				
				break			

			except RedirectException as e:	
				set_cookie = []		
				for cookie in jar_response:		
					set_cookie.append(Cookie(cookie.__dict__, self.request.url))			
				
				r = Request(REQTYPE_REDIRECT, "GET", str(e), parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id)
				requests.append(r)
				break
			except NotHtmlException:
				raise
			except Exception as e:						
				self.retries -= 1
				if self.retries == 0: raise
				time.sleep(self.retries_interval)
				
		return requests