def testSetResponseData(self): from mechanize import response_seek_wrapper r = TestUnSeekableResponse(self.text, {'blah': 'yawn'}) rsw = response_seek_wrapper(r) rsw.set_data(b"""\ A Seeming somwhat more than View; That doth instruct the Mind In Things that ly behind, """) self.assertEqual(rsw.read(9), b"A Seeming") self.assertEqual(rsw.read(13), b" somwhat more") rsw.seek(0) self.assertEqual(rsw.read(9), b"A Seeming") self.assertEqual(rsw.readline(), b" somwhat more than View;\n") rsw.seek(0) self.assertEqual(rsw.readline(), b"A Seeming somwhat more than View;\n") rsw.seek(-1, 1) self.assertEqual(rsw.read(7), b"\n That") r = TestUnSeekableResponse(self.text, {'blah': 'yawn'}) rsw = response_seek_wrapper(r) rsw.set_data(self.text) self._test2(rsw) rsw.seek(0) self._test4(rsw)
def testSetResponseData(self): from mechanize import response_seek_wrapper r = TestUnSeekableResponse(self.text, {'blah': 'yawn'}) rsw = response_seek_wrapper(r) rsw.set_data("""\ A Seeming somwhat more than View; That doth instruct the Mind In Things that ly behind, """) self.assertEqual(rsw.read(9), "A Seeming") self.assertEqual(rsw.read(13), " somwhat more") rsw.seek(0) self.assertEqual(rsw.read(9), "A Seeming") self.assertEqual(rsw.readline(), " somwhat more than View;\n") rsw.seek(0) self.assertEqual(rsw.readline(), "A Seeming somwhat more than View;\n") rsw.seek(-1, 1) self.assertEqual(rsw.read(7), "\n That") r = TestUnSeekableResponse(self.text, {'blah': 'yawn'}) rsw = response_seek_wrapper(r) rsw.set_data(self.text) self._test2(rsw) rsw.seek(0) self._test4(rsw)
def test_set_response(self): import copy from mechanize import response_seek_wrapper br = TestBrowser() url = "http://example.com/" html = """<html><body><a href="spam">click me</a></body></html>""" headers = {"content-type": "text/html"} r = response_seek_wrapper(MockResponse(url, html, headers)) br.add_handler(make_mock_handler()([("http_open", r)])) r = br.open(url) self.assertEqual(r.read(), html) r.seek(0) self.assertEqual(copy.copy(r).read(), html) self.assertEqual(list(br.links())[0].url, "spam") newhtml = """<html><body><a href="eggs">click me</a></body></html>""" r.set_data(newhtml) self.assertEqual(r.read(), newhtml) self.assertEqual(br.response().read(), html) br.response().set_data(newhtml) self.assertEqual(br.response().read(), html) self.assertEqual(list(br.links())[0].url, "spam") r.seek(0) br.set_response(r) self.assertEqual(br.response().read(), newhtml) self.assertEqual(list(br.links())[0].url, "eggs")
def test_set_response(self): import copy from mechanize import response_seek_wrapper br = TestBrowser() url = "http://example.com/" html = b"""<html><body><a href="spam">click me</a></body></html>""" headers = {"content-type": "text/html"} r = response_seek_wrapper(MockResponse(url, html, headers)) br.add_handler(make_mock_handler()([("http_open", r)])) r = br.open(url) self.assertEqual(r.read(), html) r.seek(0) self.assertEqual(copy.copy(r).read(), html) self.assertEqual(list(br.links())[0].url, "spam") newhtml = b"""<html><body><a href="eggs">click me</a></body></html>""" r.set_data(newhtml) self.assertEqual(r.read(), newhtml) self.assertEqual(br.response().read(), html) br.response().set_data(newhtml) self.assertEqual(br.response().read(), html) self.assertEqual(list(br.links())[0].url, "spam") r.seek(0) br.set_response(r) self.assertEqual(br.response().read(), newhtml) self.assertEqual(list(br.links())[0].url, "eggs")
def http_response(self, request, response): if not hasattr(response, "seek"): response = mechanize.response_seek_wrapper(response) if response.info().dict.has_key('content-type') and ( 'html' in response.info().dict['content-type']): p = Popen([self.tidybin, "-q", "-i"], stdout=PIPE, stdin=PIPE, stderr=PIPE) html = p.communicate(input=response.get_data())[0] #print html #p = Popen(["/usr/bin/tidy", "-q", "-i"], stdin=PIPE, stdout=PIPE, stderr=PIPE) #p.stdin.write(response.get_data()) #p.stdin.flush() #p.stdin.close() #html = p.stdout.read() #p.stdout.close() response.set_data(html) #html = etree.HTML(response.get_data()) #response.set_data(etree.tostring(html)) return response
def http_response(self, request, response): if not hasattr(response, "seek"): response = mechanize.response_seek_wrapper(response) # only use BeautifulSoup if response is html if response.info().dict.has_key('content-type') and ('html' in response.info().dict['content-type']): soup = BeautifulSoup(response.get_data()) response.set_data(soup.prettify()) return response
def http_response(self, request, response): if not hasattr(response, "seek"): response = mechanize.response_seek_wrapper(response) # Run HTML through BeautifulSoup for sanitizing if 'html' in response.info().get('content-type', ''): soup = get_soup(response.get_data()) response.set_data(soup.prettify(encoding=soup.original_encoding)) return response
def http_response(self, request, response): if not hasattr(response, "seek"): response = mechanize.response_seek_wrapper(response) # only use BeautifulSoup if response is html if response.info().dict.has_key('content-type') and ('html' in response.info().dict['content-type']): soup = MinimalSoup (response.get_data()) response.set_data(soup.prettify()) return response
def http_response(self, request, httpResponse): if not hasattr(httpResponse, "seek"): httpResponse = mechanize.response_seek_wrapper(httpResponse) # If HTML used, get it though a robust Parser like BeautifulSoup if 'content-type' in httpResponse.info().dict and ('html' in httpResponse.info().dict['content-type']): soup = BeautifulSoup(httpResponse.get_data()) httpResponse.set_data(soup.prettify()) return httpResponse
def http_response(self, request, response): if not hasattr(response, "seek"): response = mechanize.response_seek_wrapper(response) # Run HTML through BeautifulSoup for sanitizing if 'html' in response.info().get('content-type', ''): soup = get_soup(response.get_data()) response.set_data( soup.prettify(encoding=soup.original_encoding)) return response
def testGetResponseData(self): from mechanize import response_seek_wrapper r = TestUnSeekableResponse(self.text, {'blah': 'yawn'}) rsw = response_seek_wrapper(r) self.assertEqual(rsw.get_data(), self.text) self._test2(rsw) rsw.seek(0) self._test4(rsw)
def testResponseSeekWrapper(self): from mechanize import response_seek_wrapper hdrs = {"Content-type": "text/html"} r = TestUnSeekableResponse(self.text, hdrs) rsw = response_seek_wrapper(r) rsw2 = self._testCopy(rsw) self.assert_(rsw is not rsw2) self.assertEqual(rsw.info(), rsw2.info()) self.assert_(rsw.info() is not rsw2.info()) # should be able to close already-closed object rsw2.close() rsw2.close()
def testResponseSeekWrapper(self): from mechanize import response_seek_wrapper hdrs = {"Content-type": "text/html"} r = TestUnSeekableResponse(self.text, hdrs) rsw = response_seek_wrapper(r) rsw2 = self._testCopy(rsw) self.assertTrue(rsw is not rsw2) self.assertEqual(rsw.info(), rsw2.info()) self.assertTrue(rsw.info() is not rsw2.info()) # should be able to close already-closed object rsw2.close() rsw2.close()
def processLink(self, link): """Process a link.""" url = link.absoluteURL # Whatever will happen, we have looked at the URL self.visited.append(url) # Retrieve the content try: self.browser.open(link.callableURL) except urllib2.HTTPError, error: # Something went wrong with retrieving the page. self.linkErrors += 1 self.sendMessage( '%s (%i): %s' % (error.msg, error.code, link.callableURL), 2) self.sendMessage('+-> Reference: ' + link.referenceURL, 2) # Now set the error page as the response from mechanize import response_seek_wrapper self.browser._response = response_seek_wrapper(error)
def http_response(self, request, response): if not hasattr(response, "seek"): response = mechanize.response_seek_wrapper(response) # only use if response is html if response.info().dict.has_key('content-type') and ( 'html' in response.info().dict['content-type']): tag_soup = response.get_data() try: self.element = lxml.html.fromstring(tag_soup) ignore = lxml.etree.tostring( self.element, encoding=unicode ) # check the unicode entity conversion has worked except (UnicodeDecodeError, lxml.etree.XMLSyntaxError): self.element = lxml.html.soupparser.fromstring( tag_soup ) # fall back to beautiful soup if there is an error response.set_data( lxml.etree.tostring(self.element, pretty_print=True, method="html")) return response
def http_response(self, request, response): if not hasattr(response, "seek"): response = mechanize.response_seek_wrapper(response) if response.info().dict.has_key('content-type') and ('html' in response.info().dict['content-type']): p = Popen([self.tidybin, "-q", "-i"], stdout=PIPE, stdin=PIPE, stderr=PIPE) html = p.communicate(input=response.get_data())[0] #print html #p = Popen(["/usr/bin/tidy", "-q", "-i"], stdin=PIPE, stdout=PIPE, stderr=PIPE) #p.stdin.write(response.get_data()) #p.stdin.flush() #p.stdin.close() #html = p.stdout.read() #p.stdout.close() response.set_data(html) #html = etree.HTML(response.get_data()) #response.set_data(etree.tostring(html)) return response