def test_init(self): self.assertRaises(Exception, Response) self.assertRaises(Exception, Response, url='http://www.example.com/') self.assertRaises(Exception, Response, request=Request('http://www.example.com/')) self.assertRaises(ValueError, Response, url='foo', request=Request('http://www.example.com/') ) self.assertRaises(ValueError, Response, 'http://www.example.com/', status='foo', request=Request('http://www.example.com/') ) self.assertRaises(TypeError, Response, 'http://www.example.com/', request='foo' ) response = Response('http://www.example.com/', Request('http://www.example.com/') ) assert response.url assert not response.body response = Response('http://www.example.com/', Request('http://www.example.com/'), headers={'Content-Type': 'text/html', 'Content-Length': 1234 } )
def test_dynamic_request_browser_actions(self): cm = CookiesMiddleware(self.spider, self.spider.settings) self.driver = webdriver.Chrome() dh = DownloadHandler(self.spider, self.driver, self.driver_sem) def _actions(driver): driver.find_element_by_name('account').send_keys("username") driver.find_element_by_name('password').send_keys("pwd") driver.find_element_by_xpath( '/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button' ).click() gevent.sleep(5) request = Request( 'https://www.zhihu.com/#signin', dynamic=True, meta={'cookiejar': 'test'}, browser_actions=[_actions], ) cm.process_request(request) response = dh.fetch(request) cm.process_response(request, response) request = Request('https://www.zhihu.com', dynamic=True, meta={'cookiejar': 'test'}) cm.process_request(request) response = dh.fetch(request) cm.process_response(request, response) print response.body self.driver.close()
def test_body(self): r1 = Response(url="http://www.example.com/", request=Request('http://www.example.com/') ) assert r1.body == b'' r2 = Response(url="http://www.example.com/", body=b"", request=Request('http://www.example.com/')) assert isinstance(r2.body, bytes) self.assertEqual(r2.encoding, 'utf-8') # default encoding r3 = Response(url="http://www.example.com/", body=u"Price: \xa3100", encoding='utf-8', request=Request('http://www.example.com/')) assert isinstance(r3.body, bytes) self.assertEqual(r3.body, b"Price: \xc2\xa3100") r4 = Response(url="http://www.example.com/", request=Request('http://www.example.com/'), body=u"Price: \xa3100", encoding='latin1' ) assert isinstance(r4.body, bytes) self.assertEqual(r4.body, b"Price: \xa3100")
def test_url(self): request = Request('http://www.example.com/') self.assertIsInstance(request.url, str) self.assertEqual(request.url, 'http://www.example.com/') request = Request(u'http://www.example.com?content=测试') self.assertEqual(request.url, safe_url_string('http://www.example.com?content=测试')) self.assertRaises(TypeError, Request, 123)
def test_copy(self): request1 = Request('http://www.example.com/', headers={ 'Content-Type': 'text/html', 'Content-Length': 1234 }, method='get') request2 = request1.copy() assert request1.__dict__ == request2.__dict__ self.assertEqual(request1.headers, request2.headers) self.assertEqual(request1, request2) self.assertIsNot(request1, request2)
def test_process_request_interval(self): self.spider.settings.set("PROXY_LIST", ['218.76.106.78:3128']) request = Request('http://httpbin.org/get') pm = ProxyMiddleware(self.spider.settings, self.spider.logger) dh = DownloadHandler(self.spider, None, BoundedSemaphore(1)) pm.process_request(request) time1 = time.time() dh.fetch(request) request = Request('http://httpbin.org/get') pm.process_request(request) self.assertGreater(time.time() - time1, 3)
def test_init(self): self.assertRaises(Exception, Request) self.assertRaises(ValueError, Request, 'foo') request = Request('http://www.example.com/') assert request.url assert not request.body request = Request('http://www.example.com/', headers={ 'Content-Type': 'text/html', 'Content-Length': 1234 }, method='get') self.assertEqual(request.method, 'GET')
def test_timeout_dynamic(self): self.driver = webdriver.PhantomJS() self.spider.settings.set('TIMEOUT', 5) dh = DownloadHandler(self.spider, self.driver, self.driver_sem) self.assertRaises(TimeoutException, dh.fetch, Request(HTTPBIN_URL + '/delay/10', dynamic=True)) self.driver.close()
def test_process_response(self): request = Request('http://httpbin.org/') response = Response('http://httpbin.org/', request, status=500) rm = RetryMiddleware(self.spider.settings, self.spider.logger) request.meta["dont_retry"] = True self.assertEqual(rm.process_response(request, response), response) request.meta["dont_retry"] = False request = rm.process_response(request, response) self.assertIsInstance(request, Request) self.assertEqual(request.meta.get("retry_count"), 1) request = rm.process_response(request, response) self.assertIsInstance(request, Request) request = rm.process_response(request, response) self.assertIsInstance(request, Request) self.assertIsInstance(rm.process_response(request, response), Response)
def test_process_request(self): self.spider.settings.set("PROXY_LIST", ['124.88.67.54:80']) request = Request('http://httpbin.org/get') pm = ProxyMiddleware(self.spider.settings, self.spider.logger) dh = DownloadHandler(self.spider, None, BoundedSemaphore(1)) pm.process_request(request) response = dh.fetch(request) assert response.body
def after_login(self, response): html = response.body selector = etree.HTML(html) links = selector.xpath('//a[@class="question_link"]') for link in links: yield Request('https://www.zhihu.com' + link.attrib["href"], meta={"cookiejar": "zhihu"}, callback=self.get_item)
def test_dynamic_request_cookie_between_static_and_dynamic(self): cm = CookiesMiddleware(self.spider, self.spider.settings) self.driver = webdriver.PhantomJS() dh = DownloadHandler(self.spider, self.driver, self.driver_sem) request = Request(HTTPBIN_URL + '/cookies/set?key1=val1&key2=val2', dynamic=True, meta={'cookiejar': 'test'}) response = dh.fetch(request) cm.process_response(request, response) request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test'}) cm.process_request(request) response = dh.fetch(request) self.assertEqual( json.loads(response.body)['cookies'], { u'key1': u'val1', u'key2': u'val2' }) self.driver.close()
def test_post_data_content_static(self): dh = DownloadHandler(self.spider, self.driver, self.driver_sem) response = dh.fetch( Request(HTTPBIN_URL + '/post', method='POST', body={'text': 'pycreeper'})) self.assertIsInstance(response, Response) self.assertEqual( json.loads(response.body)['form'], {'text': 'pycreeper'}) response = dh.fetch( Request(HTTPBIN_URL + '/post', method='POST', body=u'Unicode测试')) self.assertEqual(json.loads(response.body)['data'], 'Unicode测试') response = dh.fetch( Request(HTTPBIN_URL + '/post', method='POST', body='中文测试')) self.assertEqual(json.loads(response.body)['data'], '中文测试') self.assertEqual(response.status, 200)
def test_process_request(self): request = Request('http://httpbin.org/user-agent') self.assertIs(request.headers.get("User-Agent"), None) uam = UserAgentMiddleware(self.spider.settings, self.spider.logger) dh = DownloadHandler(self.spider, None, BoundedSemaphore(1)) uam.process_request(request) response = dh.fetch(request) self.assertEqual( json.loads(response.body)['user-agent'], request.headers['User-Agent'])
def test_concurrency_with_delayed_url(self): dh = DownloadHandler(self.spider, self.driver, self.driver_sem) n = 5 pool = Pool(n) urls = [] for i in range(n): urls.append(HTTPBIN_URL + '/delay/1') time_start = time.time() pool.map(dh.fetch, [Request(url) for url in urls]) time_total = time.time() - time_start self.assertLess(time_total, n)
def start_requests(self): def _search(driver): driver.find_element_by_id('key').send_keys(u"联想笔记本", Keys.ENTER) gevent.sleep(3) self._jump_guide(driver) gevent.sleep(3) yield Request(url='https://www.jd.com/', meta={"cookiejar": "jd"}, callback=self.parse_list, dynamic=True, browser_actions=[_search])
def test_dynamic_request_concurrency(self): self.driver = webdriver.PhantomJS() dh = DownloadHandler(self.spider, self.driver, self.driver_sem) n = 5 pool = Pool(n) urls = [] for i in range(n): urls.append(HTTPBIN_URL + '/delay/1') time1 = time.time() pool.map(dh.fetch, [Request(url, dynamic=True, wait=5) for url in urls]) self.assertGreater(time.time() - time1, n) self.driver.close()
def start_requests(self): def _login(driver): driver.find_element_by_name('account').send_keys("username") driver.find_element_by_name('password').send_keys("password") driver.find_element_by_xpath( '/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button' ).click() gevent.sleep(5) yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, callback=self.after_login, dynamic=True, browser_actions=[_login])
def test_copy(self): response1 = Response('http://www.example.com/', headers={'Content-Type': 'text/html', 'Content-Length': 1234 }, request=Request('http://www.example.com/') ) response2 = response1.copy() assert response1.__dict__ == response2.__dict__ self.assertEqual(response1.headers, response2.headers) self.assertEqual(response1.request, response2.request) self.assertEqual(response1, response2) self.assertIsNot(response1.headers, response2.headers) self.assertIsNot(response1.request, response2.request) self.assertIsNot(response1, response2)
def parse_list(self, response): html = response.body selector = etree.HTML(html) links = selector.xpath('//div[@class="p-img"]/a') titles = selector.xpath('//div[@class="p-name p-name-type-2"]/a/em') imgs = selector.xpath('//div[@class="p-img"]/a/img') prices = selector.xpath('//div[@class="p-price"]/strong/i') for i in range(len(links)): try: yield { 'path': links[i].attrib["href"] if 'http' in links[i].attrib["href"] else 'http:' + links[i].attrib["href"], 'title': parser.unescape( etree.tostring(titles[i], pretty_print=True)), 'img': imgs[i].attrib["src"] if 'http' in imgs[i].attrib["src"] else 'http:' + imgs[i].attrib["src"], 'price': prices[i].text, } except Exception as e: pass url = response.url def _next_page(driver): self._jump_guide(driver) driver.find_element_by_xpath( '//*[@id="J_bottomPage"]/span[1]/a[9]').click() self._jump_guide(driver) yield Request(url=url, meta={"cookiejar": "jd"}, callback=self.parse_list, dynamic=True, browser_actions=[_next_page])
from pycreeper.http.request import Request from pycreeper.spider import Spider from Queue import Empty __doctests__ = ['pycreeper.utils.scheduler'] URLS = [ 'http://www.example.com/index.html#print', 'http://www.example.com/index.html', 'http://www.xxx.com/index.html?id=77&nameid=2905210001&page=1', 'http://www.xxxxx.com/index.html?id=77&nameid=2905210001&page=1', 'http://www.xxxxx.com/index.html?test123123', 'http://www.xxxxx.com/index.html', 'ftp://www.xxxxx.com/index.html' ] REQUEST = [Request(url) for url in URLS] class RequestTest(unittest.TestCase): def test_basic(self): request_filter = RequestFilter() request_filter.request_seen(REQUEST[0]) self.assertEqual(request_filter.request_seen(REQUEST[0]), True) self.assertEqual(request_filter.request_seen(REQUEST[1]), False) self.assertEqual(request_filter.request_seen(REQUEST[1]), True) self.assertRaises(AttributeError, request_filter.request_seen, None) class SchedulerTest(unittest.TestCase): def setUp(self): self.spider = Spider()
def test_dynamic_request_wait(self): self.driver = webdriver.PhantomJS() dh = DownloadHandler(self.spider, self.driver, self.driver_sem) request = Request(HTTPBIN_URL + '/get', dynamic=True, wait=3) dh.fetch(request) self.driver.close()
def test_timeout_static(self): self.spider.settings.set('TIMEOUT', 5) dh = DownloadHandler(self.spider, self.driver, self.driver_sem) self.assertRaises(TimeoutException, dh.fetch, Request(HTTPBIN_URL + '/delay/10'))
def test_post_data_static(self): dh = DownloadHandler(self.spider, self.driver, self.driver_sem) response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST')) self.assertIsInstance(response, Response) self.assertEqual(response.status, 200)
def test_get_data(self): dh = DownloadHandler(self.spider, self.driver, self.driver_sem) response = dh.fetch(Request(HTTPBIN_URL + '/get')) self.assertIsInstance(response, Response) self.assertEqual(response.status, 200)
def start_requests(self): """start_requests """ for url in self.start_urls: yield Request(url)
def test_request(self): response = Response('http://www.example.com/', request=Request('http://www.example.com/') ) self.assertIsInstance(response.request, Request) self.assertEqual(response.request, Request('http://www.example.com/'))