def test_xhtml(self): xhtml = b""" <?xml version="1.0"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>XHTML document title</title> </head> <body> <div class='links'> <p><a href="/about.html">About us</a></p> </div> <div> <p><a href="/follow.html">Follow this link</a></p> </div> <div> <p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p> </div> <div> <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p> </div> <div> <p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p> </div> </body> </html> """ response = HtmlResponse("http://example.com/index.xhtml", body=xhtml) lx = self.extractor_cls() self.assertEqual( lx.extract_links(response), [ Link( url="http://example.com/about.html", text="About us", fragment="", nofollow=False, ), Link( url="http://example.com/follow.html", text="Follow this link", fragment="", nofollow=False, ), Link( url="http://example.com/nofollow.html", text="Dont follow this one", fragment="", nofollow=True, ), Link( url="http://example.com/nofollow2.html", text="Choose to follow or not", fragment="", nofollow=False, ), Link( url="http://google.com/something", text="External link not to follow", nofollow=True, ), ], ) response = XmlResponse("http://example.com/index.xhtml", body=xhtml) lx = self.extractor_cls() self.assertEqual( lx.extract_links(response), [ Link( url="http://example.com/about.html", text="About us", fragment="", nofollow=False, ), Link( url="http://example.com/follow.html", text="Follow this link", fragment="", nofollow=False, ), Link( url="http://example.com/nofollow.html", text="Dont follow this one", fragment="", nofollow=True, ), Link( url="http://example.com/nofollow2.html", text="Choose to follow or not", fragment="", nofollow=False, ), Link( url="http://google.com/something", text="External link not to follow", nofollow=True, ), ], )
<meta charset="UTF-8"> <title></title> </head> <body> <ul> <li class="item-"><a id='i1' href="link.html" class='ding'>first item</a></li> <li class="item-0"><a id='i2' href="llink.html" class='ding'>first item</a></li> <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> </ul> <div><a href="llink2.html">second item</a></div> <div><a href="llink2.html">10</a></div> </body> </html> """ # 构造response对象 response = HtmlResponse(url='', body=html, encoding='utf-8') selector = Selector(response=response) # 获取所有a标签 temp = selector.xpath('//a') # 获取第一个body标签并从body标签开始找ul标签 ./ul 相对标签的子标签 temp = selector.xpath('body')[0].xpath('.//ul') print(temp) exit() # 获取body的子标签ul temp = selector.xpath('body/ul') # 获取body的后代标签li temp = selector.xpath('body//li') # []空,li不是body的子标签 temp = selector.xpath('body/li') # 获取body的父标签 temp = selector.xpath('body')[0].xpath('..')
def test_generic_form_requests_with_spider_args(self): name = "ebay3" args = {'search_string': 'Cars'} spider = self.smanager.create(name, **args) generic_form_request = list(spider.start_requests())[0] response = HtmlResponse( url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request request_list = [ request_to_dict(req, spider) for req in generic_form_request.callback(response) ] expected = [{ 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None }] self.assertEqual(request_list, expected)
def process_request(self,request,spider): dr=webdriver.PhantomJS() dr.get(request.url) time.sleep(2) body=dr.page_source return HtmlResponse(dr.current_url,body=body.replace(u'\xa9',u''),encoding='utf-8',request=request)
def test_meta_refresh(self): req = Request(url='http://example.org') rsp = HtmlResponse(req.url, body=self._body()) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, 'http://example.org/newpage')
def process_request(self, request, spider): chrome_options = Options() chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式 chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') ''' :param request: 请求 :param spider: 爬虫名 :return: ''' # 判断是哪个爬虫 if spider.name == 'scjrm_zszq': # 判断是否是登陆 # if request.url == "http://www.scjrm.com/site/login.html": print("<<<<<<<" +request.url) spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe") spider.driver.get("http://www.scjrm.com/site/login.html") # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click() time.sleep(2) #模拟输入账号密码 username = spider.driver.find_element_by_id('phonenumber') password = spider.driver.find_element_by_id('password') username.send_keys('18030535053') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_id('sub_bt').click() time.sleep(1) spider.driver.get(request.url) time.sleep(3) spider.cookies = spider.driver.get_cookies() time.sleep(1) return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8') # 不是登录 # else: # req = requests.session() # 会话 # for cookie in spider.cookies: # req.cookies.set(cookie['name'], cookie["value"]) # req.headers.clear() # 清空头 # newpage = req.get(request.url) # time.sleep(5) # return HtmlResponse(url=request.url, # 当前连接 # body=newpage.text, # 源代码 # 源代码 # encoding="utf-8", request=request) # 返回页面信息 if spider.name == 'scjuchuang_yxzq': # 判断是否是登陆 # if request.url.find('login') != -1: spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe") spider.driver.get('https://www.scjuchuang.com/login') # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click() time.sleep(2) #模拟输入账号密码 username = spider.driver.find_element_by_class_name('loginName') password = spider.driver.find_element_by_class_name('loginPassword') username.send_keys('yczs123') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_class_name('loginBtn').click() time.sleep(1) spider.driver.get('https://www.scjuchuang.com/goods?attr=1&page=1') # spider.driver.find_element_by_link_text('院线专区').click() spider.cookies = spider.driver.get_cookies() return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8') elif spider.name == 'rjyiyao_xpsj': # 判断是否是登陆 # if request.url.find('login') != -1: spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe", chrome_options=chrome_options) spider.driver.get('http://new.rjyiyao.com/web/login') # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click() time.sleep(2) #模拟输入账号密码 username = spider.driver.find_element_by_id('username') password = spider.driver.find_element_by_id('password') username.send_keys('18030535053') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_id('btnLogin').click() time.sleep(1) # spider.driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[3]/div[2]/a[2]/img').click() # 新品上架 # windows = spider.driver.window_handles # spider.driver.switch_to.window(windows[1]) # 切换到第二页 spider.driver.get('http://new.rjyiyao.com/web/product/group/5?page=1') time.sleep(5) spider.cookies = spider.driver.get_cookies() return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8') elif spider.name == 'rjyiyao_zkzq': # 判断是否是登陆 # if request.url.find('login') != -1: spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe", chrome_options=chrome_options) spider.driver.get('http://new.rjyiyao.com/web/login') # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click() time.sleep(1) #模拟输入账号密码 username = spider.driver.find_element_by_id('username') password = spider.driver.find_element_by_id('password') username.send_keys('18030535053') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_id('btnLogin').click() time.sleep(2) spider.driver.get('http://new.rjyiyao.com/web/product/sale/3?page=1') # spider.driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[3]/div[2]/a[2]/img').click() # 新品上架 # windows = spider.driver.window_handles # spider.driver.switch_to.window(windows[1]) # 切换到第二页 time.sleep(5) spider.cookies = spider.driver.get_cookies() return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8') elif spider.name == 'sckxyy_ypzq': # 判断是否是登陆 # if request.url.find('login') != -1: spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe") spider.driver.get('http://www.sckxyy.com/Login.html') time.sleep(2) #模拟输入账号密码 username = spider.driver.find_element_by_id('usernameLogin') password = spider.driver.find_element_by_id('passwordLogin') username.send_keys('bianyuantianshi') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_id('userLogin').click() time.sleep(1) spider.cookies = spider.driver.get_cookies() spider.driver.get('http://www.sckxyy.com/Drug_zone.html#Monday-bg-two') # spider.driver.find_element_by_link_text('普药专区').click() # 普药专区 # time.sleep(5) # windows = spider.driver.window_handles # spider.driver.switch_to.window(windows[1]) # 切换到第二页 return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8')
def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body)
def setUp(self) -> None: with RESPONSE_FAILED.open("rb") as file: self.s_response_failed = HtmlResponse(url="", body=file.read()) with RESPONSE_SUCCEED.open("rb") as file: self.s_response_succeed = HtmlResponse(url="", body=file.read())
import scrapy from scrapy.selector import Selector from scrapy.http import HtmlResponse response = HtmlResponse(url='http://ca.indeed.com') print response.selector.xpath('//span/text()').extract()
def parse(self, response): print('start url:', response.url) self.driver.get(response.url) self.driver.maximize_window() # test with set timer start_time = time.time() counter = 0 while True: last_height = self.driver.execute_script( "return document.body.scrollHeight") try: print('scroll') self.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) new_height = self.driver.execute_script( "return document.body.scrollHeight") if new_height == last_height: break else: last_height = new_height continue except: break # hide footer bar element = self.driver.find_element_by_id("jump_paging") self.driver.execute_script("arguments[0].style.visibility='hidden'", element) """"new recent Pantip error 31/07/19""" login_message = self.driver.find_element_by_xpath( "/html/body/div[4]/div/div/div[4]") self.driver.execute_script("arguments[0].style.visibility='hidden'", login_message) """"load sub comments""" try: more_buttons = self.driver.find_elements_by_class_name('see-more') for x in range(0, len(more_buttons)): if more_buttons[x].is_displayed(): more_buttons[x].click() except: pass response = HtmlResponse(self.driver.current_url, body=self.driver.page_source, encoding='utf-8', request=response) response.selector.remove_namespaces() post_id = response.url.split('https://pantip.com/topic/')[1] title = response.xpath('//*[@id="topic-' + post_id + '"]/div/div[2]/h2/node()').extract() post_story = response.xpath( '/html/body/div[4]/div/div/div[3]/div/div[4]/div[1]/div//text()' ).extract() post_date = response.xpath('//*[@id="topic-' + post_id + '' '"]/div/div[4]/div[2]/div[3]/div[' '3]/div/span/abbr/@data-utime').extract() post_tags = response.xpath( '/html/body/div[4]/div/div/div[3]/div/div[3]/div/div[2]/a/text()' ).extract() post_comments_time = response.xpath( './/div[@class="display-post-avatar-inner"]/span/abbr/@data-utime' ).extract() post_comments_time.pop(0) post_comments_userID = response.xpath( './/div[@class="display-post-avatar-inner"]/a//text()').extract() comments = self.driver.find_elements_by_class_name( 'display-post-story') # emotion_list = [] # # emotions = response.xpath( # '/html/body/div[4]/div/div/div[3]/div/div[4]/div[2]/div[4]/div[1]/a/span//text()').extract() # # emotions_count = response.xpath( # '/html/body/div[4]/div/div/div[3]/div/div[4]/div[2]/div[4]/div[1]/span//text()').extract() # # remove first label of number of emotions # emotions_count.pop(0) # for i in range(len(emotions)): # emotion_list.append({emotions[i]: emotions_count[i]}) # print(emotion_list) # comments_emotions_list = [] # # try: # emotions = self.driver.find_element_by_class_name('/html/body/div[4]/div/div/div[6]/div/div/div[2]/div[3]/div[4]/div[1]/a/span') # emotions_count = self.driver.find_elements_by_xpath('/html/body/div[4]/div/div/div[6]/div/div/div[2]/div[3]/div[4]/div[1]/span') # emotions_count.pop(0) # for i in range(len(emotions)): # comments_emotions_list.append({emotions[i].text: emotions_count[i].text}) # except: # pass # # print(comments_emotions_list) # data_topic = { # post_id: { # 'id': post_id, # 'user_id': post_comments_userID[0], # 'post_date': post_date[0], # 'post_tags': ','.join(str(c).strip() for c in post_tags), # 'post_title': title[0], # 'post_story': comments[0].text, # 'total_comment': int(len(post_comments_time)), # # 'emotions': ''.join(str(c).strip() for c in emotion_list) # } # } # # self.firebase.uploadDatabase('data/scraped/post', data_topic) # # comments.pop(0) # comments.pop(-1) # comments.pop(-1) # print(comments[0].text) # # post_comments_userID.pop(0) # if len(post_comments_userID) < len(comments): # comments.pop(0) for i in range(len(post_comments_userID)): id = post_id + '_' + (post_comments_time[i].replace( '/', '_')).replace(' ', '_') data_comment = { id: { "id": id, "user_id": post_comments_userID[i], "time": post_comments_time[i], "comment": comments[i].text } } print(data_comment) self.firebase.uploadDatabase('data/model/comment', data_comment)
from scrapy.selector import Selector from scrapy.http import HtmlResponse body = '<html><body><span>good</span></body></html>' response = HtmlResponse(url='http://example.com', body=body) a = response.selector.xpath('//span/text()').extract() print(a)
def parse_scene(self, response): jsondata = response.json() htmlcode = jsondata['solution']['response'] response = HtmlResponse(url=response.url, body=htmlcode, encoding='utf-8') response_url = jsondata['solution']['url'] cookies = jsondata['solution']['cookies'] for cookie in cookies: if cookie['name'] == 'mydate': scenedate = cookie['value'] if cookie['name'] == 'performer': performer = cookie['value'] item = SceneItem() if scenedate: item['date'] = self.parse_date(scenedate).isoformat() else: item['date'] = self.parse_date('today').isoformat() if performer: item['performers'] = [performer] else: item['performers'] = [] item['title'] = self.get_title(response) item['description'] = self.get_description(response) item['image'] = self.get_image(response) item['image_blob'] = self.get_image_blob(response) item['tags'] = self.get_tags(response) if "" in item['tags']: item['tags'].remove("") item['id'] = re.search(r'/movie/(.*?)/', jsondata['solution']['url']).group(1) item['trailer'] = self.get_trailer(response) item['url'] = jsondata['solution']['url'] item['network'] = "ATK Girlfriends" if "atkarchives" in response_url: item['parent'] = "ATK Archives" item['site'] = "ATK Archives" if "atkexotics" in response_url: item['parent'] = "ATK Exotics" item['site'] = "ATK Exotics" if "atkpremium" in response_url: item['parent'] = "ATK Premium" item['site'] = "ATK Premium" if "atkpetites" in response_url: item['parent'] = "ATK Petites" item['site'] = "ATK Petites" if "atkhairy" in response_url: item['parent'] = "ATK Hairy" item['site'] = "ATK Hairy" if "amkingdom" in response_url: item['parent'] = "ATK Galleria" item['site'] = "ATK Galleria" days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def _buildresponse(body, **kwargs): kwargs.setdefault('body', body) kwargs.setdefault('url', 'http://example.com') kwargs.setdefault('encoding', 'utf-8') return HtmlResponse(**kwargs)
def process_request(self, request, spider): url = request.url spider.chrome.get(url) time.sleep(3) html = spider.chrome.page_source return HtmlResponse(url=url,body=html,request=request,encoding='utf-8')
def process_request(self, request, spider): if spider.USE_SELENIUM: url = request._get_url() self.driver.get(url) return HtmlResponse(url, body=self.driver.page_source, encoding='utf-8')
print("############### START ###################") with open('source.json', 'r') as f: source_data = json.load(f) output_data = [] errors = [] sending_pattern = DEFUALT_SENDING_PATTERN.copy() for book in tqdm(source_data): txt_to_search = book['name'] link_to_book = book['link'] description = book['pargraph'] sending_pattern['FreeText_1'] = txt_to_search try: r = requests.post(URL_TO_SEARCH_API, params=sending_pattern) response = HtmlResponse(url=URL_TO_SEARCH_API, body=r.text, encoding='utf-8') founded_books = response.selector.css('td[width="222"]') if len(founded_books) > 0: founded_books_title = [] for book_html in founded_books: book_title = ''.join( book_html.css('*::text').extract()).replace('\n', '') founded_books_title.append(book_title) founded_books_title_str = '|'.join(founded_books_title) output_data.append({ **book, 'founded_books_title': founded_books_title_str }) except Exception as e: print(f'ERROR: {e}')
def load_html(): file = codecs.open("test/resources/covid_stub.html", 'r') response = HtmlResponse(url="my HTML string", body=file.read(), encoding='utf-8') return response
def fake_response(url: str) -> Response: body = bytes(requests.get(url).text, 'UTF-8') response = HtmlResponse(url, body=body) return response
def test_restrict_xpaths_with_html_entities(self): html = '<html><body><p><a href="/♥/you?c=€">text</a></p></body></html>' response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15') links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response) self.assertEqual(links, [Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])
def process_request(self, request, spider): """ 用PhantomJS抓取页面 """ # self.logger.debug('PhantomJS is Starting') page = request.meta.get('page', 1) print("我到这里了", ) # m不重要, m的作用是判断是否刷新页面 m = random.randint(2, 202) try: if page == 1: self.driver.get(request.url) time.sleep(random.uniform(1, 3)) self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) self.driver.execute_script('window.scrollBy(0, 1200)') time.sleep(random.uniform(0.5, 1.5)) self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8', status=200) if page <= m: input = self.wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) submit = self.wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page) submit.click() time.sleep(random.uniform(1, 3)) self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) self.driver.execute_script('window.scrollBy(0, 1200)') time.sleep(random.uniform(0.5, 1.5)) self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8', status=200) if page > m: self.driver.get(request.url) self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') input = self.wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) submit = self.wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page) submit.click() time.sleep(random.uniform(1, 3)) self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) self.driver.execute_script('window.scrollBy(0, 1200)') time.sleep(random.uniform(0.5, 1.5)) self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8', status=200) except TimeoutException: self.driver.get(request.url) return HtmlResponse(url=request.url, status=500, request=request)
def getScrapyResponse(self, url): response = self.downloadUsingSelenium(url) response = HtmlResponse(url=url, body=response, encoding='utf-8') return response
''' """ Scrapy选择器是Selector通过传递文本或TextResponse 对象构造的类的实例。 它根据输入类型自动选择最佳解析规则(XML vs HTML) """ from scrapy.selector import Selector from scrapy.http import HtmlResponse # 从文本构建 body = '<html><body><span>good</span></body></html>' print(Selector(text=body).xpath('//span/text()').extract()) # 从response(响应)中构建 response = HtmlResponse(url='https://sebastianraschka.com/blog/index.html', body=body, encoding='utf-8') print(Selector(response=response).xpath('//*/h1[@class="post-title"]/text()').extract()) # 上面那句等价于下面这句 print(response.selector.xpath('//*/h1[@class="post-title"]/text()').extract()) response = r""" <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
def test_priority_adjust(self): req = Request('http://a.com') rsp = HtmlResponse(req.url, body=self._body()) req2 = self.mw.process_response(req, rsp, self.spider) assert req2.priority > req.priority
def body_html(response_body: bytes) -> HtmlResponse: return HtmlResponse(url="", body=to_json(response_body)["domops"][0][3]["__html"], encoding="utf-8")
def crawl_product_id(): product_id_list = [] i = 1 while (i < 3): driver = webdriver.Chrome("C:/bin/chromedriver.exe", chrome_options=options) driver.get(laptop_page_url.format(i)) if "https://shopee.vn/Laptop-cat.13030.13065" in laptop_page_url.format( i): y = 2300 x = 1 while y <= 4800: driver.execute_script("window.scrollTo(0, " + str(y) + ")") y += 1000 # print("aaaaaaaaaaa") # try: # print("bbbbbbb" ,WebDriverWait(driver, 1).until(EC.presence_of_element_located( # (By.XPATH, '//*[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[1]/div'.format({x}))))) # print("Page is ready!") # except TimeoutException: # print("cccccccc") # print("Loading took too much time!") x += 10 body = driver.page_source abc = driver.current_url response = HtmlResponse(abc, body=body, encoding='utf8') print(body) if (response == None): break for product in response.css( "div.col-xs-2-4.shopee-search-item-result__item"): try: url = product.css("div a::attr(href)").get() print("link ok: ", url) product_key = url.rsplit("-i.", 1)[1] # product_id_dict = {"shop_id": product_key[0], "item_id": product_key[1]} # shop_id = product_key[0] # item_id = product_key[1] # parser = BeautifulSoup(body, 'html.parser') # product_box = parser.findAll(class_="col-xs-2-4 shopee-search-item-result__item", ) # if (len(product_box) == 0): # break # print(product_box[0]) # for product in product_box: # # href = product.get("href").rsplit("-i.", 1)[1] # # product_id = href.split(".html")[0] # product_id = product.get("div a::attr(href)") # # product_id = product.css("div a::attr(href)").get() # # product_id = product.get("href") product_id_list.append(product_key) except: print("no!") driver.close() print("Crawl page: ", i) print(product_id_list) # response = requests.get(laptop_page_url.format(i), params=params, headers=headers) # parser = BeautifulSoup(response.text, 'html.parser') # # print(response.content) # product_box = parser.findAll('a', class_="col-xs-2-4 shopee-search-item-result__item") # # if (len(product_box) == 0): # break # # for product in product_box: # href = product.get("href") # print(href) i += 1 return product_id_list, i
from scrapy.selector import Selector from scrapy.http import HtmlResponse body = '<html><body><span>good</span></body></html>' p = Selector(text=body).xpath('//span/text()').extract() print(p) response = HtmlResponse(url='http://example.com', body=body, encoding='utf-8') print(Selector(response=response).xpath('//span/text()').extract())
def test_generic_form_requests_with_file_field(self): name = "ebay2" spider = self.smanager.create(name) generic_form_request = list(spider.start_requests())[0] self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt') response = HtmlResponse(url='file://tmp/test_params.txt', body=open( join(_PATH, "data", "test_params.txt")).read()) response.request = generic_form_request requests = list(generic_form_request.callback(response)) request_list = [request_to_dict(req, spider) for req in requests] expected = [{ 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': { u'xpath': u"//form[@name='adv_search_from']", u'form_url': u'http://*****:*****@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt' }, { u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt' }, { u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate' }] }, 'headers': {}, 'url': u'file://tmp/test_params.txt', 'dont_filter': True, 'priority': 0, 'callback': 'parse_field_url_page', 'method': 'GET', 'errback': None }] self.assertEqual(request_list, expected) generic_form_request = requests[0] self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt') response = HtmlResponse(url='file://tmp/test_params.txt', body=open( join(_PATH, "data", "test_params.txt")).read()) response.request = generic_form_request requests = list(generic_form_request.callback(response)) request_list = [request_to_dict(req, spider) for req in requests] expected = [{ 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': { u'xpath': u"//form[@name='adv_search_from']", u'fields': [{ u'xpath': u".//*[@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt' }, { 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt' }, { u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate' }], u'type': u'form', 'field_index': 1 }, 'headers': {}, 'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse_form_page', 'method': 'GET', 'errback': None }] self.assertEqual(request_list, expected) generic_form_request = requests[0] self.assertEqual( generic_form_request.url, 'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc') response = HtmlResponse( url="http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request request_list = [ request_to_dict(req, spider) for req in generic_form_request.callback(response) ] expected = [{ 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None }] self.assertEqual(request_list, expected)
def process_request(self, request, spider): driver = webdriver.PhantomJS() driver.get(request.url) return HtmlResponse(request.url, encoding='utf-8', body=driver.page_source.encode('utf-8'))
class SelectortemLoaderTest(unittest.TestCase): response = HtmlResponse(url="", body=""" <html> <body> <div id="id">marta</div> <p>paragraph</p> <a href="http://www.scrapy.org">homepage</a> <img src="/images/logo.png" width="244" height="65" alt="Scrapy"> </body> </html> """) def test_constructor(self): l = TestItemLoader() self.assertEqual(l.selector, None) def test_constructor_errors(self): l = TestItemLoader() self.assertRaises(RuntimeError, l.add_xpath, 'url', '//a/@href') self.assertRaises(RuntimeError, l.replace_xpath, 'url', '//a/@href') self.assertRaises(RuntimeError, l.get_xpath, '//a/@href') self.assertRaises(RuntimeError, l.add_css, 'name', '#name::text') self.assertRaises(RuntimeError, l.replace_css, 'name', '#name::text') self.assertRaises(RuntimeError, l.get_css, '#name::text') def test_constructor_with_selector(self): sel = Selector(text=u"<html><body><div>marta</div></body></html>") l = TestItemLoader(selector=sel) self.assert_(l.selector is sel) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) def test_constructor_with_selector_css(self): sel = Selector(text=u"<html><body><div>marta</div></body></html>") l = TestItemLoader(selector=sel) self.assert_(l.selector is sel) l.add_css('name', 'div::text') self.assertEqual(l.get_output_value('name'), [u'Marta']) def test_constructor_with_response(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) def test_constructor_with_response_css(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_css('name', 'div::text') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.add_css('url', 'a::attr(href)') self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org']) # combining/accumulating CSS selectors and XPath expressions l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta', u'Marta']) l.add_xpath('url', '//img/@src') self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org', u'/images/logo.png']) def test_add_xpath_re(self): l = TestItemLoader(response=self.response) l.add_xpath('name', '//div/text()', re='ma') self.assertEqual(l.get_output_value('name'), [u'Ma']) def test_replace_xpath(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_xpath('name', '//p/text()') self.assertEqual(l.get_output_value('name'), [u'Paragraph']) l.replace_xpath('name', ['//p/text()', '//div/text()']) self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta']) def test_get_xpath(self): l = TestItemLoader(response=self.response) self.assertEqual(l.get_xpath('//p/text()'), [u'paragraph']) self.assertEqual(l.get_xpath('//p/text()', TakeFirst()), u'paragraph') self.assertEqual(l.get_xpath('//p/text()', TakeFirst(), re='pa'), u'pa') self.assertEqual(l.get_xpath(['//p/text()', '//div/text()']), [u'paragraph', 'marta']) def test_replace_xpath_multi_fields(self): l = TestItemLoader(response=self.response) l.add_xpath(None, '//div/text()', TakeFirst(), lambda x: {'name': x}) self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_xpath(None, '//p/text()', TakeFirst(), lambda x: {'name': x}) self.assertEqual(l.get_output_value('name'), [u'Paragraph']) def test_replace_xpath_re(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_xpath('name', '//div/text()', re='ma') self.assertEqual(l.get_output_value('name'), [u'Ma']) def test_add_css_re(self): l = TestItemLoader(response=self.response) l.add_css('name', 'div::text', re='ma') self.assertEqual(l.get_output_value('name'), [u'Ma']) l.add_css('url', 'a::attr(href)', re='http://(.+)') self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org']) def test_replace_css(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_css('name', 'div::text') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_css('name', 'p::text') self.assertEqual(l.get_output_value('name'), [u'Paragraph']) l.replace_css('name', ['p::text', 'div::text']) self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta']) l.add_css('url', 'a::attr(href)', re='http://(.+)') self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org']) l.replace_css('url', 'img::attr(src)') self.assertEqual(l.get_output_value('url'), [u'/images/logo.png']) def test_get_css(self): l = TestItemLoader(response=self.response) self.assertEqual(l.get_css('p::text'), [u'paragraph']) self.assertEqual(l.get_css('p::text', TakeFirst()), u'paragraph') self.assertEqual(l.get_css('p::text', TakeFirst(), re='pa'), u'pa') self.assertEqual(l.get_css(['p::text', 'div::text']), [u'paragraph', 'marta']) self.assertEqual(l.get_css(['a::attr(href)', 'img::attr(src)']), [u'http://www.scrapy.org', u'/images/logo.png']) def test_replace_css_multi_fields(self): l = TestItemLoader(response=self.response) l.add_css(None, 'div::text', TakeFirst(), lambda x: {'name': x}) self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_css(None, 'p::text', TakeFirst(), lambda x: {'name': x}) self.assertEqual(l.get_output_value('name'), [u'Paragraph']) l.add_css(None, 'a::attr(href)', TakeFirst(), lambda x: {'url': x}) self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org']) l.replace_css(None, 'img::attr(src)', TakeFirst(), lambda x: {'url': x}) self.assertEqual(l.get_output_value('url'), [u'/images/logo.png']) def test_replace_css_re(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_css('url', 'a::attr(href)') self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org']) l.replace_css('url', 'a::attr(href)', re='http://www\.(.+)') self.assertEqual(l.get_output_value('url'), [u'scrapy.org'])
def setUp(self): body = get_testdata("link_extractor", "linkextractor.html") self.response = HtmlResponse(url="http://example.com/index", body=body)