def parse(self, response): print "test point" response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body) url = response.url #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract() # active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div/h2/a/text()').extract() active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div') file_obj = open('collection_now.log', 'w') for active_block in active_page_list: #active = active_block.xpath('.//div[1]/text()').extract()[1].strip() #question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract() #answer_link = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract()[0] #if 'http' not in answer_link: # answer_link = "http://www.zhihu.com" + answer_link question = active_block.xpath('.//h2/a/text()').extract()[0] # answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a[@href="toggle-expand"]/@href').extract() answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a/@href').extract() if len(answer_link) > 0: if 'http' not in answer_link[0]: answer_link_str = "http://www.zhihu.com" + answer_link[0] # print question, answer_link_str file_obj.write(question.encode('utf-8') + '\t' + answer_link_str.encode('utf-8') + '\n') # file_obj.write('\n') file_obj.close()
def parse_kb(self, response): # initial html tokenization to find regions segmented by e.g. "======" # or "------" filtered = response.xpath( "//div[@class='sfdc_richtext']").extract()[0].split("=-") for entry in [x and x.strip() for x in filtered]: resp = HtmlResponse(url=response.url, body=entry, encoding=response.encoding) for link in resp.xpath("//a"): href = link.xpath("@href").extract()[0] if "cache-www" in href: text = resp.xpath("//text()").extract() text_next = link.xpath("following::text()").extract() item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%b %d, %Y", "%B %d, %Y", "%m/%d/%Y"]) version = FirmwareLoader.find_version_period(text_next) if not version: version = FirmwareLoader.find_version_period(text) item.add_value("version", version) item.add_value("date", item.find_date(text)) item.add_value("url", href) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def parse_json(self,response): data = response.body[1:-1] js = json.loads(data) response = HtmlResponse(url=response.url,body=js['data'].encode('utf8')) for href in response.css(settings["el_nacional"]['links']): full_url = response.urljoin(href.extract()) yield scrapy.Request(full_url, callback=self.parse_links)
def parse_shop(self, response): print '\r\n\t======== Page Crawl Start - Company -----------' hxs = HtmlXPathSelector(response) item = init_item('shop') #初始化 shop item try : if conf['show_messages'] : print '----Company Fetch Start----' #--分析代码开始################################################################################################################# item['url'] = response.url item['logo_src'] = 'http://baidu.com/abc/ddd.jpg' item['photo_src'] = '/image/abcd.jpg' newurl = 'http://cn.china.cn' #构造企业介绍页Url try : #尝试加载新页面,使用代理IP proxy_handle = urllib2.ProxyHandler({ 'http' : get_proxy() }) opener = urllib2.build_opener(proxy_handle) temp = opener.open(newurl,timeout=30) #请求 except : #重试一次,如果仍无法打开.. 然后..就没有然后了 proxy_handle = urllib2.ProxyHandler({ 'http' : get_proxy() }) opener = urllib2.build_opener(proxy_handle) temp = opener.open(newurl,timeout=30) #请求 temp = temp.read() #读数据 newresponse = HtmlResponse(newurl) newresponse._set_body(temp) hxs = HtmlXPathSelector(newresponse) #构建新的xpath选择器 #print temp #--分析代码结束################################################################################################################# if conf['show_messages'] : print '---- Fetch Success ----' except EOFError,e : if conf['show_messages'] : print '----Company Fetch Error Start----' print e if conf['show_messages'] : print '----Company Fetch Error End----'
def test_login_requests(self): name = "pinterest.com" spider = self.smanager.create(name) login_request = list(spider.start_requests())[0] response = HtmlResponse(url="https://pinterest.com/login/", body=open(join(_PATH, "data", "pinterest.html")).read()) response.request = login_request form_request = login_request.callback(response) expected = {'_encoding': 'utf-8', 'body': 'email=test&password=testpass&csrfmiddlewaretoken=nLZy3NMzhTswZvweHJ4KVmq9UjzaZGn3&_ch=ecnwmar2', 'callback': 'after_login', 'cookies': {}, 'dont_filter': True, 'errback': None, 'headers': {'Content-Type': ['application/x-www-form-urlencoded']}, 'meta': {}, 'method': 'POST', 'priority': 0, 'url': u'https://pinterest.com/login/?next=%2F'} self.assertEqual(request_to_dict(form_request, spider), expected) # simulate a simple response to login post from which extract a link response = HtmlResponse(url="http://pinterest.com/", body="<html><body><a href='http://pinterest.com/categories'></body></html>") result = list(spider.after_login(response)) self.assertEqual([r.url for r in result], ['http://pinterest.com/categories', 'http://pinterest.com/popular/'])
def parse(self, response): print "test point" response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body) url = response.url #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract() #active_page_list = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a[@class="question_link" or @class="post-link"]/text()').extract() active_page_list = response.xpath('//*[@id="zh-profile-activity-page-list"]/div') file_obj = open('active_now.log', 'w') for active_block in active_page_list: active = active_block.xpath('.//div[1]/text()').extract()[1].strip() question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract() answer_link_list = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract() answer_link = "" if len(answer_link_list) > 0: answer_link = answer_link_list[0] question_txt = "" if len(question) > 0: question_txt = question[0] if 'http' not in answer_link: answer_link = "http://www.zhihu.com" + answer_link file_obj.write(active.encode('utf-8') + '\t' + question_txt.encode('utf-8') + '\t' + answer_link.encode('utf-8') + '\n') # file_obj.write('\n') print answer_link file_obj.close()
def parse(self, response): marker_txt = re.findall(re.compile("markerData.*\}", re.MULTILINE), response.body_as_unicode()) if not len(marker_txt): return markers_json = "{\"" + marker_txt[0] markers = list(json.loads(markers_json).values())[0] if not len(markers): return for marker in markers: marker_response = HtmlResponse(url="", body=marker["info"].encode("utf-8")) hours = re.findall(r"\{\"label.*\}", marker["info"]) hours = hours[0] parsed_hours = json.loads(hours) addr_parts = marker_response.css(".address span:not(.phone)::text").extract() url = marker_response.css("header a").xpath("@href").extract_first() city, state = addr_parts[-1].split(",") yield GeojsonPointItem(lat=marker.get("lat"), lon=marker.get("lng"), name=marker_response.css("header a::text").extract_first(default=None), addr_full=", ".join(addr_parts), city=city.strip(), state=state.strip(), country="United States", phone=marker_response.css(".phone::text").extract_first(), website=url, opening_hours=get_hours(parsed_hours["days"]), ref=url.split("/")[-1].split(".")[0])
def crawlListPage(self): print '开始抓取列表页' self.openPage( "http://hotel.elong.com/nanjing/" ) # 记录每页的循环次数(初始值为0) loop_num = 0 # 标识页面是否已经爬取:False为未处理,反之为已处理 if_handle = False # 总页面数 page_num = 0 hotel_num = int(self.driver.find_element_by_xpath("//span[@class='t24 mr5']").text) if hotel_num % 20==0: page_num = hotel_num/20 else: page_num = hotel_num/20 + 1 # 测试 抓取5页 #page_num = 5 while page_num>=1: loop_num += 1 self.driver.find_element_by_tag_name("body").send_keys(Keys.END) #self.driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_UP) if u"返后价" in self.driver.page_source: if if_handle == False: self.__parseUrls(self.driver.page_source) print u"获取酒店数为:%d" % len(self.listPageInfo) if_handle = True try: #判断是否在加载,若在加载,就等0.1s response = HtmlResponse(url="My HTML String",body=self.driver.page_source,encoding="utf-8") _loading = response.xpath("//div[@id='_loading_']/@style").extract() while 1: if _loading == []: break if u'none' in _loading[0]: break else: #print '正在加载中......' time.sleep(0.1) response = HtmlResponse(url="My HTML String",body=self.driver.page_source,encoding="utf-8") _loading = response.xpath("//div[@id='_loading_']/@style").extract() if u"下一页" in self.driver.page_source: self.driver.find_element_by_xpath("//div[@class='paging1']/a[@class='page_next']").click() page_num -= 1 if_handle = False loop_num = 0 time.sleep(random.uniform(1, 3)) except Exception, e: print "error happen at clicking next-page" print e if loop_num != 0: if loop_num < 15: time.sleep(1) continue else: break
def parse_kb(self, response): mib = None # need to perform some nasty segmentation because different firmware versions are not clearly separated # reverse order to get MIB before firmware items for entry in reversed(response.xpath( "//div[@id='support-article-downloads']/div/p")): for segment in reversed(entry.extract().split("<br><br>")): resp = HtmlResponse( url=response.url, body=segment, encoding=response.encoding) for href in resp.xpath("//a/@href").extract(): text = resp.xpath("//text()").extract() if "MIBs" in href: mib = href elif "firmware" in href: text = resp.xpath("//text()").extract() item = FirmwareLoader( item=FirmwareImage(), response=resp, date_fmt=["%m/%d/%Y"]) item.add_value("date", item.find_date(text)) item.add_xpath("url", "//a/@href") item.add_value("mib", mib) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) item.add_value( "version", FirmwareLoader.find_version_period(text)) yield item.load_item()
def parse_lobbyist(self, html, fn): response = HtmlResponse('http://localhost/test.html', body='<book>%s</book>' % html) rows = response.css('row') item = fn(response, rows) actual = dict(item) return actual
def parse_row(self, html, fn): response = HtmlResponse('http://localhost/test.html', body='<table>%s</table>' % html) row = response.css('tr')[0] item = fn(response, row) actual = dict(item) return actual
def __crawlHotelComment(self,driver,hotel_id ,pagenum): pagenum = int(pagenum) # 遍历所有页 while pagenum>=1: response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8") loading = response.xpath("//div[@id='commentLoading']/@style").extract()[0] #当加载不显示时,才爬取 while loading!=u'display: none;': print '正在加载......' time.sleep(0.1) response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8") loading = response.xpath("//div[@id='commentLoading']/@style").extract()[0] itemlist = response.xpath("//ul[@class='dcomt_list']/li") for item in itemlist: username = item.xpath(".//div[@class='dcomt_head left']/div[2]/span/text()").extract()[0] remarkText = item.xpath(".//p[@class='dcomt_con_txt']/text()").extract()[0] #TODO 过滤 非中文字符 待修改 remarkText = remarkText.encode("gbk",'ignore') remarkText = remarkText.decode("gbk") remark = '' for string in remarkText: remark = remark + re.sub("\s+", "", string) user_type = item.xpath(".//div[@class='dcomt_head_pic']/p/text()").extract()[0] comm_time = item.xpath(".//span[@class='dcomt_con_time']/text()").extract()[0] goodorbad = item.xpath(".//p[@class='mb5']/i/@class").extract()[0] comm_type = '' if u'good' in goodorbad: comm_type = "值得推荐" if u'bad' in goodorbad: comm_type = "有待改善" senti_value = self.hotelNLP.sentiment(remark.encode("utf-8")) viewpoint = json.dumps(self.hotelNLP.viewpoint(remark.encode("utf-8"),decoding="utf-8")) comm ={ "guid":uuid.uuid1(), "username":username, "remark":remark, "comm_time":comm_time, "user_type":user_type, "comm_type":comm_type, "senti_value":senti_value, "viewpoint":viewpoint, "baseinfo_id":hotel_id } if self.__is_exist_in_comment_list(comm) is False: self.commList.append(comm) else: #print comm['remark'] pass if pagenum == 1: break #点下一页 self.scroll_and_click_by_xpath("//div[@id='comment_paging']/a[@class='page_next']") pagenum -= 1 time.sleep(random.uniform(1,4)) print pagenum return True
def parse_field(self, html, fn): response = HtmlResponse('http://localhost/test.html', body='<table><tr>%s</tr></table>' % html) row = response.css('tr')[0] node = response.css('td')[0] lobbyist = Loader(self.spider, response, Lobbyist(), row) lobbyist.add_value(None, fn(node)) item = lobbyist.load_item() actual = dict(item) return actual
def test_caching(self): r1 = HtmlResponse('http://www.example.com', body=b'<html><head></head><body></body></html>') r2 = r1.copy() doc1 = LxmlDocument(r1) doc2 = LxmlDocument(r1) doc3 = LxmlDocument(r2) # make sure it's cached assert doc1 is doc2 assert doc1 is not doc3
def test_generic_form_requests_with_spider_args(self): name = "ebay3" args = {'search_string': 'Cars'} spider = self.smanager.create(name, **args) generic_form_request = list(spider.start_requests())[0] response = HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request request_list = [request_to_dict(req, spider) for req in generic_form_request.callback(response)] expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}] self.assertEqual(request_list, expected)
def get_response(url, meta={}): url = canonicalize_url(url) r = requests.get(url) res = r.text final_url = r.url to_encoding = 'utf-8' response = HtmlResponse(url=final_url, body=res, encoding=to_encoding) response.request = Request(url, meta=meta) return response
def process_response(self, request, response, spider): #log.msg('%s is type %s' % (response.url, type(response)), level=log.DEBUG) if type(response) is Response and not _file_pattern.match(response.url): response = HtmlResponse(response.url, body=response.body) if hasattr(response, 'body_as_unicode'): hdoc = html.fromstring(response.body_as_unicode()) links = hdoc.xpath('//a') for link in links: href = link.get('href') link.set('href', urlparse.urljoin(get_base_url(response), href) ) return response.replace(body=html.tostring(hdoc, encoding='unicode')) return response
def test_caching(self): r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>') r2 = r1.copy() doc1 = LxmlDocument(r1) doc2 = LxmlDocument(r1) doc3 = LxmlDocument(r2) # make sure it's cached assert doc1 is doc2 assert doc1 is not doc3 # don't leave documents in memory to avoid wrong libxml2 leaks reports del doc1, doc2, doc3
def parse(self, response): #pprint.pprint("------------------------------") cadena_temp_1 = response.body.split("<TABLE CELLSPACING=1>") cadena_temp_1 = cadena_temp_1[1].split("</TABLE>") cadena_temp_1[0] = '<HTML><BODY><TABLE CELLSPACING=1>'.lower() + cadena_temp_1[0].lower() + '</TABLE></BODY></HTML>'.lower() response_2 = HtmlResponse(url="http://nuforc.org/webreports/ndxevent.html", body=cadena_temp_1[0]) for registro in response_2.xpath('.//body/table/tbody/tr'): item_tabla = CrawlerUfoItem() item_tabla['report_href'] = registro.xpath('td[1]/font/a/@href').extract()[0] item_tabla['report_text'] = registro.xpath('td[1]/font/a/text()').extract()[0] item_tabla['count'] = registro.xpath('td[2]/font/text()').extract()[0] #pprint.pprint(item_tabla) url_nuevo = 'http://nuforc.org/webreports/' + item_tabla['report_href'] yield scrapy.Request(url_nuevo, body = "", method = 'GET', headers={"content-type":"application/x-www-form-urlencoded"}, callback = self.parse_2, dont_filter = True)
def __parseUrls(self, page_source): response = HtmlResponse(url="my HTML string",body=page_source,encoding="utf-8") # 抽取出每页中的酒店url存储到urlList中 urlList = response.xpath("//a[@class='name']/@href").extract() commnumList = response.xpath("//div[@class='comment']/a/span/text()").extract() name_list = response.xpath("//a[@class='name']/text()").extract() if len(urlList) == len(commnumList) == len(name_list): for i in range(0,len(urlList)): self.listPageInfo.append({ "guid":uuid.uuid1(), "url":urlList[i], "hotel_name":name_list[i], "OTA":"途牛", "comm_num":int(commnumList[i]), })
def _get_url(url, request_kwargs={}): '''Returns a scrapy.html.HtmlResponse with the contents of the received url. Note that the session is kept intact among multiple calls to this method (i.e. cookies are passed over). ''' response = betamax_session.get(url) scrapy_response = HtmlResponse( url=str(response.url), body=response.content, ) scrapy_response.request = Request(url, **request_kwargs) return scrapy_response
def parse(self, response): # Wiener Linien returns HTML with an XML content type which creates an # XmlResponse. response = HtmlResponse(url=response.url, body=response.body) for item in response.css(".block-news-item"): il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", ignoretz=True, base_url="https://www.{}".format(self.name), ) link = response.urljoin(item.css("a::attr(href)").extract_first()) il.add_value("link", link) il.add_value("title", item.css("h3::text").extract_first()) il.add_value("updated", item.css(".date::text").extract_first()) yield scrapy.Request(link, self.parse_item, meta={"il": il})
def test_response_libxml2_caching(self): r1 = HtmlResponse("http://www.example.com", body="<html><head></head><body></body></html>") r2 = r1.copy() doc1 = Libxml2Document(r1) doc2 = Libxml2Document(r1) doc3 = Libxml2Document(r2) # make sure it's cached assert doc1 is doc2 assert doc1.xmlDoc is doc2.xmlDoc assert doc1 is not doc3 assert doc1.xmlDoc is not doc3.xmlDoc # don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports del doc1, doc2, doc3
def parse_3(self, response, item): cadena_temp_2 = response.body.lower().replace("<p>","") response = HtmlResponse(url=response.url, body=cadena_temp_2) #pprint.pprint(response.xpath('.//body/table/font/caption/b/text()').extract()[0]) #pprint.pprint("******************************") pprint.pprint(item) for registro in response.xpath('.//body/table/tbody'): if not registro.xpath('tr[1]/td/font').extract(): item["detalle1"] = "" else: item["detalle1"] = registro.xpath('tr[1]/td/font').extract()[0] if not registro.xpath('tr[2]/td/font').extract(): item["detalle2"] = "" else: item["detalle2"] = registro.xpath('tr[2]/td/font').extract()[0] #pprint.pprint(item_tabla_3) yield item
def test_spider_crawls_links(spider, scrape_request, html_headers, mock_html_twolinks): """Ensure spider always picks up relevant links to HTML pages""" # Use only 1 user agent for easier counting ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0') spider.batch_user_agents = [ua] # Generate a mock response based on html containing two links mock_response = HtmlResponse(url='http://test:12345', body=mock_html_twolinks, encoding='utf-8') mock_response.request = scrape_request mock_response.headers = html_headers mock_response.meta['user_agent'] = ua mock_response.meta['sitescan'] = factories.SiteScanFactory() mock_response.status = 200 mock_response.flags = [] # Call spider on the mock response pipeline_generator = spider.parse(mock_response) # We should have two new requests and one MarkupItem sites_expected = set([ mock_response.url + '/link1.html', mock_response.url + '/link2.html', ]) sites_collected = [] for elem in pipeline_generator: if isinstance(elem, Request): sites_collected.append(elem.url) else: assert isinstance(elem, MarkupItem) assert sites_expected == set(sites_collected)
def parse_2(self, response): cadena_temp_1 = response.body.split("<TABLE CELLSPACING=1>") cadena_temp_1 = cadena_temp_1[1].split("</TABLE>") cadena_temp_1[0] = ('<HTML><BODY><TABLE CELLSPACING=1>' + cadena_temp_1[0] + '</TABLE></BODY></HTML>').lower() response = HtmlResponse(url=response.url, body=cadena_temp_1[0]) #pprint.pprint("++++++++++++++++++++++++++++++") for registro in response.xpath('.//body/table/tbody/tr'): item = Crawler_2Item() if not registro.xpath('td[1]/font/a/text()').extract(): item["date_text"] = "" else: item["date_text"] = registro.xpath('td[1]/font/a/text()').extract()[0] if not registro.xpath('td[1]/font/a/@href').extract(): item["date_href"] = "" else: item["date_href"] = registro.xpath('td[1]/font/a/@href').extract()[0] if not registro.xpath('td[2]/font/text()').extract(): item["city"] = "" else: item["city"] = registro.xpath('td[2]/font/text()').extract()[0] if not registro.xpath('td[3]/font/text()').extract(): item["state"] = "" else: item["state"] = registro.xpath('td[3]/font/text()').extract()[0] if not registro.xpath('td[4]/font/text()').extract(): item["shape"] = "" else: item["shape"] = registro.xpath('td[4]/font/text()').extract()[0] if not registro.xpath('td[5]/font/text()').extract(): item["duration"] = "" else: item["duration"] = registro.xpath('td[5]/font/text()').extract()[0] if not registro.xpath('td[6]/font/text()').extract(): item["summary"] = "" else: item["summary"] = registro.xpath('td[6]/font/text()').extract()[0] if not registro.xpath('td[7]/font/text()').extract(): item["posted"] = "" else: item["posted"] = registro.xpath('td[7]/font/text()').extract()[0] #pprint.pprint(item_tabla_2) url_nuevo = 'http://nuforc.org/webreports/' + item["date_href"] item["detalle1"] = "" item["detalle2"] = "" yield scrapy.Request(url_nuevo , body = "", method = 'GET', headers={"content-type":"application/x-www-form-urlencoded"}, dont_filter = True, callback = lambda r : self.parse_3(r, item) )
def parse_links(self, response): try: reponse.css except: response = HtmlResponse(url=response.url,body=response.body) fecha = limpiar_autor_tc(response.css(settings[self.name]['fecha']).extract()[0].split('|')[1]) current_date = True if(len(fecha)>10): current_date = obtener_fecha_tipo6(fecha.split(" ")[0]) if(current_date): titulo = limpiar_autor_tc(response.css(settings[self.name]['titulo']).extract()[0]) body = limpiar_ult_n(response.css(settings[self.name]['body']).extract()) yield { 'titulo': titulo, 'autor': response.css(settings[self.name]['autor']).extract()[0], 'fecha': fecha, 'body': [body], 'link': response.url, }
def parse_page(self, response): #print "test point" response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body) url = response.url name = response.xpath('//h2[@class="zm-item-title zm-editable-content"]/text()').extract() context_list = response.xpath('//div[@class="zm-editable-content"]/text()').extract() print name[0] for context in context_list: print context answer_num = response.xpath('//h3/@data-num').extract() if len(answer_num) == 0: print 1 else: print answer_num[0] author_list = response.xpath('//*[@class="author-link"]/text()').extract() for author in author_list: print author
def test_parse_drug_details_or_overview_generates_new_request_if_redirected_to_search_page(self): url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Search_Drug_Name' meta = { 'original_url': 'http://www.accessdata.fda.gov/somewhere.cfm', 'original_cookies': { 'foo': 'bar', }, } mock_response = HtmlResponse(url=url) mock_response.request = Request(url, meta=meta) with mock.patch('random.random', return_value='random_cookiejar'): spider = Spider() request = spider.parse_drug_details_or_overview(mock_response) assert request.url == meta['original_url'] assert request.cookies == meta['original_cookies'] assert request.dont_filter assert request.callback == spider.parse_drug_details_or_overview assert request.meta['cookiejar'] == 'random_cookiejar'
def test_start_url_matcher(self): url = 'http://example.org' spider = self.spider_factory(start_urls=[url]) response = HtmlResponse(url) rule = spider._rulesman.get_rule_from_response(response) self.failUnless(isinstance(rule.matcher, UrlListMatcher)) response = HtmlResponse(url + '/item.html') rule = spider._rulesman.get_rule_from_response(response) self.failUnless(rule is None) # TODO: remove this block # in previous version get_rule returns rule from response.request response.request = Request(url) rule = spider._rulesman.get_rule_from_response(response.request) self.failUnless(isinstance(rule.matcher, UrlListMatcher)) self.failUnlessEqual(rule.follow, True)
def load_html(): file = codecs.open("test/resources/covid_stub.html", 'r') response = HtmlResponse(url="my HTML string", body=file.read(), encoding='utf-8') return response
def parse1(self, response): rs = response.xpath('//div[@class="left"]//div[@class="sons"]') title = rs.xpath('//div[@class="cont"]//h1/text()').extract_first() print(title) quanwen = rs.xpath('//div[@class="cont"]//div[@class="contson"]') quanwen1 = quanwen[0].xpath('./text()').extract() print(quanwen1) qw = "" for item in quanwen1: if len(item) < 5: continue qw += item.strip() qw += '\r\n' if len(qw) < 12: quanwen = response.xpath('//div[@class="main3"]/div[@class="left"]/div[@class="sons"]/div[@class="cont"]/div[@class="contson"]').extract()[0] temp = re.sub(r'<p>', '', quanwen) temp = re.sub(r'</p>', '', temp) temp = re.sub(r'</div>', '', temp) temp = re.sub(r'(<div\s+class=\s*\".*?\">)', '', temp) qw1 = re.split('\<br>|\n', temp) for item in qw1: if len(item) < 5: continue qw += item.strip() qw += '\r\n' # quanwen2 = quanwen[0].xpath('/p') # quanwen1 = quanwen2.xpath('./text()').extract() #quanwen1 = quanwen[0].xpath('/p/./text()').extract() # for item in quanwen1: # qw += item.strip() #qw = '\r\n'.join(temp) print(qw) cd_zz = rs.xpath('//div[@class="cont"]//p//a/text()').extract() cd = str(cd_zz[0]) zz = str(cd_zz[1]) rs_all = rs.xpath('//div[@class="contyishang"]') rs_h = rs_all.css('a::attr(href)') yw="" yuny="" zy="" yiny="" zs="" byw=True for href in rs_h: hreftxt = href.extract() if ("javascript:PlayFanyi" in hreftxt) and byw: print(hreftxt) byw=False yw_num = re.sub("\D", "", hreftxt) print(str(yw_num)) yw_url = "https://so.gushiwen.org/fanyi_" + str(yw_num) +".aspx" html_requests = requests.get(yw_url).text.encode('utf-8') html_response = HtmlResponse(url=yw_url, body=html_requests, headers={'Connection': 'close'}) rs = html_response.xpath( '//div[@class="main3"]//div[@class="left"]//div[@class="contyishang"]/p[not(@style or contains(text(),"参考资料:"))]').extract() for temp1 in rs: temp = re.sub(r'<p>', '', temp1) temp = re.sub(r'</p>', '', temp) temp = re.sub(r'<strong>', '', temp) temp = re.sub(r'</strong>', '', temp) temp = re.sub(r'<a>', '', temp) temp = re.sub(r'</a>', '', temp) temp = re.sub(r'\u3000', '', temp) temp = re.sub(r'(<a\s+href=\s*\".*?\">)', '', temp) yw1 = re.split('\<br>', temp) if yw1[0] == "译文": del yw1[0] yw2 = '\r\n'.join(yw1) yw +=yw2 if yw1[0] == "韵译": del yw1[0] yw2 = '\r\n'.join(yw1) yuny +=yw2 if yw1[0] == "直译": del yw1[0] yw2 = '\r\n'.join(yw1) zy +=yw2 if yw1[0] == "音译": del yw1[0] yw2 = '\r\n'.join(yw1) yiny +=yw2 if yw1[0] == "注释": del yw1[0] yw2 = '\r\n'.join(yw1) zs +=yw2 c1 = self.conn.cursor() c1.execute("INSERT INTO guwen VALUES (?,?,?,?,?,?,?,?,?)",(title,zz,cd,qw,yw,yuny,zy,yiny,zs)) self.conn.commit() #名句end
def parse_product_reviews(self, response): for line in response.body.split('\n'): if line.startswith('var materials='): body = line.lstrip('var materials=').rstrip(',') break try: body = eval(body) except: logging.error('Failed to parse: ' + repr(response.body)) body = '' # Emulate "normal" HTML response if body: body = ('<html><body>' + '%s' + '</body></html>') % ( body['BVRRSourceID'].replace('\\/', '/')) response2 = HtmlResponse(url=response.url, body=body) response2.request = response.request hxs = HtmlXPathSelector(response2) if body else None base_url = self.get_base_url(response) product = response.meta['product'] product['metadata'].setdefault('reviews', []) box_spec = self.PRODUCT_REVIEW_BOX or {} review_hxs = xpath_select( hxs, box_spec.get('xpath') ) if 'xpath' in box_spec and box_spec.get('xpath') != "." else hxs for review_box in review_hxs: loader = ReviewLoader(item=Review(), selector=hxs, date_format=self.PRODUCT_REVIEW_DATE_FORMAT) loader.add_value('url', urlparse.urljoin(base_url, response.url)) # review full text full_text_specs = box_spec.get('full_text', []) if hasattr( box_spec.get('full_text', []), 'append') else [box_spec['full_text']] full_text_parts = [] for xpath in full_text_specs: items = xpath_select( review_box, xpath).extract() if not callable(xpath) else [xpath(hxs)] if any(items): item_text = self.REVIEW_TEXT_JOIN.join([ e.replace(u'\xa0', u' ').strip(self.REVIEW_TEXT_STRIP) for e in items ]) full_text_parts.append(item_text) review_text = self.REVIEW_PARAGRAPH_JOIN.join(full_text_parts) loader.add_value('full_text', review_text) if box_spec.get('date'): date = review_box.select( box_spec.get('date')).extract() if not callable( box_spec.get('date')) else [ box_spec['date'](review_box) ] loader.add_value('date', date[0] if date else None) if box_spec.get('rating'): rating_text = review_box.select( box_spec.get('rating')).extract() if not callable( box_spec.get('rating')) else [ box_spec['rating'](review_box) ] loader.add_value('rating', rating_text[0] if rating_text else None) review = loader.load_item() if review.get('full_text') or review.get('date'): product['metadata']['reviews'].append(review) next_page = xpath_select(hxs, box_spec.get('next_url')).extract() if ( box_spec.get('next_url') and not callable(box_spec['next_url']) ) else [box_spec['next_url'](response, hxs)] if callable( box_spec.get('next_url')) else None next_page_url = urlparse.urljoin( base_url, next_page[0]) if any(next_page) else None if not next_page_url or next_page_url in self.visited_urls or not review_hxs: yield self.clean_product(product) else: self.visited_urls.add(next_page_url) yield Request(url=next_page_url, meta=dict(**response.meta), callback=self.parse_product_reviews)
def parse(self, response: HtmlResponse): next_page = response.xpath('//a[@class="catalog-pagination__item _text js-pagination-catalog-item"]/@href').extract_first() links = response.xpath('//a[@class="book__image-link js-item-element ddl_product_link"]/@href').extract() for link in links: yield response.follow(link, callback=self.handle_book_data) yield response.follow(next_page, callback=self.parse)
def reparse_by_id(session, id): url = session.query(Article.url).filter(Article.id == id).first()[0] response = requests.get(url) response = HtmlResponse(url=url, body=response.content) WikiResponseProcessor.DBResponseProcessor().process(response, id_to_update=id)
def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open( os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open( os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log( "Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log( "Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images( response.body), flags=flags, request=response.request) self.log( "WARNING: page with base64 embedded images was cleaned-up for links extraction" ) except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru)
def process_request(self,request,spider): dr=webdriver.PhantomJS() dr.get(request.url) time.sleep(2) body=dr.page_source return HtmlResponse(dr.current_url,body=body.replace(u'\xa9',u''),encoding='utf-8',request=request)
<a href='image00.html'>Name: Image 00 <br/><img src='image00.jpg' /></a> <a href='image01.html'>Name: Image 01 <br/><img src='image01.jpg' /></a> <a href='image02.html'>Name: Image 02 <br/><img src='image02.jpg' /></a> <a href='image03.html'>Name: Image 03 <br/><img src='image03.jpg' /></a> <a href='image04.html'>Name: Image 04 <br/><img src='image04.jpg' /></a> </div> </body> </html> ''' response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8') print(response.xpath('/html')) print(response.xpath('/html/head')) print(response.xpath('/html/body/div/a')) print('\n//E 选中文档中所有E') for a in response.xpath('//a'): print(a) print('E1//E2:选中E1后代节点中的所有E2,无论在后代中的什么位置') for a in response.xpath('/html/body//img'): print(a) print('E/text():选中E的文本子节点')
def vacancy_parce(self, response: HtmlResponse): name1 = response.css("div.vacancy-title h1::text").extract_first() salary1 = response.xpath("//span[@class='bloko-header-2 bloko-header-2_lite']/text()").extract() link1 = response.url src = "hh.ru" yield JobparserItem(name=name1, salary=salary1, link=link1, src=src)
''' """ Scrapy选择器是Selector通过传递文本或TextResponse 对象构造的类的实例。 它根据输入类型自动选择最佳解析规则(XML vs HTML) """ from scrapy.selector import Selector from scrapy.http import HtmlResponse # 从文本构建 body = '<html><body><span>good</span></body></html>' print(Selector(text=body).xpath('//span/text()').extract()) # 从response(响应)中构建 response = HtmlResponse(url='https://sebastianraschka.com/blog/index.html', body=body, encoding='utf-8') print(Selector(response=response).xpath('//*/h1[@class="post-title"]/text()').extract()) # 上面那句等价于下面这句 print(response.selector.xpath('//*/h1[@class="post-title"]/text()').extract()) response = r""" <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
def user_parse(self, response: HtmlResponse): j_body = json.loads(response.text) if j_body['authenticated']: yield response.follow(f'/{self.parse_user}', callback=self.user_data_parse, cb_kwargs={'username': self.parse_user})
from scrapy.selector import Selector from scrapy.http import HtmlResponse body = '<html><body><span>good</span></body></html>' p = Selector(text=body).xpath('//span/text()').extract() print(p) response = HtmlResponse(url='http://example.com', body=body, encoding='utf-8') print(Selector(response=response).xpath('//span/text()').extract())
def process_request(self, request, spider): driver = webdriver.PhantomJS() driver.get(request.url) return HtmlResponse(request.url, encoding='utf-8', body=driver.page_source.encode('utf-8'))
def parse_multiple_via_pages(self, response): response = HtmlResponse(url=self.shops_root_url, body=response.body) le = LinkExtractor( allow=[r"%s" % regex for regex in self.via_page_url_regex]) for link in le.extract_links(response): yield scrapy.Request(url=link.url, callback=self.parse_single_page)
def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body)
def test_priority_adjust(self): req = Request('http://a.com') rsp = HtmlResponse(req.url, body=self._body()) req2 = self.mw.process_response(req, rsp, self.spider) assert req2.priority > req.priority
def test_meta_refresh(self): req = Request(url='http://example.org') rsp = HtmlResponse(req.url, body=self._body()) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, 'http://example.org/newpage')
def crawl_product_id(): product_id_list = [] i = 1 while (i < 3): driver = webdriver.Chrome("C:/bin/chromedriver.exe", chrome_options=options) driver.get(laptop_page_url.format(i)) if "https://shopee.vn/Laptop-cat.13030.13065" in laptop_page_url.format( i): y = 2300 x = 1 while y <= 4800: driver.execute_script("window.scrollTo(0, " + str(y) + ")") y += 1000 # print("aaaaaaaaaaa") # try: # print("bbbbbbb" ,WebDriverWait(driver, 1).until(EC.presence_of_element_located( # (By.XPATH, '//*[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[1]/div'.format({x}))))) # print("Page is ready!") # except TimeoutException: # print("cccccccc") # print("Loading took too much time!") x += 10 body = driver.page_source abc = driver.current_url response = HtmlResponse(abc, body=body, encoding='utf8') print(body) if (response == None): break for product in response.css( "div.col-xs-2-4.shopee-search-item-result__item"): try: url = product.css("div a::attr(href)").get() print("link ok: ", url) product_key = url.rsplit("-i.", 1)[1] # product_id_dict = {"shop_id": product_key[0], "item_id": product_key[1]} # shop_id = product_key[0] # item_id = product_key[1] # parser = BeautifulSoup(body, 'html.parser') # product_box = parser.findAll(class_="col-xs-2-4 shopee-search-item-result__item", ) # if (len(product_box) == 0): # break # print(product_box[0]) # for product in product_box: # # href = product.get("href").rsplit("-i.", 1)[1] # # product_id = href.split(".html")[0] # product_id = product.get("div a::attr(href)") # # product_id = product.css("div a::attr(href)").get() # # product_id = product.get("href") product_id_list.append(product_key) except: print("no!") driver.close() print("Crawl page: ", i) print(product_id_list) # response = requests.get(laptop_page_url.format(i), params=params, headers=headers) # parser = BeautifulSoup(response.text, 'html.parser') # # print(response.content) # product_box = parser.findAll('a', class_="col-xs-2-4 shopee-search-item-result__item") # # if (len(product_box) == 0): # break # # for product in product_box: # href = product.get("href") # print(href) i += 1 return product_id_list, i
def crawllianjie(doc = driver.page_source): response1 = HtmlResponse(url="my HTML string", body=doc, encoding="utf-8") A = response1.xpath("//dl[@class='list-noimg job-j-list clearfix job-new-list']") for B in A: ID = B.xpath('@pt').extract()[0] href = B.xpath('a/@href').extract()[0] href = 'http://gz.ganji.com' + href name = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[@class="name"]/text()').extract()[0] sex = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[2]/text()').extract()[0] age = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[3]/text()').extract()[0] xueli = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[4]/text()').extract() if(xueli): xueli = xueli[0] else: xueli = 'none' jingli = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[5]/text()').extract() if(jingli): jingli = jingli[0] else: jingli = 'none' address = B.xpath('a/div[@class="fl district-salary"]/p[@class="district"]/text()').extract()[0] salary = B.xpath('a/div[@class="fl district-salary"]/p[@class="salary"]/text()').extract()[0] address = address.strip() salary = salary.strip() time1 = B.xpath('a/div[@class="order fr"]/text()').extract()[0] time1 = time1.strip() time.sleep(7) res = requests.get(href, headers=head, cookies = Cookie) html = res.text doc = etree.HTML(str(html)) try: A1 = doc.xpath('//div[@class="tend-line clearfix"]/b/a') qiuzhi = [] for B1 in A1: i = B1.xpath('text()') if(i): i = i[0] else: i = 'none' qiuzhi.append(i) qiuzhi = ','.join(qiuzhi) except: qiuzhi = 'none' try: work_exp1 = doc.xpath('//div[@class="experience-block"]/p/text()') exp = [] if(work_exp1): work_exp1 = work_exp1[0] exp.append(work_exp1) didian = doc.xpath('//div[@class="experience-block"]/b') content = doc.xpath('//div[@class="experience-block"]/ul') for m,n in zip(didian,content): company = m.xpath('text()')[0] time2 = n.xpath('li[1]/p/text()')[0] time3 = n.xpath('li[1]/p/i/text()')[0] zhiwei = n.xpath('li[2]/p/text()')[0] work_content = n.xpath('li[3]/p/text()') if(work_content): work_content = work_content[0] else: work_content = 'none' he = ','.join([company,time2,time3,zhiwei,work_content]) exp.append(he) exp = ','.join(exp) else: exp = 'none' except: exp = 'none' #print(exp) try: edu = doc.xpath('//div[@class="education-block"]/table/tbody/tr') if(edu): edu1 = [] for a in edu: i1 = a.xpath('td[1]/text()')[0] i2 = a.xpath('td[2]/text()') if(i2): i2 = i2[0] else: i2 = 'none' i3 = a.xpath('td[3]/text()') if(i3): i3 = i3[0] else: i3 = 'none' i4 = ','.join([i1,i2,i3]) edu1.append(i4) edu1 = ','.join(edu1) else: edu1 = 'none' except: edu1 = 'none' try: zhengshu = doc.xpath('//div[@class="project-block"]/table/tbody/tr') if(zhengshu): zhengshu1 = [] for a in zhengshu: i1 = a.xpath('td[1]/text()')[0] i2 = a.xpath('td[2]/text()') if(i2): i2 = i2[0] else: i2 = 'none' i3 = a.xpath('td[3]/text()') if(i3): i3 = i3[0] else: i3 = 'none' i4 = ','.join([i1,i2,i3]) zhengshu1.append(i4) zhengshu1 = ','.join(zhengshu1) else: zhengshu1 = 'none' except: zhengshu1 = 'none' self1 = doc.xpath('//div[@class="self-block"]/div/text()') if(self1): self1 = self1[0] else: self1 = 'none' print(name,time1,qiuzhi) #print(ID,name,sex,age,xueli,jingli,address,salary,time1, href, # exp, edu1, zhengshu1, self1) cursor = connect.cursor() sql = "INSERT IGNORE INTO ganji(ID,name,sex,age,xueli,jingli,address,salary,time1, href, exp, edu1, zhengshu1, self1,qiuzhi) VALUES ( '%s', '%s', '%s', '%s', '%s','%s','%s', '%s', '%s', '%s', '%s','%s','%s', '%s','%s' )" data = (ID,name,sex,age,xueli,jingli,address,salary,time1, href, exp, edu1, zhengshu1, self1, qiuzhi) try: cursor.execute(sql % data) except: print(ID) connect.commit()
<meta charset="UTF-8"> <title></title> </head> <body> <ul> <li class="item-"><a id='i1' href="link.html" class='ding'>first item</a></li> <li class="item-0"><a id='i2' href="llink.html" class='ding'>first item</a></li> <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> </ul> <div><a href="llink2.html">second item</a></div> <div><a href="llink2.html">10</a></div> </body> </html> """ # 构造response对象 response = HtmlResponse(url='', body=html, encoding='utf-8') selector = Selector(response=response) # 获取所有a标签 temp = selector.xpath('//a') # 获取第一个body标签并从body标签开始找ul标签 ./ul 相对标签的子标签 temp = selector.xpath('body')[0].xpath('.//ul') print(temp) exit() # 获取body的子标签ul temp = selector.xpath('body/ul') # 获取body的后代标签li temp = selector.xpath('body//li') # []空,li不是body的子标签 temp = selector.xpath('body/li') # 获取body的父标签 temp = selector.xpath('body')[0].xpath('..')
from scrapy.selector import Selector from scrapy.http import HtmlResponse # construct from text body = '<html><body><span>good</span></body></html>' print(Selector(text=body).xpath('//span/text()').extract()) # construct from response response = HtmlResponse(url='http://doc.scrapy.org/en/latest/_static/selectors-sample1.html') print(Selector(response=response).xpath('//title/text()').extract())
def test_generic_form_requests_with_file_field(self): name = "ebay2" spider = self.smanager.create(name) generic_form_request = list(spider.start_requests())[0] self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt') response = HtmlResponse(url='file://tmp/test_params.txt', body=open( join(_PATH, "data", "test_params.txt")).read()) response.request = generic_form_request requests = list(generic_form_request.callback(response)) request_list = [request_to_dict(req, spider) for req in requests] expected = [{ 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': { u'xpath': u"//form[@name='adv_search_from']", u'form_url': u'http://*****:*****@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt' }, { u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt' }, { u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate' }] }, 'headers': {}, 'url': u'file://tmp/test_params.txt', 'dont_filter': True, 'priority': 0, 'callback': 'parse_field_url_page', 'method': 'GET', 'errback': None }] self.assertEqual(request_list, expected) generic_form_request = requests[0] self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt') response = HtmlResponse(url='file://tmp/test_params.txt', body=open( join(_PATH, "data", "test_params.txt")).read()) response.request = generic_form_request requests = list(generic_form_request.callback(response)) request_list = [request_to_dict(req, spider) for req in requests] expected = [{ 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': { u'xpath': u"//form[@name='adv_search_from']", u'fields': [{ u'xpath': u".//*[@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt' }, { 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt' }, { u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate' }], u'type': u'form', 'field_index': 1 }, 'headers': {}, 'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse_form_page', 'method': 'GET', 'errback': None }] self.assertEqual(request_list, expected) generic_form_request = requests[0] self.assertEqual( generic_form_request.url, 'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc') response = HtmlResponse( url="http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request request_list = [ request_to_dict(req, spider) for req in generic_form_request.callback(response) ] expected = [{ 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None }] self.assertEqual(request_list, expected)
def process_request(self, request, spider): #0_imo_org动态加载 if (spider.name == "0_imo_org" and str(request.url).count("osssearchresults")): print("PhantomJS is starting...") driver = webdriver.PhantomJS( executable_path=settings['JS_BIN']) # 指定使用的浏览器 # driver = webdriver.Firefox() driver.get(request.url) body = driver.page_source return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) # 0_worldcargonews动态加载翻页 if (spider.name == "0_worldcargonews" and (str(request.url).count("search"))): print("PhantomJS is starting...") driver = webdriver.PhantomJS( executable_path=settings['JS_BIN']) # 指定使用的浏览器 #driver = webdriver.Firefox() driver.get(request.url) time.sleep(5) # print(driver.page_source) # 翻页(因为是每天都运行此爬虫程序,所以没有必要翻页) worldcargonews_look_more = '//div[@class="aoci aos-searchc"]/button' for i in range(1, 1): try: driver.find_element_by_xpath( worldcargonews_look_more).click() # 数据由js来控制,点击后加载数据 time.sleep(2) print("more page") except: print("get news data failed") body = driver.page_source print("final page") #print(driver.page_source) # driver.close() return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) driver.close() #2_Brunei_jpm动态加载 if (spider.name == "2_Brunei_jpm"): print("PhantomJS is starting...") driver = webdriver.PhantomJS( executable_path=settings['JS_BIN']) # 指定使用的浏览器 # driver = webdriver.Firefox() driver.get(request.url) body = driver.page_source return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) # 6_Malaysia_miti动态加载翻页 if (spider.name == "6_Malaysia_miti" and str(request.url).count("search")): print("PhantomJS is starting...") driver = webdriver.PhantomJS( executable_path=settings['JS_BIN']) # 指定使用的浏览器 #driver = webdriver.Firefox() driver.get(request.url) time.sleep(1) # 翻页 Malaysia_miti_look_more = '//*[@id="more"]' if (str(request.url).count("search")): for i in range(1, 1): try: driver.find_element_by_xpath( Malaysia_miti_look_more).click( ) # 数据由js来控制,点击后加载数据 time.sleep(2) #true_page = driver.page_source print("more page") except: print("get news data failed") body = driver.page_source print("final page") #print(driver.page_source) # driver.close() return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) driver.close() #10_Thailand_thaigov变语言 if (spider.name == "10_Thailand_thaigov"): driver = webdriver.PhantomJS( executable_path=settings['JS_BIN']) # 指定使用的浏览器 # driver = webdriver.Firefox() driver.get(request.url) time.sleep(5) #变语言 change_lang = driver.find_element_by_xpath( '//*[@id="destop"]/div[@class="col-sm-8 col-md-8 remove-xs"]' '/div[@class="col-xs-6 col-md-2 remove-xs"]/a[2]') # .click() driver.execute_script("arguments[0].click();", change_lang) #time.sleep(20) WebDriverWait(driver, 30).until(lambda x: x.find_element_by_xpath( "//*[contains(text(),'Change')]")) if ((str(request.body).count("Thursday 01 January 1970"))): return body = driver.page_source return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) driver.close() # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php user_agent_list = [ \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] #不需要动态加载的页面: ua = random.choice(user_agent_list) if ua: request.headers.setdefault('User-Agent', ua)
def test_generic_form_requests_with_spider_args(self): name = "ebay3" args = {'search_string': 'Cars'} spider = self.smanager.create(name, **args) generic_form_request = list(spider.start_requests())[0] response = HtmlResponse( url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request request_list = [ request_to_dict(req, spider) for req in generic_form_request.callback(response) ] expected = [{ 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None }, { 'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None }] self.assertEqual(request_list, expected)
def parse_doctor(self, response): response_url = response.url doctor_id = re.search('doctor/([^\.]*)\.htm', response_url).group(1) hxs = Selector(response) #parse doctor name name_list = hxs.xpath("//input[@name='doctor_name']/@value") doctor_name = '' if len(name_list) != 0: doctor_name = name_list[0].extract() #hospital department hospital_department_selectors = hxs.xpath("//meta[@name='keywords']/@content") hospital = '' department = '' if len(hospital_department_selectors) != 0: hospital_re = r',(?P<hospital>.*?)' + doctor_name hospital_match = re.search(hospital_re, hospital_department_selectors[0].extract()) if hospital_match != None: hospital = hospital_match.group('hospital') department_re = hospital + r'(?P<department>.*?)' + doctor_name + ',' department_match = re.search(department_re, hospital_department_selectors[0].extract()) if department_match != None: department = department_match.group('department') #title title = '' title_selectors = hxs.xpath('//meta[@name="description"]/@content') if len(title_selectors) != 0: title_re_str = doctor_name + r'(?P<doctor_title>.*?)' + u'简介' title = re.search(title_re_str, title_selectors[0].extract()).group(1) doctor_about_dict = None tag_doctor_about_selectors = hxs.xpath('//div[@id="bp_doctor_about"]/div[@class="doctor_about"]') if len(tag_doctor_about_selectors) != 0: doctor_about_dict = self.parse_doctor_about(tag_doctor_about_selectors) else: doctor_about_match_list = hxs.xpath( '//script[@type="text/javascript"]/text()').re( 'BigPipe.onPageletArrive\((?P<doctor_about>\{"id":"bp_doctor_about".*\})\);') if doctor_about_match_list: da_dict = json.loads(doctor_about_match_list[0]) if 'content' in da_dict: doctor_about_hxs = Selector(HtmlResponse(url=response.url, body=da_dict['content'].encode('utf-8'))) doctor_about_dict = self.parse_doctor_about(doctor_about_hxs) #schedule doctor_schedule = [] trs = hxs.xpath("//table[@class='doctortimefrom1']/tr") day_part = 0 for itr in trs: if 0 != day_part: doctor_schedule.extend(self.weekday_operation(itr, day_part)) #上午 day_part += 1 # #disease # disease_list = list() # disease_ht_selector = hxs.xpath('//div[@class="ltdiv"]//table[@class="jbsm"]//td') # if len(disease_ht_selector) == 1: # disease_list = self.parse_disease_from_td_selector(disease_ht_selector, doctor_id=doctor_id) # else: # disease_match_list = hxs.xpath( # '//script[@type="text/javascript"]/text()').re( # 'BigPipe.onPageletArrive\((?P<dict_content>\{"id":"bp_doctor_getvote".*\})\);') # if disease_match_list: # disease_match = disease_match_list[0] # d_dict = json.loads(disease_match) # if 'content' in d_dict: # disease_hxs = Selector(HtmlResponse(url=response.url, body=d_dict['content'].encode('utf-8'))) # disease_selector = disease_hxs.xpath('//div[@class="ltdiv"]//table[@class="jbsm"]//td') # if len(disease_selector) == 1: # disease_list = self.parse_disease_from_td_selector(disease_selector, doctor_id=doctor_id) zanwu_re = re.compile(u'暂无') empty_sub_re = re.compile(r'(<!--.*?-->|\n|\t|\r|[ ])') item = XPathItemLoader(DoctorDetailItem(),hxs) item.add_value('doctor_id',doctor_id) if doctor_name: item.add_value('_name',doctor_name) if response.meta['city']: item.add_value('city',response.meta['city']) if hospital: item.add_value('hospital',hospital) if department: item.add_value('department',department) if title: item.add_value('title',title) if doctor_schedule: item.add_value('schedule',doctor_schedule) else: if len(hxs.xpath('//table[@class="doctortimefrom1"]')) == 0: for content in hxs.xpath('//script[@type="text/javascript"]/text()').extract(): if content.find('doctortimefrom1') != -1: item.add_value('schedule','') # shouldn't exist in js break if doctor_about_dict: if 'image_url' in doctor_about_dict: item.add_value('image',doctor_about_dict['image_url']) if 'bio' in doctor_about_dict: bio = doctor_about_dict['bio'] if zanwu_re.search(bio) != None: bio = '' if bio: item.add_value('bio',empty_sub_re.sub('', bio)) if 'feature' in doctor_about_dict: feature = doctor_about_dict['feature'] if zanwu_re.search(feature) != None: feature = '' if feature: item.add_value('feature',empty_sub_re.sub('', feature)) yield item.load_item() url=u'http://www.haodf.com/doctor/'+doctor_id+u'/jingyan/1.htm' l = LetterItem() l['doctor_id'] = doctor_id letter = [] disease_item = DoctorDiseaseItem() disease_item['doctor_id'] = doctor_id req=Request(url,callback=self.parse_letter) req.meta['item']=l req.meta['letter']=letter req.meta['disease']=disease_item yield req
class SelectortemLoaderTest(unittest.TestCase): response = HtmlResponse(url="", body=""" <html> <body> <div id="id">marta</div> <p>paragraph</p> <a href="http://www.scrapy.org">homepage</a> <img src="/images/logo.png" width="244" height="65" alt="Scrapy"> </body> </html> """) def test_constructor(self): l = TestItemLoader() self.assertEqual(l.selector, None) def test_constructor_errors(self): l = TestItemLoader() self.assertRaises(RuntimeError, l.add_xpath, 'url', '//a/@href') self.assertRaises(RuntimeError, l.replace_xpath, 'url', '//a/@href') self.assertRaises(RuntimeError, l.get_xpath, '//a/@href') self.assertRaises(RuntimeError, l.add_css, 'name', '#name::text') self.assertRaises(RuntimeError, l.replace_css, 'name', '#name::text') self.assertRaises(RuntimeError, l.get_css, '#name::text') def test_constructor_with_selector(self): sel = Selector(text=u"<html><body><div>marta</div></body></html>") l = TestItemLoader(selector=sel) self.assert_(l.selector is sel) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) def test_constructor_with_selector_css(self): sel = Selector(text=u"<html><body><div>marta</div></body></html>") l = TestItemLoader(selector=sel) self.assert_(l.selector is sel) l.add_css('name', 'div::text') self.assertEqual(l.get_output_value('name'), [u'Marta']) def test_constructor_with_response(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) def test_constructor_with_response_css(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_css('name', 'div::text') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.add_css('url', 'a::attr(href)') self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org']) # combining/accumulating CSS selectors and XPath expressions l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta', u'Marta']) l.add_xpath('url', '//img/@src') self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org', u'/images/logo.png']) def test_add_xpath_re(self): l = TestItemLoader(response=self.response) l.add_xpath('name', '//div/text()', re='ma') self.assertEqual(l.get_output_value('name'), [u'Ma']) def test_replace_xpath(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_xpath('name', '//p/text()') self.assertEqual(l.get_output_value('name'), [u'Paragraph']) l.replace_xpath('name', ['//p/text()', '//div/text()']) self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta']) def test_get_xpath(self): l = TestItemLoader(response=self.response) self.assertEqual(l.get_xpath('//p/text()'), [u'paragraph']) self.assertEqual(l.get_xpath('//p/text()', TakeFirst()), u'paragraph') self.assertEqual(l.get_xpath('//p/text()', TakeFirst(), re='pa'), u'pa') self.assertEqual(l.get_xpath(['//p/text()', '//div/text()']), [u'paragraph', 'marta']) def test_replace_xpath_multi_fields(self): l = TestItemLoader(response=self.response) l.add_xpath(None, '//div/text()', TakeFirst(), lambda x: {'name': x}) self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_xpath(None, '//p/text()', TakeFirst(), lambda x: {'name': x}) self.assertEqual(l.get_output_value('name'), [u'Paragraph']) def test_replace_xpath_re(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_xpath('name', '//div/text()', re='ma') self.assertEqual(l.get_output_value('name'), [u'Ma']) def test_add_css_re(self): l = TestItemLoader(response=self.response) l.add_css('name', 'div::text', re='ma') self.assertEqual(l.get_output_value('name'), [u'Ma']) l.add_css('url', 'a::attr(href)', re='http://(.+)') self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org']) def test_replace_css(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_css('name', 'div::text') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_css('name', 'p::text') self.assertEqual(l.get_output_value('name'), [u'Paragraph']) l.replace_css('name', ['p::text', 'div::text']) self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta']) l.add_css('url', 'a::attr(href)', re='http://(.+)') self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org']) l.replace_css('url', 'img::attr(src)') self.assertEqual(l.get_output_value('url'), [u'/images/logo.png']) def test_get_css(self): l = TestItemLoader(response=self.response) self.assertEqual(l.get_css('p::text'), [u'paragraph']) self.assertEqual(l.get_css('p::text', TakeFirst()), u'paragraph') self.assertEqual(l.get_css('p::text', TakeFirst(), re='pa'), u'pa') self.assertEqual(l.get_css(['p::text', 'div::text']), [u'paragraph', 'marta']) self.assertEqual(l.get_css(['a::attr(href)', 'img::attr(src)']), [u'http://www.scrapy.org', u'/images/logo.png']) def test_replace_css_multi_fields(self): l = TestItemLoader(response=self.response) l.add_css(None, 'div::text', TakeFirst(), lambda x: {'name': x}) self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_css(None, 'p::text', TakeFirst(), lambda x: {'name': x}) self.assertEqual(l.get_output_value('name'), [u'Paragraph']) l.add_css(None, 'a::attr(href)', TakeFirst(), lambda x: {'url': x}) self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org']) l.replace_css(None, 'img::attr(src)', TakeFirst(), lambda x: {'url': x}) self.assertEqual(l.get_output_value('url'), [u'/images/logo.png']) def test_replace_css_re(self): l = TestItemLoader(response=self.response) self.assert_(l.selector) l.add_css('url', 'a::attr(href)') self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org']) l.replace_css('url', 'a::attr(href)', re='http://www\.(.+)') self.assertEqual(l.get_output_value('url'), [u'scrapy.org'])
def getScrapyResponse(self, url): response = self.downloadUsingSelenium(url) response = HtmlResponse(url=url, body=response, encoding='utf-8') return response
def process_request(self, request, spider): if spider.USE_SELENIUM: url = request._get_url() self.driver.get(url) return HtmlResponse(url, body=self.driver.page_source, encoding='utf-8')
def process_request(self, request, spider): chrome_options = Options() chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式 chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') ''' :param request: 请求 :param spider: 爬虫名 :return: ''' # 判断是哪个爬虫 if spider.name == 'scjrm_zszq': # 判断是否是登陆 # if request.url == "http://www.scjrm.com/site/login.html": print("<<<<<<<" +request.url) spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe") spider.driver.get("http://www.scjrm.com/site/login.html") # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click() time.sleep(2) #模拟输入账号密码 username = spider.driver.find_element_by_id('phonenumber') password = spider.driver.find_element_by_id('password') username.send_keys('18030535053') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_id('sub_bt').click() time.sleep(1) spider.driver.get(request.url) time.sleep(3) spider.cookies = spider.driver.get_cookies() time.sleep(1) return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8') # 不是登录 # else: # req = requests.session() # 会话 # for cookie in spider.cookies: # req.cookies.set(cookie['name'], cookie["value"]) # req.headers.clear() # 清空头 # newpage = req.get(request.url) # time.sleep(5) # return HtmlResponse(url=request.url, # 当前连接 # body=newpage.text, # 源代码 # 源代码 # encoding="utf-8", request=request) # 返回页面信息 if spider.name == 'scjuchuang_yxzq': # 判断是否是登陆 # if request.url.find('login') != -1: spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe") spider.driver.get('https://www.scjuchuang.com/login') # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click() time.sleep(2) #模拟输入账号密码 username = spider.driver.find_element_by_class_name('loginName') password = spider.driver.find_element_by_class_name('loginPassword') username.send_keys('yczs123') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_class_name('loginBtn').click() time.sleep(1) spider.driver.get('https://www.scjuchuang.com/goods?attr=1&page=1') # spider.driver.find_element_by_link_text('院线专区').click() spider.cookies = spider.driver.get_cookies() return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8') elif spider.name == 'rjyiyao_xpsj': # 判断是否是登陆 # if request.url.find('login') != -1: spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe", chrome_options=chrome_options) spider.driver.get('http://new.rjyiyao.com/web/login') # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click() time.sleep(2) #模拟输入账号密码 username = spider.driver.find_element_by_id('username') password = spider.driver.find_element_by_id('password') username.send_keys('18030535053') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_id('btnLogin').click() time.sleep(1) # spider.driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[3]/div[2]/a[2]/img').click() # 新品上架 # windows = spider.driver.window_handles # spider.driver.switch_to.window(windows[1]) # 切换到第二页 spider.driver.get('http://new.rjyiyao.com/web/product/group/5?page=1') time.sleep(5) spider.cookies = spider.driver.get_cookies() return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8') elif spider.name == 'rjyiyao_zkzq': # 判断是否是登陆 # if request.url.find('login') != -1: spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe", chrome_options=chrome_options) spider.driver.get('http://new.rjyiyao.com/web/login') # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click() time.sleep(1) #模拟输入账号密码 username = spider.driver.find_element_by_id('username') password = spider.driver.find_element_by_id('password') username.send_keys('18030535053') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_id('btnLogin').click() time.sleep(2) spider.driver.get('http://new.rjyiyao.com/web/product/sale/3?page=1') # spider.driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[3]/div[2]/a[2]/img').click() # 新品上架 # windows = spider.driver.window_handles # spider.driver.switch_to.window(windows[1]) # 切换到第二页 time.sleep(5) spider.cookies = spider.driver.get_cookies() return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8') elif spider.name == 'sckxyy_ypzq': # 判断是否是登陆 # if request.url.find('login') != -1: spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe") spider.driver.get('http://www.sckxyy.com/Login.html') time.sleep(2) #模拟输入账号密码 username = spider.driver.find_element_by_id('usernameLogin') password = spider.driver.find_element_by_id('passwordLogin') username.send_keys('bianyuantianshi') password.send_keys('123456') #模拟点击“登录”按钮 spider.driver.find_element_by_id('userLogin').click() time.sleep(1) spider.cookies = spider.driver.get_cookies() spider.driver.get('http://www.sckxyy.com/Drug_zone.html#Monday-bg-two') # spider.driver.find_element_by_link_text('普药专区').click() # 普药专区 # time.sleep(5) # windows = spider.driver.window_handles # spider.driver.switch_to.window(windows[1]) # 切换到第二页 return HtmlResponse(url=spider.driver.current_url, # 登录后的url body=spider.driver.page_source, # html源码 encoding='utf-8')
def test_restrict_xpaths_with_html_entities(self): html = '<html><body><p><a href="/♥/you?c=€">text</a></p></body></html>' response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15') links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response) self.assertEqual(links, [Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])