def parse_list_page(self, response): multi_xpath = '//li[@class="g"]' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: #//li[@class="g"][1]//h3/a/@href url = ''.join(hxs.select('.//h3/a/@href').extract()) url = urllib.unquote(url).strip() doc = { 'data_source': 'google专利搜索', 'url': url, } detail_url = fix_possible_missing_scheme(url) list_url = response.url query = response.meta.get('query') if not detail_url: next_request = None else: # detail_url = detail_url.replace('_free', '') next_request = Request(detail_url, callback=self.parse_detail_page) item = PatentItem(doc=doc, next_request=next_request, list_url=list_url, query=query, attachments=[], attachment_urls=[]) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//*[@id="r"]/table' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: site_name, pub_time = ''.join(hxs.select('.//nobr//text()').extract()).split(' ', 1) title = ''.join(hxs.select('.//span/b//text()').extract()) overview = ''.join(hxs.select('.//font[@size="-1"]//text()').extract()) url = ''.join(hxs.select('.//span/../@href').extract()) url = urllib.unquote(url).strip() doc = { 'data_source': '百度新闻搜索', 'site_name': site_name, 'pub_time': pub_time, 'title': title, 'overview': overview, 'url': url, } detail_url = fix_possible_missing_scheme(url) list_url = response.url query = response.meta.get('query') if not detail_url: next_request = None else: next_request = Request(detail_url, callback=self.parse_detail_page) item = NewsItem(doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//ul[@class="list_ul"]' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: url = "".join(hxs.select('.//a[.//text()="查看详细信息"]/@href').extract()) # url = ''.join(hxs.select('./li[1]/a[3]/@href').extract()) patent_name = "".join(hxs.select("./li[1]/a[3]//text()").extract()) url = urllib.unquote(url).strip() doc = {"patent_name": patent_name, "data_source": "万方专利搜索", "url": url} detail_url = fix_possible_missing_scheme(url) list_url = response.url query = response.meta.get("query") if not detail_url: next_request = None else: # detail_url = detail_url.replace('_free', '') next_request = Request(detail_url, callback=self.parse_detail_page) item = PatentItem( doc=doc, next_request=next_request, list_url=list_url, query=query, attachments=[], attachment_urls=[] ) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//ul[@id="vlist1"]/li' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: #//li[@class="g"][1]//h3/a/@href url = ''.join(hxs.select('.//div/a/@href').extract()) title = ''.join(hxs.select('.//h3//text()').extract()) title = title.strip() thumb = ''.join(hxs.select('.//div[@class="imgbox"]//img//@src').extract()) pub_time = ''.join(hxs.select('.//p//text()').extract()) pub_time = pub_time.lstrip('发布时间:') total_time = ''.join(hxs.select('.//span[@class="updatetxt_time"]//text()').extract()) from_site = ''.join(hxs.select('.//div[@class="updatetxt"]/text()').extract()) url = urllib.unquote(url).strip() url = fix_possible_missing_scheme(url) doc = { 'data_source': 'sogou视频搜索', 'url': url, 'title': title, 'thumb': thumb, 'pub_time': pub_time, 'total_time': total_time, 'from_site': from_site, } list_url = response.url query = response.meta.get('query') next_request = None attachment_urls = [] if thumb: attachment_urls.append(thumb) item = VideoItem(doc=doc, next_request=next_request, list_url=list_url, query=query, attachments=[], attachment_urls=attachment_urls) yield self.item_or_request(item)