Exemplo n.º 1
0
    def parse_list_page(self, response):
        multi_xpath = '//li[@class="g"]'
        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)
        multi_hxs = hxs.select(multi_xpath)
        for hxs in multi_hxs:
            #//li[@class="g"][1]//h3/a/@href
            url = ''.join(hxs.select('.//h3/a/@href').extract())

            url = urllib.unquote(url).strip()
            doc = {
                'data_source': 'google专利搜索',
                'url': url,
            }
            detail_url = fix_possible_missing_scheme(url)
            list_url = response.url
            query = response.meta.get('query')
            if not detail_url:
                next_request = None
            else:
                # detail_url = detail_url.replace('_free', '')
                next_request = Request(detail_url, callback=self.parse_detail_page)
            item = PatentItem(doc=doc,
                              next_request=next_request, list_url=list_url, query=query,
                              attachments=[], attachment_urls=[])
            yield self.item_or_request(item)
Exemplo n.º 2
0
 def parse_list_page(self, response):
     multi_xpath = '//*[@id="r"]/table'
     html5_response = response_html5parse(response)
     hxs = HtmlXPathSelector(html5_response)
     multi_hxs = hxs.select(multi_xpath)
     for hxs in multi_hxs:
         site_name, pub_time = ''.join(hxs.select('.//nobr//text()').extract()).split(' ', 1)
         title = ''.join(hxs.select('.//span/b//text()').extract())
         overview = ''.join(hxs.select('.//font[@size="-1"]//text()').extract())
         url = ''.join(hxs.select('.//span/../@href').extract())
         url = urllib.unquote(url).strip()
         doc = {
             'data_source': '百度新闻搜索',
             'site_name': site_name,
             'pub_time': pub_time,
             'title': title,
             'overview': overview,
             'url': url,
         }
         detail_url = fix_possible_missing_scheme(url)
         list_url = response.url
         query = response.meta.get('query')
         if not detail_url:
             next_request = None
         else:
             next_request = Request(detail_url, callback=self.parse_detail_page)
         item = NewsItem(doc=doc,
                         next_request=next_request, list_url=list_url, query=query)
         yield self.item_or_request(item)
Exemplo n.º 3
0
    def parse_list_page(self, response):
        multi_xpath = '//ul[@class="list_ul"]'
        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)
        multi_hxs = hxs.select(multi_xpath)
        for hxs in multi_hxs:
            url = "".join(hxs.select('.//a[.//text()="查看详细信息"]/@href').extract())
            # url = ''.join(hxs.select('./li[1]/a[3]/@href').extract())
            patent_name = "".join(hxs.select("./li[1]/a[3]//text()").extract())

            url = urllib.unquote(url).strip()
            doc = {"patent_name": patent_name, "data_source": "万方专利搜索", "url": url}
            detail_url = fix_possible_missing_scheme(url)
            list_url = response.url
            query = response.meta.get("query")
            if not detail_url:
                next_request = None
            else:
                # detail_url = detail_url.replace('_free', '')
                next_request = Request(detail_url, callback=self.parse_detail_page)
            item = PatentItem(
                doc=doc, next_request=next_request, list_url=list_url, query=query, attachments=[], attachment_urls=[]
            )
            yield self.item_or_request(item)
Exemplo n.º 4
0
 def parse_list_page(self, response):
     multi_xpath = '//ul[@id="vlist1"]/li'
     html5_response = response_html5parse(response)
     hxs = HtmlXPathSelector(html5_response)
     multi_hxs = hxs.select(multi_xpath)
     for hxs in multi_hxs:
         #//li[@class="g"][1]//h3/a/@href
         url = ''.join(hxs.select('.//div/a/@href').extract())
         title = ''.join(hxs.select('.//h3//text()').extract())
         title = title.strip()
         thumb = ''.join(hxs.select('.//div[@class="imgbox"]//img//@src').extract())
         pub_time = ''.join(hxs.select('.//p//text()').extract())
         pub_time = pub_time.lstrip('发布时间:')
         total_time = ''.join(hxs.select('.//span[@class="updatetxt_time"]//text()').extract())
         from_site = ''.join(hxs.select('.//div[@class="updatetxt"]/text()').extract())
         url = urllib.unquote(url).strip()
         url = fix_possible_missing_scheme(url)
         doc = {
             'data_source': 'sogou视频搜索',
             'url': url,
             'title': title,
             'thumb': thumb,
             'pub_time': pub_time,
             'total_time': total_time,
             'from_site': from_site,
         }
         list_url = response.url
         query = response.meta.get('query')
         next_request = None
         attachment_urls = []
         if thumb:
             attachment_urls.append(thumb)
         item = VideoItem(doc=doc,
                          next_request=next_request, list_url=list_url, query=query,
                          attachments=[], attachment_urls=attachment_urls)
         yield self.item_or_request(item)