Пример #1
0
    def parse(self, response):
        global cnt
        hxs = HtmlXPathSelector(response)
        sites = hxs.x('//table/tbody')
        items = []
        # 标记是哪个用户
        user = hxs.x('//head/title/text()').extract()[0][:-7].encode('utf-8')
        # 标记用户总共有多少条收听记录
        sum = hxs.x('//span').extract()[-3].encode('utf-8').split('共')[1].split('条')[0]
        currentPage = hxs.x('//span').extract()[-3].encode('utf-8').split('第')[1].split('页')[0]

        #for site in sites:
        if int(currentPage) <= int(sum) / 50:
            for i in range(1, 50):
                    item = XiamiItem()
                    item['user'] = user
                    item['song'] = sites.x('tr[' + str(i) + ']/td[2]/a').extract()[0].split('\"')[3].encode('utf-8')
                    print '_______________' + item['song']
                    item['artist'] = sites.x('tr[' + str(i) + ']/td[2]/a/text()').extract()[1].encode('utf-8')
                    print '+++++++++++++++' + item['artist']
                    items.append(item)
                    #yield item
            return items
        #yield items
        """ 
Пример #2
0
def parse(self, response):
    items = []
    hxs = HtmlXPathSelector(response)
    posts = hxs.x('//h1/a/@href').extract()
    items.extend([
        self.make_requests_from_url(url).replace(callback=self.parse_post)
        for url in posts
    ])

    page_links = hxs.x('//div[@class="wp-pagenavi"]/a[not(@title)]')
    for link in page_links:
        if link.x('text()').extract()[0] == u'\xbb':
            url = link.x('@href').extract()[0]
            items.append(self.make_requests_from_url(url))

    return items
Пример #3
0
 def parse(self, response):
     self.log("OK,%s"%response.url)
     hxs = HtmlXPathSelector(response)
     #将文章的链接继续进行处理
     divs = hxs.x('//div[@class="publicLeftCon mt10"]')
     for div in divs:
         url = div.x('h5/a/@href').extract()[0]
         yield self.make_requests_from_url(url).replace(callback=self.parse_content)
     #将下一页的链接继续进行处理
     try:
         next_url = hxs.x('//div[@id="project_left"]/div[@class="publicMiddleLine"]/span/a[b="下一页"]/@href').extract()[0]
     except Exception:
         return
     next_url = 'http://article.yeeyan.org'+next_url
   #  if self.count==10:
   #      return
   #  self.count+=1
     yield self.make_requests_from_url(next_url).replace(callback=self.parse)
Пример #4
0
 def parse(self, response):
     self.log("OK,%s" % response.url)
     hxs = HtmlXPathSelector(response)
     # 将文章的链接继续进行处理
     divs = hxs.x('//div[@class="publicLeftCon mt10"]')
     for div in divs:
         url = div.x('h5/a/@href').extract()[0]
         yield self.make_requests_from_url(url).replace(
             callback=self.parse_content)
     # 将下一页的链接继续进行处理
     try:
         next_url = \
         hxs.x('//div[@id="project_left"]/div[@class="publicMiddleLine"]/span/a[b="下一页"]/@href').extract()[0]
     except Exception:
         return
     next_url = 'http://article.yeeyan.org' + next_url
     #  if self.count==10:
     #      return
     #  self.count+=1
     yield self.make_requests_from_url(next_url).replace(
         callback=self.parse)
Пример #5
0
 def parse_jx(self, item, response):
     hxs = HtmlXPathSelector(response)
     item['url'] = response.url
     item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
     div = hxs.x('//div[@class="jxar_author"]')
     item['author'] = div.x('.//a/text()').extract()[0]
     item['release_time'] = hxs.x('//p[@class="jxa_info"]/span[1]/text()').extract()[0]
     try:
         item['excerpt'] = hxs.x('//p[@class="jxa_intro"]/text()').extract()[0]
     except Exception:
         item['excerpt'] = None
     item['category'] = hxs.x('//p[@class="jxa_map"]/text()').extract()[1].split()[1]
     item['content_html'] = hxs.x('//div[@class="jxa_content"]').extract()[0]
     return item
Пример #6
0
 def parse_content(self, response):
     hxs = HtmlXPathSelector(response)
     item = YeeyanItem()
     if hxs.x('//a[@class="jx_logo"]/text()'):
         item = self.parse_jx(item, response)
     else:
         item['url'] = response.url
         item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
         div = hxs.x('//div[@class="user_info"]')
         item['author'] = div.x('.//h2/a/text()').extract()[0]
         item['excerpt'] = hxs.x('//p[@class="excerpt"]/text()').extract()
         if item['excerpt']:
             item['excerpt'] = item['excerpt'][0]
         else:
             item['excerpt'] = ''
         item['content_html'] = hxs.x('//div[@id="conBox"]').extract()[0]
         item['release_time'] = div.x('.//p/text()').extract()[0].strip()[1:-7]
         item['category'] = hxs.x('//div[@class="crumb"]/a/text()').extract()[1]
     return item
Пример #7
0
 def parse_jx(self, item, response):
     hxs = HtmlXPathSelector(response)
     item['url'] = response.url
     item['title'] = hxs.x('//title/text()').extract()[0].split(
         '|')[1].strip()
     div = hxs.x('//div[@class="jxar_author"]')
     item['author'] = div.x('.//a/text()').extract()[0]
     item['release_time'] = hxs.x(
         '//p[@class="jxa_info"]/span[1]/text()').extract()[0]
     try:
         item['excerpt'] = hxs.x(
             '//p[@class="jxa_intro"]/text()').extract()[0]
     except Exception:
         item['excerpt'] = None
     item['category'] = hxs.x(
         '//p[@class="jxa_map"]/text()').extract()[1].split()[1]
     item['content_html'] = hxs.x(
         '//div[@class="jxa_content"]').extract()[0]
     return item
Пример #8
0
 def parse_content(self, response):
     hxs = HtmlXPathSelector(response)
     item = YeeyanItem()
     if hxs.x('//a[@class="jx_logo"]/text()'):
         item = self.parse_jx(item, response)
     else:
         item['url'] = response.url
         item['title'] = hxs.x('//title/text()').extract()[0].split(
             '|')[1].strip()
         div = hxs.x('//div[@class="user_info"]')
         item['author'] = div.x('.//h2/a/text()').extract()[0]
         item['excerpt'] = hxs.x('//p[@class="excerpt"]/text()').extract()
         if item['excerpt']:
             item['excerpt'] = item['excerpt'][0]
         else:
             item['excerpt'] = ''
         item['content_html'] = hxs.x('//div[@id="conBox"]').extract()[0]
         item['release_time'] = div.x(
             './/p/text()').extract()[0].strip()[1:-7]
         item['category'] = hxs.x(
             '//div[@class="crumb"]/a/text()').extract()[1]
     return item