Пример #1
0
 def parse_page(self, response):
     hxs = HtmlXPathSelector(response)
     categories = stripcats(hxs.select('//title/text()').extract())
     joke_area = hxs.select('//p/text()').extract()
     for joke in joke_area:
         joke = stripjokes(joke)
         if len(joke) > 15:
             yield JokeItem(joke=joke, categories=categories)
Пример #2
0
 def parse_page(self, response):
     hxs = HtmlXPathSelector(response)
     categories = stripcats(hxs.select('//title/text()').extract())
     joke_area = hxs.select('//p/text()').extract()
     for joke in joke_area:
         joke = stripjokes(joke)
         if len(joke) > 15:
             yield JokeItem(joke=joke, categories=categories)
Пример #3
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        m = re.search(r'ym\d{2}-(\d+)\.html', response.url)
        if m:
            page = m.groups()[0]
            next_page = int(page) + 1
            url = urlparse.urljoin(response.url, str(response.url)[0:str(response.url).find('-')] + '-%s.html' % next_page)
        else:
            url = response.url

        self.log(url)
        yield Request(url, callback=self.parse)

        categories = stripcats(hxs.select('//title/text()').extract())

        for joke_box in hxs.select('//div[@id="Joke_box"]/text()').extract():
            for joke in joke_box.split('<br><br>'):
                joke = stripjokes(joke)
                if len(joke) > 15:
                    yield JokeItem(joke=joke, categories=categories)