def parse_item(self, response): scope = response.xpath('//*[@id="mw-content-text"]') for sel in scope.xpath('//h1 | .//h2 | .//h3 | .//h4 | .//p | .//ol | .//ul | .//pre'): item = NuggetItem() item['content'] = sel.extract() item['title'] = response.xpath('//title/text()').extract() yield item
def parse_start_url(self, response): scope = response.xpath('//*[@id="mw-content-text"]') for sel in scope.xpath('//h1 | .//h2 | .//h3 | .//h4 | .//p | .//ol | .//ul | .//pre'): # use: '//h2[1]/preceding-sibling::p' to get intro paragraphs item = NuggetItem() item['content'] = sel.extract() item['title'] = response.xpath('//title/text()').extract() yield item
def parse_start_url(self, response): for sel in response.xpath('//h1 | //h2 | //p | //pre | //ul | //ol'): item = NuggetItem() item['content'] = sel.extract() item['title'] = response.xpath('//title/text()').extract() yield item