def parse_item(self, response): item = PageItem() text_html = response.xpath('//text()') text = text_html.extract() item['text'] = text item['url'] = response.url yield item
def parse_page(self, response): self.logger.info('xue---jun------') self.logger.info(response.url) item = PageItem() item['url'] = response.url item['text'] = ' '.join(response.xpath('//text()').extract()) yield item
def parse_page(self, response): item = PageItem() item['url'] = response.url nom = re.sub('<[^>]*>', ' ', response.text) nos = re.sub('\s+', ' ', nom) item['text'] = nos yield item
def parse_page(self, response): print('-------------------------') item = PageItem() item['url'] = response.url print('-----------', item['url']) item['text'] = ' '.join(response.xpath('//text()').extract()) yield item
def parse_page(self, response): print(response.url) item = PageItem() item['url'] = str(response.url) item['text'] = response.xpath('//div[@class="body"]').extract_first() #item['body'] = re.sub('<.*?>|\s','',body) yield item
def parse_page(self, response): item = PageItem() # for res in response.xpath('//div[@id="user-s-guide"]'): # item['url'] = response.urljoin(res.xpath('.//a/@href').extract_first()) # item['text'] = res.xpath('.//p/text()').extract_first() item['url'] = response.url item['text'] = response.xpath('//text()').extract() yield item
def parse_page(self, response): yield PageItem({ 'url': response.url, 'text': re.sub(r'<.*?>', '', response.xpath('//div[@role="main"]').extract_first()) })
def parse_item(self, response): item = PageItem() """ url: 当前页面的url text: 当前页面的文本 """ item['url'] = response.url item['text'] = ' '.join(response.xpath('//text()').extract_first()) yield item
def parse_page(self, response): item = PageItem() item['url'] = response.url text = ' '.join(response.xpath('//text()').extract()) item['text'] = text #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() yield item
def parse_page(self, response): item = PageItem() """ TODO:补充 url 和 text 的解析规则 """ item['url'] = response.url item['text'] = response.css('div.body *::text').extract() yield item
def parse_page(self, response): raw = ''.join(response.xpath('//text()').extract()) # response.xpath('/html/body/div[1]/div[2]/div[1]/div/div/text()').extract_first() # re.sub(re.compile(r'<[^>]+>'), "", raw) # re.sub(re.compile('\s+'), "\s", raw) item = PageItem({ 'url': response.url, 'text': raw, }) yield item
def parse_page(self, response): item = PageItem() """ TODO:补充 url 和 text 的解析规则 """ item['url'] = response.url item['text'] = '\n'.join(response.css('::text').extract()) ''' for i in response.css('div.body'): t = i.xpath('.//*/text()').extract_first().strip() if t: item['text'] += (t + '\n') ''' yield item
def parse_item(self, response): #item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() #return item self.index = self.index + 1 print("********************* {} ************************".format( self.index)) item = PageItem() url = str(response.url) #content=str(response.body) item['url'] = url item['content'] = (' '.join( response.xpath('//text()').extract()))[0:250] yield item
def parse_page(self, response): item = PageItem() #i = {} #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() #return i """ todo 补充url和text的解析规则 """ item['url'] = response.url item['text'] = ' '.join(response.xpath('//text()').extract()) # text_h1 = response.xpath('//div[@class="body"]/div[@class="section"]/h1/text()').extract() # text_h1_p = response.xpath('//div[@class="body"]/div[@class="section"]/p/text()').extract() # text_h2_p = response.xpath('//div[@class="body"]/div[@class="section"]/div[@class="section"]/p/text()').extract() # text_h2 = response.xpath(' //div[@class="body"]/div[@class="section"]/div[@class="section"]/h2/text()').extract() # item['text'] = text_h1+text_h1_p+text_h2+text_h2_p yield item
def page_parse(self, response): item = PageItem() item['url'] = response.url item['text'] = ' '.join(response.xpath('//text()').extract()) yield item
def parse_page(self, response): item = PageItem() item['url'] = response.url item['text'] = response.text yield item
def parse_page(self, response): item = PageItem() print(type(item)) item['url'] = response.url item['text'] = response.css('div.body').extract_first() yield item
def parse_page(self, response): item = PageItem() item['url'] = response.url item['text'] = response.css('html').extract() yield item
def parse_page(self, response): item = PageItem() item['url'] = str(response.url) item['text'] = response.xpath( "//div[@class='body']/div").extract_first() yield item
def parse_page(self, response): item=PageItem() item['url']=response.url item['text']=' '.join(response.xpath('//text()').extract()) #//代表全局搜索,//text()是匹配全文内容 yield item
def parse_page(self, response): item = PageItem() item['url'] = response.url #item['text1']=response.xpath('//*/text()').re('(\S+)') item['text'] = response.xpath('//text()').extract() return item