Пример #1
0
 def parse_item(self, response):
     item = PageItem()
     text_html = response.xpath('//text()')
     text = text_html.extract()
     item['text'] = text
     item['url'] = response.url
     yield item
Пример #2
0
 def parse_page(self, response):
     self.logger.info('xue---jun------') 
     self.logger.info(response.url) 
     item = PageItem()
     item['url'] = response.url
     item['text'] = ' '.join(response.xpath('//text()').extract())
     yield item
Пример #3
0
 def parse_page(self, response):
     item = PageItem()
     item['url'] = response.url
     nom = re.sub('<[^>]*>', ' ', response.text)
     nos = re.sub('\s+', ' ', nom)
     item['text'] = nos
     yield item
Пример #4
0
 def parse_page(self, response):
     print('-------------------------')
     item = PageItem()
     item['url'] = response.url
     print('-----------', item['url'])
     item['text'] = ' '.join(response.xpath('//text()').extract())
     yield item
Пример #5
0
    def parse_page(self, response):
        print(response.url)

        item = PageItem()
        item['url'] = str(response.url)
        item['text'] = response.xpath('//div[@class="body"]').extract_first()
        #item['body'] = re.sub('<.*?>|\s','',body)
        yield item
Пример #6
0
 def parse_page(self, response):
     item = PageItem()
     #        for res in response.xpath('//div[@id="user-s-guide"]'):
     #           item['url'] = response.urljoin(res.xpath('.//a/@href').extract_first())
     #          item['text'] = res.xpath('.//p/text()').extract_first()
     item['url'] = response.url
     item['text'] = response.xpath('//text()').extract()
     yield item
Пример #7
0
 def parse_page(self, response):
     yield PageItem({
         'url':
         response.url,
         'text':
         re.sub(r'<.*?>', '',
                response.xpath('//div[@role="main"]').extract_first())
     })
Пример #8
0
 def parse_item(self, response):
     item = PageItem()
     """
     url: 当前页面的url
     text: 当前页面的文本
     """
     item['url'] = response.url
     item['text'] = ' '.join(response.xpath('//text()').extract_first())
     yield item
Пример #9
0
 def parse_page(self, response):
     item = PageItem()
     item['url'] = response.url
     text = ' '.join(response.xpath('//text()').extract())
     item['text'] = text
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     yield item
Пример #10
0
    def parse_page(self, response):
        item = PageItem()
        """
        TODO:补充 url 和 text 的解析规则
        """
        item['url'] = response.url

        item['text'] = response.css('div.body *::text').extract()

        yield item
Пример #11
0
    def parse_page(self, response):
        raw = ''.join(response.xpath('//text()').extract())
        # response.xpath('/html/body/div[1]/div[2]/div[1]/div/div/text()').extract_first()
        # re.sub(re.compile(r'<[^>]+>'), "", raw)
        # re.sub(re.compile('\s+'), "\s", raw)

        item = PageItem({
            'url': response.url,
            'text': raw,
        })
        yield item
Пример #12
0
 def parse_page(self, response):
     item = PageItem()
     """
     TODO:补充 url 和 text 的解析规则
     """
     item['url'] = response.url
     item['text'] = '\n'.join(response.css('::text').extract())
     '''
     for i in response.css('div.body'):
         t = i.xpath('.//*/text()').extract_first().strip()
         if t:
             item['text'] += (t + '\n')
     '''
     yield item
Пример #13
0
    def parse_item(self, response):
        #item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        #return item
        self.index = self.index + 1
        print("********************* {}  ************************".format(
            self.index))
        item = PageItem()
        url = str(response.url)
        #content=str(response.body)
        item['url'] = url
        item['content'] = (' '.join(
            response.xpath('//text()').extract()))[0:250]

        yield item
Пример #14
0
    def parse_page(self, response):
        item = PageItem()
        #i = {}
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        #return i
        """
        todo 补充url和text的解析规则
        """
        item['url'] = response.url
        item['text'] = ' '.join(response.xpath('//text()').extract())
#        text_h1 = response.xpath('//div[@class="body"]/div[@class="section"]/h1/text()').extract()

#       text_h1_p = response.xpath('//div[@class="body"]/div[@class="section"]/p/text()').extract()

#        text_h2_p = response.xpath('//div[@class="body"]/div[@class="section"]/div[@class="section"]/p/text()').extract()
#        text_h2 = response.xpath(' //div[@class="body"]/div[@class="section"]/div[@class="section"]/h2/text()').extract()
        
#        item['text'] = text_h1+text_h1_p+text_h2+text_h2_p
        
        yield item
Пример #15
0
    def page_parse(self, response):
        item = PageItem()

        item['url'] = response.url
        item['text'] = ' '.join(response.xpath('//text()').extract())
        yield item
Пример #16
0
 def parse_page(self, response):
     item = PageItem()
     item['url'] = response.url
     item['text'] = response.text 
     yield item
Пример #17
0
 def parse_page(self, response):
     item = PageItem()
     print(type(item))
     item['url'] = response.url
     item['text'] = response.css('div.body').extract_first()
     yield item
Пример #18
0
 def parse_page(self, response):
     item = PageItem()
     item['url'] = response.url
     item['text'] = response.css('html').extract()
     yield item
Пример #19
0
 def parse_page(self, response):
     item = PageItem()
     item['url'] = str(response.url)
     item['text'] = response.xpath(
         "//div[@class='body']/div").extract_first()
     yield item
Пример #20
0
 def parse_page(self, response):
     item=PageItem()
     item['url']=response.url
     item['text']=' '.join(response.xpath('//text()').extract())  #//代表全局搜索,//text()是匹配全文内容
     yield item
Пример #21
0
 def parse_page(self, response):
     item = PageItem()
     item['url'] = response.url
     #item['text1']=response.xpath('//*/text()').re('(\S+)')
     item['text'] = response.xpath('//text()').extract()
     return item