예제 #1
0
 def parse_text(self, response):
     selector = scrapy.Selector(response)
     title = selector.xpath("//title/text()").extract()[0]  #得到页面的标题
     url = response.url  #页面链接
     #content=selector.xpath("string(//form[@name='form124904a'])").extract()[0].replace(u'\r\n','').replace(u'\xa0',u'')
     content1 = selector.css(
         "form[name='form124904a'] *:not(style)::text").extract()  #得到页面的内容
     content = "".join(content1).replace(u'\r\n', '').replace(
         u'\xa0', u'').replace(' ', '').replace('\'', '').replace('\"', '')
     item = JimeiItem()
     item['url'] = url
     item['title'] = title
     item['content'] = content
     yield item
예제 #2
0
 def parse_text(self, response):
     selector = scrapy.Selector(response)
     title = selector.xpath("//title/text()").extract()[0]  #得到页面的标题
     url = response.url  #页面链接
     content = selector.css(
         "div[id='vsb_content'] *:not(style)::text").extract()  #得到页面的内容
     content = "".join(content).replace(u'\r\n', '').replace(
         u'\xa0', u'').replace(' ', '').replace('\'', '').replace('\"', '')
     #filename="infos/ae.txt"
     #with open(filename, 'a+',encoding='utf-8') as f:
     #f.write(url+title+'\n')
     item = JimeiItem()
     item['url'] = url
     item['title'] = title
     item['content'] = content
     yield item