def parse_text(self, response): selector = scrapy.Selector(response) title = selector.xpath("//title/text()").extract()[0] #得到页面的标题 url = response.url #页面链接 #content=selector.xpath("string(//form[@name='form124904a'])").extract()[0].replace(u'\r\n','').replace(u'\xa0',u'') content1 = selector.css( "form[name='form124904a'] *:not(style)::text").extract() #得到页面的内容 content = "".join(content1).replace(u'\r\n', '').replace( u'\xa0', u'').replace(' ', '').replace('\'', '').replace('\"', '') item = JimeiItem() item['url'] = url item['title'] = title item['content'] = content yield item
def parse_text(self, response): selector = scrapy.Selector(response) title = selector.xpath("//title/text()").extract()[0] #得到页面的标题 url = response.url #页面链接 content = selector.css( "div[id='vsb_content'] *:not(style)::text").extract() #得到页面的内容 content = "".join(content).replace(u'\r\n', '').replace( u'\xa0', u'').replace(' ', '').replace('\'', '').replace('\"', '') #filename="infos/ae.txt" #with open(filename, 'a+',encoding='utf-8') as f: #f.write(url+title+'\n') item = JimeiItem() item['url'] = url item['title'] = title item['content'] = content yield item