r = re.compile(r'''\n+''', re.M | re.S) content = r.sub('\n', content) content = Utils.transform_coding(content.strip('\n')) return content def parse_response(self, response, item): html = response.content tree = lxml.etree.HTML(html) try: item.title = self.extract_title(tree) item.publishedtime = self.extract_publishedtime(tree) item.content = self.extract_content(tree) except Exception as e: item.urlhash = None if __name__ == '__main__': item = Item() url = 'http://finance.huanqiu.com/roll/2015-05/6422166.html' response = requests.get(url) e = HuanqiuExtract() e.parse_response(response, item) print item.title print item.publishedtime print item.content
elif re.match("http://weibo.com/p/.+mod=recommand_article.*", url): # 长微博 se = SinaWeiboArticleExtract() se.parse_response(response, item) elif re.match("http://weibo.com/p/.+from=huati_thread.*", url): # 话题 item.urlhash = None else: item.urlhash = None if __name__ == '__main__': item = Item() url = "http://weibo.com/1389537561/BgPtMatcA?mod=weibotime&type=comment" # url = "http://weibo.com/p/1001593739605047367339?from=singleweibo&mod=recommand_article" # url = "http://weibo.com/p/100808d93f8a84e207ec12b9514f1f97a051cb?k=%E4%BA%91%E5%8D%97%E9%B2%81%E7%94%B8%E5%8E%BF6.5%E7%BA%A7%E5%9C%B0%E9%9C%87&from=huati_thread" response = requests.get(url) response.url = url item.url = url s = SinaWeiboExtract() s.parse_response(response, item) print item.url print item.publishedtime print item.title