예제 #1
0
        r = re.compile(r'''\n+''', re.M | re.S)
        content = r.sub('\n', content)

        content = Utils.transform_coding(content.strip('\n'))
        return content


    def parse_response(self, response, item):
        html = response.content
        tree = lxml.etree.HTML(html)

        try:
            item.title = self.extract_title(tree)
            item.publishedtime = self.extract_publishedtime(tree)
            item.content = self.extract_content(tree)
        except Exception as e:
            item.urlhash = None



if __name__ == '__main__':
    item = Item()
    url = 'http://finance.huanqiu.com/roll/2015-05/6422166.html'
    response = requests.get(url)

    e = HuanqiuExtract()
    e.parse_response(response, item)

    print item.title
    print item.publishedtime
    print item.content
예제 #2
0
        elif re.match("http://weibo.com/p/.+mod=recommand_article.*", url):  # 长微博
            se = SinaWeiboArticleExtract()
            se.parse_response(response, item)

        elif re.match("http://weibo.com/p/.+from=huati_thread.*", url):  # 话题
            item.urlhash = None

        else:
            item.urlhash = None




if __name__ == '__main__':
    item = Item()

    url = "http://weibo.com/1389537561/BgPtMatcA?mod=weibotime&type=comment"
    # url = "http://weibo.com/p/1001593739605047367339?from=singleweibo&mod=recommand_article"
    # url = "http://weibo.com/p/100808d93f8a84e207ec12b9514f1f97a051cb?k=%E4%BA%91%E5%8D%97%E9%B2%81%E7%94%B8%E5%8E%BF6.5%E7%BA%A7%E5%9C%B0%E9%9C%87&from=huati_thread"

    response = requests.get(url)
    response.url = url
    item.url = url

    s = SinaWeiboExtract()
    s.parse_response(response, item)

    print item.url
    print item.publishedtime
    print item.title