def test_fetch_page(self): url = 'http://cn.wsj.com/gb/20100710/bus094831.asp?source=rss' accessed_url = url[:url.rfind('/') + 1] if url[-1] != '/' else url[:url[:-1].rfind('/') + 1] print accessed_url cut_content_from = '<!content_tag txt>' cut_content_to = '<!/content_tag txt>' resp = urllib2.urlopen(url) html_content = resp.read() charset = fetchpage.get_charset(html_content) content_tag_start = html_content.find(cut_content_from) content_tag_end = html_content.find(cut_content_to, content_tag_start) content = html_content[content_tag_start + len('<!content_tag txt>'):content_tag_end].decode(charset) content = remove_scriptag(content, 'script') content = replace_url(content, accessed_url) print content
def test_get_page(self): print 'start' url = 'http://cn.wsj.com/gb/20100721/rth080855.asp?source=rss' html_content = urllib2.urlopen(url).read() cut_content_from = '<!content_tag txt>' cut_content_to = '<!/content_tag txt>' charset = fetchpage.get_charset(html_content) content_tag_start = html_content.find(cut_content_from) content_tag_end = html_content.find(cut_content_to, content_tag_start) content = html_content[content_tag_start + len(cut_content_from):content_tag_end].decode(charset, 'ignore') content = fetchpage.remove_scriptag(content) accessed_url = url[:url.rfind('/') + 1] if url[-1] != '/' else url[:url[:-1].rfind('/') + 1] content = fetchpage.replace_url(content, accessed_url) print 'finished' print content print 'end' pass