def getArticleHtml(name, link, index_loc): soup = readee.export(link) funcs = [ lambda x: x.find('div', {'property': 'articleBody'}), lambda x: x.find('article'), lambda x: x.find('div', {'id': 'story-body'}), ] for f in funcs: new_soup = f(soup) if new_soup: soup = new_soup for item in soup.find_all('h2'): new_item = fact().new_tag('h4') new_item.string = item.text item.replace_with(new_item) if len(soup.text) < 100: return return ''' <html> <body> <title>%s</title> <h1>%s</h1> <div><a href="%s">返回目录</a></div> %s <div><br/><a href="%s">原文</a></div> <div><br/><a href="%s">返回目录</a></div> </body> </html> ''' % (name, name, index_loc, str(soup), link, index_loc)
def getArticleHtml(name, link, index_loc): content = None if 'bbc' in link: content = cached_url.get(link, force_cache=True, sleep = 5) args = {} if 'twreporter.org/' in link: args['toSimplified'] = True soup = readee.export(link, content = content, **args) funcs = [ lambda x: x.find('div', {'property': 'articleBody'}), lambda x: x.find('article'), lambda x: x.find('div', {'id': 'story-body'}), ] for f in funcs: new_soup = f(soup) if new_soup: soup = new_soup for item in soup.find_all('h2'): new_item = fact().new_tag('h4') new_item.string = item.text item.replace_with(new_item) if len(soup.text) < 100: return return ''' <html> <body> <title>%s</title> <h1>%s</h1> <div><a href="%s">返回目录</a></div> %s <div><br/><a href="%s">原文</a></div> <div><br/><a href="%s">返回目录</a></div> </body> </html> ''' % (name, name, index_loc, str(soup), link, index_loc)
def test(): for url in urls: print('原文:', url) name = getFileName(url) with open(name, 'w') as f: f.write(str(readee.export(url, toSimplified=False))) print('导出:', name) os.system('open ' + name + ' -g')
def get(path): content = cached_url.get(path) b = readee.export(path, content=content) result = Result() result.imgs = getImgs(b) result.cap = getCap(b) result.video = getVideo(b) return result
def check(link): try: content = cached_url.get(link, force_cache=True) except: return False soup = readee.export(link, content=content) if 200 < cnWordCount(soup.text) < 2500: return True return False
def crawl(name): url = WIKI_PREFIX + name content = readee.export(url) for item in content.find_all('div', class_='thumbcaption'): if item.parent.name != 'figcaption': item.decompose() else: item.replaceWith(item.text) with open('result/' + name + '.html', 'w') as f: f.write(str(content))
def getArticleHtml(name, link_list, index_loc): result = '' for link in link_list: soup = readee.export(link) if len(soup.text) < 100: continue result += str(soup) if not result: return return getHtml( name, ''' %s <div><br/><a href="%s">原文</a></div> ''' % (result, link), index_loc)
def _getArticle(url, toSimplified=False, force_cache=False, noAutoConvert=False): content = getContent(url, force_cache=force_cache) soup = BeautifulSoup(_trimWebpage(content), 'html.parser') article_url = _findUrl(url, soup) doc = Document(content) title = _findTitle(soup, doc) to_simplify_calculated = calculateToSimplified(toSimplified, noAutoConvert, title) article = _Article( title, _findAuthor(soup), readee.export(url, content=content, list_replace=True, toSimplified=to_simplify_calculated), article_url) if to_simplify_calculated: article.title = cc.convert(article.title) article.author = cc.convert(article.author) return article