def auto_excerpt(self): """ Attempts to detect the text of this page (ignoring all navigation and other clutter), returning a list of strings. Each string represents a paragraph. """ from ebdata.textmining.treeutils import make_tree tree = make_tree(self.html) if self.seed.rss_full_entry: from ebdata.templatemaker.textlist import html_to_paragraph_list paras = html_to_paragraph_list(tree) else: if self.seed.strip_noise: from ebdata.templatemaker.clean import strip_template try: html2 = self.companion_page().html except IndexError: pass else: tree2 = make_tree(html2) strip_template(tree, tree2) if self.seed.guess_article_text: from ebdata.templatemaker.articletext import article_text paras = article_text(tree) else: from ebdata.templatemaker.textlist import html_to_paragraph_list paras = html_to_paragraph_list(tree) return paras
def auto_excerpt(self): """ Attempts to detect the text of this page (ignoring all navigation and other clutter), returning a list of strings. Each string represents a paragraph. """ from ebdata.textmining.treeutils import make_tree tree = make_tree(self.html) if self.seed.rss_full_entry: from ebdata.templatemaker.textlist import html_to_paragraph_list paras = html_to_paragraph_list(tree) else: if self.seed.strip_noise: from ebdata.templatemaker.clean import strip_template try: html2 = self.companion_page().html except IndexError: pass else: tree2 = make_tree(html2) strip_template(tree, tree2) if self.seed.guess_article_text: from ebdata.templatemaker.articletext import article_text paras = article_text(tree) else: from ebdata.templatemaker.textlist import html_to_paragraph_list paras = html_to_paragraph_list(tree) return paras
def assertConverts(self, html, expected): self.assertEqual(html_to_paragraph_list(make_tree(html)), expected)
def assertConverts(self, html, expected): self.assertEqual(html_to_paragraph_list(make_tree(html)), expected)