Exemplo n.º 1
0
 def auto_excerpt(self):
     """
     Attempts to detect the text of this page (ignoring all navigation and
     other clutter), returning a list of strings. Each string represents a
     paragraph.
     """
     from ebdata.textmining.treeutils import make_tree
     tree = make_tree(self.html)
     if self.seed.rss_full_entry:
         from ebdata.templatemaker.textlist import html_to_paragraph_list
         paras = html_to_paragraph_list(tree)
     else:
         if self.seed.strip_noise:
             from ebdata.templatemaker.clean import strip_template
             try:
                 html2 = self.companion_page().html
             except IndexError:
                 pass
             else:
                 tree2 = make_tree(html2)
                 strip_template(tree, tree2)
         if self.seed.guess_article_text:
             from ebdata.templatemaker.articletext import article_text
             paras = article_text(tree)
         else:
             from ebdata.templatemaker.textlist import html_to_paragraph_list
             paras = html_to_paragraph_list(tree)
     return paras
Exemplo n.º 2
0
 def auto_excerpt(self):
     """
     Attempts to detect the text of this page (ignoring all navigation and
     other clutter), returning a list of strings. Each string represents a
     paragraph.
     """
     from ebdata.textmining.treeutils import make_tree
     tree = make_tree(self.html)
     if self.seed.rss_full_entry:
         from ebdata.templatemaker.textlist import html_to_paragraph_list
         paras = html_to_paragraph_list(tree)
     else:
         if self.seed.strip_noise:
             from ebdata.templatemaker.clean import strip_template
             try:
                 html2 = self.companion_page().html
             except IndexError:
                 pass
             else:
                 tree2 = make_tree(html2)
                 strip_template(tree, tree2)
         if self.seed.guess_article_text:
             from ebdata.templatemaker.articletext import article_text
             paras = article_text(tree)
         else:
             from ebdata.templatemaker.textlist import html_to_paragraph_list
             paras = html_to_paragraph_list(tree)
     return paras
Exemplo n.º 3
0
 def assertConverts(self, html, expected):
     self.assertEqual(html_to_paragraph_list(make_tree(html)), expected)
Exemplo n.º 4
0
 def assertConverts(self, html, expected):
     self.assertEqual(html_to_paragraph_list(make_tree(html)), expected)