Exemplo n.º 1
0
 def processWebpage(self, webpage, dump_xhtml=False):
     if not hasattr(webpage, 'tree'):
         webpage.tree = webpage._get_parse_tree()
     from copy import copy
     self.remapLinks(webpage)
     self.tree_processor = TreeProcessor()
     #self.tree_processor.getMetaInfo(webpage)
     self.tree_processor.annotateNodes(webpage)
     self.tree_processor.clean(webpage)
     webpage.xml = self.serializeArticle(copy(webpage.tree))
     self.container.addArticle(webpage)
     if dump_xhtml:
         return webpage.xml
     del webpage.tree
     del webpage.xml