Exemplo n.º 1
0
 def convertToText(self,article):
     text = Parser.getFormattedText(self.topNode)
     lines = text.split(u'\n')
     good_lines = []
     for line in lines:
         if re.search('[^ \xa0]',line): good_lines.append(line.strip())
     text = u'\n'.join(good_lines)
     Parser.adjustTopNode(article)
     return text
Exemplo n.º 2
0
    def convertToText(self,article):
        txts = []
        for node in list(self.getTopNode()):
            txt = Parser.getFormattedText(node)
            if txt:
                txt = HTMLParser().unescape(txt)
                txts.append(innerTrim(txt))
        text = '\n'.join(txts)
	text = re.sub(u'[\ufffc]','\n',text)
        lines = text.split('\n')
        text = ''
        # cutting title from article text if found in first 4 rows
        if len(lines) > 4:
            for i in range(0,4):
                if lines[i] == article.h1 or lines[i] == article.title:
                    del lines[i]
                    break
        for line in lines:
            if re.search('[^ \t\r]',line): text += line + '\n'
        return text