Exemplo n.º 1
0
 def convertToText(self):
     txts = []
     node = self.getTopNode()
     txt = node.text
     if txt and re.search('[^ \t\r\n]',txt):
         txt = HTMLParser().unescape(txt)
         txts.append(innerTrim(txt))
     for node in list(self.getTopNode()):
         txt = Parser.getText(node)
         if txt:
             txt = HTMLParser().unescape(txt)
             txts.append(innerTrim(txt))
     return '\n\n'.join(txts)
Exemplo n.º 2
0
 def getTextAndWriteToFile(self, node):
     txts = [i for i in node.itertext()]
     f = codecs.open("log.txt",'wb','utf-8')
     for line in txts:
         f.write(line)
     f.close()
     return innerTrim(u' '.join(txts).strip())
Exemplo n.º 3
0
 def convertToText(self):
     txts = []
     for node in list(self.getTopNode()):
         txt = Parser.getText(node)
         if txt:
             txt = HTMLParser().unescape(txt)
             txts.append(innerTrim(txt))
     return '\n\n'.join(txts)
Exemplo n.º 4
0
 def convertToText(self):
     txts = []
     for node in list(self.getTopNode()):
         txt = Parser.getText(node)
         if txt:
             txt = HTMLParser().unescape(txt)
             txts.append(innerTrim(txt))
     return '\n\n'.join(txts)
Exemplo n.º 5
0
 def convert_to_text(self):
     txts = []
     for node in list(self.get_top_node()):
         txt = self.parser.getText(node)
         if txt:
             txt = HTMLParser().unescape(txt)
             txt_lis = innerTrim(txt).split(r'\n')
             txts.extend(txt_lis)
     return '\n\n'.join(txts)
Exemplo n.º 6
0
 def convert_to_text(self):
     txts = []
     for node in list(self.get_top_node()):
         txt = self.parser.getText(node)
         if txt:
             txt = HTMLParser().unescape(txt)
             txt_lis = innerTrim(txt).split(r'\n')
             txts.extend(txt_lis)
     return '<br/>'.join(txts)
Exemplo n.º 7
0
    def convertToText(self,article):
        txts = []
        for node in list(self.getTopNode()):
            txt = Parser.getFormattedText(node)
            if txt:
                txt = HTMLParser().unescape(txt)
                txts.append(innerTrim(txt))
        text = '\n'.join(txts)
	text = re.sub(u'[\ufffc]','\n',text)
        lines = text.split('\n')
        text = ''
        # cutting title from article text if found in first 4 rows
        if len(lines) > 4:
            for i in range(0,4):
                if lines[i] == article.h1 or lines[i] == article.title:
                    del lines[i]
                    break
        for line in lines:
            if re.search('[^ \t\r]',line): text += line + '\n'
        return text
Exemplo n.º 8
0
 def getText(self, node):
     txts = [i for i in node.itertext()]
     return innerTrim(u' '.join(txts).strip())
Exemplo n.º 9
0
 def getText(self, node):
     txts = [i for i in node.itertext()]
     return innerTrim(u' '.join(txts).strip())
Exemplo n.º 10
0
 def clean(self, node):
     html_string = self.parser.nodeToString(node, method='html')
     clean_html_string = self.clean_html(html_string)
     return innerTrim(clean_html_string)
Exemplo n.º 11
0
 def getTextAndShowInConsole(self, node):
     txts = [i for i in node.itertext()]
     print txts
     return innerTrim(u' '.join(txts).strip())
Exemplo n.º 12
0
 def getText(self, node):
     txts = [i for i in node.itertext()]
     return innerTrim(u" ".join(txts))