Пример #1
0
    def convert(self, filename, encoding=None, mimetype=None):
        # XXX: dont read entire file into memory
        doc = open(filename, "r").read()

        # convert to unicode
        if not encoding:
            mo = charset_reg.search(doc)
            encoding = mo.group(1)
        doc = unicode(doc, encoding, "replace")
        doc = convert_entities(doc)
        result = convert_entities(html2text(doc))

        # convert back to utf-8
        return StringIO.StringIO(result.encode("utf-8")), "utf-8"
Пример #2
0
    def convert(self, filename, encoding=None, mimetype=None):
        # XXX: dont read entire file into memory
        doc = open(filename, 'r').read()

        # convert to unicode
        if not encoding:
            mo = charset_reg.search(doc)
            if mo:
                encoding = mo.group(1)
            else:
                encoding = 'UTF-8'  # UTF-8 is the new ASCII
        doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        result = convert_entities(html2text(doc))

        # convert back to utf-8
        return StringIO.StringIO(result.encode('utf-8')), 'utf-8'
Пример #3
0
    def convert(self, filename, encoding=None, mimetype=None):
        # XXX: dont read entire file into memory
        doc = open(filename, 'r').read()

        # convert to unicode
        if not encoding:
            mo = charset_reg.search(doc)
            if mo:
                encoding = mo.group(1)
            else:
                encoding = 'UTF-8' # UTF-8 is the new ASCII
        doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        result = convert_entities(html2text(doc))

        # convert back to utf-8
        return StringIO.StringIO(result.encode('utf-8')), 'utf-8'
Пример #4
0
    def convert(self, filename, encoding, mimetype):

        # XXX: dont read entire file into memory
        doc = open(filename, 'r').read()

        # Use encoding from XML preamble if present
        mo = encoding_reg.search(doc)
        if mo:
            encoding = mo.group(1)

        if not encoding:
            encoding = default_encoding

        if not isinstance(doc, unicode):
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        doc = doc.encode('utf-8')
        p = StripTagParser()
        p.feed(doc)
        p.close()
        return StringIO.StringIO(p), 'utf-8'
Пример #5
0
    def convert(self, filename, encoding, mimetype):

        # XXX: dont read entire file into memory
        doc = open(filename, 'r').read()

        # Use encoding from XML preamble if present
        mo = encoding_reg.search(doc)
        if mo:
            encoding = mo.group(1)

        if not encoding:
            encoding = default_encoding
        
        if not isinstance(doc, unicode):
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        doc = doc.encode('utf-8')
        p = StripTagParser()
        p.feed(doc)
        p.close()
        return StringIO.StringIO(p), 'utf-8'
Пример #6
0
def extract_text_from_html(text):
    if not isinstance(text, unicode):
        text = unicode(text, 'utf-8', 'replace')
    return convert_entities(html2text(convert_entities(text))).strip()
Пример #7
0
def extract_text_from_html(text):
    if not isinstance(text, unicode):
        text = unicode(text, 'utf-8', 'replace')
    return convert_entities(html2text(convert_entities(text))).strip()