Python determineEncodingLenient示例

编程语言: Python

命名空间/包名称: minds.encode_tools

方法/功能: determineEncodingLenient

hotexamples.com的示例: 3

Python determineEncodingLenient - 已找到3个示例。这些是从开源项目中提取的最受好评的minds.encode_tools.determineEncodingLenient现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： distillML.py 项目： BackupTheBerlios/mindretrieve-svn

def distill(rstream, wstream, meta):
    """ Parse the HTML doc rstream. Determine its character encoding.
        Return buf and fill in meta.
        Use heuristic to determine if it should be indexed.
        Return False if not.

        @params rstream - the input stream
        @params wstream - the distilled output stream (utf8 encoded)
        @params meta - Add the title, description and keywords fields while parsing.
                       Also add encoding (for diagnosis)
                       Other meta data like uri and timestamp is supplied by caller.
        @returns - 0 means accepted. Otherwise a tuple of reason code and an explanation string.
    """

    first_block = rstream.read(32768)
    rstream.seek(0)                                           # network stream would not support seek!?

    result = preparse_filter(first_block, meta)
    if result:
        return result

    encoding, source = encode_tools.determineEncodingLenient(meta, first_block)
    Reader = encode_tools.getreader(encoding, source)
    reader = Reader(rstream, 'replace')
    writer = codecs.getwriter('utf8')(wstream,'replace')

    meta['encoding'] = '%s [%s]' % (encoding, source)

    formatter = Formatter(writer)
    try:
        has_html, has_frameset, has_common_tag = process(reader, formatter, meta)
    except sgmllib.SGMLParseError, e:
        return (PARSE_ERROR, 'SGMLParseError: %s' % str(e)) # SGMLParseError

示例#2

显示文件

文件： distillML.py 项目： BackupTheBerlios/mindretrieve-svn

def distillTxt(rstream, wstream, meta):
    """ Similar interface to distill() for text/plain media type

        @params rstream - the input stream
        @params wstream - the output stream (meta data + content of rstream, utf8 encoded)
        @params meta - meta data like uri and timestamp is supplied by caller.
                       No title or description defined for plain text.
        @returns - 0 means accepted. Otherwise a tuple of reason code and an explanation string.
    """

    first_block = rstream.read(8192)
    rstream.seek(0)

    result = preparse_filter(first_block, meta)
    if result:
        return result

    encoding, source = encode_tools.determineEncodingLenient(meta, '')
    Reader = encode_tools.getreader(encoding, source)
    reader = Reader(rstream, 'replace')
    writer = codecs.getwriter('utf8')(wstream,'replace')

    meta['encoding'] = '%s [%s]' % (encoding, source)

    writeHeader(writer, meta)
    shutil.copyfileobj(reader, writer)

    return 0

示例#3

显示文件

文件： test_encode_tools.py 项目： BackupTheBerlios/mindretrieve-svn

    def test_determine_lenient(self):

        result = encode_tools.determineEncoding(
            {'content-type': 'text/html; charset=bad' },
            '')
        self.assertEqual(('bad',encode_tools.HTTP_CONTENT_TYPE), result)    # controlled test

        result = encode_tools.determineEncodingLenient(
            {'content-type': 'text/html; charset=bad' },
            '')
        self.assertEqual(('iso-8859-1',encode_tools.DEFAULT), result)       # return default instead of bad