Python bytes_to_unicode примеры использования

Язык программирования: Python

Пространство имен/Пакет: textacy.compat

Метод/Функция: bytes_to_unicode

Примеров на hotexamples.com: 5

Python bytes_to_unicode - 5 примеров найдено. Это лучшие примеры Python кода для textacy.compat.bytes_to_unicode, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: similarity.py Проект: dchllngr/textacy

def _force_unicode(s):
    """Force ``s`` into unicode, or die trying."""
    if isinstance(s, unicode_):
        return s
    elif isinstance(s, bytes_):
        return bytes_to_unicode(s)
    else:
        return unicode_(s)

Пример #2

Показать файл

Файл: utils.py Проект: winstonewert/textacy

def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_):
        return compat.unicode_to_bytes(content)
    return content

Пример #3

Показать файл

Файл: utils.py Проект: chartbeat-labs/textacy

def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_type):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_type):
        return compat.unicode_to_bytes(content)
    return content

Пример #4

Показать файл

Файл: wikipedia.py Проект: rsesha/textacy

    def __iter__(self):
        """
        Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2),
        yielding one (page id, page title, page content) 3-tuple at a time.

        Yields:
            Tuple[str, str, str]: page id, title, content with wikimedia markup
        """
        if not self.filename:
            raise IOError('{} file not found'.format(self._filename))

        if compat.is_python2 is False:
            events = ('end', )
            f = fileio.open_sesame(self.filename, mode='rt')
        else:  # Python 2 can't open bzip in text mode :(
            events = (b'end', )
            f = fileio.open_sesame(self.filename, mode='rb')
        with f:

            elems = (elem for _, elem in iterparse(f, events=events))

            elem = next(elems)
            match = re.match('^{(.*?)}', elem.tag)
            namespace = match.group(1) if match else ''
            if not namespace.startswith(
                    'http://www.mediawiki.org/xml/export-'):
                raise ValueError(
                    'namespace "{}" not a valid MediaWiki dump namespace'.
                    format(namespace))

            page_tag = '{%s}page' % namespace
            ns_path = './{%s}ns' % namespace
            page_id_path = './{%s}id' % namespace
            title_path = './{%s}title' % namespace
            text_path = './{%s}revision/{%s}text' % (namespace, namespace)

            for elem in elems:
                if elem.tag == page_tag:
                    page_id = elem.find(page_id_path).text
                    title = elem.find(title_path).text
                    ns = elem.find(ns_path).text
                    if ns != '0':
                        content = ''
                    else:
                        content = elem.find(text_path).text
                    if content is None:
                        content = ''
                    elif not isinstance(content, compat.unicode_):
                        content = compat.bytes_to_unicode(content,
                                                          errors='ignore')
                    yield page_id, title, content
                    elem.clear()

Пример #5

Показать файл

Файл: wiki_reader.py Проект: chartbeat-labs/textacy

    def __iter__(self):
        """
        Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2),
        yielding one (page id, page title, page content) 3-tuple at a time.

        Yields:
            Tuple[str, str, str]: page id, title, content with wikimedia markup
        """
        if PY2 is False:
            events = ('end',)
            f = open_sesame(self.path, mode='rt')
        else:  # Python 2 can't open bzip in text mode :(
            events = (b'end',)
            f = open_sesame(self.path, mode='rb')
        with f:

            elems = (elem for _, elem in iterparse(f, events=events))

            elem = next(elems)
            match = re.match('^{(.*?)}', elem.tag)
            namespace = match.group(1) if match else ''
            if not namespace.startswith('http://www.mediawiki.org/xml/export-'):
                raise ValueError(
                    'namespace "{}" not a valid MediaWiki dump namespace'.format(namespace))

            page_tag = '{%s}page' % namespace
            ns_path = './{%s}ns' % namespace
            page_id_path = './{%s}id' % namespace
            title_path = './{%s}title' % namespace
            text_path = './{%s}revision/{%s}text' % (namespace, namespace)

            for elem in elems:
                if elem.tag == page_tag:
                    page_id = elem.find(page_id_path).text
                    title = elem.find(title_path).text
                    ns = elem.find(ns_path).text
                    if ns != '0':
                        content = ''
                    else:
                        content = elem.find(text_path).text
                    if not isinstance(content, unicode_type):
                        content = bytes_to_unicode(content, errors='ignore')
                    yield page_id, title, content
                    elem.clear()