Пример #1
0
def _force_unicode(s):
    """Force ``s`` into unicode, or die trying."""
    if isinstance(s, unicode_):
        return s
    elif isinstance(s, bytes_):
        return bytes_to_unicode(s)
    else:
        return unicode_(s)
Пример #2
0
def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_):
        return compat.unicode_to_bytes(content)
    return content
Пример #3
0
def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_type):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_type):
        return compat.unicode_to_bytes(content)
    return content
Пример #4
0
    def __iter__(self):
        """
        Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2),
        yielding one (page id, page title, page content) 3-tuple at a time.

        Yields:
            Tuple[str, str, str]: page id, title, content with wikimedia markup
        """
        if not self.filename:
            raise IOError('{} file not found'.format(self._filename))

        if compat.is_python2 is False:
            events = ('end', )
            f = fileio.open_sesame(self.filename, mode='rt')
        else:  # Python 2 can't open bzip in text mode :(
            events = (b'end', )
            f = fileio.open_sesame(self.filename, mode='rb')
        with f:

            elems = (elem for _, elem in iterparse(f, events=events))

            elem = next(elems)
            match = re.match('^{(.*?)}', elem.tag)
            namespace = match.group(1) if match else ''
            if not namespace.startswith(
                    'http://www.mediawiki.org/xml/export-'):
                raise ValueError(
                    'namespace "{}" not a valid MediaWiki dump namespace'.
                    format(namespace))

            page_tag = '{%s}page' % namespace
            ns_path = './{%s}ns' % namespace
            page_id_path = './{%s}id' % namespace
            title_path = './{%s}title' % namespace
            text_path = './{%s}revision/{%s}text' % (namespace, namespace)

            for elem in elems:
                if elem.tag == page_tag:
                    page_id = elem.find(page_id_path).text
                    title = elem.find(title_path).text
                    ns = elem.find(ns_path).text
                    if ns != '0':
                        content = ''
                    else:
                        content = elem.find(text_path).text
                    if content is None:
                        content = ''
                    elif not isinstance(content, compat.unicode_):
                        content = compat.bytes_to_unicode(content,
                                                          errors='ignore')
                    yield page_id, title, content
                    elem.clear()
Пример #5
0
    def __iter__(self):
        """
        Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2),
        yielding one (page id, page title, page content) 3-tuple at a time.

        Yields:
            Tuple[str, str, str]: page id, title, content with wikimedia markup
        """
        if PY2 is False:
            events = ('end',)
            f = open_sesame(self.path, mode='rt')
        else:  # Python 2 can't open bzip in text mode :(
            events = (b'end',)
            f = open_sesame(self.path, mode='rb')
        with f:

            elems = (elem for _, elem in iterparse(f, events=events))

            elem = next(elems)
            match = re.match('^{(.*?)}', elem.tag)
            namespace = match.group(1) if match else ''
            if not namespace.startswith('http://www.mediawiki.org/xml/export-'):
                raise ValueError(
                    'namespace "{}" not a valid MediaWiki dump namespace'.format(namespace))

            page_tag = '{%s}page' % namespace
            ns_path = './{%s}ns' % namespace
            page_id_path = './{%s}id' % namespace
            title_path = './{%s}title' % namespace
            text_path = './{%s}revision/{%s}text' % (namespace, namespace)

            for elem in elems:
                if elem.tag == page_tag:
                    page_id = elem.find(page_id_path).text
                    title = elem.find(title_path).text
                    ns = elem.find(ns_path).text
                    if ns != '0':
                        content = ''
                    else:
                        content = elem.find(text_path).text
                    if not isinstance(content, unicode_type):
                        content = bytes_to_unicode(content, errors='ignore')
                    yield page_id, title, content
                    elem.clear()