Пример #1
0
def _force_unicode(s):
    """Force ``s`` into unicode, or die trying."""
    if isinstance(s, unicode_):
        return s
    elif isinstance(s, bytes_):
        return bytes_to_unicode(s)
    else:
        return unicode_(s)
Пример #2
0
    def _parse_content(self, content, parser):
        wikicode = parser.parse(content)
        parsed_page = {'sections': []}

        wikilinks = [unicode_(wc.title) for wc in wikicode.ifilter_wikilinks()]
        parsed_page['categories'] = [
            wc for wc in wikilinks if wc.startswith('Category:')
        ]
        parsed_page['wiki_links'] = [
            wc for wc in wikilinks if not wc.startswith('Category:')
            and not wc.startswith('File:') and not wc.startswith('Image:')
        ]
        parsed_page['ext_links'] = [
            unicode_(wc.url) for wc in wikicode.ifilter_external_links()
        ]

        def _filter_tags(obj):
            return obj.tag == 'ref' or obj.tag == 'table'

        bad_section_titles = {'external links', 'notes', 'references'}
        section_idx = 0

        for section in wikicode.get_sections(flat=True,
                                             include_lead=True,
                                             include_headings=True):
            headings = section.filter_headings()
            sec = {'idx': section_idx}

            if section_idx == 0 or len(headings) == 1:
                try:
                    sec_title = unicode_(headings[0].title)
                    if sec_title.lower() in bad_section_titles:
                        continue
                    sec['title'] = sec_title
                    sec['level'] = int(headings[0].level)
                except IndexError:
                    if section_idx == 0:
                        sec['level'] = 1
                # strip out references, tables, and file/image links
                for obj in section.ifilter_tags(matches=_filter_tags,
                                                recursive=True):
                    try:
                        section.remove(obj)
                    except Exception:
                        continue
                for obj in section.ifilter_wikilinks(recursive=True):
                    try:
                        obj_title = unicode_(obj.title)
                        if obj_title.startswith(
                                'File:') or obj_title.startswith('Image:'):
                            section.remove(obj)
                    except Exception:
                        pass
                sec['text'] = unicode_(
                    section.strip_code(normalize=True, collapse=True)).strip()
                if sec.get('title'):
                    sec['text'] = re.sub(
                        r'^' + re.escape(sec['title']) + r'\s*', '',
                        sec['text'])
                parsed_page['sections'].append(sec)
                section_idx += 1

            # dammit! the parser has failed us; let's handle it as best we can
            elif len(headings) > 1:
                titles = [unicode_(h.title).strip() for h in headings]
                levels = [int(h.level) for h in headings]
                sub_sections = [
                    unicode_(ss) for ss in re.split(
                        r'\s*' +
                        '|'.join(re.escape(unicode_(h)) for h in headings) +
                        r'\s*', unicode_(section))
                ]
                # re.split leaves an empty string result up front :shrug:
                if sub_sections[0] == '':
                    del sub_sections[0]
                if len(headings) != len(sub_sections):
                    LOGGER.warning('# headings = %s, but # sections = %s',
                                   len(headings), len(sub_sections))
                for i, sub_section in enumerate(sub_sections):
                    try:
                        if titles[i].lower() in bad_section_titles:
                            continue
                        parsed_page['sections'].append({
                            'title':
                            titles[i],
                            'level':
                            levels[i],
                            'idx':
                            section_idx,
                            'text':
                            strip_markup(sub_section)
                        })
                        section_idx += 1
                    except IndexError:
                        continue

        return parsed_page
Пример #3
0
 def test_repr(self, corpus):
     repr = compat.unicode_(corpus)
     assert repr.startswith("Corpus")
     assert all("{}".format(n) in repr
                for n in [corpus.n_docs, corpus.n_tokens])