def _force_unicode(s): """Force ``s`` into unicode, or die trying.""" if isinstance(s, unicode_): return s elif isinstance(s, bytes_): return bytes_to_unicode(s) else: return unicode_(s)
def _parse_content(self, content, parser): wikicode = parser.parse(content) parsed_page = {'sections': []} wikilinks = [unicode_(wc.title) for wc in wikicode.ifilter_wikilinks()] parsed_page['categories'] = [ wc for wc in wikilinks if wc.startswith('Category:') ] parsed_page['wiki_links'] = [ wc for wc in wikilinks if not wc.startswith('Category:') and not wc.startswith('File:') and not wc.startswith('Image:') ] parsed_page['ext_links'] = [ unicode_(wc.url) for wc in wikicode.ifilter_external_links() ] def _filter_tags(obj): return obj.tag == 'ref' or obj.tag == 'table' bad_section_titles = {'external links', 'notes', 'references'} section_idx = 0 for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True): headings = section.filter_headings() sec = {'idx': section_idx} if section_idx == 0 or len(headings) == 1: try: sec_title = unicode_(headings[0].title) if sec_title.lower() in bad_section_titles: continue sec['title'] = sec_title sec['level'] = int(headings[0].level) except IndexError: if section_idx == 0: sec['level'] = 1 # strip out references, tables, and file/image links for obj in section.ifilter_tags(matches=_filter_tags, recursive=True): try: section.remove(obj) except Exception: continue for obj in section.ifilter_wikilinks(recursive=True): try: obj_title = unicode_(obj.title) if obj_title.startswith( 'File:') or obj_title.startswith('Image:'): section.remove(obj) except Exception: pass sec['text'] = unicode_( section.strip_code(normalize=True, collapse=True)).strip() if sec.get('title'): sec['text'] = re.sub( r'^' + re.escape(sec['title']) + r'\s*', '', sec['text']) parsed_page['sections'].append(sec) section_idx += 1 # dammit! the parser has failed us; let's handle it as best we can elif len(headings) > 1: titles = [unicode_(h.title).strip() for h in headings] levels = [int(h.level) for h in headings] sub_sections = [ unicode_(ss) for ss in re.split( r'\s*' + '|'.join(re.escape(unicode_(h)) for h in headings) + r'\s*', unicode_(section)) ] # re.split leaves an empty string result up front :shrug: if sub_sections[0] == '': del sub_sections[0] if len(headings) != len(sub_sections): LOGGER.warning('# headings = %s, but # sections = %s', len(headings), len(sub_sections)) for i, sub_section in enumerate(sub_sections): try: if titles[i].lower() in bad_section_titles: continue parsed_page['sections'].append({ 'title': titles[i], 'level': levels[i], 'idx': section_idx, 'text': strip_markup(sub_section) }) section_idx += 1 except IndexError: continue return parsed_page
def test_repr(self, corpus): repr = compat.unicode_(corpus) assert repr.startswith("Corpus") assert all("{}".format(n) in repr for n in [corpus.n_docs, corpus.n_tokens])