def parse_ja_characters(text, full_source, visited, depth) -> Iterable[CharacterJAName]: pairs = re.findall( r";\s*(.+?)\n:\s*(?:声|\[\[\s*声優\s*\|\s*声\s*\]\])\s*-\s*(.+?)\n", text) for ch, cv_list in pairs: other_names = re.findall(r"(([^\n]+))", ch) ch = re.sub(r"(.+?)", "", ch) ch = clean_text(ch).strip() if other_names: other_names = [o for ot in other_names for o in ot.split('、')] else: other_names = [] cv = clean_text(cv_list) yield CharacterJAName(name=ch, cv=cv, other_names=other_names) if full_source is None or depth == 0: return links = _parse_links_from_html(full_source) site = wikipedia_ja() for link in links: if 'の登場人物' in link and link not in visited and ":" not in link: visited.add(link) p = site.pages[link] while p.redirect: print("Redirect", p.page_title) p = p.resolve_redirect() dt = p.text() yield from parse_ja_characters(dt, _safe_html(site, link), visited, depth - 1)
def get_characters(text, source) -> List[CVCharacter]: pairs = re.findall(r'\n\s*(?:\*\s*)+(.+?)\s*————+\s*《\s*(.+?)\s*》', text) return [ CVCharacter(name=convert_zh(ch, 'zh-CN'), anime=convert_zh(clean_text(src), 'zh-CN'), source=source) for char_l, src in pairs for ch in clean_text(char_l).split('、') ]
def main(): index_file = indices_root / 'アニメ.txt' visited_file = indices_root / 'アニメ_visited.txt' dst_file = anime_root / 'アニメ.txt' visited = set() if os.path.exists(visited_file): with open(visited_file) as f: for line in f.readlines(): visited.add(line.strip()) with open(index_file) as src_f, open(visited_file, 'a') as visited_f, open( dst_file, 'a') as dst_f: writer = JSONWriter(dst_f) for line in src_f.readlines(): if re.match(r"^:|^[a-zA-Z]{,5}:", line): continue title = re.sub(r'^Redirect |の登場人物$', '', line) title = title.strip() title = clean_text(title) if title in visited or not title: continue print(title) info = _safe_crawl(title) writer.write(info) visited_f.write(title + "\n")
def test_clean(self): text = "[[森川智之|森川 智之]]" text = clean_text(text) print(text)
def parse_zh_characters(text, full_source, visited, depth) -> Iterable[CharacterZHName]: pairs = [] def _resolve_cv(a4, a5): if a4 is not None: ret = re.search(r"(?:配音員|配音员):(.+)", a4) if ret is not None: return ret.group(1) if a5 is not None: ret = re.search(r"(?:聲優|声优)——日:([^/]+)", a5) if ret is not None: return ret.group(1) rm_visible_anchor = r"\{|\}|Visible anchor\s*\||\(.+?\)" for d in find_all_templates(text, 'nihongo', ['1', '2', '3', '4', '5']): cv = _resolve_cv(d.get('4'), d.get('5')) if d['1'] and d['2']: zh_name = re.sub(r"(.+?)", "", d['1']).strip() ja_name = re.sub(r"(.+?)", "", d['2']).strip() en_name = re.sub(r"(.+?)", "", d['3'] or '').strip() zh_name = re.sub(rm_visible_anchor, "", zh_name).strip() other_names = [ja_name, en_name] other_names = [ re.sub(rm_visible_anchor, "", f).strip() for f in other_names if f ] pairs.append((zh_name, [f for f in other_names if f], cv)) for n, cv in re.findall(r"\n;(.+?)(聲優:(.+?)\n", text): n = re.sub(rm_visible_anchor, "", n).strip() n = clean_text(n) cv = re.sub(rm_visible_anchor, "", cv).strip() cv = clean_text(cv) n = to_zhs(n) pairs.append((n, [], cv)) for d in find_all_templates( text, 'Infobox animanga character', ['name', 'japanese', 'english', 'kana', 'romaji', 'voiced by']): name = to_zhs(clean_text(d['name'])) other_names = [ clean_text(d[k]) for k in ['japanese', 'english', 'kana', 'romaji'] if d[k] ] cv = d['voiced by'] or '' cv = to_zhs(clean_text(cv)) cv = '、'.join(cv.split()) pairs.append((name, other_names, cv)) for zh_name, other_names, cv_list in pairs: if cv_list: for cv in clean_text(cv_list).split('、'): cv = re.sub(r"(.+?)", "", cv).strip() yield CharacterZHName(name=to_zhs(zh_name), cv=to_zhs(cv), other_names=other_names) else: yield CharacterZHName(name=to_zhs(zh_name), cv=None, other_names=other_names) if full_source is None or depth == 0: return links = _parse_links_from_html(full_source) site = wikipedia_zh() for link in links: if '角色列表' in to_zhs(link) and link not in visited and ":" not in link: visited.add(link) p = site.pages[link] while p.redirect: p = p.resolve_redirect() dt = p.text() yield from parse_zh_characters(dt, _safe_html(site, link), visited, depth - 1) break