def crawl(category): target_fp = indices_root / f'{category}.txt' site = moegirl() with open(target_fp, 'w') as f: for p in site.categories[category]: title = p.name if not title.startswith("Category:"): continue f.write(f"{title[len('Category:'):]}\n")
def crawl(category): target_fp = indices_root / f'{category}.txt' site = moegirl() with open(target_fp, 'w') as f: for p in site.categories[category]: title = p.name if title.startswith("Category:"): continue print(category, title) f.write(title + "\n")
def crawl(category): target_fp = indices_root / f'{category}.txt' site = moegirl() with open(target_fp, 'w') as f: for p in site.categories[category]: title = p.name if title.startswith("Category:"): continue print(category, title) item = PageIndex(title=title, source=site.host) json.dump(class_to_json(item), f) f.write("\n")
def get_info(title): site = moegirl() page = site.pages[title] tags = {} doc = wtp.parse(page.text()) for temp in doc.templates: for arg in temp.arguments: k = arg.name.strip() if k in tag_keys: tags[k] = arg.value.strip() categories = [c.name[9:] for c in page.categories()] info = {'page_title': title, 'tags': tags, 'categories': categories} return info
def get_page(title) -> CVInfo: title = convert_zh(title, 'zh-CN') site = moegirl() page = site.pages[title] text = page.text() gender = get_gender(title, text) birth_year = get_birth_year(text) characters = get_characters(text, site.host) return CVInfo(name=title, gender=gender, birth_year=birth_year, characters=characters, source=site.host)
def crawl(ftype): index_fp = indices_root / f'{ftype}_index.txt' target_fp = indices_root / f'{ftype}.txt' site = moegirl() categories = open(index_fp).read().split() with open(target_fp, 'w') as f: for cate in categories: for p in site.categories[cate]: title = p.name if title.startswith("Category:"): continue print(ftype, cate, title) item = PageIndexWithTag(title=title, tag=cate, source=site.host) json.dump(class_to_json(item), f) f.write("\n")
def test_moegirl(self): self.run_test(moegirl())