def process_page(page_url, languages): soup = get_soup(page_url) nextpage = "" nextpage_div = soup.find(id="mw-pages") last_link = nextpage_div.find_all("a")[-1] if NEXTPAGE_TEXT == last_link.text: nextpage = ROOT_URL + last_link.get("href") content = soup.find("div", {"class": "mw-category"}) lis = content.findAll("li") for li in lis: link = li.find("a")["href"] li_url = ROOT_URL + link key = li.text.split(":")[1] sub_soup = get_soup(li_url) content = sub_soup.find("div", {"class": "mw-parser-output"}).find( "p", recursive=False ) value = content.text.strip() languages[key] = value a_url = ALIAS_URL.format(li.text) soup_alias = get_soup(a_url) if ul_alias := soup_alias.find("ul", {"id": "mw-whatlinkshere-list"}): for alias_li in ul_alias.findAll("li"): alias_text = alias_li.find("a").text alias_key = alias_text.split(":")[1] languages[alias_key] = value
def process_cs_page(url, results): soup = get_soup(url) nextpage = "" nextpage_div = soup.find(id="mw-pages") last_link = nextpage_div.find_all("a")[-1] if NEXTPAGE_TEXT == last_link.text: nextpage = ROOT_URL + last_link.get("href") divs_category = soup.find_all("div", {"class": "mw-category-group"}) for divs_category in divs_category: lis = divs_category.find_all("li") for li in lis: template_link = li.find("a") template_url = ROOT_URL + template_link.get("href") template_name = template_link.text.split(":")[1] template_soup = get_soup(template_url) template_text_div = template_soup.find( "div", {"class": "mw-parser-output"}) template_text = template_text_div.find("p").text.strip() if template_text[-1] == ".": template_text = template_text[:-1] results[template_name] = template_text process_alias_page(template_link.text, template_text, results) return nextpage
def get_content(url): soup = get_soup(url) content_div = soup.find("div", "mw-parser-output") content_div = content_div.findChild( "div", {"class": "mw-highlight"}, recursive=False ) return content_div.text.split("\n")
def get_text(url): soup = get_soup(url) div = soup.find("span", "form-of-definition") if not div: return "" res = div.text.replace(" term", "") res = res.replace(" [Term?]", "") return res
def process_alias_page(key, value, results): url = ALIAS_URL.format(key) soup = get_soup(url) ul = soup.find("ul", {"id": ["mw-whatlinkshere-list"]}) if not ul: return for alias in ul.find_all("a", {"class": ["mw-redirect"]}): alias = alias.text.replace("Modèle:", "") if alias == "modifier": continue results[alias] = value
def process_regions_page(url, results): soup = get_soup(url) nextpage = "" nextpage_div = soup.find(id="mw-pages") last_link = nextpage_div.find_all("a")[-1] if NEXTPAGE_TEXT == last_link.text: nextpage = ROOT + last_link.get("href") content_div = soup.find("div", "mw-category-generated") lis = content_div.find_all("li") for li in lis: template_url = ROOT + li.find("a").get("href") template_name = li.text.split(":")[1] template_soup = get_soup(template_url) region = template_soup.find("span", {"id": ["région"]}) if not region: continue results[template_name] = region.text.strip("()") return nextpage
def process_alias_page(model, template_text, results): url = ALIAS_URL.format(model) soup = get_soup(url) ul = soup.find("ul", {"id": ["mw-whatlinkshere-list"]}) if not ul: return for alias in ul.find_all("a", {"class": ["mw-redirect"]}): alias = alias.text.replace("Plantilla:", "") if alias == "editar": continue results[alias] = template_text
def process_category_page(url, results): soup = get_soup(url) nextpage = "" nextpage_div = soup.find(id="mw-pages") last_link = nextpage_div.find_all("a")[-1] if NEXTPAGE_TEXT == last_link.text: nextpage = ROOT + last_link.get("href") content_div = soup.find("div", "mw-category-generated") lis = content_div.find_all("li") for li in lis: template_url = ROOT + li.find("a").get("href") template_name = li.text.split(":")[1] template_soup = get_soup(template_url) parser_output = template_soup.find("span", {"class": ["term", "texte"]}) rendering = parser_output.text if template_name and rendering: results[template_name] = rendering.strip("()") return nextpage
def process_page(url, repl, stop_line, var_name, print_result=True): soup = get_soup(url) div = soup.find("div", {"class": "mw-highlight-lines"}) text = div.text text = text.replace("local ", "") text = text.replace("end", "") text = text.replace("true", "True") text = text.replace("false", "False") text = text.replace("--", "#") text = re.sub(r"function\s+(\w+\([\w|\,]+\))", "def \\g<1>:", text) text = text.replace("for _,v in ipairs(y) do", "for v in y:") for r in repl: text = re.sub(rf"[ \t]+{r}[\s]*=", f' "{r}":', text) code = "" for line in text.split("\n"): if line.strip().startswith(stop_line): break elif "require" not in line: code += line + "\n" exec(code, globals()) results = {} for k, v in labels.items(): # noqa label_v = v label_k = k if isinstance(v, str): label_v = labels.get(v, v) # noqa if label_v != v: label_k = v if isinstance(label_v, str): display = label_v else: display = label_v.get("display", label_k) display = process_display(display) if display != k: results[k] = display if print_result: print(f"{var_name} = {{") for key, value in sorted(results.items()): print(f' "{key}": "{value}",') print(f"}} # {len(results):,}") return results
from scripts_utils import get_soup root_url = "https://de.wiktionary.org" start_url = f"{root_url}/wiki/Kategorie:Wiktionary:Sprachadjektive" alias_url = "https://de.wiktionary.org/w/index.php?title=Spezial:Linkliste/{}&hidetrans=1&hidelinks=1" soup = get_soup(start_url) content = soup.find("div", {"class": "mw-category"}) lis = content.findAll("li") languages = {} for li in lis: link = li.find("a")["href"] li_url = root_url + link key = li.text.split(":")[1] sub_soup = get_soup(li_url) content = sub_soup.find("div", {"class": "mw-parser-output"}).find("p") value = content.text.strip() languages[key] = value a_url = alias_url.format(li.text) soup_alias = get_soup(a_url) if ul_alias := soup_alias.find("ul", {"id": "mw-whatlinkshere-list"}): for alias_li in ul_alias.findAll("li"): alias_text = alias_li.find("a").text alias_key = alias_text.split(":")[1] languages[alias_key] = value print("lang_adjs = {") for key, value in sorted(languages.items()): print(f' "{key}": "{value}",') print(f"}} # {len(languages):,}")
import re from scripts_utils import get_soup url = "https://pt.wiktionary.org/w/index.php?title=Predefini%C3%A7%C3%A3o:gram%C3%A1tica/core&action=edit" soup = get_soup(url) textarea = soup.find("textarea") current_abbr = [] count = 0 text = textarea.text text = text.replace("{{#ifeq:{{int:Log}}|{{:MediaWiki:Log}}|género|gênero}}", "género") text = text.replace("{{gramática/core/faltagenero|{{{2|}}}}}", "gênero em falta") text = re.sub("(<!--.*?-->)", "", text, flags=re.DOTALL) print("gramatica_short = {") for p in sorted(text.split("|")): p = p.strip() if p and "<!--" not in p and ("{" not in p) and ("}" not in p): if "=" in p: sArray = p.split("=") name = sArray[1].strip("'") print(f' "{sArray[0]}": "{name}",') count += 1 for abbr in sorted(current_abbr): print(f' "{abbr}": "{name}",') count += 1 current_abbr.clear() else: