def check_need_accent(text): for word in re.split(" +", text): word = blib.remove_links(word) if u"\u0301" in word or u"ё" in word: continue if not ru.is_monosyllabic(word): return True return False
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) origtext = page.text parsed = blib.parse_text(origtext) # Find the declension arguments for LEMMA and inflected form INFL, # the WORDINDth word in the expression. Return value is a tuple of # four items: a list of (NAME, VALUE) tuples for the arguments, whether # the word is an adjective, the value of n= (if given), and the value # of a= (if given). def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if ru.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-"]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get(lemma, None) if not overriding_decl: pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None # ru-noun-table assert unicode(decl_template.name) == "ru-noun-table" # Split out the arg sets in the declension and check the # lemma of each one, taking care to handle cases where there is no lemma # (it would default to the page name). highest_numbered_param = 0 for p in decl_template.params: pname = unicode(p.name) if re.search("^[0-9]+$", pname): highest_numbered_param = max(highest_numbered_param, int(pname)) # Now gather the numbered arguments into arg sets. Code taken from # ru-noun.lua. offset = 0 arg_sets = [] arg_set = [] for i in xrange(1, highest_numbered_param + 2): end_arg_set = False val = getparam(decl_template, str(i)) if i == highest_numbered_param + 1: end_arg_set = True elif val == "_" or val == "-" or re.search("^join:", val): pagemsg("WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None elif val == "or": end_arg_set = True if end_arg_set: arg_sets.append(arg_set) arg_set = [] offset = i else: arg_set.append(val) canon_infl = ru.remove_accents(infl).lower() canon_lemma = lemma.lower() ispl = False need_sc1 = False found_gender = None if canon_infl != canon_lemma: for sgend, plend, gender, is_sc1 in pl_data: if sgend: check_sgend = sgend else: check_sgend = consonant_re if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma): ispl = True found_gender = gender need_sc1 = is_sc1 break else: pagemsg("WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None # Substitute the wordlink for any lemmas in the declension. # If plural, also add gender and verify special case (1) as necessary. # Concatenate all the numbered params, substituting the wordlink into # the lemma as necessary. numbered_params = [] for arg_set in arg_sets: lemma_arg = 0 if len(arg_set) > 0 and runoun.arg1_is_stress(arg_set[0]): lemma_arg = 1 if len(arg_set) <= lemma_arg: arg_set.append("") arglemma = arg_set[lemma_arg] manualtr = "" if "//" in arglemma: arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups() if (not arglemma or arglemma.lower() == infl.lower() or ru.is_monosyllabic(infl) and ru.remove_accents(arglemma).lower() == ru.remove_accents(infl).lower() or ispl and ru.remove_accents(arglemma).lower() == lemma.lower() ): arg_set[lemma_arg] = wordlink + manualtr else: pagemsg("WARNING: Can't sub word link %s into decl lemma %s%s" % ( wordlink, arg_set[lemma_arg], ispl and ", skipping" or "")) if ispl: return None if ispl: # Add the gender if len(arg_set) <= lemma_arg + 1: arg_set.append("") declarg = arg_set[lemma_arg + 1] # First, sub in gender m = re.search("(3f|[mfn])", declarg) if found_gender == "mf": if not m: pagemsg(u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None decl_gender = m.group(1) if decl_gender == "n": pagemsg(u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None elif decl_gender in ["m", "3f"]: pagemsg(u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (decl_gender, wordind, lemma, infl)) else: assert gender == "f" pagemsg(u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" % (wordind, lemma, infl)) declarg = re.sub("f", "3f", declarg, 1) else: if m: decl_gender = m.group(1) if decl_gender == found_gender: pagemsg("Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (found_gender, wordind, lemma, infl)) else: pagemsg("WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" % (decl_gender, wordind, found_gender, lemma, infl)) declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1) else: pagemsg("No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" % (wordind, found_gender, lemma, infl)) declarg = found_gender + declarg # Now check special case 1 if need_sc1 != ("(1)" in declarg): if need_sc1: pagemsg("WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % ( wordind, declarg, lemma, infl)) return None else: pagemsg("WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % ( wordind, declarg, lemma, infl)) return None arg_set[lemma_arg + 1] = declarg if numbered_params: numbered_params.append("or") numbered_params.extend(arg_set) # Now gather all params, including named ones. params = [] params.extend((str(i+1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params)) num = None anim = None for p in decl_template.params: pname = unicode(p.name) val = unicode(p.value) if pname == "a": anim = val elif pname == "n": num = val elif pname == "notes": params.append((pname, val)) elif pname == "title": pagemsg("WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" % (wordind, lemma, infl, val)) elif re.search("^[0-9]+$", pname): pass else: keepparam = True if pname == "loc": if pagetitle in keep_locative: pagemsg("Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) else: pagemsg("WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if pname == "par": pagemsg("WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if pname == "voc": pagemsg("WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if keepparam: if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U): pagemsg(u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) pname += str(wordind) params.append((pname, val))
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) origtext = page.text parsed = blib.parse_text(origtext) # Find the declension arguments for LEMMA and inflected form INFL, # the WORDINDth word in the expression. Return value is a tuple of # four items: a list of (NAME, VALUE) tuples for the arguments, whether # the word is an adjective, the value of n= (if given), and the value # of a= (if given). def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if rulib.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg( "WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg( "WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg( "WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg( "WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg( "WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg( "find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runounlib.convert_zdecl_to_ru_noun_table( decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-" ]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg( "WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search( u"(ий|ый|ой)$", lemma): pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text( overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get( lemma, None) if not overriding_decl: pagemsg( "WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text( overriding_decl).filter_templates()[0] else: pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None # ru-noun-table assert unicode(decl_template.name) == "ru-noun-table" # Split out the arg sets in the declension and check the # lemma of each one, taking care to handle cases where there is no lemma # (it would default to the page name). highest_numbered_param = 0 for p in decl_template.params: pname = unicode(p.name) if re.search("^[0-9]+$", pname): highest_numbered_param = max(highest_numbered_param, int(pname)) # Now gather the numbered arguments into arg sets. Code taken from # ru-noun.lua. offset = 0 arg_sets = [] arg_set = [] for i in xrange(1, highest_numbered_param + 2): end_arg_set = False val = getparam(decl_template, str(i)) if i == highest_numbered_param + 1: end_arg_set = True elif val == "_" or val == "-" or re.search("^join:", val): pagemsg( "WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None elif val == "or": end_arg_set = True if end_arg_set: arg_sets.append(arg_set) arg_set = [] offset = i else: arg_set.append(val) canon_infl = rulib.remove_accents(infl).lower() canon_lemma = lemma.lower() ispl = False need_sc1 = False found_gender = None if canon_infl != canon_lemma: for sgend, plend, gender, is_sc1 in pl_data: if sgend: check_sgend = sgend else: check_sgend = consonant_re if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub( sgend + "$", plend, canon_lemma): ispl = True found_gender = gender need_sc1 = is_sc1 break else: pagemsg( "WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None # Substitute the wordlink for any lemmas in the declension. # If plural, also add gender and verify special case (1) as necessary. # Concatenate all the numbered params, substituting the wordlink into # the lemma as necessary. numbered_params = [] for arg_set in arg_sets: lemma_arg = 0 if len(arg_set) > 0 and runounlib.arg1_is_stress(arg_set[0]): lemma_arg = 1 if len(arg_set) <= lemma_arg: arg_set.append("") arglemma = arg_set[lemma_arg] manualtr = "" if "//" in arglemma: arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups() if (not arglemma or arglemma.lower() == infl.lower() or rulib.is_monosyllabic(infl) and rulib.remove_accents(arglemma).lower() == rulib.remove_accents(infl).lower() or ispl and rulib.remove_accents(arglemma).lower() == lemma.lower()): arg_set[lemma_arg] = wordlink + manualtr else: pagemsg( "WARNING: Can't sub word link %s into decl lemma %s%s" % (wordlink, arg_set[lemma_arg], ispl and ", skipping" or "")) if ispl: return None if ispl: # Add the gender if len(arg_set) <= lemma_arg + 1: arg_set.append("") declarg = arg_set[lemma_arg + 1] # First, sub in gender m = re.search("(3f|[mfn])", declarg) if found_gender == "mf": if not m: pagemsg( u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None decl_gender = m.group(1) if decl_gender == "n": pagemsg( u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None elif decl_gender in ["m", "3f"]: pagemsg( u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (decl_gender, wordind, lemma, infl)) else: assert gender == "f" pagemsg( u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" % (wordind, lemma, infl)) declarg = re.sub("f", "3f", declarg, 1) else: if m: decl_gender = m.group(1) if decl_gender == found_gender: pagemsg( "Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (found_gender, wordind, lemma, infl)) else: pagemsg( "WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" % (decl_gender, wordind, found_gender, lemma, infl)) declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1) else: pagemsg( "No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" % (wordind, found_gender, lemma, infl)) declarg = found_gender + declarg # Now check special case 1 if need_sc1 != ("(1)" in declarg): if need_sc1: pagemsg( "WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % (wordind, declarg, lemma, infl)) return None else: pagemsg( "WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % (wordind, declarg, lemma, infl)) return None arg_set[lemma_arg + 1] = declarg if numbered_params: numbered_params.append("or") numbered_params.extend(arg_set) # Now gather all params, including named ones. params = [] params.extend( (str(i + 1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params)) num = None anim = None for p in decl_template.params: pname = unicode(p.name) val = unicode(p.value) if pname == "a": anim = val elif pname == "n": num = val elif pname == "notes": params.append((pname, val)) elif pname == "title": pagemsg( "WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" % (wordind, lemma, infl, val)) elif re.search("^[0-9]+$", pname): pass else: keepparam = True if pname == "loc": if pagetitle in keep_locative: pagemsg( "Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) else: pagemsg( "WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) keepparam = False if pname == "par": pagemsg( "WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) keepparam = False if pname == "voc": pagemsg( "WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) keepparam = False if keepparam: if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U): pagemsg( u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) pname += str(wordind) params.append((pname, val))
def find_accented_2(term, termtr, verbose, pagemsg): if term in accentless_multisyllable: pagemsg("Not accenting unaccented multisyllabic particle %s" % term) return term, termtr # This can happen if e.g. we're passed "[[FOO|BAR]] BAZ"; we will reject it, # but it will then be word-split and handled correctly ("[[FOO|BAR]]" is # special-cased in find_accented_1()). if "|" in term: #pagemsg("Can't handle links with vertical bars: %s" % term) return term, termtr # This can happen if e.g. we're passed "[[FOO]] [[BAR]]"; we will reject it, # but it will then be word-split and handled correctly ("[[FOO]]" is # special-cased in find_accented_1()). if "[" in term or "]" in term: #pagemsg("Can't handle stray bracket in %s" % term) return term, termtr if "<" in term or ">" in term: pagemsg("Can't handle stray < or >: %s" % term) return term, termtr if u"\u0301" in term or u"ё" in term: pagemsg(u"Term has accent or ё, not looking up accents: %s" % term) return term, termtr if ru.is_monosyllabic(term): pagemsg("Term is monosyllabic, not looking up accents: %s" % term) return term, termtr pagename = ru.remove_accents(term) # We can't use expand_text() from find_accented_1() because it has a # different value for PAGENAME, and the proper value is important in # expanding ru-noun+ and ru-proper noun+. def expand_text(tempcall): return blib.expand_text(tempcall, pagename, pagemsg, semi_verbose) # Look up the page if semi_verbose: pagemsg("find_accented: Finding heads on page %s" % pagename) cached_redirect = False global num_cache_lookups num_cache_lookups += 1 if pagename in accented_cache: global num_cache_hits num_cache_hits += 1 result = accented_cache[pagename] cached = True if result is None: if semi_verbose: pagemsg("find_accented: Page %s doesn't exist (cached)" % pagename) return term, termtr elif result == "redirect": cached_redirect = True heads = set() saw_head = False else: heads, saw_head = result else: cached = False page = pywikibot.Page(site, pagename) try: if not page.exists(): if semi_verbose: pagemsg("find_accented: Page %s doesn't exist" % pagename) if not global_disable_cache: accented_cache[pagename] = None return term, termtr except Exception as e: pagemsg("WARNING: Error checking page existence: %s" % unicode(e)) if not global_disable_cache: accented_cache[pagename] = None return term, termtr # Page exists, find the heads heads = set() def add(val, tr): val_to_add = blib.remove_links(val) if val_to_add: heads.add((val_to_add, tr)) saw_head = False for t in blib.parse(page).filter_templates(): tname = unicode(t.name) if tname in ru_head_templates: saw_head = True if getparam(t, "1"): add(getparam(t, "1"), getparam(t, "tr")) elif getparam(t, "head"): add(getparam(t, "head"), getparam(t, "tr")) elif tname == "head" and getparam(t, "1") == "ru": saw_head = True add(getparam(t, "head"), getparam(t, "tr")) elif tname in ["ru-noun+", "ru-proper noun+"]: saw_head = True lemma = ru.fetch_noun_lemma(t, expand_text) lemmas = re.split(",", lemma) lemmas = [split_ru_tr(lemma) for lemma in lemmas] # Group lemmas by Russian, to group multiple translits lemmas = ru.group_translits(lemmas, pagemsg, expand_text) for val, tr in lemmas: add(val, tr) if saw_head: for i in xrange(2, 10): headn = getparam(t, "head" + str(i)) if headn: add(headn, getparam(t, "tr" + str(i))) if not global_disable_cache: accented_cache[pagename] = (heads, saw_head) # We have the heads cached_msg = " (cached)" if cached else "" if len(heads) == 0: if not saw_head: if cached_redirect: pagemsg("Redirect without heads (cached)") elif not cached and re.match("#redirect", page.text, re.I): if not global_disable_cache: accented_cache[pagename] = "redirect" pagemsg("Redirect without heads") else: pagemsg("WARNING: Can't find any heads: %s%s" % (pagename, cached_msg)) return term, termtr if len(heads) > 1: pagemsg("WARNING: Found multiple heads for %s%s: %s" % (pagename, cached_msg, ",".join("%s%s" % (ru, "//%s" % tr if tr else "") for ru, tr in heads))) return term, termtr newterm, newtr = list(heads)[0] if semi_verbose: pagemsg("find_accented: Found head %s%s%s" % (newterm, "//%s" % newtr if newtr else "", cached_msg)) if re.search("[!?]$", newterm) and not re.search("[!?]$", term): newterm_wo_punc = re.sub("[!?]$", "", newterm) if ru.remove_accents(newterm_wo_punc) == ru.remove_accents(term): pagemsg("Removing punctuation from %s when matching against %s" % ( newterm, term)) newterm = newterm_wo_punc if ru.remove_accents(newterm) != ru.remove_accents(term): pagemsg("WARNING: Accented term %s differs from %s in more than just accents%s" % ( newterm, term, cached_msg)) return newterm, newtr
def process_page(page, index, parsed): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") override_pos = pages_pos.get(pagetitle, None) if override_pos: del pages_pos[pagetitle] if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return titlewords = split_words(pagetitle, True) saw_e = False for word in titlewords: if word.endswith(u"е") and not rulib.is_monosyllabic(word): saw_e = True break if not saw_e: pagemsg(u"No possible final unstressed -е in page title, skipping") return #if (" " in pagetitle or "-" in pagetitle) and not override_pos: # pagemsg(u"WARNING: Space or hyphen in page title and probable final unstressed -е, not sure how to handle yet") # return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j - 1] == "==Russian==\n": if foundrussian: pagemsg( "WARNING: Found multiple Russian sections, skipping page") return foundrussian = True subsections = re.split( "(^===(?:Etymology|Pronunciation) [0-9]+===\n)", sections[j], 0, re.M) # If no separate etymology sections, add extra stuff at the beginning # to fit the pattern if len(subsections) == 1: subsections = ["", ""] + subsections subsections_with_ru_ipa_to_fix = set() subsections_with_ru_ipa = set() for k in xrange(0, len(subsections), 2): for t in blib.parse_text(subsections[k]).filter_templates(): if unicode(t.name) == "ru-IPA": subsections_with_ru_ipa.add(k) if getparam(t, "pos"): pagemsg( "Already has pos=, skipping template in section %s: %s" % (k // 2, unicode(t))) else: phon = (getparam(t, "phon") or getparam(t, "1") or pagetitle).lower() phonwords = split_words(phon, True) if len(phonwords) != len(titlewords): pagemsg( "WARNING: #Words (%s) in phon=%s not same as #words (%s) in title" % ((len(phonwords) + 1) // 2, phon, (len(titlewords) + 1) // 2)) for i in xrange(0, len(phonwords), 2): phonword = phonwords[i] wordno = i // 2 + 1 if rulib.is_monosyllabic(phonword): pagemsg( "Skipping monosyllabic pronun %s (#%s) in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) elif not phonword.endswith(u"е"): pagemsg( u"Skipping pronun word %s (#%s) in section %s because doesn't end in -е" % (phonword, wordno, k // 2)) else: pagemsg( "Found template that will be modified due to phonword %s (#%s) in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) subsections_with_ru_ipa_to_fix.add(k) else: for i in xrange(0, len(phonwords), 2): titleword = titlewords[i] phonword = phonwords[i] wordno = i // 2 + 1 if rulib.is_monosyllabic(phonword): pagemsg( "Skipping monosyllabic pronun %s (#%s) in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) elif not titleword.endswith(u"е"): pagemsg( u"Skipping title word %s (#%s) in section %s because doesn't end in -е" % (titleword, wordno, k // 2)) elif re.search( u"([еия]|цы|е̂|[кгхцшжщч]а)" + rulib.DOTABOVE + "?$", phonword): pagemsg( "Found template that will be modified due to phonword %s, titleword %s (#%s) in section %s: %s" % (phonword, titleword, wordno, k // 2, unicode(t))) subsections_with_ru_ipa_to_fix.add(k) elif not re.search( u"[еэѐ][" + rulib.AC + rulib.GR + rulib.CFLEX + rulib.DUBGR + "]?$", phonword): pagemsg( u"WARNING: ru-IPA pronunciation word %s (#%s) doesn't end in [еэия] or е̂ or hard sibilant + [ыа] when corresponding titleword %s ends in -е, something wrong in section %s: %s" % (phonword, wordno, titleword, k // 2, unicode(t))) else: pagemsg( u"Pronun word %s (#%s) with final -э or stressed vowel, ignoring in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) if not subsections_with_ru_ipa: pagemsg("No ru-IPA on page, skipping page") return if not subsections_with_ru_ipa_to_fix: pagemsg("No fixable ru-IPA on page, skipping page") return # If saw ru-IPA covering multiple etym sections, make sure we don't # also have pronuns inside the etym sections, and then treat as one # single section for the purposes of finding POS's if 0 in subsections_with_ru_ipa: if len(subsections_with_ru_ipa) > 1: pagemsg( "WARNING: Saw ru-IPA in section 0 (covering multiple etym or pronun sections) and also inside etym/pronun section(s) %s; skipping page" % (",".join(k // 2 for k in subsections_with_ru_ipa if k > 0))) return subsections = ["", "", "".join(subsections)] subsections_with_ru_ipa_to_fix = {2} for k in subsections_with_ru_ipa_to_fix: pagemsg("Fixing section %s" % (k // 2)) parsed = blib.parse_text(subsections[k]) if override_pos: pos = override_pos else: pos = set() is_lemma = set() lemma = set() saw_acc = False saw_noun_form = False for t in parsed.filter_templates(): def getp(param): return getparam(t, param) tname = unicode(t.name) if tname in ["ru-noun", "ru-proper noun"]: if getparam(t, "2") == "-": pagemsg("Found invariable noun: %s" % unicode(t)) pos.add("inv") else: pagemsg("Found declined noun: %s" % unicode(t)) pos.add("n") is_lemma.add(True) elif tname in ["ru-noun+", "ru-proper noun+"]: for param in t.params: if re.search("^[0-9]+$", unicode( param.name)) and "+" in unicode( param.value): pagemsg( "Found declined adjectival noun, treating as adjective: %s" % unicode(t)) pos.add("a") break else: pagemsg("Found declined noun: %s" % unicode(t)) pos.add("n") is_lemma.add(True) elif tname == "comparative of" and getp( "lang") == "ru": pagemsg("Found comparative: %s" % unicode(t)) pos.add("com") is_lemma.add(False) elif tname == "ru-adv": pagemsg("Found adverb: %s" % unicode(t)) pos.add("adv") is_lemma.add(True) elif tname == "ru-adj": pagemsg("Found adjective: %s" % unicode(t)) pos.add("a") is_lemma.add(True) elif tname == "ru-noun form": pagemsg("Found noun form: %s" % unicode(t)) saw_noun_form = True is_lemma.add(False) elif tname == "head" and getp("1") == "ru": if getp("2") == "verb form": pagemsg("Found verb form: %s" % unicode(t)) pos.add("v") is_lemma.add(False) elif getp("2") in [ "adjective form", "participle form" ]: pagemsg("Found adjective form: %s" % unicode(t)) pos.add("a") is_lemma.add(False) elif getp("2") == "noun form": pagemsg("Found noun form: %s" % unicode(t)) saw_noun_form = True is_lemma.add(False) elif getp("2") == "pronoun form": pagemsg("Found pronoun form: %s" % unicode(t)) pos.add("pro") is_lemma.add(False) elif getp("2") == "preposition": pagemsg("Found preposition: %s" % unicode(t)) pos.add("p") is_lemma.add(True) elif getp("2") == "numeral": pagemsg("Found numeral: %s" % unicode(t)) pos.add("num") is_lemma.add(True) elif getp("2") == "pronoun": pagemsg("Found pronoun: %s" % unicode(t)) pos.add("pro") is_lemma.add(True) elif tname == "inflection of" and getp("lang") == "ru": is_lemma.add(False) lemma.add(rulib.remove_accents(getp("1"))) if saw_noun_form: inflection_groups = [] inflection_group = [] for param in t.params: if param.name in ["1", "2"]: continue val = unicode(param.value) if val == ";": if inflection_group: inflection_groups.append( inflection_group) inflection_group = [] else: inflection_group.append(val) if inflection_group: inflection_groups.append(inflection_group) for igroup in inflection_groups: igroup = set(igroup) is_plural = not not ({"p", "plural"} & igroup) if is_plural and ({"nom", "nominative"} & igroup): pagemsg( "Found nominative plural case inflection: %s" % unicode(t)) pos.add("nnp") elif {"acc", "accusative"} & igroup: # We use "n" for misc cases, but skip accusative for now, # adding "n" later if we haven't seen nnp to avoid problems # below with the check for multiple pos's (nom pl and acc pl # are frequently the same) saw_acc = True elif not is_plural and ( {"pre", "prep", "prepositional"} & igroup): pagemsg( "Found prepositional singular case inflection: %s" % unicode(t)) pos.add("pre") elif not is_plural and ({"dat", "dative"} & igroup): pagemsg( "Found dative singular case inflection: %s" % unicode(t)) pos.add("dat") elif not is_plural and ( {"loc", "locative"} & igroup): pagemsg( "Found locative singular case inflection: %s" % unicode(t)) pos.add("dat") elif not is_plural and ( {"voc", "vocative"} & igroup): pagemsg( "Found vocative case inflection: %s" % unicode(t)) pos.add("voc") else: pos.add("n") elif tname == "prepositional singular of" and getp( "lang") == "ru": pagemsg( "Found prepositional singular case inflection: %s" % unicode(t)) pos.add("pre") is_lemma.add(False) lemma.add(getp("1")) elif tname == "dative singular of" and getp( "lang") == "ru": pagemsg( "Found dative singular case inflection: %s" % unicode(t)) pos.add("dat") is_lemma.add(False) lemma.add(getp("1")) elif tname == "vocative singular of" and getp( "lang") == "ru": pagemsg("Found vocative case inflection: %s" % unicode(t)) pos.add("voc") is_lemma.add(False) lemma.add(getp("1")) if saw_acc and "nnp" not in pos: pos.add("n") if "dat" in pos and "pre" in pos: pagemsg("Removing pos=dat because pos=pre is found") pos.remove("dat") if "com" in pos: if "a" in pos: pagemsg("Removing pos=a because pos=com is found") pos.remove("a") if "adv" in pos: pagemsg( "Removing pos=adv because pos=com is found") pos.remove("adv") if "a" in pos and "nnp" in pos: pagemsg("Removing pos=nnp because pos=a is found") pos.remove("nnp") if not pos: pagemsg( "WARNING: Can't locate any parts of speech, skipping section" ) continue if len(pos) > 1: pagemsg( "WARNING: Found multiple parts of speech, skipping section: %s" % ",".join(pos)) continue pos = list(pos)[0] # If multiword term or potential adjectival term, can't trust # the part of speech coming from the above process if (" " in pagetitle or "-" in pagetitle or re.search(u"[ыиео]́?е$", pagetitle)): if not is_lemma: pagemsg( "WARNING: Can't determine whether lemma or not, skipping section" ) continue if len(is_lemma) > 1: pagemsg( "WARNING: Found both lemma and non-lemma parts of speech, skipping section" ) continue is_lemma = list(is_lemma)[0] if (" " in pagetitle or "-" in pagetitle) and is_lemma: pagemsg( u"WARNING: Space or hyphen in lemma page title and probable final unstressed -e, not sure how to handle yet, skipping section" ) continue # If is_lemma, we are a single-word adjective and will be handled # correctly by the above code if not is_lemma: if not lemma: pagemsg( "WARNING: Non-lemma form and can't determine lemma, skipping section" ) continue if len(lemma) > 1: pagemsg( "WARNING: Found inflections of multiple lemmas, skipping section: %s" % ",".join(lemma)) continue lemma = list(lemma)[0] retval = find_noun_word_types(lemma, pagemsg) if not retval: continue word_types, seen_pos_specs = retval words = split_words(pagetitle, False) assert len(words) == len(word_types) modified_word_types = [] need_to_continue = False # FIXME: Should we be using phonetic version of lemma? for wordno, (word, ty) in enumerate( zip(words, word_types)): if word.endswith( u"е" ) and not rulib.is_monosyllabic(word): if ty == "inv": if len(seen_pos_specs) > 1: pagemsg( u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma has ambiguous pos= params (%s), not sure what to do, skipping section" % (pagetitle, word, ",".join(seen_pos_specs))) need_to_continue = True break elif not seen_pos_specs: pagemsg( u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma has no pos= params, not sure what to do, skipping section" % (pagetitle, word)) need_to_continue = True break else: seen_pos_spec = list( seen_pos_specs)[0] seen_poses = re.split( "/", seen_pos_spec) if len(seen_poses) == 1: ty = seen_poses[0] elif len(words) != len(seen_poses): pagemsg( u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma param pos=%s has wrong number of parts of speech, not sure what to do, skipping section" % (pagetitle, word, seen_pos_spec)) need_to_continue = True break else: ty = seen_poses[wordno] if not ty: pagemsg( "WARNING: Something wrong with retrieved pos= value from lemma, has blank value" ) need_to_continue = True break if ty == "decln": modified_word_types.append(pos) else: modified_word_types.append(ty) else: modified_word_types.append("") if need_to_continue: continue non_blank_distinct_mwt = set( x for x in modified_word_types if x) if len(non_blank_distinct_mwt) == 0: pagemsg( "WARNING: Something wrong, pos= would end up blank" ) elif len(non_blank_distinct_mwt) == 1: pos = list(non_blank_distinct_mwt)[0] else: pos = "/".join(modified_word_types) # Check whether there's a pronunciation with final -е for a given # word. There are some entries that have multiple pronunciations, # one with final -е and one with something else, e.g. final -и, # and we want to leave those alone with a warning. saw_final_e = {} for t in parsed.filter_templates(): if unicode(t.name) == "ru-IPA": param = "phon" phon = getparam(t, param) if not phon: param = "1" phon = getparam(t, "1") if not phon: param = "pagetitle" phon = pagetitle if getparam(t, "pos"): pass # Already output msg else: phonwords = split_words(phon, True) for i in xrange(0, len(phonwords), 2): if re.search(u"е$", phonwords[i]): saw_final_e[i] = True # Now modify the templates. for t in parsed.filter_templates(): if unicode(t.name) == "ru-IPA": param = "phon" phon = getparam(t, param) if not phon: param = "1" phon = getparam(t, "1") if not phon: param = "pagetitle" phon = pagetitle origt = unicode(t) if getparam(t, "pos"): pass # Already output msg else: phonwords = split_words(phon, True) mismatched_phon_title = len(phonwords) != len( titlewords) for i in xrange(0, len(phonwords), 2): titleword = not mismatched_phon_title and titlewords[ i] phonword = phonwords[i] lphonword = phonword.lower() wordno = i // 2 + 1 if rulib.is_monosyllabic(phonword): pass # Already output msg elif mismatched_phon_title: pass # Can't canonicalize template elif not titleword.endswith(u"е"): pass # Already output msg elif re.search( u"([еия]|цы|е̂|[кгхцшжщч]а)" + rulib.DOTABOVE + "?$", lphonword): # Found a template to modify if re.search(u"е" + rulib.DOTABOVE + "?$", lphonword): pass # No need to canonicalize else: if saw_final_e.get(i, False): pagemsg( u"WARNING: Found another pronunciation with final -е, skipping: phon=%s (word #%s)" % (phonword, wordno)) continue if re.search( u"и" + rulib.DOTABOVE + "?$", lphonword): pagemsg( u"phon=%s (word #%s) ends in -и, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append( u"unstressed -и -> -е") elif re.search(u"е̂$", lphonword): # Make this a warning because we're not sure this is correct pagemsg( u"WARNING: phon=%s (word #%s) ends in -е̂, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append(u"-е̂ -> -е") elif re.search( u"я" + rulib.DOTABOVE + "?$", lphonword): pagemsg( u"phon=%s (word #%s) ends in -я, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append( u"unstressed -я -> -е") elif re.search( u"цы" + rulib.DOTABOVE + "?$", lphonword): pagemsg( u"phon=%s (word #%s) ends in ц + -ы, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append( u"unstressed -ы after ц -> -е") elif re.search( u"[кгхцшжщч]а" + rulib.DOTABOVE + "?$", lphonword): pagemsg( u"phon=%s (word #%s) ends in unpaired cons + -а, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append( u"unstressed -а after unpaired cons -> -е" ) else: assert False, "Something wrong, strange ending, logic not correct: section %s, phon=%s (word #%s)" % ( k // 2, phonword, wordno) newphonword = re.sub( u"(?:[ияыа]|е̂)(" + rulib.DOTABOVE + "?)$", ur"е\1", phonword) newphonword = re.sub( u"(?:[ИЯЫА]|Е̂)(" + rulib.DOTABOVE + "?)$", ur"Е\1", newphonword) pagemsg( "Modified phon=%s (word #%s) to %s in section %s: %s" % (phonword, wordno, newphonword, k // 2, unicode(t))) phonwords[i] = newphonword newphon = "".join(phonwords) if newphon != phon: assert param != "pagetitle", u"Something wrong, page title should not have -и or similar that needs modification: section %s, phon=%s, newphon=%s" % ( k // 2, phon, newphon) if pos in ["voc", "inv", "pro"]: pagemsg( u"WARNING: pos=%s may be unstable or inconsistent in handling final -е, please check change of phon=%s to %s in section %s: %s" % (pos, phon, newphon, k // 2, unicode(t))) pagemsg( "Modified phon=%s to %s in section %s: %s" % (phon, newphon, k // 2, unicode(t))) if pos == "none": pagemsg( "WARNING: pos=none, should not occur, not modifying phon=%s to %s in section %s: %s" % (phon, newphon, k // 2, unicode(t))) else: t.add(param, newphon) if pos == "none": pagemsg( "WARNING: pos=none, should not occur, not setting pos= in section %s: %s" % (k // 2, unicode(t))) else: t.add("pos", pos) notes.append( "added pos=%s%s" % (pos, override_pos and " (override)" or "")) pagemsg( "Replaced %s with %s in section %s%s" % (origt, unicode(t), k // 2, override_pos and " (using override)" or "")) subsections[k] = unicode(parsed) sections[j] = "".join(subsections) new_text = "".join(sections) def fmt_key_val(key, val): if val == 1: return "%s" % key else: return "%s (%s)" % (key, val) if new_text != text: assert notes # Group identical notes together and append the number of such identical # notes if > 1, putting 'added pos=X' notes before others, so we get e.g. # "added pos=n (2); added pos=a; unstressed -и -> -е (2)" from five # original notes. # 1. Count items in notes[] and return a key-value list in descending order notescount = Counter(notes).most_common() # 2. Extract 'added pos=X' items; we put them first; note, descending order # of # of times each note has been seen is maintained added_pos = [(x, y) for x, y in notescount if x.startswith("added pos=")] # 3. Extract other items not_added_pos = [(x, y) for x, y in notescount if not x.startswith("added pos=")] # 4. Recreate notes for 'added pos=X', then others notes = [fmt_key_val(x, y) for x, y in added_pos] notes.extend([fmt_key_val(x, y) for x, y in not_added_pos]) return new_text, notes