def _preprocess_token(t) -> Token: # Examples from the CHILDES LeeWongLeung corpus, child mhz # e.g., mor is suk1&DIM=uncle, word is 叔叔 # e.g., mor is ngo5-PL=I, word i 我 try: jyutping_mor, _, eng = t.mor.partition("=") except AttributeError: return Token(t.word, t.pos, None, t.mor, t.gra) if "-" in jyutping_mor: jyutping, _, mor = jyutping_mor.partition("-") elif "&" in jyutping_mor: jyutping, _, mor = jyutping_mor.partition("&") else: jyutping = jyutping_mor mor = "" mor = f"{mor}={eng}" if eng else mor try: parse_jyutping(jyutping) except ValueError: jyutping = None return Token(t.word, t.pos, jyutping or None, mor or None, t.gra)
def _get_words_characters_to_jyutping(): corpus = hkcancor() words_to_jyutping_counters = defaultdict(Counter) characters_to_jyutping_counters = defaultdict(Counter) for word, _, jyutping, _ in corpus.tagged_words(): if not jyutping or not word: continue try: parsed_jp = parse_jyutping(jyutping) except ValueError: continue if len(word) != len(parsed_jp): continue words_to_jyutping_counters[word][jyutping] += 1 for char, jp in zip(word, parsed_jp): characters_to_jyutping_counters[char]["".join(jp)] += 1 words_to_jyutping = {} for word, jyutping_counter in words_to_jyutping_counters.items(): jp = jyutping_counter.most_common(1)[0][0] words_to_jyutping[word] = jp characters_to_jyutping = {} for character, jyutping_counter in characters_to_jyutping_counters.items(): jp = jyutping_counter.most_common(1)[0][0] characters_to_jyutping[character] = jp words_to_jyutping = { # The ordering of the following dicts matters. The rime-cantonese # data may contain what's been re-segmented by this repo, and may # contain jyutping pronunciations for particular characters that # are only used in those contexts. The data from HKCanCor should comes # last to act as the default to override such cases. **{ k: v for k, v in LETTERED.items() if len( split_characters_with_alphanum(k)) > 1 }, **{k: v for k, v in CHARS_TO_JYUTPING.items() if len(k) > 1}, **words_to_jyutping, } # TODO: Extract characters from CHARS_TO_JYUTPING and LETTERED # and add them to characters_to_jyutping characters_to_jyutping = { # The ordering of the following dicts matters. The rime-cantonese # data may contain what's been re-segmented by this repo, and may # contain jyutping pronunciations for particular characters that # are only used in those contexts. The data from HKCanCor should comes # last to act as the default to override such cases. **{k: v for k, v in LETTERED.items() if len(k) == 1}, **{k: v for k, v in CHARS_TO_JYUTPING.items() if len(k) == 1}, **characters_to_jyutping, } return words_to_jyutping, characters_to_jyutping
def _resegment_chars_jyutping_data(json_filename): json_path = os.path.join(_THIS_DIR, json_filename) with open(json_path, encoding="utf8") as f: chars_to_jyutping = json.load(f) with open(os.path.join(_THIS_DIR, _RESEGMENTED_FILENAME), encoding="utf8") as f: resegmented = {} for line in f: line = line.strip() if not line or line.startswith("# ") or " " not in line: continue resegmented[line.replace(" ", "")] = line new_chars_to_jyutping = {} for chars, jp in chars_to_jyutping.items(): if chars in resegmented: chars_split = split_characters_with_alphanum(chars) jp_split = parse_jyutping(jp) # Don't bother if we can't match each jyutping syllable # with each Cantonese character. if len(chars_split) != len(jp_split): new_chars_to_jyutping[chars] = jp else: new_words = resegmented[chars].split() i = 0 for new_word in new_words: # If this new word already exists in the original # mapping, don't re-add it to the new map, or else # we risk altering this word's jyutping representation # (some Cantonese words/characters have multiple # pronunciations, and we've already chosen the more # frequent one according to the rime-cantonese source). if new_word in chars_to_jyutping: i += len(split_characters_with_alphanum(new_word)) continue new_jp_for_word = "" for _ in range( len(split_characters_with_alphanum(new_word))): new_jp_for_word += "".join(jp_split[i]) i += 1 new_chars_to_jyutping[new_word] = new_jp_for_word else: new_chars_to_jyutping[chars] = jp with open(json_path, "w", encoding="utf8") as f: json.dump(new_chars_to_jyutping, f, indent=4, ensure_ascii=False)
def jyutping_to_tipa(jp_str): """Convert Jyutping romanization into LaTeX TIPA. .. versionadded:: 3.0.0 This function replaces the deprecated equivalent ``jyutping2tipa``. Parameters ---------- jp_str : str Jyutping romanization for one or multiple characters Returns ------- list[str] Raises ------ ValueError If the Jyutping romanization is illegal (e.g., with unrecognized elements). Examples -------- >>> jyutping_to_tipa("gwong2dung1waa2") # 廣東話, Cantonese # doctest: +SKIP ['k\\super w ON25', 'tUN55', 'wa25'] """ # noqa: E501 jp_parsed_list = parse_jyutping(jp_str) tipa_list = [] for jp_parsed in jp_parsed_list: onset = jp_parsed[0] # TODO: Separate "final" as "nucleus" and "coda" instead? final = jp_parsed[1] + jp_parsed[2] tone = jp_parsed[3] tipa = ONSETS_TIPA[onset] + FINALS_TIPA[final] tipa = tipa.strip() + TONES_TIPA[tone] tipa_list.append(tipa) return tipa_list
def jyutping_to_yale(jp_str, as_list=True): """Convert Jyutping romanization into Yale romanization. .. versionadded:: 3.0.0 This function replaces the deprecated equivalent ``jyutping2yale``. .. versionchanged:: 3.0.0 ``as_list`` has its default value switched from ``False`` to ``True``, so that by default the function returns a list, which is in line with the other "jyutping_to_X" functions. Parameters ---------- jp_str : str Jyutping romanization for one or multiple characters as_list : bool, optional If False (default is True), the output is a string with a single quote ``'`` to disambiguate unclear syllable boundaries (e.g., a consonant or the low-tone marker "h" being ambiguous as an onset or as part of the previous syllable). Returns ------- list[str], or str if as_list is False Raises ------ ValueError If the Jyutping romanization is illegal (e.g., with unrecognized elements). Examples -------- >>> jyutping_to_yale("gwong2dung1waa2") # 廣東話, Cantonese ['gwóng', 'dūng', 'wá'] >>> jyutping_to_yale("gwong2dung1waa2", as_list=False) 'gwóngdūngwá' >>> >>> # 'heihauh' would be ambiguous between hei3hau6 and hei6au6. >>> jyutping_to_yale("hei3hau6", as_list=False) # 氣候, climate "hei'hauh" """ jp_parsed_list = parse_jyutping(jp_str) yale_list = [] for jp_parsed in jp_parsed_list: onset = ONSETS_YALE[jp_parsed.onset] nucleus = NUCLEI_YALE[jp_parsed.nucleus] coda = CODAS_YALE[jp_parsed.coda] tone = jp_parsed.tone # still in parse_jyutping # jyutping2yale system uses "h" to mark the three low tones if tone in {"4", "5", "6"}: low_tone_h = "h" else: low_tone_h = "" # in jyutping2yale, long "aa" vowel with no coda is denoted by "a" if nucleus == "aa" and coda == "": nucleus = "a" # when nucleus is "yu"... # 1. disallow "yyu" (when onset is "y") # 2. change nucleus "yu" into "u" -- this is a hack for adding tone # diacritic, since we don't want "y" to bear the diacritic if nucleus == "yu": if onset == "y": onset = "" nucleus = "u" # when nucleus is "ng" # the tone diacritic has to be on "g" but not "n" # now we pretend that the nucleus is "g", and will prepend the "n" back # at the end if nucleus == "ng": nucleus = "g" # add the jyutping2yale tone diacritic to the first nucleus letter # parse_jyutping tone 1 --> add macron # parse_jyutping tone 2 or 5 --> add acute # parse_jyutping tone 4 --> add grave # parse_jyutping tone 3 or 6 --> (no diacritic) # If the accented letter doesn't exist in unicode, use the combining # accent instead. letter = nucleus[0] # nucleus 1st letter unicode_letter_name = unicodedata.name(letter) if tone == "1": try: letter_with_diacritic = unicodedata.lookup( unicode_letter_name + " WITH MACRON") except KeyError: letter_with_diacritic = letter + "\u0304" elif tone in {"2", "5"}: try: letter_with_diacritic = unicodedata.lookup( unicode_letter_name + " WITH ACUTE") except KeyError: letter_with_diacritic = letter + "\u0301" elif tone == "4": try: letter_with_diacritic = unicodedata.lookup( unicode_letter_name + " WITH GRAVE") except KeyError: letter_with_diacritic = letter + "\u0300" else: # either tone 3 or tone 6 letter_with_diacritic = letter nucleus = letter_with_diacritic + nucleus[1:] # add back "y" if the nucleus is "yu" # ("y" was taken away for convenience in adding tone diacritic) if jp_parsed.nucleus == "yu": nucleus = "y" + nucleus # add back "n" if the nucleus is "ng" # ('n' was taken away so that tone diacritic is on "g" but not "n") if jp_parsed.nucleus == "ng": nucleus = "n" + nucleus # parse_jyutping final "eu" should be jyutping2yale "ew" (not "eu") if coda == "u" and nucleus == "e": coda = "w" # save the resultant jyutping2yale if coda in {"i", "u", "w"} and tone in {"4", "5", "6"}: yale = onset + nucleus + coda + low_tone_h else: yale = onset + nucleus + low_tone_h + coda yale_list.append(yale) if as_list: return yale_list # Output yale_list as a string # Check if there's potential ambiguity when Yale strings are concatenated # Ambiguity case 1: # 1st syllable coda is one of the "ambiguous_consonants" # and 2nd syllable starts with a vowel *letter* # Ambiguity case 2: # 1st syllable has no coda and 2nd syllable starts with one of the # "ambiguous_consonants" # e.g., hei3hau6 'climate' --> heihauh # (middle "h" for tone in 1st syllable or being onset of 2nd syllable?) if len(yale_list) == 0: return "" elif len(yale_list) == 1: return yale_list[0] ambiguous_consonants = {"h", "p", "t", "k", "m", "n", "ng"} vowel_letters = { "a", "e", "i", "o", "u", "á", "é", "í", "ó", "ú", "à", "è", "ì", "ò", "ù", "ā", "ē", "ī", "ō", "ū", } output_str = "" for i in range(len(yale_list) - 1): yale1 = yale_list[i] yale2 = yale_list[i + 1] ambiguous = False # test case 1: if _endswithoneof(yale1, ambiguous_consonants) and _startswithoneof( yale2, vowel_letters): ambiguous = True # test case 2: if (not ambiguous and not _endswithoneof(yale1, ambiguous_consonants) and _startswithoneof(yale2, ambiguous_consonants)): ambiguous = True output_str += yale1 if ambiguous: output_str += "'" output_str += yale_list[-1] return output_str
def perform_search( fn_to_tagged_sents, onset=None, nucleus=None, coda=None, tone=None, initial=None, final=None, jyutping=None, character=None, pos=None, word_range=(0, 0), sent_range=(0, 0), tagged=True, sents=False, ): """ overall strategy: deal with jp (and all jp-related elements) first, and then the character 1. jp hierarchy of jp and associated search elements: jp / | \ onset/initial final tone / \ nucleus coda lower search elements cannot be used together with dominating higher elements """ # ensure tuple type: word_range and sent_range if not (type(word_range) == type(sent_range) == tuple): raise ValueError("word_range and sent_range must be tuples") words_left, words_right = word_range sents_left, sents_right = sent_range # ensure int type: words_left, words_right, sents_left, sents_right if not ( type(words_left) == type(words_right) == type(sents_left) == type(sents_right) == int ): raise ValueError("int required for {words, sents}_{left, right}") if sents_left > 0 or sents_right > 0: sents = True # determine what kinds of search we are doing character_search = False jp_search = False pos_search = False if character: character_search = True if onset or nucleus or coda or tone or final or jyutping: jp_search = True if pos: pos_search = True if not (character_search or jp_search or pos_search): raise ValueError("no search elements") # check if jyutping search is valid jp_search_tuple = (None, None, None, None) if jp_search: # ensure compatible jyutping search elements if final and (nucleus or coda): raise ValueError( "final cannot be used together with " "either nucleus or coda (or both)" ) if jyutping and (onset or final or nucleus or coda or tone): raise ValueError( "jyutping cannot be used together with other " "Jyutping elements" ) if (onset != initial) and onset and initial: raise ValueError("onset conflicts with initial") # onset/initial if initial: onset = initial # determine jp_search_tuple if jyutping: try: jp_search_list = parse_jyutping(jyutping) except ValueError: raise ValueError("invalid jyutping -- %s" % (repr(jyutping))) if len(jp_search_list) > 1: raise ValueError("only jyutping for one character is allowed") else: jp_search_tuple = jp_search_list[0] else: if final: nucleus, coda = parse_final(final) jp_search_tuple = (onset, nucleus, coda, tone) fn_to_results = {} for fn, tagged_sents in fn_to_tagged_sents.items(): sent_word_index_pairs = [] for i_sent, tagged_sent in enumerate(tagged_sents): for i_word, tagged_word in enumerate(tagged_sent): c_characters, c_pos, c_mor, _ = tagged_word # c = current c_jyutping = get_jyutping_from_mor(c_mor) # determine character_search and pos_search if character_search: character_match = character in c_characters else: character_match = True # if 'V' in c_pos.upper(): # import pdb; pdb.set_trace() if pos_search: pos_match = bool(re.search(pos, c_pos)) else: pos_match = True if not (character_match and pos_match): continue # determine if jyutping matches c_jyutping jyutping_match = False if not jp_search: jyutping_match = True elif not c_jyutping: pass else: try: c_parsed_jyutpings = parse_jyutping(c_jyutping) except ValueError: continue for c_parsed_jyutping in c_parsed_jyutpings: booleans = [ _jp_element_match(search_, current_) for search_, current_ in zip( jp_search_tuple, c_parsed_jyutping ) ] if all(booleans): jyutping_match = True break if jyutping_match: sent_word_index_pairs.append((i_sent, i_word)) results_list = [] for i_sent, i_word in sent_word_index_pairs: if not sents: tagged_sent = tagged_sents[i_sent] i_word_start = i_word - words_left i_word_end = i_word + words_right + 1 if i_word_start < 0: i_word_start = 0 if i_word_end > len(tagged_sent): i_word_end = len(tagged_sent) words_wanted = tagged_sent[i_word_start:i_word_end] if not tagged: words_wanted = [x[0] for x in words_wanted] if len(words_wanted) == 1: words_wanted = words_wanted[0] results_list.append(words_wanted) else: i_sent_start = i_sent - sents_left i_sent_end = i_sent + sents_right + 1 if i_sent_start < 0: i_sent_start = 0 if i_sent_end > len(tagged_sents): i_sent_end = len(tagged_sents) sents_wanted = tagged_sents[i_sent_start:i_sent_end] if not tagged: for i, sent in enumerate(sents_wanted[:]): sents_wanted[i] = [x[0] for x in sent] if len(sents_wanted) == 1: sents_wanted = sents_wanted[0] results_list.append(sents_wanted) fn_to_results[fn] = results_list return fn_to_results