def extract_components(input, orthography='honzi_jcz'): assert orthography in {'honzi_jcz', 'jcz_only'} is_alphabetic = lambda x: x.isascii() and x.isalpha() jpings_list = pc.characters_to_jyutping(input) assert len(jpings_list) == 1 _, jpings = jpings_list[0] x = pc.parse_jyutping(jpings) if orthography == 'jcz_only': return [str(y) for y in x] targets = [pc.characters_to_jyutping(glyph)[0] for glyph in input \ if not is_alphabetic(glyph) and glyph != "-"] idx = 0 tgt_idx = 0 outs = [] while idx < len(x): elem = x[idx] if tgt_idx >= len(targets): outs.extend([str(y) for y in x[idx:]]) return outs if str(elem)[:-1] == targets[tgt_idx][1][:-1]: if targets[tgt_idx][0] in mouth_list: outs.append(str(elem)) else: outs.append(targets[tgt_idx][0]) tgt_idx += 1 else: outs.append(str(elem)) idx += 1 return outs
def search_single_char(self, word_dict, m_char): # search single char in word dict for key, value in word_dict.items(): if m_char in key: m_index = list(key).index(m_char) jp = pc.parse_jyutping(value) jp_index = ''.join(list(jp[m_index])) return jp_index
def get_word_phone_list(word_dict, word_list): """ get phone list and phone array of word_list Parameters: word_list -- list of words in a sentence ["我","是个"] create_oov -- oov dictionary needed to be created at first step. """ phone_list = [] tone_list = [] syl_map = OrderedDict() word_map = OrderedDict() phone_index = 0 char_index = 0 word_index = 0 lex_dict = mld.get_lexicon_dict('./lexicon.txt') for word in word_list: word = word.strip() try: word_phone = word_dict[word] except Exception as e: word_phone_list = [] # need find one by one for char in list(word): try: char_phone = word_dict[char] except Exception as e: char_phone = mld.search_single_char(word_dict, char) if char_phone is None: print(char) char_phone_list = char_phone.strip().split(" ") word_phone_list.extend(char_phone_list) word_phone = ''.join(word_phone_list) jp = pc.parse_jyutping(word_phone) for phone_t in jp: char_phone = list(phone_t) char_phone = [e_phone for e_phone in char_phone if e_phone != ''] assert char_phone[-1].isdigit() char_phone_list = lex_dict[''.join(char_phone[:-1])] for my_phone in char_phone_list: syl_map[phone_index] = char_index phone_index = phone_index + 1 phone_list.extend(char_phone_list) tone_list.append(char_phone[-1]) word_map[char_index] = word_index char_index = char_index + 1 # non_tone_line_phones.append(''.join(char_phone[:-1])) word_index = word_index + 1 # logging.debug("phone_list:" + ' '.join(phone_list)) return phone_list, tone_list, syl_map, word_map
def pycantonese_converter(input, cur_cmu, orthography='honzi_jcz', sep_eng_words=True): assert orthography in {'honzi_jcz', 'jcz_only'} frags = re.split(r'(\s+)', input) # preserve whitespace except space character frags = list(filter(lambda x: x not in {'', ' '}, frags)) outs, unparsed = [], set() prev_was_eng_word = False # used for adding space between words for frag in frags: if frag.isspace() and (frag != ' ' and sep_eng_words): outs.append(frag) prev_was_eng_word = False continue for glyphs, jpings in pc.characters_to_jyutping(frag): if jpings is not None: jping_arr = pc.parse_jyutping(jpings) if re.search('[a-zA-Z]', glyphs): # this is a dictionary entry with both honzi and latin characters outs.extend( extract_components(glyphs, orthography=orthography)) else: if orthography == 'jcz_only': outs.extend([str(x) for x in jping_arr]) else: for i, glyph in enumerate(glyphs): outs.append( str(jping_arr[i]) if glyph in mouth_list else glyph) prev_was_eng_word = False elif len(glyphs) == 1 and not is_alphanum( glyphs): #glyphs in puncs or glyphs.isdigit(): outs.append(glyphs) prev_was_eng_word = False else: outs, unparsed, prev_was_eng_word = words_to_jyutping( glyphs, cur_cmu, outs, unparsed, prev_was_eng_word and sep_eng_words) return outs, unparsed
def test_invalid_onset(): with pytest.raises(ValueError) as e: parse_jyutping("shaa1") assert "onset error" in str(e.value)
def test_invalid_nucleus(): with pytest.raises(ValueError) as e: parse_jyutping("sk3") assert "nucleus error" in str(e.value)
def test_invalid_coda(): with pytest.raises(ValueError) as e: parse_jyutping("leil3") assert "coda error" in str(e.value)
def test_fewer_than_2_characters(): with pytest.raises(ValueError) as e: parse_jyutping("3") assert "fewer than 2 characters" in str(e.value)
def test_no_tone(): with pytest.raises(ValueError) as e: parse_jyutping("lei") assert "tone error" in str(e.value)
def tojyutping_converter(input, cur_cmu, orthography='honzi_jcz', sep_eng_words=True): # input: original input from transliterate, cur_cmu # output: array of syllables/glyphs (outs) outs, unparsed, word = [], set(), "" # ToJyutping has better pronunciation values compared to PyCantonese tj_parse = ToJyutping.get_jyutping_list(input) # parse words properly # i.e. convert [('做', 'zou6'), ('g', None), ('y', None), ('m', None)] # to [('做', 'zou6'), ('gym', None)] new_tj_parse, word, ptr = [], "", 0 # A-Z and a-z and 0-9 only while ptr < len(tj_parse): glyph, jping = tj_parse[ptr] # print(glyph, jping) if not is_loweralphanum( glyph): # note sinoglyphs are alphabetic in unicode if glyph != " ": new_tj_parse.append(tj_parse[ptr]) ptr += 1 else: word = "" while is_loweralphanum( glyph) and jping is None and ptr < len(tj_parse): # print("word =", word) glyph, jping = tj_parse[ptr] if is_loweralphanum(glyph) and jping is None: word += glyph ptr += 1 else: break if word != " ": new_tj_parse.append((word, None)) prev_was_eng_word = False # used for adding space between words # perform the conversion to Honzi-Jyutping mix # i.e. convert [('做', 'zou6'), ('gym', None)] to ['做','zim1'] for i, pair in enumerate(new_tj_parse): glyphs, jpings = pair if jpings is not None: jping_arr = pc.parse_jyutping(jpings.replace(" ", "")) for j, glyph in enumerate(glyphs): if orthography == 'jcz_only': outs.extend(jpings.split(" ")) elif orthography == 'honzi_jcz': outs.append( str(jping_arr[j]) if glyph in mouth_list else glyph) prev_was_eng_word = False elif len(glyphs) == 1 and not is_alphanum( glyphs): #glyphs in puncs or glyphs.isdigit(): outs.append(glyphs) prev_was_eng_word = False else: outs, unparsed, prev_was_eng_word = words_to_jyutping( glyphs, cur_cmu, outs, unparsed, prev_was_eng_word and sep_eng_words) return outs, unparsed
if (not (re.match('X.+', word[1]) == None) and not (word[0] == '鴨寮街')): allwords[file].pop(i) i = i + 1 # 揸fit, call機x3, 操fitx2 were also excluded parsed_words = [] unparsed_words = [] parsed_word_files = [] for file in allwords.keys(): i = 0; filename = re.sub("[A-Z]:\\\\.*\\\\hkcancor\\\\","",file) for word in allwords[file]: try: print("Word parsed: ", word) parsed_words.append(pc.parse_jyutping(word[2])); parsed_word_files.append(filename); except ValueError: unparsed_words.append(word) print("Error: The word ", word[2], " cannot be parsed.") except IndexError: unparsed_words.append(word) print("Error: The word ", word[2], " cannot be parsed.") parsed_syls = [] parsed_syls_files = [] i = 0; for word in parsed_words: for syl in word: parsed_syls.append(syl);
def test_wrong_data_type(): with pytest.raises(ValueError): parse_jyutping(123)
def test_basic_case_gwong2dung1waa2(): assert parse_jyutping("gwong2dung1waa2") == [ ("gw", "o", "ng", "2"), ("d", "u", "ng", "1"), ("w", "aa", "", "2"), ]
def get_word_phone_list(self,word_dict, word_list): """ get phone list and phone array of word_list Parameters: word_list -- list of words in a sentence ["我","是个"] create_oov -- oov dictionary needed to be created at first step. """ phone_list = [] tone_list = [] syl_map = OrderedDict() word_map = OrderedDict() phone_index = 0 char_index = 0 word_index = 0 non_tone_line_phones = [] for word in word_list: word = word.strip() try: word_phone = word_dict[word] except Exception as e: temp_word_phone = jyutping.get(word) temp_word_phone_renew = [] # if polyphone appear, just pick first one for char_phone in temp_word_phone: if isinstance(char_phone,list): temp_word_phone_renew.append(char_phone[0]) else: temp_word_phone_renew.append(char_phone) if temp_word_phone[0] == None: print(word) pdb.set_trace() exit(0) word_phone = ''.join(temp_word_phone_renew) if word_phone == 'hng1': jp = [('h','ng','1')] elif word_phone == 'ung2': jp = [('u','ng','2')] else: try: jp = pc.parse_jyutping(word_phone) except Exception as e: pdb.set_trace() for phone_t in jp: char_phone = list(phone_t) char_phone = [e_phone for e_phone in char_phone if e_phone != ''] assert char_phone[-1].isdigit() try: char_phone_list = self.lex_dict[''.join(char_phone[:-1])] except Exception as e: pdb.set_trace() for my_phone in char_phone_list: syl_map[phone_index] = char_index phone_index = phone_index + 1 phone_list.extend(char_phone_list) tone_list.append(char_phone[-1]) word_map[char_index] = word_index char_index = char_index + 1 non_tone_line_phones.append(''.join(char_phone[:-1])) word_index = word_index + 1 # logging.debug("phone_list:" + ' '.join(phone_list)) return phone_list, tone_list, syl_map, word_map,non_tone_line_phones
age_to_tones[age] = Counter() for tagged_word in tagged_words: # jyutping should be like "gaa1jau2" (two syllables), "ngo5" (one syllable) etc mor = tagged_word[2] jyutping, _, _ = mor.partition('-') jyutping, _, _ = jyutping.partition('&') if not jyutping: continue # use PyCantonese to parse the "jyutping" str try: jyutping_parsed_list = pc.parse_jyutping(jyutping) except: continue for jyutping_parsed in jyutping_parsed_list: _, _, _, tone = jyutping_parsed # (onset, nucleus, coda, tone) age_to_tones[age][tone] += 1 # Creating the dataframe for plotting the desired heatmap # --------------------------------------------------------------- # # The dataframe has three columns and is created by `data_dict`. # In[17]:
def test_coda_ng(): assert parse_jyutping("hoeng1") == [("h", "oe", "ng", "1")]
def test_no_noda(): assert parse_jyutping("gaa1") == [("g", "aa", "", "1")]
def test_null_input(input_): assert parse_jyutping(input_) == []
def get_word_phone_list(self, word_list, using_tool): """ get phone list and phone array of word_list Parameters: word_dict -- dictionary of word word_list -- list of words in a sentence ["我","是个"] without non-verbal information using_tool -- whether use tool instead of dictionary to fetch phone sequence lang -- cantonse or mandarin TO DO: add more functions for language support :return phone list : ph e m e j tone list : 1 2 3 syl_map: [p1:s1,p2,s1,p3,s1,p4,p4] word_map non_tone_line_phones """ flag = False phone_list = [] tone_list = [] syl_map = OrderedDict() word_map = OrderedDict() # phone index the index of phone in one sentence phone_index = 0 # char index the index of char in one sentence char_index = 0 word_index = 0 non_tone_line_phones = [] for word in word_list: word = word.strip() # get the phone or word if not using_tool: try: word_phone = word_dict[word] except Exception as e: temp_word_phone = jyutping.get(word) temp_word_phone_renew = [] # if polyphone appear, just pick first one for char_phone in temp_word_phone: if isinstance(char_phone, list): temp_word_phone_renew.append(char_phone[0]) else: temp_word_phone_renew.append(char_phone) word_phone = ''.join(temp_word_phone_renew) # word_phone [('j', 'a', 't', '1'), # ('g', 'a', 'u', '2'), # ('s', 'e', 'i', '3'), # ('g', 'a', 'u', '2'), # ('n', 'i', 'n', '4')] if word_phone == 'hng1': word_phone_list = [('h', 'ng', '1')] elif word_phone == 'ung2': word_phone_list = [('u', 'ng', '2')] else: try: word_phone_list = pc.parse_jyutping(word_phone) except Exception as e: pdb.set_trace() else: word_phone_list = [] # word = HanziConv.toSimplified(word) for character in pinyin(word, style=Style.TONE3): if not character[0][-1].isdigit(): # 轻声作为第五声 character[0] += '5' # assert character[0][-1].isdigit() char_phone_sequence = [] char_phone_sequence = self.chinese_dict[character[0] [:-1]].copy() char_phone_sequence.append(character[0][-1]) word_phone_list.append(char_phone_sequence) for phone_t in word_phone_list: char_phone = phone_t char_phone = [ e_phone for e_phone in char_phone if e_phone != '' ] assert char_phone[-1].isdigit() char_phone_list = char_phone[:-1] for my_phone in char_phone_list: syl_map[phone_index] = char_index phone_index = phone_index + 1 phone_list.extend(char_phone_list) tone_list.append(char_phone[-1]) word_map[char_index] = word_index char_index = char_index + 1 non_tone_line_phones.append(''.join(char_phone[:-1])) word_index = word_index + 1 # logging.debug("phone_list:" + ' '.join(phone_list)) return phone_list, tone_list, syl_map, word_map, non_tone_line_phones
def test_syllabic_nasals(): # TODO assert parse_jyutping('hm4') == [('h', 'm', '', '4')] assert parse_jyutping("ng5") == [("", "ng", "", "5")] assert parse_jyutping("m4") == [("", "m", "", "4")] assert parse_jyutping("n3") == [("", "n", "", "3")]