예제 #1
0
def get_all_jyutping(word):
  output = []
  output.extend(get_all_yue(word))
  word_to_jyutping_list = get_word_to_jyutping_list()
  if word in word_to_jyutping_list:
    output.extend(word_to_jyutping_list[word])
  jyut = jyutping.get(word)
  if jyut != None and None not in jyut:
    if is_unambiguous(jyut) and (len(output) == 0 or len(output) > 1):
      output.insert(0, ''.join(get_first_of_all(jyut)).strip().replace(' ', ''))
  jyut = get_word_to_jyutping_corpus_mostfreq(word)
  if jyut != None:
    output.insert(0, jyut)
  return [x.strip().replace(' ', '') for x in output]
예제 #2
0
    def get_word_phone_list(self, word_list, using_tool):
        """
            get phone list and phone array of word_list

        Parameters:
            word_dict -- dictionary of word
            word_list -- list of words in a sentence ["我","是个"] without non-verbal information
            using_tool -- whether use tool instead of dictionary to fetch phone sequence
            lang -- cantonse or mandarin
        TO DO: add more functions for language support
        :return
            phone list : ph e m e j
            tone list : 1 2 3
            syl_map: [p1:s1,p2,s1,p3,s1,p4,p4]
            word_map
            non_tone_line_phones
        """
        flag = False
        phone_list = []
        tone_list = []
        syl_map = OrderedDict()
        word_map = OrderedDict()
        # phone index the index of phone in one sentence
        phone_index = 0
        # char index the index of char in one sentence
        char_index = 0

        word_index = 0
        non_tone_line_phones = []
        for word in word_list:
            word = word.strip()
            # get the phone or word
            if not using_tool:
                try:
                    word_phone = word_dict[word]
                except Exception as e:
                    temp_word_phone = jyutping.get(word)
                    temp_word_phone_renew = []
                    # if polyphone appear, just pick first one
                    for char_phone in temp_word_phone:
                        if isinstance(char_phone, list):
                            temp_word_phone_renew.append(char_phone[0])
                        else:
                            temp_word_phone_renew.append(char_phone)
                    word_phone = ''.join(temp_word_phone_renew)
                    # word_phone [('j', 'a', 't', '1'),
                    #  ('g', 'a', 'u', '2'),
                    #  ('s', 'e', 'i', '3'),
                    #  ('g', 'a', 'u', '2'),
                    #  ('n', 'i', 'n', '4')]
                    if word_phone == 'hng1':
                        word_phone_list = [('h', 'ng', '1')]
                    elif word_phone == 'ung2':
                        word_phone_list = [('u', 'ng', '2')]
                    else:
                        try:
                            word_phone_list = pc.parse_jyutping(word_phone)
                        except Exception as e:
                            pdb.set_trace()
            else:
                word_phone_list = []
                # word = HanziConv.toSimplified(word)
                for character in pinyin(word, style=Style.TONE3):
                    if not character[0][-1].isdigit():
                        # 轻声作为第五声
                        character[0] += '5'
                    # assert character[0][-1].isdigit()
                    char_phone_sequence = []
                    char_phone_sequence = self.chinese_dict[character[0]
                                                            [:-1]].copy()
                    char_phone_sequence.append(character[0][-1])
                    word_phone_list.append(char_phone_sequence)

            for phone_t in word_phone_list:
                char_phone = phone_t
                char_phone = [
                    e_phone for e_phone in char_phone if e_phone != ''
                ]
                assert char_phone[-1].isdigit()
                char_phone_list = char_phone[:-1]
                for my_phone in char_phone_list:
                    syl_map[phone_index] = char_index
                    phone_index = phone_index + 1
                phone_list.extend(char_phone_list)
                tone_list.append(char_phone[-1])
                word_map[char_index] = word_index
                char_index = char_index + 1
                non_tone_line_phones.append(''.join(char_phone[:-1]))
            word_index = word_index + 1
        #     logging.debug("phone_list:" + ' '.join(phone_list))
        return phone_list, tone_list, syl_map, word_map, non_tone_line_phones
예제 #3
0
parser = argparse.ArgumentParser(
    description=
    'Adds jyutping to all entries in the standard Chinese dictionary')
parser.add_argument('--input', dest='inputPath', help='Input dictionary file')
parser.add_argument('--output',
                    dest='outputPath',
                    help='Output dictionary file')
args = parser.parse_args()

with open(args.inputPath) as input:
    with open(args.outputPath, "w+") as output:
        lines = input.readlines()

        for line in lines:
            characters = line.split(" ")[0]
            pronunciation = jyutping.get(characters)
            if None in pronunciation:
                pronunciation = []

            for i in range(len(pronunciation)):
                if isinstance(pronunciation[i], list):
                    pronunciation[i] = "/".join(pronunciation[i])

            replPattern = r"\1 \2 [\3] {{{0}}} /\4/".format(
                " ".join(pronunciation))
            newLine = re.sub(r"(.+?) (.+?) \[(.+?)\] \/(.+)\/", replPattern,
                             line)

            # This way we can re-run on an existing dictionary to update the existing entries
            # if the first sub did not do anything. Otherwise the entry should be the same
            newLine = re.sub(r"(.+?) (.+?) \[(.+?)\] {.*?} \/(.+)\/",
예제 #4
0
    def get_word_phone_list(self,word_dict, word_list):
        """
            get phone list and phone array of word_list

        Parameters:
            word_list -- list of words in a sentence ["我","是个"]
            create_oov -- oov dictionary needed to be created at first step.
        """

        phone_list = []
        tone_list = []
        syl_map = OrderedDict()
        word_map = OrderedDict()
        phone_index = 0
        char_index = 0
        word_index = 0
        non_tone_line_phones = []
        for word in word_list:
            word = word.strip()
            try:
                word_phone = word_dict[word]
            except Exception as e:
                temp_word_phone = jyutping.get(word)
                temp_word_phone_renew = []
                # if polyphone appear, just pick first one
                for char_phone in temp_word_phone:
                    if isinstance(char_phone,list):
                        temp_word_phone_renew.append(char_phone[0])
                    else:
                        temp_word_phone_renew.append(char_phone)
                if temp_word_phone[0] == None:
                    print(word)
                    pdb.set_trace()
                    exit(0)
                word_phone = ''.join(temp_word_phone_renew)
            if word_phone == 'hng1':
                jp = [('h','ng','1')]
            elif word_phone == 'ung2':
                jp = [('u','ng','2')]
            else:
                try:
                    jp = pc.parse_jyutping(word_phone)
                except Exception as e:
                    pdb.set_trace()
            for phone_t in jp:
                char_phone = list(phone_t)
                char_phone = [e_phone for e_phone in char_phone if e_phone != '']
                assert char_phone[-1].isdigit()
                try:
                    char_phone_list = self.lex_dict[''.join(char_phone[:-1])]
                except Exception as e:
                    pdb.set_trace()
                for my_phone in char_phone_list:
                    syl_map[phone_index] = char_index
                    phone_index = phone_index + 1
                phone_list.extend(char_phone_list)
                tone_list.append(char_phone[-1])
                word_map[char_index] = word_index
                char_index = char_index + 1
                non_tone_line_phones.append(''.join(char_phone[:-1]))
            word_index = word_index + 1
        #     logging.debug("phone_list:" + ' '.join(phone_list))
        return phone_list, tone_list, syl_map, word_map,non_tone_line_phones