def safetext_token(self, instring): ## Special handling of terminal token: if instring == c.TERMINAL: return c.TERMINAL else: if self.lowercase_safetext == 'True': return naive_util.safetext(instring.lower()) else: return naive_util.safetext(instring)
def get_custom_segments(self, word): # pairs letters together # use this function if your original text is written in phones instead of letters safetext_pairs = [] list_pairs = zip(*[list(word.lower())[i::2] for i in range(2)]) for x, y in list_pairs: first_letter = naive_util.safetext(x).encode("utf-8") second_letter = naive_util.safetext(y).encode("utf-8") safetext_pairs.append(unicode(first_letter + second_letter)) return safetext_pairs
def safetext_token(self, instring): """ 把unicode串用一个唯一英文串表示,以方便之后处理 :param instring: unicode串 :return: 英文串 """ ## Special handling of terminal token: if instring == c.TERMINAL: return c.TERMINAL else: if self.lowercase_safetext == 'True': return naive_util.safetext(instring.lower()) else: return naive_util.safetext(instring)
def get_phonetic_segments(self, word): word = word.lower() safetext_word = [] unsafetext_word = [] chars = '@' for i, char in enumerate(word + "@"): if (chars + char in self.vi_consonants): chars += char else: if (chars == 'g') and (naive_util.safetext(char) in [ 'i', '_LATINSMALLLETTERIWITHGRAVE_', '_LATINSMALLLETTERIWITHACUTE_', '_LATINSMALLLETTERIWITHHOOKABOVE_', '_LATINSMALLLETTERIWITHTILDE_', '_LATINSMALLLETTERIWITHDOTBELOW_' ]): chars += 'i' safetext_char = '' if (chars in self.vi_consonants): for c in self.vi_cons_phone[self.vi_consonants.index( chars)]: safetext_char += self.get_safetext(c) elif chars in self.name_reps.keys(): safetext_char += self.name_reps[chars] else: safetext_char += self.get_safetext(chars) safetext_word.append(safetext_char) unsafetext_word.append(chars) chars = char if (unsafetext_word[-1] in self.vi_consonants): safetext_word[-1] = "END" + safetext_word[-1] for i in range(len(safetext_word)): safetext_word[i] = "_" + safetext_word[i].replace(" ", "") + "_" return safetext_word[1:]
def process_utterance(self, utt): for node in utt.xpath(self.target_nodes): assert node.has_attribute(self.input_attribute) word = node.get(self.input_attribute) ## for now, do indic->latin conversion within lexicon:-- word_lat = latinise_indian_script_string(word) word = [safetext(lett.lower()) for lett in word_lat] ## handle OOV phones:-- word = [ letter for letter in word if letter in self.phone_inventory ] word = self.phone_delimiter.join(word) if word in self.entries: node.set('phones_from', 'lex') pronunciation = self.entries[word] else: pronunciation = self.get_oov_pronunciation(word) node.set('phones_from', 'lts') if pronunciation == None: pronunciation = self.backoff_pronunciation node.set('phones_from', 'default') node.set(self.output_attribute, pronunciation)
def get_phonetic_segments(self, word): # consonants = ['q', 'w', 'r', 't', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm'] # list_words = list(word.lower()) # safetext_letters = [naive_util.safetext(list_words[0])] # for letter in list_words[1:]: # if (letter in consonants) and (safetext_letters[-1] in consonants): # safetext_letters[-1] += letter # continue # if (letter == 'u') and (safetext_letters[-1] == 'q'): # safetext_letters[-1] += 'u' # continue # if (naive_util.safetext(letter) in ['i', '_LATINSMALLLETTERIWITHGRAVE_', '_LATINSMALLLETTERIWITHACUTE_', '_LATINSMALLLETTERIWITHHOOKABOVE_', '_LATINSMALLLETTERIWITHTILDE_', '_LATINSMALLLETTERIWITHDOTBELOW_']) and (safetext_letters[-1] == 'g'): # safetext_letters[-1] += 'i' # safetext_letters.append(naive_util.safetext(letter)) # letters = list(word.lower()) # safetext_letters = [letters[0]] # for letter in letters[1:]: # if (safetext_letters[-1] + letter in self.vi_consonants): # safetext_letters[-1] += letter # continue # if (safetext_letters[-1] == 'g') and (naive_util.safetext(letter) in ['i', '_LATINSMALLLETTERIWITHGRAVE_', '_LATINSMALLLETTERIWITHACUTE_', '_LATINSMALLLETTERIWITHHOOKABOVE_', '_LATINSMALLLETTERIWITHTILDE_', '_LATINSMALLLETTERIWITHDOTBELOW_']): # safetext_letters[-1] += 'i' # safetext_letters.append(letter) # for i in range(len(safetext_letters)): # if (safetext_letters[i] in self.vi_consonants): # safetext_letters[i] = self.vi_cons_phone[self.vi_consonants.index(safetext_letters[i])] # for i in range(len(safetext_letters)): # safetext_letters[i] = naive_util.safetext(safetext_letters[i]) letters = [naive_util.safetext(l) for l in list(word.lower())] safetext_letters = [letters[0]] for letter in letters[1:]: if (safetext_letters[-1] + letter in self.vi_consonants): safetext_letters[-1] += letter continue if (safetext_letters[-1] == 'g') and (letter in [ 'i', '_LATINSMALLLETTERIWITHGRAVE_', '_LATINSMALLLETTERIWITHACUTE_', '_LATINSMALLLETTERIWITHHOOKABOVE_', '_LATINSMALLLETTERIWITHTILDE_', '_LATINSMALLLETTERIWITHDOTBELOW_' ]): safetext_letters[-1] += 'i' safetext_letters.append(letter) for i in range(len(safetext_letters)): if (safetext_letters[i] in self.vi_consonants): safetext_letters[i] = self.vi_cons_phone[ self.vi_consonants.index(safetext_letters[i])] return safetext_letters
def get_phonetic_segments(self, word): """ 获取单词word的发音表示 :param word: 一个单词 :return: 发音表示 """ safetext_letters = [] if self.use_pinyin: return pinyin.look_up(word) else: for letter in list(word.lower()): safetext_letters.append(naive_util.safetext(letter)) return safetext_letters
def word_2_safetext(self, word): if any(c not in "qwertyuiopasdfghjklmnbvcxz123456" for c in word): return safetext(word) name_reps = { "1": "ONE", "2": "TWO", "3": "THREE", "4": "FOUR", "5": "FIVE", "6": "SIX"} for key in name_reps.keys(): word = word.replace(key, name_reps[key]) return "_" + word + "_"
def process_utterance(self, utt): for node in utt.xpath(self.target_nodes): assert node.has_attribute(self.target_attribute) word = node.get(self.target_attribute) word = [safetext(lett.lower()) for lett in word] current_class = node.attrib[self.class_attribute] phones_from = None if current_class in self.word_classes: word = node.attrib[self.target_attribute] if word.lower() in self.entries: phones_from = 'lex' pronunciation = self.entries[word.lower()] else: pronunciation = self.get_oov_pronunciation(word) phones_from = 'lts' if pronunciation == None: pronunciation = self.backoff_pronunciation phones_from = 'default' elif current_class in self.probable_pause_classes: pronunciation = c.PROB_PAUSE # [c.PROB_PAUSE] child = Element('segment') child.set('pronunciation', pronunciation) node.add_child(child) continue elif current_class in self.possible_pause_classes: pronunciation = c.POSS_PAUSE # [c.POSS_PAUSE] child = Element('segment') child.set('pronunciation', pronunciation) node.add_child(child) continue if phones_from == 'lts': phones = [ipa2sampa[x.encode('utf8')] if x.encode('utf8') in ipa2sampa.keys() else x for x in pronunciation.split(' ')] else: phones = [x for x in pronunciation.split(' ')] for phone in phones: child = Element('segment') child.set('pronunciation', phone) if phones_from: child.set('phones_from', phones_from) node.add_child(child)
def get_phonetic_segments(self, word): safetext_letters = [] for letter in list(word.lower()): safetext_letters.append(naive_util.safetext(letter)) return safetext_letters