def create_priors(priors, isym, osym, code): """This function creates a linear FST and adds a <sigma> (joker) symbol at the end as a place holder""" priors = priors.split(";") # init a trasducer f = fst.Fst() f.set_input_symbols(isym) f.set_output_symbols(osym) s0 = f.add_state() f.set_start(s0) old = s0 sig = "<sigma>" # adding priors for j in range(len(priors)): new = f.add_state() f.add_arc(old, fst.Arc(code[priors[j]], code[priors[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new new = f.add_state() # adding <sigma> f.add_arc(old, fst.Arc(code[sig], code[sig], fst.Weight(f.weight_type(), 1.0), new)) f.add_arc(new, fst.Arc(code[sig], code[sig], fst.Weight(f.weight_type(), 1.0), new)) return f,new
def generate_phone_sequence_recognition_wfst(n): """ generate a HMM to recognise any single phone sequence in the lexicon Args: n (int): states per phone HMM Returns: the constructed WFST """ f = fst.Fst('log') # create a single start state start_state = f.add_state() f.set_start(start_state) for i, phone in phone_table: if phone != '<eps>': tmp_state = f.add_state() weight = fst.Weight('log', -math.log(phone_table.num_symbols())) f.add_arc(start_state, fst.Arc(0, 0, weight, tmp_state)) last_state = generate_phone_wfst(f, tmp_state, phone, n) f.set_final(last_state) weight = fst.Weight('log', -math.log(1)) f.add_arc(last_state, fst.Arc(0, 0, weight, start_state)) return f
def generate_word_sequence_recognition_wfst(n): """ generate a HMM to recognise any single word sequence for words in the lexicon Args: n (int): states per phone HMM Returns: the constructed WFST """ f = fst.Fst('log') # create a single start state start_state = f.add_state() f.set_start(start_state) for _, word in word_table: if word != '<eps>': tmp_state = f.add_state() weight = fst.Weight('log', -math.log(word_table.num_symbols())) f.add_arc(start_state, fst.Arc(0, 0, weight, tmp_state)) word_wfst = generate_word_wfst(f, tmp_state, word, n) weight = fst.Weight('log', -math.log(1.0)) f.add_arc(list(word_wfst.states())[-1], fst.Arc(0, 0, weight, start_state)) return f
def generate_phone_wfst(f, start_state, phone, n): """ Generate a WFST representing an n-state left-to-right phone HMM. Args: f (fst.Fst()): an FST object, assumed to exist already start_state (int): the index of the first state, assumed to exist already phone (str): the phone label n (int): number of states of the HMM excluding start and end Returns: the final state of the FST """ current_state = start_state out_label = phone_table.find(phone) for i in range(1, n+1): in_label = state_table.find('{}_{}'.format(phone, i)) weight = fst.Weight('log', -math.log(0.1)) f.add_arc(current_state, fst.Arc(in_label, 0, weight, current_state)) new_state = f.add_state() weight = fst.Weight('log', -math.log(0.9)) f.add_arc(current_state, fst.Arc(in_label, out_label, weight, new_state)) current_state = new_state return current_state
def build_refiner(isyms_fname, refiner_fname): """build refiner this fst would help extract the last two states (one last arc) of the machine """ # read isyms input_syms = fst.SymbolTable.read_text(isyms_fname) code = {} for ltr, c in input_syms: code[c]=ltr # build refiner refiner = fst.Fst() refiner.set_input_symbols(input_syms) refiner.set_output_symbols(input_syms) s0 = refiner.add_state() s1 = refiner.add_state() for c, ltr in code.items(): if ltr == 0: continue if ltr < 100: refiner.add_arc(s0, fst.Arc(code[c], code["<epsilon>"], fst.Weight(refiner.weight_type(), 1.0), s0)) refiner.add_arc(s0, fst.Arc(code[c], code[c], fst.Weight(refiner.weight_type(), 1.0), s1)) refiner.set_start(s0) refiner.set_final(s1) # save refiner refiner.write(refiner_fname)
def generate_WFST_final_probability(n, lex, weight_fwd, weight_self, weights_final, original=False): """ generate a HMM to recognise any single word sequence for words in the lexicon Args: n (int): states per phone HMM original (bool): True/False - origianl/optimized lexicon weight_fwd (int): weight value weight_self (int): weight value of self node weight_final (dict): word -> probability of final state Returns: the constructed WFST """ f = fst.Fst('log') none_weight = fst.Weight('log', -math.log(1)) lex = parse_lexicon(lex, original) word_table, phone_table, state_table = generate_symbols_table(lex, 3) output_table = generate_output_table(word_table, phone_table) # create a single start state start_state = f.add_state() f.set_start(start_state) for word, phone_list in lex.items(): for phones in phone_list: initial_state = f.add_state() f.add_arc( start_state, fst.Arc(0, output_table.find(word), none_weight, initial_state)) current_state = initial_state for phone in phones: current_state = generate_phone_wfst(f, current_state, phone, n, state_table, output_table, weight_fwd, weight_self) f.set_final(current_state) f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) # final word state should be current state prob = weights_final[word] weight = fst.Weight('log', -math.log(prob)) f.set_final(current_state, weight) # print(f"Current state: {current_state} for word {word} is prob {prob} with log prob{(weight)}") f.set_input_symbols(state_table) f.set_output_symbols(output_table) return f, word_table
def build_lm(dev_fname, isyms_fname, constraints, lattice_output, refiner_fname): """ Make a lattice that maps lemmas and constraints (or priors) to an inflected version """ # rewrite constraints constraints = constraints.replace("_",";") # read isyms input_syms = fst.SymbolTable.read_text(isyms_fname) s_fin = '</s>' code = {} for ltr, c in input_syms: code[c]=ltr # init the lattice f_big = fst.Fst() f_big.set_input_symbols(input_syms) f_big.set_output_symbols(input_syms) for line in open(dev_fname,'r').readlines(): cns, lemma, inflection = line.split()[-3:] if cns == constraints: print(cns, lemma, inflection) # find idx that the strings diverge idx = 0 for i, (lm, flc) in enumerate(zip(lemma, inflection)): if lm !=flc: idx = i break f, old= create_priors(cns, input_syms, input_syms, code) keep = old for j in range(idx,len(lemma)): new = f.add_state() f.add_arc(old, fst.Arc(code[lemma[j]], code[lemma[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new new = f.add_state() # the residual of the lemma is mapped to the inflection residual (indirectly) sym = lemma[idx:]+"_"+inflection[idx:] print(lemma, inflection, sym) f.add_arc(old, fst.Arc(code[sym], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new)) f.set_final(new) f_big.union(f) f_big = fst.determinize(f_big.rmepsilon()) # add <sigma> state in the <sigma place holder> for c, ltr in code.items(): if int(ltr)>1 and int(ltr)<36: # (hard coded) symbols of Runssian + 2 more f_big.add_arc(keep, fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0), keep)) f_big.invert() # save lattice f_big.write(lattice_output)
def creation_automata(): transitions = {"s0": {"1:1:0": ["s0", "s1"], "2:3:1": ["s0", "s2"]}} # La methode iteritems appliquee a un dictionnaire permet de decomposer les differents "niveaux de profondeur" du dictionnaire en tableaux # iteritems appliquee a transitions transforme le dict en un tableau contenant les differentes transitions (la transition s0 a l'index 0, la transition s1 a l'index 1 etc...) puis pour chaque tableau de transition celui-ci contient encore 2 tableaux l'un pour le label (s_i a l'index 0) et l'autre pour la valeur associee (la chaine de caracteres contenant tous les arcs a l'index 1) # iteritems appliquee a arcs transforme le dict d'arcs en un tableau ou chaque cellule contient un arc et pour chaque cellule contenant un arc, il y a un tableau contenant a l'index 0 le label de l'arc et a l'index 1 la valeur de l'arc c'est a dire la liste de destinations # les etats de destinations sont contenus dans une liste donc il n'y a pas besoin d'utiliser la methode iteritems. for src_state_label, arcs in transitions.iteritems( ): # parcours du 1er niveau du dict : les cles sont les labels des etats sources et les objets sont les arcs associes a ces etats sources add_automate_state(src_state_label) for arc_label, set_dsts_states in arcs.iteritems( ): # parcours du 2eme niveau du dict : les cles sont les labels des arcs et les objets parcourus sont les listes d'etats de destination for dst_state_label in set_dsts_states: # parcours du 3eme niveau du dict : le 3eme niveau n'est pas un dictionnaire mais une liste ce qui signifie que les etats ne sont pas indexes par une cle quelconque mais par un entier : les objets parcourus sont les etats de destination add_automate_state(dst_state_label) for state_label, arcs in transitions.iteritems(): for arc_label, set_dsts_states in arcs.iteritems(): chars = arc_label.split(':') for dst_state_label in set_dsts_states: automate.add_arc( automate_states[state_label], fst.Arc(int(chars[0]), int(chars[1]), fst.Weight(automate.weight_type(), int(chars[2])), automate_states[dst_state_label])) automate.set_start(automate_states['s0']) automate.set_final(automate_states['s2'], fst.Weight(automate.weight_type(), 1.5)) print(automate) # Generation du code LaTeX au format GraphViz # Affichage des noeuds avec leurs labels i = 0 print("digraph G {") for state_label, state in automate_states.iteritems(): index = state_label.split("s")[1] display_node = index + " [label = \"" + state_label + "\"]" i += 1 print(display_node) # Affichage des arcs avec leurs labels for src_state_label, arcs in transitions.iteritems(): src_index = src_state_label.split("s")[1] for arc_label, set_dsts_states in arcs.iteritems(): for dst_state_label in set_dsts_states: dst_index = dst_state_label.split("s")[1] display_edge = src_index + "->" + dst_index + " [label = \"" + arc_label + "\"]" print(display_edge) print("}") return (automate)
def test_simple(self): f = fst.Fst() s0 = f.add_state() s1 = f.add_state() s2 = f.add_state() f.add_arc(s0, fst.Arc(1, 1, fst.Weight(f.weight_type(), 3.0), s1)) f.add_arc(s0, fst.Arc(1, 1, fst.Weight.One(f.weight_type()), s2)) f.set_start(s0) f.set_final(s2, fst.Weight(f.weight_type(), 1.5)) # Test fst self.assertEqual(f.num_states(), 3) self.assertAlmostEqual(float(f.final(s2)), 1.5)
def OpenFST_Automata_Example(): f = fst.Fst() s0 = f.add_state() s1 = f.add_state() s2 = f.add_state() f.add_arc(s0, fst.Arc(1, 2, fst.Weight(f.weight_type(), 3.0), s1)) f.add_arc(s0, fst.Arc(1, 3, fst.Weight.One(f.weight_type()), s2)) f.add_arc(s1, fst.Arc(2, 1, fst.Weight(f.weight_type(), 1.0), s2)) f.set_start(s0) f.set_final(s2, fst.Weight(f.weight_type(), 1.5)) print(s0, s1, s2) print(f)
def generate_phone_wfst(f, start_state, phone, n, state_table, phone_table, weight_fwd, weight_self): """ Generate a WFST representing an n-state left-to-right phone HMM. Args: f (fst.Fst()): an FST object, assumed to exist already start_state (int): the index of the first state, assumed to exist already phone (str): the phone label n (int): number of states of the HMM weight_fwd (int): weight value weight_self (int): weight value of self node Returns: the final state of the FST """ current_state = start_state for i in range(1, n + 1): in_label = state_table.find('{}_{}'.format(phone, i)) sl_weight = None if weight_self == None else fst.Weight( 'log', -math.log(weight_self)) # weight for self-loop next_weight = None if weight_fwd == None else fst.Weight( 'log', -math.log(weight_fwd)) # weight for forward # self-loop back to current state f.add_arc(current_state, fst.Arc(in_label, 0, sl_weight, current_state)) # transition to next state # we want to output the phone label on the final state # note: if outputting words instead this code should be modified if i == n: out_label = phone_table.find(phone) else: out_label = 0 # output empty <eps> label next_state = f.add_state() # next_weight = fst.Weight('log', -math.log(0.9)) # weight to next state f.add_arc(current_state, fst.Arc(in_label, out_label, next_weight, next_state)) current_state = next_state return current_state
def Automata_Building(ref_string, levenshtein_distance, output_weight): dict_automata = Levenshtein_Automata_Dico(ref_string, levenshtein_distance) # print(dict_automata) label_initial_state = "0;0" label_final_state = str(len(ref_string)) + ";" + str(levenshtein_distance) # Une fois l'automate represente sous forme de dictionnaire, on cree l'automate grace aux fonctions de la librairie openfst # Creation de tous les etats de l'automate (etats source et de destination confondus) # La fonction add automate state cree un dictionnaire automate states dont les cles sont les labels des etats et les valeurs associees sont les etats crees grace a la fonction de creation d'etats d'openfst state_index = 1 for src_label, set_arcs in dict_automata.iteritems(): state_index = add_automate_state(src_label, state_index) for arc_label, dst_states in set_arcs.iteritems(): for dst_label in dst_states: state_index = add_automate_state(dst_label, state_index) # print(automate_states) # # Creation des arcs de l'automate for src_label, set_arcs in dict_automata.iteritems(): for arc_label, dst_states in set_arcs.iteritems(): label_info = arc_label.split("::") transmitted_char = int(convertSymToLabel(label_info[0])) consummed_char = int(convertSymToLabel(label_info[1])) weight = int(label_info[2]) src_state_index = automate_states[src_label][1] print(transmitted_char, consummed_char, weight) for dst_label in dst_states: # print(dst_label) dst_state_index = automate_states[dst_label][1] automate.add_arc( src_state_index, fst.Arc(transmitted_char, consummed_char, fst.Weight(automate.weight_type(), weight), dst_state_index)) automate.set_start(automate_states[label_initial_state][1]) automate.set_final(automate_states[label_final_state][1], fst.Weight(automate.weight_type(), output_weight)) automate.draw("automata.dot") print(automate) return (automate)
def Automata_Building(ref_string, levenshtein_distance, output_weight): levenshtein_automata = {} levenshtein_automata = Levenshtein_Automata_Dico(ref_string, levenshtein_distance) # print(levenshtein_automata) label_inital_state = "0;0" label_final_state = str(len(ref_string)) + ";" + str(levenshtein_distance) # Une fois l'automate represente sous forme de dictionnaire, on cree l'automate grace aux fonctions de la librairie openfst # Creation de tous les etats de l'automate (etats source et de destination confondus) # La fonction add automate state cree un dictionnaire automate states dont les cles sont les labels des etats et les valeurs associees sont les etats crees grace a la fonction de creation d'etats d'openfst for src_label, set_arcs in levenshtein_automata.iteritems(): add_automate_state(src_label) for arc_label, set_dsts in set_arcs.iteritems(): for dst_label in set_dsts: add_automate_state(dst_label) print(automate) # # Creation des arcs de l'automate for src_label, set_arcs in levenshtein_automata.iteritems(): for arc_label, set_dsts in set_arcs.iteritems(): transmitted_char = arc_label.split(":")[0] consummed_char = arc_label.split(":")[1] weight = arc_label.split(":")[2] print(transmitted_char, consummed_char, weight) for dst_label in set_dsts: automate.add_arc( automate_states[src_label], fst.Arc(int(convertSymToLabel(transmitted_char)), int(convertSymToLabel(consummed_char)), fst.Weight(automate.weight_type(), int(weight)), automate_states[dst_label])) automate.set_start(automate_states[label_inital_state]) automate.set_final(automate_states[label_final_state], fst.Weight(automate.weight_type(), output_weight)) automate.draw("automata.dot") print(automate) return (automate)
def make_input_fst(query, pysym): f = fst.Fst() start = f.add_state() end = f.add_state() f.set_start(start) f.set_final(end, fst.Weight(f.weight_type(), 0.0)) prev_state = start for ch in query: n = f.add_state() label = pysym[ch] f.add_arc(prev_state, fst.Arc(label, label, fst.Weight(f.weight_type(), 0.0), n)) prev_state = n f.add_arc( prev_state, fst.Arc(pysym['<eps>'], pysym['<eps>'], fst.Weight(f.weight_type(), 0.0), end)) f.write('input.fst') return f
def generate_word_sequence_recognition_wfst_test(n, lex, original=False, weight_fwd=None, weight_self=None): """ generate a HMM to recognise any single word sequence for words in the lexicon Args: n (int): states per phone HMM original (bool): True/False - origianl/optimized lexicon weight_fwd (int): weight value weight_self (int): weight value of self node Returns: the constructed WFST """ if (weight_fwd != None and weight_self != None): f = fst.Fst('log') none_weight = fst.Weight('log', -math.log(1)) else: f = fst.Fst() none_weight = None lex = parse_lexicon(lex, original) word_table, phone_table, state_table = generate_symbols_table(lex, 3) output_table = generate_output_table(word_table, phone_table) # print('output_table: {}'.format(list(output_table))) # create a single start state start_state = f.add_state() f.set_start(start_state) # -- make fst for word, phone_list in lex.items(): for phones in phone_list: initial_state = f.add_state() f.add_arc( start_state, fst.Arc(0, output_table.find(word), none_weight, initial_state)) current_state = initial_state for phone in phones: current_state = generate_phone_wfst(f, current_state, phone, n, state_table, output_table, weight_fwd, weight_self) f.set_final(current_state) # f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) f.set_input_symbols(state_table) f.set_output_symbols(output_table) return f, word_table
def make_query(self, cns, lemma): cns = cns.split(";") lemma = list(lemma) q = cns + ["<sigma>"] + lemma + ["</s>"] f = fst.Fst() f.set_input_symbols(self.input_syms) f.set_output_symbols(self.input_syms) s0 = f.add_state() f.set_start(s0) old = s0 for j in range(len(q)): new = f.add_state() f.add_arc(old, fst.Arc(self.code[q[j]], self.code[q[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new f.set_final(old) return f
def SimpleAutomata(): src_state_label = "0;0" src_state_index = automate.add_state() dst_state_label = "0;1" dst_state_index = automate.add_state() arc_label = "2:4:1" label_string = arc_label.split(":") consummed_char = 2 # int(label_string[0]) transmitted_char = 4 # int(label_string[1]) weight = 1 # int(label_string[2]) automate.add_arc( src_state_index, fst.Arc(transmitted_char, consummed_char, fst.Weight(automate.weight_type(), weight), dst_state_index)) print(automate)
def add_arc_to_automate(src_state_label, dst_state_label, arc_label, automate, states_dict): src_state_index = get_index(src_state_label, automate, states_dict) dst_state_index = get_index(dst_state_label, automate, states_dict) label_string = arc_label.split(":") # print(label_string[0], label_string[1], label_string[2]) consummed_char = convertSymToLabel(label_string[0]) # print(consummed_char) transmitted_char = convertSymToLabel(label_string[1]) # print(transmitted_char) weight = int(label_string[2]) # print(weight) automate.add_arc( src_state_index, fst.Arc(transmitted_char, consummed_char, fst.Weight(automate.weight_type(), weight), dst_state_index))
def make_fst(word_sym, phone_sym, pydict_file): with open(pydict_file, 'r') as rp: f = fst.Fst() start = f.add_state() end = f.add_state() f.set_start(start) f.add_arc(start, fst.Arc(phone_sym['<eps>'], word_sym['<s>'], fst.Weight(f.weight_type(), 0.0), start)) # 自转 f.add_arc(end, fst.Arc(phone_sym['<eps>'], word_sym['</s>'], fst.Weight(f.weight_type(), 0.0), end)) # 自转 f.add_arc(end, fst.Arc(phone_sym['<eps>'], word_sym['<eps>'], fst.Weight(f.weight_type(), 0.0), start)) # 1 --> 0 f.set_final(end, fst.Weight(f.weight_type(), 0.0)) for l in rp.readlines(): items = l.strip().split(' ') prev_state = start ilabel = phone_sym['<eps>'] olabel = word_sym['<eps>'] for i in range(len(items[0])): n = f.add_state() pych = items[0][i] chch = items[1] ilabel = phone_sym[pych] if (i == 0): olabel = word_sym[chch] else: olabel = word_sym['<eps>'] f.add_arc( prev_state, fst.Arc(ilabel, olabel, fst.Weight(f.weight_type(), 0.0), n)) prev_state = n # connect the last state with end node f.add_arc( prev_state, fst.Arc(phone_sym['<eps>'], olabel, fst.Weight(f.weight_type(), 0.0), end)) return f
def SimpleAutomata(ref_string, levenshtein_distance): final_dst_state_label = str( len(ref_string)) + ";" + str(levenshtein_distance) for consummed_char_number in range(len(ref_string) + 1): for operations_number in range(levenshtein_distance + 1): src_state_label = str(consummed_char_number) + ";" + str( operations_number) # print(str(consummed_char_number != len(ref_string)) + "-" + str(operations_number == levenshtein_distance)) print( str(consummed_char_number == len(ref_string)) + "-" + str(operations_number == levenshtein_distance)) if (consummed_char_number == (len(ref_string)) and operations_number == levenshtein_distance): final_dst_state_label = src_state_label print("output state") elif (consummed_char_number == (len(ref_string)) and operations_number != levenshtein_distance): insertion_dst_state_label = str( consummed_char_number) + ";" + str(operations_number + 1) insertion_arc_label = "*:epsilon:1" add_arc_to_automate(src_state_label, insertion_dst_state_label, insertion_arc_label, automate, states_dict) elif (consummed_char_number != (len(ref_string)) and operations_number == levenshtein_distance): accepting_dst_state_label = str( consummed_char_number + 1) + ";" + str(operations_number) print(accepting_dst_state_label) accepting_arc_label = ref_string[ consummed_char_number] + ":" + ref_string[ consummed_char_number] + ":" + str(0) add_arc_to_automate(src_state_label, accepting_dst_state_label, accepting_arc_label, automate, states_dict) else: accepting_dst_state_label = str( consummed_char_number + 1) + ";" + str(operations_number) accepting_arc_label = ref_string[ consummed_char_number] + ":" + ref_string[ consummed_char_number] + ":" + str(0) add_arc_to_automate(src_state_label, accepting_dst_state_label, accepting_arc_label, automate, states_dict) deletion_dst_state_label = str( consummed_char_number + 1) + ";" + str(operations_number + 1) deletion_arc_label = "epsilon:" + ref_string[ consummed_char_number] + ":" + str(1) add_arc_to_automate(src_state_label, deletion_dst_state_label, deletion_arc_label, automate, states_dict) substitution_dst_state_label = str( consummed_char_number + 1) + ";" + str(operations_number + 1) substitution_arc_label = "*:" + ref_string[ consummed_char_number] + ":" + str(1) add_arc_to_automate(src_state_label, substitution_dst_state_label, substitution_arc_label, automate, states_dict) insertion_dst_state_label = str( consummed_char_number) + ";" + str(operations_number + 1) insertion_arc_label = "*:" + ref_string[ consummed_char_number] + ":" + str(1) add_arc_to_automate(src_state_label, insertion_dst_state_label, insertion_arc_label, automate, states_dict) for nb_final_states in range(levenshtein_distance + 1): final_dst_state_label = str( len(ref_string)) + ";" + str(nb_final_states) automate.set_final(states_dict[final_dst_state_label], fst.Weight(automate.weight_type(), 1.5)) automate.draw("automata.dot") print(automate) return automate, states_dict
def Levenshtein_Automata_Dico(ref_string, levenshtein_distance): # Creation des etats de l'automate dict_levenshtein_states = create_states_dico(ref_string, levenshtein_distance) # Creation des arcs emergeants de chaque etat # Pour les poids on pose que : d = 0 si on consomme un caractere, et 1 si on consomme etoile ou epsilon (insertion, deletion, substitution) # Pour les caracteres consommes et emis, on considere que les caracteres de la chaine de reference sont les caracteres consommes et les caracteres de la chaine hypothese seront les caracteres emis automate = fst.Fst() automata = {} weights = [0, 1, 1, 1] arcs_labels = [] dst_states = [] automata_voc = ["epsilon", "*"] automata_voc.extend(ref_string) initial_state_index = automate.add_state() # label2int("0;0", ref_string) final_state_index = automate.add_state() # label2int("5;2", ref_string) automate.set_start(initial_state_index) automate.set_final(final_state_index, fst.Weight(automate.weight_type(), 1.5)) for state_label, state_index in dict_levenshtein_states.iteritems(): nb_consummed_chars = int( state_label.split(";")[0]) # 1er caractere du label nb_elementary_operations = int( state_label.split(";")[1]) # 2nd caractere du label set_arcs = {} arcs_labels = [] char_from_ref_str = '' if nb_consummed_chars == len(ref_string): char_from_ref_str = "epsilon" else: char_from_ref_str = ref_string[nb_consummed_chars] up_dst_label = str(nb_consummed_chars) + ";" + str( nb_elementary_operations + 1) up_dst_index = label2int(up_dst_label, ref_string) # print("up", up_dst_label) insertion_arc_label = "*" + ":" + "epsilon" + ":" + str(1) insertion_split = insertion_arc_label.split(":") insertion_consummed_char = convertSymToLabel(insertion_split[0]) insertion_transmitted_char = convertSymToLabel(insertion_split[1]) insertion_weight = convertSymToLabel(insertion_split[2]) diag_dst_label = str(nb_consummed_chars + 1) + ";" + str(nb_elementary_operations + 1) diag_dst_index = label2int(diag_dst_label, ref_string) # print("diag", diag_dst_label) deletion_arc_label = "epsilon:" + char_from_ref_str + ":" + str( weights[1]) deletion_split = deletion_arc_label.split(":") deletion_consummed_char = convertSymToLabel(deletion_split[0]) deletion_transmitted_char = convertSymToLabel(deletion_split[1]) deletion_weight = convertSymToLabel(deletion_split[2]) substitution_arc_label = "*:" + char_from_ref_str + ":" + str( weights[1]) substitution_split = substitution_arc_label.split(":") substitution_consummed_char = convertSymToLabel(substitution_split[0]) substitution_transmitted_char = convertSymToLabel( substitution_split[1]) substitution_weight = convertSymToLabel(substitution_split[2]) right_dst_label = str(nb_consummed_chars + 1) + ";" + str(nb_elementary_operations) right_dst_index = label2int(right_dst_label, ref_string) # print("right", right_dst_label) accepting_arc_label = char_from_ref_str + ":" + char_from_ref_str + ":" + str( weights[0]) accepting_split = accepting_arc_label.split(":") accepting_consummed_char = convertSymToLabel(accepting_split[0]) accepting_transmitted_char = convertSymToLabel(accepting_split[1]) accepting_weight = convertSymToLabel(accepting_split[2]) is_last_column = nb_consummed_chars == len( ref_string ) # booleen renvoie true si le nombre de caracteres conssommes est egal a la longueur de la chaine et false sinon is_last_row = nb_elementary_operations == levenshtein_distance # booleen renvoie true si le nombre d'operations elementaires est egal a la distance de levenshtein et false sinon if is_last_column and is_last_row: output_arc_label = "epsilon" + ":" + "epsilon" + ":" + str(0) set_arcs[output_arc_label] = [] elif is_last_column: arcs_labels.append(insertion_arc_label) dst_states.append(up_dst_label) set_arcs[insertion_arc_label] = [up_dst_label] automate.add_arc( state_index, fst.Arc(insertion_consummed_char, insertion_transmitted_char, fst.Weight(automate.weight_type(), insertion_weight), up_dst_index)) elif is_last_row: arcs_labels.append(accepting_arc_label) dst_states.append(right_dst_label) set_arcs[accepting_arc_label] = [right_dst_label] automate.add_arc( state_index, fst.Arc(accepting_consummed_char, accepting_transmitted_char, fst.Weight(automate.weight_type(), accepting_weight), right_dst_index)) else: arcs_labels.append(accepting_arc_label) dst_states.append(right_dst_label) set_arcs[accepting_arc_label] = [right_dst_label] automate.add_arc( state_index, fst.Arc(accepting_consummed_char, accepting_transmitted_char, fst.Weight(automate.weight_type(), accepting_weight), right_dst_index)) arcs_labels.append(deletion_arc_label) dst_states.append(diag_dst_label) set_arcs[deletion_arc_label] = [diag_dst_label] automate.add_arc( state_index, fst.Arc(deletion_consummed_char, deletion_transmitted_char, fst.Weight(automate.weight_type(), deletion_weight), diag_dst_index)) arcs_labels.append(substitution_arc_label) dst_states.append(diag_dst_label) automate.add_arc( state_index, fst.Arc( substitution_consummed_char, substitution_transmitted_char, fst.Weight(automate.weight_type(), substitution_weight), diag_dst_index)) arcs_labels.append(insertion_arc_label) dst_states.append(up_dst_label) automate.add_arc( state_index, fst.Arc(insertion_consummed_char, insertion_transmitted_char, fst.Weight(automate.weight_type(), insertion_weight), up_dst_index)) set_arcs[substitution_arc_label] = [ diag_dst_label, up_dst_label ] # insertion et substitution ont les memes labels d'arcs automata[state_label] = set_arcs # print(automata[state_label]) # for idx in range(len(dst_states)): # dst_state_label = dst_states[idx] # dst_state_index = dict_levenshtein_states[dst_state_label] # consummed_char = convertSymToLabel(char_from_ref_str) # dst_states[idx] # transmitted_char = info[1] # weight = info[2] # automate.add_arc( # state_index, # fst.Arc( # transmitted_char, # consummed_char, # fst.Weight(automate.weight_type(), weight), # dst_state_index) # ) print(automata) # Display Automata in LaTeX : return (automata)
def build_lm(dev_fname, isyms_fname, constraints, lattice_output): """ Make a lattice that maps lemmas and constraints (or priors) to an inflected version """ # rewrite constraints constraints = constraints.replace("_", ";") # read isyms input_syms = fst.SymbolTable.read_text(isyms_fname) s_fin = '</s>' code = {} for ltr, c in input_syms: code[c] = ltr # init the lattice f_big = fst.Fst("log") f_big.set_input_symbols(input_syms) f_big.set_output_symbols(input_syms) for line in open(dev_fname, 'r').readlines( ): # all possilbe inflections are added, regardless of the prior (applying the prior an make for a more effecifent computation) line = line.strip() lemma, inflection, cns = line.split("\t")[:-2] #print(lemma, inflection, cns) if cns == constraints: # comparing strings idx = 0 lemma = lemma.split() inflection = inflection.split() for j, (lm, flc) in enumerate(zip(lemma, inflection)): if lm != flc: idx = j break f, old = create_priors(cns, input_syms, input_syms, code) keep = old for j in range(idx, len(lemma)): new = f.add_state() f.add_arc( old, fst.Arc(code[lemma[j]], code[lemma[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new new = f.add_state() # the residual of the lemma is mapped to the inflection residual (indirectly) sym = "".join(lemma[idx:]) + "_" + "".join(inflection[idx:]) f.add_arc( old, fst.Arc(code[sym], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new)) f.set_final(new) f_big.union(f) f_big = fst.determinize(f_big.rmepsilon()) # add <sigma> state in the <sigma place holder> for c, ltr in code.items(): if int(ltr) > 1 and int( ltr) < 51: # (hard coded) symbols of Runssian + 2 more f_big.add_arc( keep, fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0), keep)) f_big.invert() # save lattice f_big.write(lattice_output)
def Levenshtein_Automata_Dico(ref_string, levenshtein_distance): # Creation des etats de l'automate dict_levenshtein_states = create_states_dico(ref_string, levenshtein_distance) # Creation des arcs emergeants de chaque etat # Pour les poids on pose que : d = 0 si on consomme un caractere, et 1 si on consomme etoile ou epsilon (insertion, deletion, substitution) # Pour les caracteres consommes et emis, on considere que les caracteres de la chaine de reference sont les caracteres consommes et les caracteres de la chaine hypothese seront les caracteres emis automata = {} weights = [0, 1, 1, 1] arcs_labels = [] dst_states = [] automata_voc = ["epsilon", "*"] automata_voc.extend(ref_string) for state_label, state_index in dict_levenshtein_states.iteritems(): nb_consummed_chars = int( state_label.split(";")[0]) # 1er caractere du label nb_elementary_operations = int( state_label.split(";")[1]) # 2nd caractere du label set_arcs = {} arcs_labels = [] char_from_ref_str = '' if nb_consummed_chars == len(ref_string): char_from_ref_str = "epsilon" else: char_from_ref_str = ref_string[nb_consummed_chars] up_dst_label = str(nb_consummed_chars) + ";" + str( nb_elementary_operations + 1) # print("up", up_dst_label) diag_dst_label = str(nb_consummed_chars + 1) + ";" + str(nb_elementary_operations + 1) # print("diag", diag_dst_label) right_dst_label = str(nb_consummed_chars + 1) + ";" + str(nb_elementary_operations) # print("right", right_dst_label) is_last_column = nb_consummed_chars == len(ref_string) is_last_row = nb_elementary_operations == levenshtein_distance if is_last_column and is_last_row: output_arc_label = "epsilon" + "::" + "epsilon" + "::" + str(0) set_arcs[output_arc_label] = [] elif is_last_column: insertion_arc_label = "*" + "::" + "epsilon" + "::" + str(1) arcs_labels.append(insertion_arc_label) up_dst_label = str(nb_consummed_chars) + ";" + str( nb_elementary_operations + 1) dst_states.append(up_dst_label) set_arcs[insertion_arc_label] = [up_dst_label] elif is_last_row: accepting_arc_label = char_from_ref_str + "::" + char_from_ref_str + "::" + str( weights[0]) arcs_labels.append(accepting_arc_label) right_dst_label = str(nb_consummed_chars + 1) + ";" + str(nb_elementary_operations) dst_states.append(right_dst_label) set_arcs[accepting_arc_label] = [right_dst_label] else: accepting_arc_label = char_from_ref_str + "::" + char_from_ref_str + "::" + str( weights[0]) deletion_arc_label = "epsilon::" + char_from_ref_str + "::" + str( weights[1]) substitution_arc_label = "*::" + char_from_ref_str + "::" + str( weights[1]) insertion_arc_label = substitution_arc_label arcs_labels.append(accepting_arc_label) arcs_labels.append(deletion_arc_label) arcs_labels.append(substitution_arc_label) arcs_labels.append(insertion_arc_label) dst_states.append(up_dst_label) dst_states.append(diag_dst_label) dst_states.append(diag_dst_label) dst_states.append(right_dst_label) set_arcs[accepting_arc_label] = [right_dst_label] set_arcs[deletion_arc_label] = [diag_dst_label] set_arcs[substitution_arc_label] = [diag_dst_label, up_dst_label] automata[state_label] = set_arcs # print(automata[state_label]) for idx in range(len(dst_states)): dst_state_label = dst_states[idx] dst_state_index = dict_levenshtein_states[dst_state_label] consummed_char = convertSymToLabel(char_from_ref_str) dst_states[idx] transmitted_char = info[1] weight = info[2] automate.add_arc( state_index, fst.Arc(transmitted_char, consummed_char, fst.Weight(automate.weight_type(), weight), dst_state_index)) print(automata) # Display Automata in LaTeX : return (automata)
def generate_word_sequence_recognition_wfst_bigram(n, lex, df_bigram_prob, original=False, weight_fwd=None, weight_self=None): """ generate a HMM to recognise any single word sequence for words in the lexicon Args: n (int): states per phone HMM original (bool): True/False - origianl/optimized lexicon weight_fwd (int): weight value weight_self (int): weight value of self node Returns: the constructed WFST """ if (weight_fwd != None and weight_self != None): f = fst.Fst('log') none_weight = fst.Weight('log', -math.log(1)) else: f = fst.Fst() none_weight = None lex = parse_lexicon(lex, original) word_table, phone_table, state_table = generate_symbols_table(lex, 3) output_table = generate_output_table(word_table, phone_table) # create a single start state start_state = f.add_state() f.set_start(start_state) # -- dictionaries for initial and last states dict_initial = {} dict_final = {} # make fst for word, phone_list in lex.items(): for phones in phone_list: initial_state = f.add_state() # -- add to initial dict if word in dict_initial: dict_initial[word].append(initial_state) else: dict_initial[word] = [initial_state] # -- add arcs f.add_arc( start_state, fst.Arc(0, output_table.find(word), none_weight, initial_state)) current_state = initial_state for phone in phones: current_state = generate_phone_wfst(f, current_state, phone, n, state_table, output_table, weight_fwd, weight_self) f.set_final(current_state) f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) # -- add to final dict if word in dict_final: dict_final[word].append(current_state) else: dict_final[word] = [current_state] # -- add bidirectional arcs for word, last_state_list in dict_final.items( ): # list of final states 4 word for last_state in last_state_list: # final state from lsit for word_bi, initial_state_list in dict_initial.items( ): # list of initial satates for initial_state in initial_state_list: # state from list prob = df_bigram_prob['Word After', word_bi]['Word Before', word] if (prob == 0): prob = 1e10 else: prob = -math.log(prob) weight = fst.Weight('log', prob) f.add_arc( last_state, fst.Arc(0, output_table.find(word_bi), weight, initial_state)) f.set_input_symbols(state_table) f.set_output_symbols(output_table) return f, word_table
def generate_WFST_silent(n, lex, weight_fwd, weight_self, original=False): """ generate a HMM to recognise any single word sequence for words in the lexicon and includes a silence state Args: n (int): states per phone HMM original (bool): True/False - origianl/optimized lexicon weight_fwd (int): weight value weight_self (int): weight value of self node weights_start (dict): word -> probability of word Returns: the constructed WFST """ f = fst.Fst('log') none_weight = fst.Weight('log', -math.log(1)) original_lex = parse_lexicon(lex, original) # add the silent states silent_word = '<silence>' silent_phones = ['sil_0', 'sil_1', 'sil_2', 'sil_3', 'sil_4', 'sil_5'] silence_lex = original_lex.copy() silence_lex[silent_word] = [silent_phones ] # makes sure output table contains it # ----- # print(f"lex: {silence_lex}") word_table, phone_table, state_table = generate_symbols_table( original_lex, 3) word_table.add_symbol(silent_word) for phone in silent_phones: state_table.add_symbol(phone) phone_table.add_symbol('sil') # print(f'state table: {list(state_table)}') output_table = generate_output_table(word_table, phone_table) # create a single start state start_state = f.add_state() f.set_start(start_state) # skip silent phones by using original lex for word, phone_list in original_lex.items(): for phones in phone_list: initial_state = f.add_state() f.add_arc( start_state, fst.Arc(0, output_table.find(word), none_weight, initial_state)) current_state = initial_state for phone in phones: current_state = generate_phone_wfst(f, current_state, phone, n, state_table, output_table, weight_fwd, weight_self) f.set_final(current_state) f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) # need to add the silent state seperately current_state = f.add_state() f.add_arc( start_state, fst.Arc(0, output_table.find(silent_word), none_weight, current_state)) current_state = generate_silent_phone_wfst(f, current_state, state_table, output_table) f.set_final(current_state) f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) f.set_input_symbols(state_table) f.set_output_symbols(output_table) return f, word_table
def generate_silent_phone_wfst(f, start_state, state_table, phone_table): """ Generate a WFST representing an n-state left-to-right phone HMM. Args: f (fst.Fst()): an FST object, assumed to exist already start_state (int): the index of the first state, assumed to exist already phone (str): the phone label n (int): number of states of the HMM weight_fwd (int): weight value weight_self (int): weight value of self node Returns: the final state of the FST """ # print(f"states before silent: {list(f.states())}") current_state = start_state # start with creating the states n = 5 for i in range(1, n + 1): current_state = f.add_state() WFST_silent = list(f.states())[-(n + 1):] # manually make the ergodic topology s0 = WFST_silent[0] s0_label = state_table.find('sil_1') s1 = WFST_silent[1] s1_label = state_table.find('sil_2') s2 = WFST_silent[2] s2_label = state_table.find('sil_3') s3 = WFST_silent[3] s3_label = state_table.find('sil_4') s4 = WFST_silent[4] s4_label = state_table.find('sil_5') # create arcs # s0 f.add_arc(s0, fst.Arc(s0_label, 0, fst.Weight('log', -math.log(0.5)), s0)) f.add_arc(s0, fst.Arc(s0_label, 0, fst.Weight('log', -math.log(0.5)), s1)) # s1 f.add_arc(s1, fst.Arc(s1_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s1)) f.add_arc(s1, fst.Arc(s1_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s2)) f.add_arc(s1, fst.Arc(s1_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s3)) #s2 f.add_arc(s2, fst.Arc(s2_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s1)) f.add_arc(s2, fst.Arc(s2_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s2)) f.add_arc(s2, fst.Arc(s2_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s3)) #s3 f.add_arc(s3, fst.Arc(s3_label, 0, fst.Weight('log', -math.log(1 / 4.0)), s1)) f.add_arc(s3, fst.Arc(s3_label, 0, fst.Weight('log', -math.log(1 / 4.0)), s2)) f.add_arc(s3, fst.Arc(s3_label, 0, fst.Weight('log', -math.log(1 / 4.0)), s3)) f.add_arc(s3, fst.Arc(s3_label, 0, fst.Weight('log', -math.log(1 / 4.0)), s4)) # s4 f.add_arc(s4, fst.Arc(s4_label, 0, fst.Weight('log', -math.log(0.5)), s4)) f.add_arc( s4, fst.Arc(s4_label, 0, fst.Weight('log', -math.log(0.5)), current_state)) # print(f"silent states: {WFST_silent}") return current_state