def add_start_and_end_of_sentence_symbols(fst_1): """ Concatenate start (beginning) and end (end) of sentence symbols to the FST. :param fst_1: FST object :return: FST with prepended start of sentence symbol and appended end of sentence symbol. """ # Create start of sentence FSA # 1 is start of sentence label start_of_sentence = fst.Transducer() start_of_sentence.add_arc(0, 1, 0, 1) start_of_sentence[1].final = True # Create end of sentence FSA # 2 is end of sentence label end_of_sentence = fst.Transducer() end_of_sentence.add_arc(0, 1, 0, 2) end_of_sentence[1].final = True # Modify start_of_sentence by concatenating fst_1 start_of_sentence.concatenate(fst_1) # Modify joint start_of_sentence and fst_1 by concatenating end_of_sentence start_of_sentence.concatenate(end_of_sentence) return start_of_sentence
def Transducer(isyms=None, osyms=None, semiring=semiring): global syms if isyms is None: isyms = syms if osyms is None: osyms = syms return fst.Transducer(isyms=isyms, osyms=osyms, semiring=semiring)
def create_wordlist_fst (words): "This takes a list of words and creates a letter-to-word transducer for all of the words (unioned together)." wordset=fst.Transducer(); for word in words: wordfst=gen_word_fst(word,isyms=wordset.isyms,osyms=wordset.osyms) wordset=wordset|wordfst return wordset
def get_replace_transducer(self): transducer_symbol_table = SegmentTable().transducer_symbol_table inner_replace_transducer = fst.Transducer( isyms=transducer_symbol_table, osyms=transducer_symbol_table) for segment1, segment2 in self.target_change_tuples_list: inner_replace_transducer.add_arc(0, 1, segment1, segment2) inner_replace_transducer[1].final = True inner_replace_transducer_ignore_brackets = [ LEFT_CENTER_BRACKET, RIGHT_CENTER_BRACKET ] for bracket in inner_replace_transducer_ignore_brackets: inner_replace_transducer.add_arc(0, 0, bracket, bracket) inner_replace_transducer.add_arc(1, 1, bracket, bracket) opt_part = left_bracket_transducer + inner_replace_transducer + right_bracket_transducer add_opt(opt_part) sigma_star_regex = "({})*".format("+".join(self.alphabet)) sigma_star_dfa = get_dfa_from_regex(sigma_star_regex, sigma=self.alphabet) sigma_star_dfa_ignore_identity = get_ignore_dfa( self.alphabet | set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET]), sigma_star_dfa, set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET])) id_sigma_star = pyfst_from_dfa(sigma_star_dfa_ignore_identity) concat_transducer = id_sigma_star + opt_part replace_transducer = concat_transducer.closure() # dot(replace_transducer, "replace_transducer") return replace_transducer
def fstbuild(words): trie = fst.Transducer() letter_syms = fst.read_symbols("ascii.syms.bin") trie.isyms = letter_syms trie.osyms = letter_syms def bs(s): letter_syms = fst.read_symbols("ascii.syms.bin") return letter_syms[s] biggest = 0 for w in words: p = 0 c = 0 trie.add_arc(p, biggest + 1, w[c], "<epsilon>", 0) p = biggest + 1 c += 1 while (c < len(w) - 1): trie.add_arc(p, p + 1, w[c], "<epsilon>", 0) p += 1 c += 1 trie.add_arc(p, p + 1, w[c], w, 0) p += 1 biggest = max(p, biggest) last_state = trie[biggest] last_state.final = True det_trie = trie.determinize() det_trie.arc_sort_input() det_trie.remove_epsilon() return det_trie
def create_rule_fst(self, rule, feature_weights_dict): """ Create rule FST accepting the sequence of target side tokens. :param rule: Rule object :param feature_weights_dict: Dictionary of feature names and their weights :return: Rule FST """ # Determine whether to use word insertion penalty if 'word_insertion_penalty' in feature_weights_dict and not rule.hiero_intersection_rule: wip = feature_weights_dict['word_insertion_penalty'] else: wip = None # Add arcs representing target tokens one after the other rule_fst = fst.Transducer() index = -1 for index, token in enumerate(rule.target_side): self.add_arc(rule_fst, index, token, rule.nonterminal_coverages, weight=wip) # Compute rule weight in a log linear model rule_weight = helpers.loglinear_rule_weight(rule.feature_dict, feature_weights_dict) # Add the rule weight to the final state in the FST rule_fst[index + 1].final = rule_weight return rule_fst
def get_transducer_acceptor(string_): transducer_symbol_table = SegmentTable().transducer_symbol_table transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table) for i, char in enumerate(string_): transducer.add_arc(i, i + 1, char, char) transducer[i + 1].final = True return transducer
def levenshtein(w, editdst): wts = keyweights() trie = fst.Transducer() letter_syms = fst.read_symbols("ascii.syms.bin") trie.isyms = letter_syms trie.osyms = letter_syms letttup = list(letter_syms.items()) letters = list() for let in letttup: letters.append(let[0]) class StateCounter(object): def __init__(self): self.set = {} self.count = -1 def __contains__(self, obj): return obj in self.set def __getitem__(self, obj): if not obj in self.set: self.count += 1 self.set[obj] = self.count return self.set[obj] states = StateCounter() for x in range(0, len(w)): for y in range(0, editdst + 1): trie.add_arc(states[str(x) + "^" + str(y)], states[str(x + 1) + "^" + str(y)], w[x], w[x], 0) # char in word if not y == editdst: trie.add_arc(states[str(x) + "^" + str(y)], states[str(x + 1) + "^" + str(y + 1)], "<epsilon>", "<epsilon>", 1.5) # deletion for i in letters: trie.add_arc(states[str(x) + "^" + str(y)], states[str(x + 1) + "^" + str(y + 1)], i, i, wts[w[x], i]) # substitution trie.add_arc(states[str(x) + "^" + str(y)], states[str(x) + "^" + str(y + 1)], i, i, wts[w[x], i]) # insertion for y in range(0, editdst + 1): trie[states[str(len(w)) + "^" + str(y)]].final = True trie.remove_epsilon() trie.arc_sort_input() return trie
def gen_word_fst (word,isyms=None,osyms=None): "This takes a word and creates a transducer from the letters to the word, introducing symbols into the symbol table as needed" wordfst=fst.Transducer(isyms,osyms) state=0 for char in word: wordfst.add_arc(state,state+1,char,'ε') state=state+1 wordfst[state].final=True for arc in wordfst[state-1].arcs: arc.olabel=wordfst.osyms[word] return wordfst
def get_prologue_inverse_transducer(): transducer_symbol_table = SegmentTable().transducer_symbol_table prologue_inverse_transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table) alphabet = set(SegmentTable().get_segments_symbols()) for segment in alphabet: prologue_inverse_transducer.add_arc(0, 0, segment, segment) for bracket in BRACKETS: prologue_inverse_transducer.add_arc(0, 0, bracket, EPSILON) prologue_inverse_transducer[0].final = True return prologue_inverse_transducer
def create_ipa_fst(): '''creates fst for converting callhome dictionary pronunciations to arabic''' ipa_fst = fst.Transducer() fst_file = codecs.open(CALLHOME_FST, 'r', encoding='utf-8') for l in fst_file: l = l.replace(u'\ufeff', '') rule = l.split() if len(rule) == 4: ipa_fst.add_arc(int(rule[0]), int(rule[1]), rule[2], rule[3]) ipa_fst[1].final = True fst_file.close() return ipa_fst
def get_intro_transducer(sigma, introduced_set): sigma_transducer = get_sigma_transducer_for_intro(sigma) transducer_symbol_table = SegmentTable().transducer_symbol_table cartesian_transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table) for introduced_symbol in introduced_set: cartesian_transducer.add_arc(0, 0, EPSILON, introduced_symbol) cartesian_transducer[0].final = True union_transducer = sigma_transducer | cartesian_transducer intro_transducer = union_transducer.closure() return intro_transducer
def genBigGraph(label_prob, symbols, seq_len, label='x'): t = fst.Transducer() sym = fst.SymbolTable() symbols = map(str, symbols) x = 0 for j in range(seq_len): for i in range(len(symbols)): prob = label_prob[j][i] #"%.4f" % t.add_arc(0 + x, 1 + x, str(label + str(j)), symbols[i], -math.log(prob)) x += 1 t[j + 1].final = -1 return t
def test_simple(): t = fst.Transducer() for i, (ic, oc) in enumerate(zip('hello', 'olleh')): t.add_arc(i, i + 1, ic, oc) t[i + 1].final = True eq_(len(t), 6) ok_(t[5].final) a = fst.Acceptor() for i, c in enumerate('hello'): a.add_arc(i, i + 1, c) a[i + 1].final = True eq_(len(a), 6) ok_(a[5].final)
def create_dt_fst(): dt_fst = fst.Transducer(isyms=fst.SymbolTable('eps'), osyms=fst.SymbolTable('eps')) fst_file = codecs.open(DUTCH_FST_FILE, 'r', encoding='utf-8') for l in fst_file: l = l.replace(u'\ufeff', '') entry = l.split() if len(entry) == 4: if entry[3] == 'ks': entry[3] = 'k s' dt_fst.add_arc(int(entry[0]), int(entry[1]), entry[2], entry[3]) dt_fst[1].final = True dt_fst[2].final = True return dt_fst
def create_rule_fst(self, rule, feature_weights_dict): """ Create rule FST accepting the sequence of target side tokens. :param rule: Rule object :param feature_weights_dict: Dictionary of feature names and their weights :return: Rule FST """ # Determine whether to use word insertion penalty if 'word_insertion_penalty' in feature_weights_dict and not rule.hiero_intersection_rule: wip = feature_weights_dict['word_insertion_penalty'] else: wip = None # Offset rule_id by rule_id_offset to prevent clashes with Hiero rule id space rule_id = rule.id if not rule.hiero_intersection_rule: rule_id += self.rule_id_offset rule_fst = fst.Transducer() # Insert rule arc at the start of the transducer (rule_id:epsilon) rule_fst.add_arc(0, 1, int(rule_id), 0) # Add arcs representing target tokens one after the other # Index is adjusted to account for rule arc index = 1 for index, token in enumerate(rule.target_side, 1): self.add_arc(rule_fst, index, token, rule.nonterminal_coverages, weight=wip) # Compute rule weight in a log linear model rule_weight = helpers.loglinear_rule_weight(rule.feature_dict, feature_weights_dict) # Add the rule weight to the final state in the FST rule_fst[index + 1].final = rule_weight if rule.hiero_intersection_rule: print rule_weight print self.fst_tostring(rule_fst) return rule_fst
def pyfst_from_dfa(dfa): transducer_symbol_table = SegmentTable().transducer_symbol_table transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table) dfa_state_transducer_state_dict = {i: i for i, dfa_state in enumerate(dfa.States)} for dfa_state1 in dfa.delta: for segment in dfa.delta[dfa_state1]: dfa_state2 = dfa.delta[dfa_state1][segment] transducer_state1 = dfa_state_transducer_state_dict[dfa_state1] transducer_state2 = dfa_state_transducer_state_dict[dfa_state2] transducer.add_arc(transducer_state1, transducer_state2, segment, segment) for dfa_final_state in dfa.Final: transducer_final_state = dfa_state_transducer_state_dict[dfa_final_state] transducer[transducer_final_state].final = True transducer_initial_state = dfa_state_transducer_state_dict[dfa.Initial] transducer[transducer_initial_state].initial = True return transducer
def make_edit(sigma): """ Make an edit distance transducer with operations: - deletion: x:<epsilon>/1 - insertion: <epsilon>:x/1 - substitution: x:x/0 and x/y:1 """ # Create common symbol table syms = fst.SymbolTable() # Create transducer edit = fst.Transducer(syms, syms) edit[0].final = True for x in sigma: edit.add_arc(0, 0, x, fst.EPSILON, 1) edit.add_arc(0, 0, fst.EPSILON, x, 1) for y in sigma: edit.add_arc(0, 0, x, y, (0 if x == y else 1)) # Define edit distance def distance(a, b): # Compose a o edit transducer o b composed = fst.linear_chain(a, syms) >> edit >> fst.linear_chain( b, syms) # Compute distance distances = composed.shortest_distance(reverse=True) dist = int(distances[0]) # Find best alignment alignment = composed.shortest_path() # Re-order states alignment.top_sort() # Replace <epsilon> -> "-" alignment.relabel({fst.EPSILON: '-'}, {fst.EPSILON: '-'}) # Read alignment on the arcs of the transducer arcs = (next(state.arcs) for state in alignment) labels = ((arc.ilabel, arc.olabel) for arc in arcs) align = [(alignment.isyms.find(x), alignment.osyms.find(y)) for x, y in labels] return dist, align return distance
def create_root_fst(label, int_coverage_cells): """ Create a root FST consisting of a single (nonterminal) transition :param label: Nonterminal transition label :param int_coverage_cells: Dictionary of integer coverages and associated FSTs :return: Root FST """ root_fst = fst.Transducer(isyms=fst.SymbolTable(), osyms=fst.SymbolTable()) root_fst.osyms[label] = int(label) # Adding epsilon input label using symbol table lookup for id=0 root_fst.add_arc(0, 1, root_fst.isyms.find(0), label) root_fst[1].final = True # Create root FST symbol table for int_coverage, cell_fst in int_coverage_cells.items(): root_fst.osyms[int_coverage] = int(int_coverage) return root_fst
def gen_utt_graph(labels, symdict): t2 = fst.Transducer() sym = fst.SymbolTable() #3x3 states for this example count = 0 x = 0 # print labels for l in labels: symbols = symdict[l] symbols = map(str, symbols) for i in range(len(symbols)): if i == 0: t2.add_arc(0 + x, 1 + x, symbols[i], str(l + "/" + "(" + symbols[i] + ")")) else: t2.add_arc(0 + x, 1 + x, symbols[i], str(sym.find(0) + "(" + symbols[i] + ")")) t2.add_arc(1 + x, 1 + x, symbols[i], str(sym.find(0) + "(" + symbols[i] + ")")) x += 1 t2[x].final = True return t2
B_full_table[dict_tags[tag]][dict_words[word]] = float( full_cfd_word_tag[tag][word]) / float(full_num) full_tag_set.remove('<s>') full_tag_set.remove('</s>') full_word_set.remove('<s>') full_word_set.remove('</s>') # build the HMM_tagger import fst import math eps = '¦Å' HMM_tagger = fst.Transducer() num_temp = num_tags - 2 for tag in full_tag_set: HMM_tagger.add_arc(0, dict_tags[tag], eps, eps, -math.log(A_full_table[0][dict_tags[tag]])) for tag in full_tag_set: i = dict_tags[tag] for word in full_word_set: HMM_tagger.add_arc(i, num_temp + i, word, tag, -math.log(B_full_table[i][dict_words[word]])) for tag1 in full_tag_set: i = dict_tags[tag1]
if __name__ == '__main__': parser = OptionParser() parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") (options, args) = parser.parse_args() if options.verbose == 1: VERBOSE = 1 startTime = time.time() t1 = fst.Transducer() t1.add_arc(0, 1, 'a', 'a') t1.add_arc(1, 2, 'b', 'b') t1.add_arc(2, 2, 'c', 'c') t1[2].final = True t2 = fst.Transducer() t2.add_arc(0, 1, 'c', 'c') t2.add_arc(1, 2, 'b', 'b') t2.add_arc(2, 3, 'a', 'a') t2[3].final = True ''' while 1 : try : line = sys.stdin.readline() except KeyboardInterrupt : break if not line : break
def create_empty_fst(): empty_fst = fst.Transducer() empty_fst.add_arc(0, 1, 0, 0) empty_fst[1].final = True return empty_fst