def fst_intersect(m, n): arcs = set() state_lookup = dict( (product_state(p, q), set()) for p, q in states_set_product(m, n)) start = product_state(m.start, n.start) # Compute arcs for each state pair for ((x, y), (z, w)) in states_mega_product(m, n): labels_lists = similar_labels_between(m, x, y, n, z, w) elision_arcs = set() for labels_list in labels_lists: arcs_by_input = set() for (k, l) in labels_list: add_arc = False seg = '' if k.output == '': # Faithfulness constraint; cares about input if k.input == l.input: if k.input not in elision_arcs: add_arc = True seg = k.input elision_arcs.add(seg) elif ((k.input == '_') or (k.input == l.input)) and (l.input not in arcs_by_input): # Markedness constraint add_arc = True seg = l.input arcs_by_input.add(seg) elif (l.input == '_') and (k.input not in arcs_by_input): # Markedness constraint add_arc = True seg = k.input arcs_by_input.add(seg) if add_arc: intersection_arc = Arc( product_state(x, z), product_state(y, w), Label(seg, k.output, otimes(k.violation, l.violation))) arcs.add(intersection_arc) state_lookup[intersection_arc.start].add(intersection_arc) # Figure out the states reachable from the start fst_states = traverse_states(state_lookup, start) fst = FST(fst_states, start, fst_states, filter((lambda arc: arc.start in fst_states), arcs), 1) return fst
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ # Let's define our first FST f1 = FST('soundex-generate') letter_groups = [['b','f','p','v','B','F','P','V'],['c','C', 'g','G','J', 'j', 'K','k','Q', 'q','S', 's','X', 'x', 'Z','z'],['d','D','T','t'],['L','l'],['M','N','m','n'],['R','r']] vowels = ['a','e','i','o','u','w','y','h','A','E','I','O','U','W','Y','H'] states_num = len(letter_groups) f1.add_state('start') f1.add_state('vowels') f1.set_final('vowels') for i in range(states_num) : f1.add_state(i) f1.set_final(i) f1.initial_state = 'start' # Add the rest of the arcs # f1.add_arc('vowels','start',(),()) for letter in string.ascii_letters: if letter in vowels : f1.add_arc('start','vowels',(letter),(letter)) #first char is vowel f1.add_arc('vowels','vowels',(letter),()) #ignoring consecutive vowels iin start for i in range(states_num) : f1.add_arc(i,'vowels',(letter),()) else : for conso_state in range(states_num): if letter in letter_groups[conso_state] : f1.add_arc('start',conso_state,(letter),(letter)) f1.add_arc('vowels',conso_state,(letter),(str(conso_state+1)[0])) f1.add_arc(conso_state,conso_state,(letter),()) for other_conso_state in range(states_num): if other_conso_state != conso_state : f1.add_arc(other_conso_state,conso_state,(letter),(str(conso_state+1)[0])) return f1
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('1') f2.add_state('2') f2.add_state('3') f2.add_state('4') f2.add_state('5') f2.initial_state = '1' f2.set_final('2') f2.set_final('3') f2.set_final('4') f2.set_final('5') lista = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] listd = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] # Add the arcs for letter in string.ascii_letters: f2.add_arc('1', '2', (letter), (letter)) f2.add_arc('2', '2', (letter), ()) for n in listd: f2.add_arc('1', '3', (n), (n)) f2.add_arc('2', '3', (n), (n)) f2.add_arc('3', '4', (n), (n)) f2.add_arc('4', '5', (n), (n)) f2.add_arc('5', '5', (n), ()) return f2
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('1') f2.initial_state = '1' f2.set_final('1') f2.add_state('2L') f2.set_final('2L') f2.add_state('2D') f2.set_final('2D') f2.add_state('3D') f2.set_final('3D') f2.add_state('4D') f2.set_final('4D') for letter in string.letters: f2.add_arc('1', '2L', (letter), (letter)) f2.add_arc('2L', '2L', (letter), ()) # Add the arcs possible_chars = string.digits + string.letters for digit in string.digits: f2.add_arc('1', '2D', (digit), (digit)) f2.add_arc('2L', '2D', (digit), (digit)) f2.add_arc('2D', '3D', (digit), (digit)) f2.add_arc('3D', '4D', (digit), (digit)) #f2.add_arc('4', '5', (letter), (letter)) f2.add_arc('4D', '4D', (digit), ()) return f2
def add_zero_padding(): # Now, the third fst - the zero-padding fst f3 = FST('soundex-padzero') f3.add_state('1') f3.add_state('1a') f3.add_state('1b') f3.add_state('2') f3.initial_state = '1' f3.set_final('2') for letter in string.letters: f3.add_arc('1', '1', (letter), (letter)) for number in xrange(10): f3.add_arc('1', '1', (str(number)), (str(number))) f3.add_arc('1', '1a', (), ('0')) f3.add_arc('1a', '1b', (), ('0')) f3.add_arc('1b', '2', (), ('0')) return f3
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ # Let's define our first FST f1 = FST('soundex-generate') # Indicate that '1' is the initial state f1.add_state('start') f1.add_state('next') f1.initial_state = 'start' # Set all the final states f1.set_final('next') # Add the rest of the arcs for letter in string.ascii_lowercase: f1.add_arc('start', 'next', (letter), (letter)) f1.add_arc('next', 'next', (letter), ('0')) return f1
def computeCTreeError(cMatrix, realTree): sampleNum = cMatrix.shape[1] #Compute the distance pairwise between samples distanceMatrix = np.empty([sampleNum, sampleNum], dtype=float) for sample1 in range(0, sampleNum): for sample2 in range(0, sampleNum): #The distance can be computed for the entire column at once using the FST dist = FST().computeDistance(cMatrix[:, sample1], cMatrix[:, sample2]) distanceMatrix[sample1, sample2] = dist #Compute the MST fullGraph = generateInitialTree(distanceMatrix, realTree.vertices) mst = computeMST(fullGraph, realTree.vertices) simulationErrorHandler = SimulationErrorHandler() treeScore = simulationErrorHandler.computeTreeError([mst], realTree) return treeScore
def fstAlleleDistance( self, data, samples ): #this could have been more efficient when we pass objects fst = FST() sampleCount = data.shape[1] dm = np.zeros((sampleCount, sampleCount)) messages = dict() for i in range(0, sampleCount): for j in range(0, sampleCount): #extract the whole copy number profile, so the whole column sample1Profile = data[:, i] sample2Profile = data[:, j] returnValue = fst.computeAlleleDistance( sample1Profile, sample2Profile, samples[i], samples[j]) messages[(i, j)] = returnValue[0] dm[i, j] = returnValue[1] print "distances: " print dm return [messages, dm]
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ # Let's define our first FST f1 = FST('soundex-generate') # Indicate that '1' is the initial state f1.add_state('start') f1.add_state('next') f1.initial_state = 'start' # Set all the final states f1.set_final('next') # Add the rest of the arcs for letter in string.ascii_lowercase: # f1.add_arc('start', 'next', (letter), (letter)) # f1.add_arc('next', 'next', (letter), ('0')) f1.add_arc('start', 'next', (letter), (letter)) if letter in vowels: f1.add_arc('next', 'next', (letter), ()) elif letter in grp1: f1.add_arc('next', 'next', (letter), ('1')) elif letter in grp2: f1.add_arc('next', 'next', (letter), ('2')) elif letter in grp3: f1.add_arc('next', 'next', (letter), ('3')) elif letter in grp4: f1.add_arc('next', 'next', (letter), ('4')) elif letter in grp5: f1.add_arc('next', 'next', (letter), ('5')) elif letter in grp6: f1.add_arc('next', 'next', (letter), ('6')) else: continue #wtf return f1
def add_zero_padding(): # Now, the third fst - the zero-padding fst f3 = FST('soundex-padzero') f3.add_state('start') # f3.add_state('1a') # f3.add_state('1b') # f3.add_state('2') for i in range(7) : f3.add_state(i) f3.initial_state = 'start' f3.set_final(3) f3.set_final(6) for letter in string.letters: f3.add_arc('start', 0, (letter), (letter)) for number in xrange(10): f3.add_arc('start', 1, (str(number)), (str(number))) for i in range(3) : f3.add_arc(i, i+1, (str(number)), (str(number))) # f3.add_arc('1', '1a', (), ('0')) # f3.add_arc('1a', '1b', (), ('0')) # f3.add_arc('1b', '2', (), ('0')) # adding empty number arcs : for i in range(3) : f3.add_arc(i, i+4, (), ('0')) for i in range(4,6) : f3.add_arc(i, i+1, (), ('0')) # trace(f3,'A5') return f3
def fst_test(test_string): # print "\nThis FST replaces the first 'a' in a string with an 'ba'" fst_states = ["q0", "q1"] fst_in_alph = ["a", "b"] fst_out_alph = ["a", "b"] fst_start = "q0" fst_final = ["q0", "q1"] fst_trans = { "q0": { "a": ["q1", "ba"], "b": ["q0", "b"] }, "q1": { "a": ["q1", "a"], "b": ["q1", "b"] }, } test_fst = FST(fst_states, fst_in_alph, fst_out_alph, fst_start, fst_final, fst_trans) print test_string + " : " + test_fst.transduce_string(test_string)
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('1') f2.add_state('2_N') f2.add_state('2_L') f2.add_state('3') f2.add_state('4') f2.initial_state = '1' f2.set_final('4') f2.set_final('2_N') f2.set_final('2_L') # Check whether you want to keep this condition f2.set_final('3') f2.set_final('4') # Add the arcs for letter in string.letters: # f2.add_arc('1', '1', (letter), (letter)) f2.add_arc('1', '2_L', (letter), (letter)) f2.add_arc('2_L', '2_L', (letter), ('')) for number in range(10): f2.add_arc('2_L', '2_N', (str(number)), (str(number))) for n in range(10): f2.add_arc('1', '2_N', (str(n)), (str(n))) f2.add_arc('2_N', '3', (str(n)), (str(n))) f2.add_arc('3', '4', (str(n)), (str(n))) f2.add_arc('4', '4', (str(n)), ('')) return f2
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('ste') f2.add_state('L1') f2.add_state('N1') f2.add_state('N2') f2.add_state('N3') f2.add_state('next1') f2.initial_state = 'ste' f2.set_final('next1') for letter in string.letters: f2.add_arc('ste', 'L1', (letter), (letter)) for n in range(10): f2.add_arc('ste', 'N1', (str(n)), (str(n))) f2.add_arc('L1', 'N1', (str(n)), (str(n))) f2.add_arc('N1', 'N2', (str(n)), (str(n))) f2.add_arc('N2', 'N3', (str(n)), (str(n))) f2.add_arc('N3', 'N3', (str(n)), ()) # Add the arcs """for letter in string.letters: f2.add_arc('1', '1', (letter), (letter)) for n in range(10): f2.add_arc('1', '1', (str(n)), (str(n)))""" f2.add_arc('L1', 'next1', (), ()) f2.add_arc('N1', 'next1', (), ()) f2.add_arc('N2', 'next1', (), ()) f2.add_arc('N3', 'next1', (), ()) return f2
def add_zero_padding(): # Now, the third fst - the zero-padding fst f3 = FST('soundex-padzero') # Indicate initial and final states f3.add_state('0') f3.add_state('1') f3.add_state('2') f3.add_state('3') f3.add_state('4') f3.add_state('11') f3.add_state('12') f3.add_state('13') f3.add_state('21') f3.add_state('22') f3.initial_state = '0' f3.set_final('4') f3.set_final('22') f3.set_final('13') # padding with zeros if required. numbers = '0123456789' for letter in string.ascii_letters: f3.add_arc('0', '1', letter, letter) for number in numbers: f3.add_arc('0', '2', number, number) f3.add_arc('1', '2', number, number) f3.add_arc('2', '3', number, number) f3.add_arc('3', '4', number, number) f3.add_arc('1', '11', '', '0') f3.add_arc('11', '12', '', '0') f3.add_arc('12', '13', '', '0') f3.add_arc('2', '21', '', '0') f3.add_arc('21', '22', '', '0') f3.add_arc('3', '4', '', '0') return f3
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('1') f2.add_state('2') f2.add_state('3') f2.add_state('4') f2.initial_state = '1' # Need to account for truncation scenarious where soundex string is less # than four characters f2.set_final('2') f2.set_final('3') f2.set_final('4') # Add the arcs for letter in string.letters: f2.add_arc('1', '1', (letter), (letter)) for n in range(10): f2.add_arc('1', '2', (str(n)), (str(n))) for n in range(10): f2.add_arc('2', '3', (str(n)), (str(n))) for n in range(10): f2.add_arc('3', '4', (str(n)), (str(n))) for n in range(10): f2.add_arc('4', '4', (str(n)), ()) return f2
def fst_from_prohibited_string(input_alphabet, output_alphabet, banned_string, violation_name): length = len(banned_string) fst = FST(set([""]), "", set(), set(), "") # Add arcs if length > 1: for i in range(1, length): fst.states.add(banned_string[0:i]) add_alternation_arcs(fst, banned_string[0:i - 1], banned_string[0:i], '_', banned_string[i - 1]) # Send a penalty arc to the longest valid suffix fst.arcs.add( Arc(banned_string[0:-1], longest_suffix(banned_string, fst.states), Label('_', banned_string[-1], Counter({violation_name: 1})))) # Add loopback arcs and return for state in fst.states: for char in input_alphabet: add_elision_arc(fst, state, char) for char in fst.chars_not_leaving(state, output_alphabet): add_alternation_arcs(fst, state, longest_suffix(state + char, fst.states), '_', char) return fst
def add_zero_padding(): # Now, the third fst - the zero-padding fst #Variable aliases start_state = 'start' just_numbers = 'just_numbers' letter_first = 'letter_first' epsilons = ['e0', 'e1', 'e2', 'e3', 'e4', 'e5'] #Initialization f3 = FST('soundex-padzero') f3.add_state(start_state) f3.add_state(just_numbers) f3.add_state(letter_first) f3.initial_state = start_state add_numbers(f3, start_state, just_numbers) for letter in string.ascii_letters: f3.add_arc(start_state, letter_first, letter, letter) build_letter_first(f3, epsilons, letter_first) build_number_first(f3, epsilons, just_numbers) return f3
def add_zero_padding(): # Now, the third fst - the zero-padding fst f3 = FST('soundex-padzero') f3.add_state('1') f3.initial_state = '1' for i in range(2, 8): f3.add_state(str(i)) f3.set_final('5') f3.set_final('7') for letter in string.letters: f3.add_arc('1', '2', letter, letter) f3.add_arc('2', '2', letter, letter) for letter in ['1', '2', '3', '4', '5', '6']: f3.add_arc('1', '3', letter, letter) f3.add_arc('2', '3', letter, letter) f3.add_arc('3', '4', letter, letter) f3.add_arc('4', '5', letter, letter) f3.add_arc('3', '6', '', '0') f3.add_arc('6', '7', '', '0') f3.add_arc('4', '7', '', '0') return f3
def add_zero_padding(): # Now, the third fst - the zero-padding fst f3 = FST('soundex-padzero') states = ['1', '2', '3', '4'] for state in states: f3.add_state(state) f3.initial_state = '1' f3.set_final('4') for letter in string.letters: f3.add_arc('1', '1', letter, letter) for number in range(1, 10): f3.add_arc('1', '2', str(number), str(number)) f3.add_arc('2', '3', str(number), str(number)) f3.add_arc('3', '4', str(number), str(number)) f3.add_arc('2', '4', (), '00') f3.add_arc('3', '4', (), '0') return f3
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('start') for i in range(4): # print i f2.add_state(i) f2.set_final(i) f2.initial_state = 'start' # # Add the arcs for letter in string.letters: f2.add_arc('start', 0, (letter), (letter)) for n in range(10): f2.add_arc('start', 1, (str(n)), (str(n))) for i in range(3) : f2.add_arc(i, i+1, (str(n)), (str(n))) f2.add_arc(3,3,(str(n)),()) # trace(f2,'2345') # return f2 # trace(f2,'2345') return f2
def generate_control(self): arguments = self.matchers.keys() # this will be a hypercube control = FST() # zero state is for verb control.add_state("0", is_init=True, is_final=False) # inside states for the cube, except the last, accepting state for i in xrange(1, pow(2, len(arguments))): control.add_state(str(i), is_init=False, is_final=False) # last node of the hypercube control.add_state(str(int(pow(2, len(arguments)))), is_init=False, is_final=True) # first transition control.add_transition( KRPosMatcher("VERB"), [ExpandOperator(self.lexicon, self.working_area)], "0", "1") # count every transition as an increase in number of state for path in permutations(arguments): actual_state = 1 for arg in path: increase = pow(2, arguments.index(arg)) new_state = actual_state + increase control.add_transition( self.matchers[arg], [FillArgumentOperator(arg, self.working_area)], str(actual_state), str(new_state)) actual_state = new_state return control
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('1') f2.add_state('2') f2.add_state('3') f2.add_state('4') f2.add_state('5') f2.initial_state = '1' f2.set_final('2') f2.set_final('3') f2.set_final('4') f2.set_final('5') # Add the arcs for letter in string.letters: f2.add_arc('1', '2', (letter), (letter)) for n in range(10): for i in range(1, 6): if i == 5: f2.add_arc(str(i), str(i), (str(n)), ()) elif i == 1: f2.add_arc(str(i), str(i + 2), (str(n)), (str(n))) else: f2.add_arc(str(i), str(i + 1), (str(n)), (str(n))) return f2
def french_count(): f = FST('french') f.add_state('start') f.add_state('z') for i in range(30): f.add_state(str(i)) f.initial_state = ('start') for i in range(20, 30): f.set_final(str(i)) f.set_final('z') f.add_arc('start', 'z', ['z'], [kFRENCH_TRANS[0]]) for i in range(10): f.add_arc('start', str(i), [str(i)], []) for j in range(10, 20): if i is 0: f.add_arc(str(i), str(j), [str(j - 10)], []) elif i is 1: f.add_arc(str(i), str(j), [str(j - 10)], [kFRENCH_TRANS[100]]) elif i in range(2, 10): f.add_arc(str(i), str(j), [str(j - 10)], [kFRENCH_TRANS[i], kFRENCH_TRANS[100]]) for i in range(10, 20): for j in range(20, 30): if i is 10: if j != 20: f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[j - 20]]) else: f.add_arc(str(i), str(j), [str(j - 20)], []) elif i is 11 and j in range(20, 27): f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[j - 10]]) elif i is 11 and j in range(27, 30): f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[10], kFRENCH_TRANS[j - 20]]) elif i in range(12, 17): if j is 20: f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[int(i % 10) * 10]]) elif j is 21: f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[int(i % 10) * 10], kFRENCH_AND, kFRENCH_TRANS[1] ]) else: f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[int(i % 10) * 10], kFRENCH_TRANS[j - 20] ]) elif i is 17: if j is 20: f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[60], kFRENCH_TRANS[10]]) elif j is 21: f.add_arc( str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[60], kFRENCH_AND, kFRENCH_TRANS[11]]) elif j in range(22, 27): f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[60], kFRENCH_TRANS[j - 10]]) elif j in range(27, 30): f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[60], kFRENCH_TRANS[10], kFRENCH_TRANS[j - 20] ]) elif i is 18: if j is 20: f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) elif j in range(21, 30): f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[4], kFRENCH_TRANS[20], kFRENCH_TRANS[j - 20] ]) elif i is 19: if j in range(20, 27): f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[4], kFRENCH_TRANS[20], kFRENCH_TRANS[j - 10] ]) elif j in range(27, 30): f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[4], kFRENCH_TRANS[20], kFRENCH_TRANS[10], kFRENCH_TRANS[j - 20] ]) return f
def union_fst(self, list_fst): new_alphabet, new_states, new_start_state, new_accept_states, new_transitions =\ self._merge_fst(list_fst, op=UNION) return FST(new_alphabet, new_states, new_start_state, new_accept_states, new_transitions)
def intersection_fst(self, list_fst): new_alphabet, new_states, new_start_state, new_accept_states, new_transitions =\ self._merge_fst(list_fst, op=INTERSECT) return FST(new_alphabet, new_states, new_start_state, new_accept_states, new_transitions)
_states_L1 = {"s0", "s1", "s2", "s3", "s4"} _init_state_L1 = "s0" _accept_states_L1 = [("s2", 1), ("s4", 1)] _transitions_L1 = [ ("s0", "a", "s1"), ("s0", "b", "s3"), ("s1", "a", "s1"), ("s1", "b", "s2"), ("s2", "a", "s1"), ("s2", "b", "s2"), ("s3", "a", "s4"), ("s3", "b", "s3"), ("s4", "a", "s4"), ("s4", "b", "s3") ] _fst_L1 = FST(_alphabet_L1, _states_L1, _init_state_L1, _accept_states_L1, _transitions_L1) print("check FST - L1") print(_fst_L1) assert _fst_L1.go("aaabbababaabbab")[1] assert not _fst_L1.go("aaabbbbba")[1] rand = "".join(_fst_L1.go()) print("sample:" + rand) assert _fst_L1.go(rand) # L2 _alphabet_L2 = ["a", "b"] _states_L2 = {"q0", "q1", "q2", "q3"} _init_state_L2 = "q0" _accept_states_L2 = [("q2", 3)] _transitions_L2 = [ ("q0", "a", "q1"),
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ vowels = [ 'a', 'A', 'e', 'E', 'h', 'H', 'i', 'I', 'o', 'O', 'u', 'U', 'w', 'W', 'y', 'Y' ] # Let's define our first FST f1 = FST('soundex-generate') # Indicate that '1' is the initial state f1.add_state('start') f1.add_state('s11') f1.add_state('s22') f1.add_state('s33') f1.add_state('s44') f1.add_state('s55') f1.add_state('s66') f1.add_state('s1') f1.add_state('s2') f1.add_state('s3') f1.add_state('sv') f1.add_state('s4') f1.add_state('s5') f1.add_state('s6') f1.add_state('next') f1.initial_state = 'start' # Set all the final states f1.set_final('next') # Add the rest of the arcs for letter in string.ascii_letters: #f1.add_arc('start', 'next', (letter), (letter)) #f1.add_arc('next', 'next', (letter), ('0')) if letter in vowels: f1.add_arc('start', 'sv', (letter), (letter)) f1.add_arc('s11', 'sv', (letter), ()) f1.add_arc('s33', 'sv', (letter), ()) f1.add_arc('s22', 'sv', (letter), ()) f1.add_arc('s44', 'sv', (letter), ()) f1.add_arc('s55', 'sv', (letter), ()) f1.add_arc('s66', 'sv', (letter), ()) f1.add_arc('sv', 'sv', (letter), ()) f1.add_arc('s1', 'sv', (letter), ()) f1.add_arc('s2', 'sv', (letter), ()) f1.add_arc('s3', 'sv', (letter), ()) f1.add_arc('s4', 'sv', (letter), ()) f1.add_arc('s6', 'sv', (letter), ()) f1.add_arc('s5', 'sv', (letter), ()) #f1.add_arc('s3','s4',(letter),(letter)) elif letter in "Ll": f1.add_arc('start', 's44', (letter), (letter)) f1.add_arc('s44', 's4', (letter), ()) f1.add_arc('s11', 's4', (letter), ('4')) f1.add_arc('s22', 's4', (letter), ('4')) f1.add_arc('s33', 's4', (letter), ('4')) f1.add_arc('s55', 's4', (letter), ('4')) f1.add_arc('s66', 's4', (letter), ('4')) f1.add_arc('s4', 's4', (letter), ()) f1.add_arc('s1', 's4', (letter), ('4')) f1.add_arc('s2', 's4', (letter), ('4')) f1.add_arc('s3', 's4', (letter), ('4')) f1.add_arc('s5', 's4', (letter), ('4')) f1.add_arc('s6', 's4', (letter), ('4')) f1.add_arc('sv', 's4', (letter), ('4')) elif letter in 'Rr': f1.add_arc('start', 's66', (letter), (letter)) f1.add_arc('s66', 's6', (letter), ()) f1.add_arc('s22', 's6', (letter), ('6')) f1.add_arc('s33', 's6', (letter), ('6')) f1.add_arc('s44', 's6', (letter), ('6')) f1.add_arc('s55', 's6', (letter), ('6')) f1.add_arc('s11', 's6', (letter), ('6')) f1.add_arc('s6', 's6', (letter), ()) f1.add_arc('s1', 's6', (letter), ('6')) f1.add_arc('s2', 's6', (letter), ('6')) f1.add_arc('s3', 's6', (letter), ('6')) f1.add_arc('s5', 's6', (letter), ('6')) f1.add_arc('s4', 's6', (letter), ('6')) f1.add_arc('sv', 's6', (letter), ('6')) elif letter in "bfpvBFPV": f1.add_arc('start', 's11', (letter), (letter)) f1.add_arc('s11', 's1', (letter), ()) f1.add_arc('s22', 's1', (letter), ('1')) f1.add_arc('s33', 's1', (letter), ('1')) f1.add_arc('s44', 's1', (letter), ('1')) f1.add_arc('s55', 's1', (letter), ('1')) f1.add_arc('s66', 's1', (letter), ('1')) f1.add_arc('s1', 's1', (letter), ()) f1.add_arc('s5', 's1', (letter), ('1')) f1.add_arc('s2', 's1', (letter), ('1')) f1.add_arc('s3', 's1', (letter), ('1')) f1.add_arc('s4', 's1', (letter), ('1')) f1.add_arc('sv', 's1', (letter), ('1')) f1.add_arc('s6', 's1', (letter), ('1')) elif letter in "cgjkqsxzCGJKQSXZ": f1.add_arc('start', 's22', (letter), (letter)) f1.add_arc('s22', 's2', (letter), ()) f1.add_arc('s11', 's2', (letter), ('2')) f1.add_arc('s33', 's2', (letter), ('2')) f1.add_arc('s44', 's2', (letter), ('2')) f1.add_arc('s55', 's2', (letter), ('2')) f1.add_arc('s66', 's2', (letter), ('2')) f1.add_arc('s2', 's2', (letter), ()) f1.add_arc('s5', 's2', (letter), ('2')) f1.add_arc('s1', 's2', (letter), ('2')) f1.add_arc('s3', 's2', (letter), ('2')) f1.add_arc('s4', 's2', (letter), ('2')) f1.add_arc('sv', 's2', (letter), ('2')) f1.add_arc('s6', 's2', (letter), ('2')) elif letter in "mnMN": f1.add_arc('start', 's55', (letter), (letter)) f1.add_arc('s55', 's5', (letter), ()) f1.add_arc('s11', 's5', (letter), ('5')) f1.add_arc('s44', 's5', (letter), ('5')) f1.add_arc('s33', 's5', (letter), ('5')) f1.add_arc('s22', 's5', (letter), ('5')) f1.add_arc('s66', 's5', (letter), ('5')) f1.add_arc('s5', 's5', (letter), ()) f1.add_arc('s2', 's5', (letter), ('5')) f1.add_arc('s1', 's5', (letter), ('5')) f1.add_arc('s3', 's5', (letter), ('5')) f1.add_arc('s4', 's5', (letter), ('5')) f1.add_arc('sv', 's5', (letter), ('5')) f1.add_arc('s6', 's5', (letter), ('5')) elif letter in "dtDT": f1.add_arc('start', 's33', (letter), (letter)) f1.add_arc('s33', 's3', (letter), ()) f1.add_arc('s11', 's3', (letter), ('3')) f1.add_arc('s44', 's3', (letter), ('3')) f1.add_arc('s55', 's3', (letter), ('3')) f1.add_arc('s22', 's3', (letter), ('3')) f1.add_arc('s66', 's3', (letter), ('3')) f1.add_arc('s3', 's3', (letter), ()) f1.add_arc('s2', 's3', (letter), ('3')) f1.add_arc('s1', 's3', (letter), ('3')) f1.add_arc('s5', 's3', (letter), ('3')) f1.add_arc('s4', 's3', (letter), ('3')) f1.add_arc('sv', 's3', (letter), ('3')) f1.add_arc('s6', 's3', (letter), ('3')) """ else: f1.add_arc('s1','s5',(letter),('1')) f1.add_arc('s4','s5',(letter),('1')) f1.add_arc('s6','s5',(letter),('1')) f1.add_arc('s44','s5',(letter),('1')) f1.add_arc('s66','s5',(letter),('1')) f1.add_arc('s3','s5',(letter),('1')) #f1.add_arc('s5','s5',(letter),()) """ f1.add_arc('s11', 'next', (), ()) f1.add_arc('s22', 'next', (), ()) f1.add_arc('s33', 'next', (), ()) f1.add_arc('s44', 'next', (), ()) f1.add_arc('s55', 'next', (), ()) f1.add_arc('s66', 'next', (), ()) f1.add_arc('s1', 'next', (), ()) f1.add_arc('s2', 'next', (), ()) f1.add_arc('s3', 'next', (), ()) f1.add_arc('sv', 'next', (), ()) f1.add_arc('s4', 'next', (), ()) f1.add_arc('s5', 'next', (), ()) f1.add_arc('s6', 'next', (), ()) return f1
from fst import FST import argparse parser = argparse.ArgumentParser() parser.add_argument('--fst', action='store_true') parser.add_argument('--isyms', action='store_true') parser.add_argument('--osyms', action='store_true') parser.add_argument('--name', type=str, required=True) parser.add_argument('--file', type=str, required=True) if __name__ == '__main__': args = parser.parse_args() fst = FST(args.name) fst.initial_state = fst.new_state() fst.final_states.append(fst.new_state()) with open(args.file, 'r') as f: for line in f: if fst.name == 'L': word, phones = line.strip().split('\t') tokens = phones.split() elif fst.name == 'S': word = line.strip().split('\t')[0] tokens = list(word) if len(tokens) == 1: fst.add_arc(fst.initial_state, fst.final_states[0], word, tokens[0]) else: state = fst.new_state() fst.add_arc(fst.initial_state, state, word, tokens[0]) for phone in tokens[1:-1]:
def french_count(): f = FST('french') f.add_state('start') f.add_state('0xx') f.add_state('nxx') f.add_state('00x') f.add_state('n0x') f.add_state('n1x') f.add_state('nnx') f.add_state('n7x') f.add_state('n8x') f.add_state('n9x') f.add_state('last') f.initial_state = 'start' f.set_final('last') for ii in xrange(10): f.add_arc('00x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) if ii == 0: f.add_arc('start', '0xx', [str(ii)], ()) f.add_arc('0xx', '00x', [str(ii)], ()) f.add_arc('nxx', 'n0x', [str(ii)], ()) f.add_arc('n0x', 'last', [str(ii)], ()) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[ii + 10]]) f.add_arc('nnx', 'last', [str(ii)], ()) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[ii+10]]) f.add_arc('n8x', 'last', [str(ii)], ()) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[ii+10]]) if ii == 1: f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[100]]) f.add_arc('0xx', 'n1x', [str(ii)], ()) f.add_arc('nxx', 'n1x', [str(ii)], ()) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[ii + 10]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_AND,kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_AND,kFRENCH_TRANS[ii+10]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[ii+10]]) if ii in range(2,7): f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[ii],kFRENCH_TRANS[100]]) f.add_arc('0xx', 'nnx', [str(ii)], [kFRENCH_TRANS[ii * 10]]) f.add_arc('nxx', 'nnx', [str(ii)], [kFRENCH_TRANS[ii * 10]]) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[ii + 10]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[ii+10]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[ii + 10]]) if ii == 7: f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[ii],kFRENCH_TRANS[100]]) f.add_arc('0xx', 'n7x', [str(ii)], [kFRENCH_TRANS[6 * 10]]) f.add_arc('nxx', 'n7x', [str(ii)], [kFRENCH_TRANS[6*10]]) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) if ii == 8: f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[ii],kFRENCH_TRANS[100]]) f.add_arc('0xx', 'n8x', [str(ii)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) f.add_arc('nxx', 'n8x', [str(ii)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) if ii == 9: f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[ii],kFRENCH_TRANS[100]]) f.add_arc('0xx', 'n9x', [str(ii)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) f.add_arc('nxx', 'n9x', [str(ii)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) return f
fst_trans[char_list[i][0]] = {} fst_trans["start"][char_list[i][0]] = [ char_list[i][:1], arp_list[i][0] + " " ] for i in range(0, len(char_list)): for j in range(1, len(char_list[i])): # adding new states to fst_states fst_states.append(char_list[i][:j + 1]) # adding state transitions, where state names are string of chars in the color # so far so the 3rd state on the way to blue is named 'blu' # example in dictionary: 'ru': {'b': ['rub', 'B ']} fst_trans[char_list[i][:j + 1]] = {} fst_trans[char_list[i][:j]][char_list[i][j]] = [char_list[i][:j + 1]] # if/else accounting for silent letters at end of word if j < len(arp_list[i]): fst_trans[char_list[i][:j]][char_list[i][j]].append( arp_list[i][j] + " ") else: fst_trans[char_list[i][:j]][char_list[i][j]].append("") # Our final states are all of our input words fst_final = char_list # Declaring our fst test_fst = FST(fst_states, fst_in_alph, fst_out_alph, fst_start, fst_final, fst_trans) # Calling transduce on each color in our input for word in char_list: print test_fst.transduce_string(word)