Exemplo n.º 1
0
def soundex_convert(name_string):
    """Combine the three FSTs above and use it to convert a name into a Soundex"""
    f1 = letters_to_numbers()
    f2 = truncate_to_three_digits()
    f3 = add_zero_padding()
    output = compose(name_string, f1, f2, f3)[0]
    return ''.join(output)
Exemplo n.º 2
0
def soundex_convert(name_string):
    """Combine the three FSTs above and use it to convert a name into a Soundex"""
    char_list = [char for char in name_string]
    target = compose(char_list, letters_to_numbers(),
                     truncate_to_three_digits(), add_zero_padding())
    return ''.join(target[0])
    def parse(self, word):
        """Parse a word morphologically 

        e.g.
        p = Parser()
        word = ['p','a','n','i','c','k','i','n','g']
        p.parse(word)
        ---> 'panic+present participle form'
        """

        # Ok so now let's do the second FST
        f2 = FST('morphology-parse')  # Detecting affixes ('ing' and 'ed')

        # Indicate initial and final states
        states = ['start', 'i', 'n', 'g', 'e', 'd']
        for state in states:
            f2.add_state(state)
        f2.initial_state = 'start'
        f2.set_final('d')
        f2.set_final('g')

        for letter in string.ascii_lowercase:
            if not letter in 'ie':
                f2.add_arc('start', 'start', letter, letter)
            if not letter == 'n':
                f2.add_arc('i', 'start', letter, ('i', letter))
            if not letter == 'd':
                f2.add_arc('e', 'start', letter, ['e', letter])
            if not letter == 'g':
                f2.add_arc('n', 'start', letter, ['i', 'n', letter])

        f2.add_arc('start', 'i', 'i', '')
        f2.add_arc('i', 'n', 'n', '')
        f2.add_arc('n', 'g', 'g', '+present participle form')
        f2.add_arc('start', 'e', 'e', '')
        f2.add_arc('e', 'd', 'd', '+past form')

        f3 = FST('morphology-parse')  # K-deletion
        # Indicate initial and final states
        f3.add_state('start')
        f3.add_state('vowel')
        f3.add_state('consonant')
        f3.add_state('c')
        f3.add_state('k')
        f3.add_state('lick_l')
        f3.add_state('lick_i')
        f3.add_state('lick_c')
        f3.add_state('lick_k')
        f3.add_state('parse')
        f3.initial_state = 'start'
        f3.set_final('parse')

        vowels = 'aeiou'
        for vowel in vowels:
            f3.add_arc('start', 'vowel', vowel, vowel)
            f3.add_arc('vowel', 'vowel', vowel, vowel)
            f3.add_arc('consonant', 'vowel', vowel, vowel)
            f3.add_arc('c', 'vowel', vowel, vowel)

        for letter in string.ascii_lowercase:
            f3.add_arc('parse', 'parse', letter, letter)

            if letter in vowels:
                continue

            if not letter == 'c':
                f3.add_arc('vowel', 'consonant', letter, letter)

            if not letter == 'l':
                f3.add_arc('start', 'consonant', letter, letter)

            if not letter == 'k':
                f3.add_arc('c', 'consonant', letter, letter)
            f3.add_arc('consonant', 'consonant', letter, letter)

        f3.add_arc('vowel', 'c', 'c', 'c')
        f3.add_arc('c', 'k', 'k', '')
        f3.add_arc('start', 'lick_l', 'l', 'l')
        f3.add_arc('lick_l', 'lick_i', 'i', 'i')
        f3.add_arc('lick_i', 'lick_c', 'c', 'c')
        f3.add_arc('lick_c', 'lick_k', 'k', 'k')
        f3.add_arc('lick_k', 'parse', '+', '+')
        f3.add_arc('k', 'parse', '+', '+')
        f3.add_arc('consonant', 'parse', '+', '+')
        f3.add_arc('parse', 'parse', ' ', ' ')

        output = compose(word, f2, f3)[0]

        return "".join(output)
	def parse(self, word):
		"""Parse a word morphologically want, sync, panic, havoc, and lick

		e.g. -ed and -ing
		p = Parser()   past form  \  present participle form
		word = ['p', 'a', 'n', 'i', 'c', 'k','e','d']
		p.parse(word)
		---> 'panic+past form'
		"""
		f2=fst.FST('lexicon')
		for i in range(1,56):
			f2.add_state(str(i))
		f2.initial_state = '1'
		f2.add_arc('1','2','w','w')
		f2.add_arc('2','3','a','a')
		f2.add_arc('3','4','n','n')
		f2.add_arc('4','5','t','t')
		f2.add_arc('5','6','e','^')
		f2.add_arc('6','7','d','d')
		f2.add_arc('7','8','','#')
		f2.add_arc('5','9','i','^')
		f2.add_arc('9','10','n','n')
		f2.add_arc('10','11','g','g')
		f2.add_arc('11','8','','#')
		f2.add_arc('1','12','s','s')
		f2.add_arc('12','13','y','y')
		f2.add_arc('13','14','n','n')
		f2.add_arc('14','15','c','c')
		f2.add_arc('15','16','e','^')
		f2.add_arc('16','17','d','d')
		f2.add_arc('17','18','','#')
		f2.add_arc('15','19','i','^')
		f2.add_arc('19','20','n','n')
		f2.add_arc('20','21','g','g')
		f2.add_arc('21','18','','#')
		f2.add_arc('1','22','p','p')
		f2.add_arc('22','23','a','a')
		f2.add_arc('23','24','n','n')
		f2.add_arc('24','25','i','i')
		f2.add_arc('25','26','c','c')
		f2.add_arc('26','27','k','^')
		f2.add_arc('27','28','e','e')
		f2.add_arc('28','29','d','d')
		f2.add_arc('29','30','','#')
		f2.add_arc('27','31','i','i')
		f2.add_arc('31','32','n','n')
		f2.add_arc('32','33','g','g')
		f2.add_arc('33','30','','#')
		f2.add_arc('1','34','h','h')
		f2.add_arc('34','35','a','a')
		f2.add_arc('35','36','v','v')
		f2.add_arc('36','37','o','o')
		f2.add_arc('37','38','c','c')
		f2.add_arc('38','39','k','^')
		f2.add_arc('39','40','e','e')
		f2.add_arc('40','41','d','d')
		f2.add_arc('41','42','','#')
		f2.add_arc('39','43','i','i')
		f2.add_arc('43','44','n','n')
		f2.add_arc('44','45','g','g')
		f2.add_arc('45','42','','#')
		f2.add_arc('1','46','l','l')
		f2.add_arc('46','47','i','i')
		f2.add_arc('47','48','c','c')
		f2.add_arc('48','49','k','k')
		f2.add_arc('49','50','e','^')
		f2.add_arc('50','51','d','d')
		f2.add_arc('51','52','','#')
		f2.add_arc('49','53','i','^')
		f2.add_arc('53','54','n','n')
		f2.add_arc('54','55','g','g')
		f2.add_arc('55','52','','#')
		f2.set_final('8')
		f2.set_final('18')
		f2.set_final('30')
		f2.set_final('42')
		f2.set_final('52')
		
		f3=fst.FST('rule')
		for i in range(1,53):
			f3.add_state(str(i))
		f3.initial_state = '1'
		f3.add_arc('1','2','p','p')
		f3.add_arc('2','3','a','a')
		f3.add_arc('3','4','n','n')
		f3.add_arc('4','5','i','i')
		f3.add_arc('5','6','c','c')
		f3.add_arc('6','7','^','')
		f3.add_arc('7','8','e','')
		f3.add_arc('8','9','d','+past form')
		f3.add_arc('9','10','#','')
		f3.add_arc('7','11','i','')
		f3.add_arc('11','12','n','')
		f3.add_arc('12','13','g','+present participle')
		f3.add_arc('13','10','#','')
		f3.add_arc('1','14','h','h')
		f3.add_arc('14','15','a','a')
		f3.add_arc('15','16','v','v')
		f3.add_arc('16','17','o','o')
		f3.add_arc('17','18','c','c')
		f3.add_arc('18','19','^','')
		f3.add_arc('19','20','e','')
		f3.add_arc('20','21','d','+past form')
		f3.add_arc('21','22','#','')
		f3.add_arc('19','23','i','')
		f3.add_arc('23','24','n','')
		f3.add_arc('24','25','g','+present participle')
		f3.add_arc('25','22','#','')
		f3.add_arc('1','26','l','l')
		f3.add_arc('26','27','i','i')
		f3.add_arc('27','28','c','c')
		f3.add_arc('28','29','k','k')
		f3.add_arc('29','30','^','')
		f3.add_arc('30','31','d','+past form')
		f3.add_arc('31','32','#','')
		f3.add_arc('30','33','n','')
		f3.add_arc('33','34','g','+present participle')
		f3.add_arc('34','32','#','')
		f3.add_arc('1','35','s','s')
		f3.add_arc('35','36','y','y')
		f3.add_arc('36','37','n','n')
		f3.add_arc('37','38','c','c')
		f3.add_arc('38','39','^','')
		f3.add_arc('39','40','d','+past form')
		f3.add_arc('40','41','#','')
		f3.add_arc('38','42','n','')
		f3.add_arc('42','43','g','+present participle')
		f3.add_arc('43','41','#','')
		f3.add_arc('1','44','w','w')
		f3.add_arc('44','45','a','a')
		f3.add_arc('45','46','n','n')
		f3.add_arc('46','47','t','t')
		f3.add_arc('47','48','^','')
		f3.add_arc('48','49','d','+past form')
		f3.add_arc('49','50','#','')
		f3.add_arc('47','51','n','')
		f3.add_arc('51','52','g','+present participle')
		f3.add_arc('52','50','#','')
		f3.set_final('10')
		f3.set_final('22')
		f3.set_final('32')
		f3.set_final('41')
		f3.set_final('50')
		
		output=''.join(fsmutils.compose(word,f2,f3)[0])
		return output
		'''output = ['p','a','n','i','c','+past form']