def fstbuild(words):
    trie = fst.Transducer()

    letter_syms = fst.read_symbols("ascii.syms.bin")
    trie.isyms = letter_syms
    trie.osyms = letter_syms

    def bs(s):
        letter_syms = fst.read_symbols("ascii.syms.bin")
        return letter_syms[s]

    biggest = 0

    for w in words:
        p = 0
        c = 0
        trie.add_arc(p, biggest + 1, w[c], "<epsilon>", 0)
        p = biggest + 1
        c += 1
        while (c < len(w) - 1):
            trie.add_arc(p, p + 1, w[c], "<epsilon>", 0)
            p += 1
            c += 1
        trie.add_arc(p, p + 1, w[c], w, 0)
        p += 1
        biggest = max(p, biggest)
        last_state = trie[biggest]
        last_state.final = True

    det_trie = trie.determinize()
    det_trie.arc_sort_input()
    det_trie.remove_epsilon()

    return det_trie
Exemplo n.º 2
0
def fstbuild(words):
    trie = fst.Transducer()

    letter_syms = fst.read_symbols("ascii.syms.bin")
    trie.isyms = letter_syms
    trie.osyms = letter_syms

    def bs(s):
        letter_syms = fst.read_symbols("ascii.syms.bin")
        return letter_syms[s]


    biggest = 0

    for w in words:
        p = 0
        c = 0
        trie.add_arc(p, biggest+1, w[c], w[c], 0) 
        p = biggest+1
        c += 1
        while (c < len(w)-1):
            trie.add_arc(p, p+1, w[c], w[c], 0)
            p+=1
            c+=1        
        trie.add_arc(p, p+1, w[c], w, 0)
        p +=1
        biggest = max(p, biggest)
        last_state = trie[biggest] 
        last_state.final = True
        

    det_trie = trie.determinize()
    det_trie.arc_sort_input()
    
    return det_trie
def levenshtein(w, editdst):
    
    wts = keyweights()

    trie = fst.Transducer()

    letter_syms = fst.read_symbols("ascii.syms.bin")
    trie.isyms = letter_syms
    trie.osyms = letter_syms
    letttup = list(letter_syms.items())
    letters = list()
    for let in letttup:
        letters.append(let[0])


    class StateCounter(object):
        def __init__(self):
            self.set = {}
            self.count = -1
        
        def __contains__(self, obj):
            return obj in self.set
 
        def __getitem__(self, obj):
            if not obj in self.set:
                self.count += 1
                self.set[obj] = self.count
            return self.set[obj]

    states = StateCounter()




    for x in range(0,len(w)):
        for y in range(0, editdst+1):        
            trie.add_arc(states[str(x)+"^"+str(y)], states[str(x+1)+"^"+str(y)], w[x], w[x], 0)# char in word
            if not y == editdst:
                trie.add_arc(states[str(x)+"^"+str(y)], states[str(x+1)+"^"+str(y+1)], "<epsilon>", "<epsilon>", 1.5)# deletion
                for i in letters:
                    trie.add_arc(states[str(x)+"^"+str(y)], states[str(x+1)+"^"+str(y+1)], i, i, wts[w[x], i])# substitution
                    trie.add_arc(states[str(x)+"^"+str(y)], states[str(x)+"^"+str(y+1)], i, i, wts[w[x], i])# insertion

    for y in range(0, editdst+1):
        trie[states[str(len(w))+"^"+str(y)]].final = True
    

    
    
    trie.remove_epsilon()
    trie.arc_sort_input()
   




    return trie
Exemplo n.º 4
0
 def _prepare_resource(self,dir_to_tagger,dir_to_phrase):
     '''
     [description]
         根据tagger和constraint的fst生成字典
     Arguments:
         dir_to_tagger {string} -- [description]
         dir_to_phrase {string} -- [description]
     Returns:
         tagger_dict -- [description] tagger_dict['${.concept}']=string list, each string is a path
         constraint_dict -- [description] constraint_dict['${@constraint}']=list of (string path, mapped value)
     '''
     # deal with entities(tagger)
     files=os.listdir(dir_to_tagger)
     isyms=fst.read_symbols(os.path.join(dir_to_tagger,'isyms.fst'))
     osyms=fst.read_symbols(os.path.join(dir_to_tagger,'osyms.fst'))
     filepath=os.path.join(dir_to_tagger,[each for each in files if each not in ['isyms.fst','osyms.fst'] and each.endswith('.fst')][0])
     lexicon=fst.read_std(filepath)
     lexicon.isyms=isyms
     lexicon.osyms=osyms
     self.tagger_dict=defaultdict(list)
     for each_path in lexicon.paths():
         input_string=[lexicon.isyms.find(arc.ilabel) for arc in each_path if arc.ilabel != 0]
         if len(input_string)!=1:
             raise ValueError('[Error]:error in resolving tagger name!')
         output_string=[lexicon.osyms.find(arc.olabel) for arc in each_path if arc.olabel != 0]
         self.tagger_dict[input_string[0]].append(reverse_preproc(output_string))
     # deal with constraints
     files=os.listdir(dir_to_phrase)
     isyms=fst.read_symbols(os.path.join(dir_to_phrase,'isyms.fst'))
     osyms=fst.read_symbols(os.path.join(dir_to_phrase,'osyms.fst'))
     fst_dict={}
     for each in files:
         if each not in ['isyms.fst','osyms.fst'] and each.endswith('.fst'):
             fst_dict[each[0]]=fst.read_std(os.path.join(dir_to_phrase,each))
             fst_dict[each[0]].isyms=isyms
             fst_dict[each[0]].osyms=osyms
     self.constraint_dict=defaultdict(list)
     for each in sorted(fst_dict.keys()): #层级phrase的fst按0-1-2-...顺序组织
         tmp_fst=fst_dict[each]
         for path in tmp_fst.paths():
             name,item_list=self._get_path_and_mapped_value(path,tmp_fst)
             self.constraint_dict[name].extend(item_list)
     return (self.tagger_dict,self.constraint_dict)
Exemplo n.º 5
0
def levenshtein(w, editdst):

    wts = keyweights()

    trie = fst.Transducer()

    letter_syms = fst.read_symbols("ascii.syms.bin")
    trie.isyms = letter_syms
    trie.osyms = letter_syms
    letttup = list(letter_syms.items())
    letters = list()
    for let in letttup:
        letters.append(let[0])

    class StateCounter(object):
        def __init__(self):
            self.set = {}
            self.count = -1

        def __contains__(self, obj):
            return obj in self.set

        def __getitem__(self, obj):
            if not obj in self.set:
                self.count += 1
                self.set[obj] = self.count
            return self.set[obj]

    states = StateCounter()

    for x in range(0, len(w)):
        for y in range(0, editdst + 1):
            trie.add_arc(states[str(x) + "^" + str(y)],
                         states[str(x + 1) + "^" + str(y)], w[x], w[x],
                         0)  # char in word
            if not y == editdst:
                trie.add_arc(states[str(x) + "^" + str(y)],
                             states[str(x + 1) + "^" + str(y + 1)],
                             "<epsilon>", "<epsilon>", 1.5)  # deletion
                for i in letters:
                    trie.add_arc(states[str(x) + "^" + str(y)],
                                 states[str(x + 1) + "^" + str(y + 1)], i, i,
                                 wts[w[x], i])  # substitution
                    trie.add_arc(states[str(x) + "^" + str(y)],
                                 states[str(x) + "^" + str(y + 1)], i, i,
                                 wts[w[x], i])  # insertion

    for y in range(0, editdst + 1):
        trie[states[str(len(w)) + "^" + str(y)]].final = True

    trie.remove_epsilon()
    trie.arc_sort_input()

    return trie
Exemplo n.º 6
0
def generate_suggestions(prefix):
    """
    To extract suggestions the first step was to traverse the fst
    in fstfile following the charecters of the given prefix. From
    there the state of the final letter of prefix is saved and the next
    part constructs an fst of the branch the grows from the saved state.
    It is done in bds approach. Later, extract all paths from acceptor in
    a dfs manner is done with path weight calculation. Then all paths 
    are sorted by weights and the first three are jsoned.
    INPUT:
       a string
    OUTPUT:
       a json file with up to three values for Suggestion entry
    """

    fstfile = "/Users/dudy/CSLU/summerIntern/src/prfx_tree.fst"
    sym = fst.read_symbols("/Users/dudy/CSLU/summerIntern/src/syms")
    lm = fst.read(fstfile)
    prefix = prefix.lower()

    # look for subtree given prefix
    stateid = 0
    for ch in prefix:
        state = lm[stateid]
        for arc in state.arcs:
            if sym.find(arc.ilabel) == ch:
                print ch
                stateid = arc.nextstate
                break

    # construct desired subtree (bds)
    reduced = bfs(stateid, lm, sym)
    # read strings (dfs)
    top3 = dfs(reduced, sym)

    # take first three (if exists)
    suggest = []
    for (suffix, _) in top3:
        suggest.append(suffix)

    # dict it
    result = {}
    result["Suggestions:"] = suggest

    # json it
    json_file = "auto.json"
    with open(json_file, "w") as fp:
        json.dump(result, fp)
Exemplo n.º 7
0
def generate_suggestions(prefix):
    """
    To extract suggestions the first step was to traverse the fst
    in fstfile following the charecters of the given prefix. From
    there the state of the final letter of prefix is saved and the next
    part constructs an fst of the branch the grows from the saved state.
    It is done in bds approach. Later, extract all paths from acceptor in
    a dfs manner is done with path weight calculation. Then all paths 
    are sorted by weights and the first three are jsoned.
    INPUT:
       a string
    OUTPUT:
       a json file with up to three values for Suggestion entry
    """    

    fstfile = "/Users/dudy/CSLU/summerIntern/src/prfx_tree.fst"
    sym = fst.read_symbols("/Users/dudy/CSLU/summerIntern/src/syms")
    lm = fst.read(fstfile)
    prefix = prefix.lower()

    # look for subtree given prefix
    stateid = 0
    for ch in prefix:
        state = lm[stateid]
        for arc in state.arcs:
            if sym.find(arc.ilabel)==ch:
                print ch
                stateid = arc.nextstate
                break

    # construct desired subtree (bds)
    reduced = bfs(stateid, lm, sym)
    # read strings (dfs)
    top3 = dfs(reduced, sym)

    # take first three (if exists)
    suggest = []
    for (suffix, _) in top3:
        suggest.append(suffix)

    # dict it    
    result = {}
    result["Suggestions:"] = suggest

    # json it
    json_file = "auto.json"
    with open(json_file, "w") as fp:
        json.dump(result, fp)
Exemplo n.º 8
0
        #print fst_path
        s = a[a.index('-str') + 1]
        #print s
        s = s.strip()
        sym = a[a.index('-sym') + 1]
        #print sym
        return [s, fst_path, sym]
    except (ValueError, IndexError):
        sys.stderr.write('Usage: -fst [name of fst] -str ' \
                         '[string to encode as fst (no quotes,tokens separated by white space)] ' \
                         '-sym [binary symbol file]\n e.g. -fst sentence.fst -str hello world -sym mysym.bin\n')
        exit()


def log_linear_chain(txt, sym_f):
    txt = txt.replace('__s__', '<s>')
    txt = txt.replace('_s_', '</s>')
    txt = txt.split()
    lc = fst.Transducer(sym_f, sym_f)
    for idx, t in enumerate(txt):
        lc.add_arc(idx, idx + 1, t, t, 0.0)
    lc[idx + 1].final = True
    return lc


if __name__ == '__main__':
    [s, fst_path, sym_path] = parseargs(sys.argv)
    sym = fst.read_symbols(sym_path)
    lc = log_linear_chain(s, sym)
    lc.write(fst_path, sym, sym)
 def bs(s):
     letter_syms = fst.read_symbols("ascii.syms.bin")
     return letter_syms[s]
Exemplo n.º 10
0

def add_arc_pr(sym, lmfst, fid, tid, isy, osy, wt):
    lmfst.add_arc(fid, tid, isy, osy, wt)
    '''
    print 'added arc', fid, tid, sym[isy], sym[osy]
    if fid == 1961:
        count = 0
        for c, s in enumerate(lmfst.states):
            count = c
        print 'number of total states', count
    '''


if __name__ == '__main__':
    sym_e = fst.read_symbols('data/syme.bin')
    lm_txt = open('data/lm', 'r').read()
    [bs, unigrams, bigrams, trigrams] = re.split('1-grams:|2-grams:|3-grams:', lm_txt)
    unigrams = re.split('\n+', unigrams)
    bigrams = re.split('\n+', bigrams)
    trigrams = re.split('\n+', trigrams)

    lm_id = {}
    lm_id[INITIAL] = len(lm_id)
    lm_fst = fst.Transducer(sym_e, sym_e)
    lm_fst.add_state()
    lm_id[NULL] = len(lm_id)

    for uni_line in unigrams:
        if uni_line.strip() != '' and len(uni_line.split('\t')) > 1:
            [p, ng, bk] = trysplit(uni_line)
 def bs(s):
     letter_syms = fst.read_symbols("ascii.syms.bin")
     return letter_syms[s]
Exemplo n.º 12
0
__author__ = 'arenduchintala'
import fst, itertools

if __name__ == '__main__':
    tokens = open('data/input', 'r').read().split()
    tokens = set(tokens)
    symf = fst.read_symbols('data/symf.bin')
    reorder_list = []
    reorder = fst.Transducer(symf, symf)
    reorder[0].final = True
    for s, v in symf.items():
        reorder.add_arc(0, 0, s, s, 0.0)
        st = set(s.split('_'))
        if len(st.intersection(tokens)) > 0:
            print 'keep', s
            reorder_list.append(s)
        else:
            print 'reject', s
    print 'filtered down to', len(set(reorder_list))
    n = 1
    c = 0
    for a, b in itertools.product(reorder_list, reorder_list):
        c += 1
        if c % 1000 == 0:
            print int(c / 1000), 'of', int((len(reorder_list) ** 2) / 1000)
        if a != b:
            reorder.add_arc(0, n, a, fst.EPSILON, 0.0)
            #print 0, n
            reorder.add_arc(n, n + 1, b, fst.EPSILON, 0.0)
            #print n + 1, n + 2
            reorder.add_arc(n + 1, n + 2, fst.EPSILON, b, 0.0)