Пример #1
0
 def apply(self, line):
     tok = hfst.HfstTokenizer()
     Transducer = hfst.HfstTransducer(self.fallbackTransducer)
     Transducer.push_weights_to_end()
     words = hfst.tokenized_fst(tok.tokenize(line))
     words.compose(Transducer)
     words.minimize()
     return words
Пример #2
0
def similar_words_with_block_composition(words, transducer_path):
    def _compose_block(block, delenv, right_tr, tokenizer):
        tr = hfst.empty_fst()
        for word in block:
            tr.disjunct(hfst.tokenized_fst(tokenizer.tokenize(word)))
        tr.minimize()
        tr.compose(delenv)
        tr.minimize()
        tr.compose(right_tr)
        tr.minimize()
        return tr

    def _extract_unique_io_pairs(transducer):
        tr_b = hfst.HfstBasicTransducer(transducer)
        previous_io_pairs = []
        for s in tr_b.states():
            previous_io_pairs.append(set())
        previous_io_pairs[0].add(('', ''))
        results = set()
        empty = False
        while not empty:
            empty = True
            current_io_pairs = []
            for s in tr_b.states():
                current_io_pairs.append(set())
            for state, state_io_pairs in enumerate(previous_io_pairs):
                if state_io_pairs:
                    empty = False
                if tr_b.is_final_state(state):
                    results |= state_io_pairs
                for str_in, str_out in state_io_pairs:
                    for transition in tr_b.transitions(state):
                        target_state = transition.get_target_state()
                        sym_in = transition.get_input_symbol()
                        if sym_in == hfst.EPSILON:
                            sym_in = ''
                        elif sym_in in (hfst.IDENTITY, hfst.UNKNOWN):
                            raise RuntimeError('Illegal symbol!')
                        sym_out = transition.get_output_symbol()
                        if sym_out == hfst.EPSILON:
                            sym_out = ''
                        elif sym_out in (hfst.IDENTITY, hfst.UNKNOWN):
                            raise RuntimeError('Illegal symbol!')
                        current_io_pairs[target_state].add(
                            (str_in + sym_in, str_out + sym_out))
            previous_io_pairs = current_io_pairs
        # convert the results to a dict
        results_dict = {}
        for word_1, word_2 in results:
            if word_1 not in results_dict:
                results_dict[word_1] = []
            results_dict[word_1].append(word_2)
        return results_dict

    delenv, right_tr = FST.load_cascade(transducer_path)
    tok = hfst.HfstTokenizer()
    for sym in shared.multichar_symbols:
        tok.add_multichar_symbol(sym)
    for sym in delenv.get_alphabet():
        if len(sym) > 1:
            tok.add_multichar_symbol(sym)
    block_size = shared.config['preprocess'].getint('block_size')
    count = 0
    while count < len(words):
        block = words[count:count + block_size]
        tr = _compose_block(block, delenv, right_tr, tok)
        similar_words_for_word = _extract_unique_io_pairs(tr)
        for word in block:
            if word in similar_words_for_word:
                yield (word, similar_words_for_word[word])
            else:
                yield (word, [])
        count += block_size
Пример #3
0
from aligner import build_aligner
import argparse

eps = hfst.EPSILON
pad = '"<P>"'

eps_pair = (
    eps,
    eps,
)
pad_pair = (
    pad,
    pad,
)

tok = hfst.HfstTokenizer()
levenshtein = hfst.regex('[ ?::0 | ?:?::1 | 0:?::1 | ?:0::1 | 0:0::0 ]*')

cldict = {
    '\\': '\\\\',
    '\x84': '',
}


def clean(s):
    """
	Remove and escape certain characters
	"""

    for a, b in cldict.items():
        s = s.replace(a, b)