def compile(filename, outfile, feats_file): stderr.write('Compiling into FST...\n') rlist = open(filename, 'r', encoding='utf8').read().rstrip(' ;').split(',,\n') chars = get_chars(feats_file) fst = hfst.regex('0 -> "<S>"') fst.compose(hfst.regex('0 -> "<P>" || .#. _ ,, 0 -> "<P>" || _ .#.')) double(fst, chars) string2string(fst, rlist) # Delete preceding input-level symbol single(fst, chars) # Delete auxiliary symbols delete_aux(fst) # Minimize and write into .hfst file fst.minimize() fst.convert(hfst.ImplementationType.HFST_OLW_TYPE) ostr = hfst.HfstOutputStream(filename=outfile, type=hfst.ImplementationType.HFST_OLW_TYPE) ostr.write(fst) ostr.flush() ostr.close() stderr.write('Done.\n')
def get_fst(start_rule, end_rule, *args): src = Path('g2p.twolc') tmp = Path('g2p_test_from_py.tmp.hfst') hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True) print('Preparing rule transducers for composition...', file=sys.stderr) rule_fsts_stream = hfst.HfstInputStream(tmp.name) rule_numbers = set() rule_numbers.add(0) for i in range(start_rule, end_rule + 1): rule_numbers.add(i) if (len(args) > 0): for i in range(args[0], args[1] + 1): rule_numbers.add(i) rule_fsts = [] for index, rule in enumerate(rule_fsts_stream): if index in rule_numbers: rule_fsts.append(rule) print('Creating universal language FST...', file=sys.stderr) output = hfst.regex('?* ;') print('Compose-intersecting rules with universal FST...', file=sys.stderr) output.compose_intersect(rule_fsts) print('Optimizing for fast lookup...', file=sys.stderr) output.lookup_optimize() return output
def read_fst(filename="examples.fst"): """Reads in a previously stored example FST file """ import hfst exfile = hfst.HfstInputStream(filename) cfg.examples_fst = exfile.read() pair_symbols = cfg.examples_fst.get_property("x-pair_symbols") # print("pair_symbols", pair_symbols) ## pair_symbol_lst = re.split(r" +", pair_symbols) for pair in pair_symbol_lst: cfg.pair_symbol_set.add(pair) (insym, outsym) = cfg.pairsym2sympair(pair) cfg.symbol_pair_set.add((insym, outsym)) cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) cfg.all_pairs_fst = hfst.empty_fst() for insym, outsym in cfg.symbol_pair_set: in_quoted = re.sub(r"([{}])", r"%\1", insym) #print(in_quoted, outsym)### tilts if insym contains bad chars pair_fst = hfst.regex(in_quoted + ':' + outsym) cfg.all_pairs_fst.disjunct(pair_fst) cfg.all_pairs_fst.remove_epsilons() cfg.all_pairs_fst.minimize() if cfg.verbosity >= 30: twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst") return
def get_fst(): src = Path('g2p.twolc') tmp = Path('g2p_from_py.tmp.hfst') final = Path('g2p_from_py.hfstol') #if (not tmp.exists()) or (src.stat().st_mtime > tmp.stat().st_mtime): print('Compiling twolc rules...', file=sys.stderr) hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True) #if (not final.exists()) or not (src.stat().st_mtime < # tmp.stat().st_mtime < # final.stat().st_mtime): print('Preparing rule transducers for composition...', file=sys.stderr) rule_fsts_stream = hfst.HfstInputStream(tmp.name) rule_fsts = [t for t in rule_fsts_stream] print('Creating universal language FST...', file=sys.stderr) output = hfst.regex('?* ;') print('Compose-intersecting rules with universal FST...', file=sys.stderr) output.compose_intersect(rule_fsts) print('Optimizing for fast lookup...', file=sys.stderr) output.lookup_optimize() print('Writing out final FST...', file=sys.stderr) output.write_to_file(final.name) #else: # ol_fst_stream = hfst.HfstInputStream(final.name) # output = next(ol_fst_stream) return output
def test_tokenized(tok, pathin, pathout, exp, weight=0): tokenized = None if (pathout == None): tokenized = tok.tokenize_one_level(pathin) else: tokenized = tok.tokenize(pathin, pathout) if not hfst.tokenized_fst(tokenized, weight).compare(hfst.regex(exp)): if pathout == None: raise RuntimeError('test_tokenized failed with input: ' + pathin) else: raise RuntimeError('test_tokenized failed with input: ' + pathin + ", " + pathout)
def syllabify(self): v = "[ " + out_prefix + " " + _build_regex( self._alph.get_phonemes("+syllabic")) + " ]" c = "[ " + out_prefix + " " + _build_regex( self._alph.get_phonemes("-syllabic")) + " ]" fill_nucl = hfst.regex(v + " -> " + nucl_bound + " ... " + nucl_bound) syl = hfst.regex("0 -> " + syl_bound + " \/ " + nucl_bound + " " + c + "* _ " + c + "* " + nucl_bound) surround = hfst.regex("?* -> " + syl_bound + " ... " + syl_bound + " || .#. _ .#.") fill_nucl.compose(syl) fill_nucl.compose(surround) if self._fill_onset: no_vowstart = hfst.regex("~[ $[ \\" + v + " " + syl_bound + " " + nucl_bound + " ] ]") fill_nucl.compose(no_vowstart) if self._sonorous: son_scale = self._alph.get_sonority_scale() scale = list() for layer in son_scale[1:]: if not len(layer) == 0: lregex = "[ " + out_prefix + " " + _build_regex( layer) + " ]" scale.append(lregex) suffix = "" for layer in scale: suffix += " " + layer + "*" prefix = "" for layer in reversed(scale): prefix += layer + "* " son_filter = hfst.regex(syl_bound + " [ " + prefix + " " + nucl_bound + " " + v + " " + nucl_bound + " " + suffix + " " + syl_bound + " ]+ ") fill_nucl.compose(son_filter) fill_nucl.minimize() return fill_nucl
def __init__(self): letters_cyr = 'йцукенгшщзхъфывапролджэячсмитьбюё' letters_lat = ['j', 'c', 'u', 'k', 'e', 'n', 'g', ' sh', 'shch', 'z', 'kh', 'ie', 'f', 'y', 'v', 'a', 'p', 'r', 'o', 'l', 'd', 'zh', 'e', 'ia', 'ch', 's', 'm', 'i', 't', '0', 'b', 'iu', 'e'] letters_cyr += letters_cyr.upper() letters_lat += [trans[0].upper() + trans[1:] for trans in letters_lat] regexes = [] for i in range(len(letters_cyr)): regexes.append(hfst.regex(' {0} -> {1} || _'.format(letters_cyr[i], letters_lat[i]))) tr = regexes[0] for reg in regexes[1:]: tr.compose(reg) self.tr = tr
def get_fst(src): tmp = Path('../res/g2p_from_py.hfst') print('Compiling twolc rules...', file=sys.stderr) hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True) print('Preparing rule transducers for composition...', file=sys.stderr) rule_fsts_stream = hfst.HfstInputStream(tmp.name) rule_fsts = [t for t in rule_fsts_stream] print('Creating universal language FST...', file=sys.stderr) output = hfst.regex('?* ;') print('Compose-intersecting rules with universal FST...', file=sys.stderr) output.compose_intersect(rule_fsts) print('Optimizing for fast lookup...', file=sys.stderr) output.lookup_optimize() return output
def get_all_forms(word, pos, language, descrpitive=True, limit_forms=-1, filter_out=["#", "+Der", "+Cmp", "+Err"]): analyzer = get_transducer(language, descrpitive=descrpitive, analyzer=True, convert_to_openfst=True, cache=False) abcs = analyzer.get_alphabet() f = [] flags = [] for abc in abcs: for fi in filter_out: if abc.startswith(fi): f.append(__regex_escape(abc)) break if "@" in abc and "@_" not in abc: flags.append("\"" + abc + "\"") flag_string = "" flag_end = "" start_flag_end = "" flag_string_start = "" if len(flags) > 0: flag_string_start = " [ " + " | ".join(flags) flag_string = flag_string_start + " | " flag_string_start = "" + flag_string_start flag_end = "]" start_flag_end = "]* " reg_text = flag_string_start + start_flag_end + "{" + word + "} %+" + pos + flag_string + " [ ? - [ " + " | ".join( f) + " ]]" + flag_end + "*" reg = hfst.regex(reg_text) analyzer2 = analyzer analyzer2.compose(reg) output = analyzer2.extract_paths(max_cycles=1, max_number=limit_forms, output='text').replace( "@_EPSILON_SYMBOL_@", "").split("\n") output = filter(lambda x: x, output) output = list(map(lambda x: x.split('\t'), output)) return list(map(lambda x: ( x[0], float(x[1]), ), output))
def apply(self, candidates, no_penalty=False, no_pardon=False, method="matching"): """ Apply the constraint to the current candidate set, i.e. compose the candidates with the constraint FST inserting the violation marks. :param candidates: The FST generating the current candidate set :param no_penalty: Do not remove losers if True :param no_pardon: Do not remove violation marks if True :param method: The penalization method to apply, matching (default) or counting :return: The updated candidate set FST """ candidates.compose(hfst.regex(self._regex)) if not no_penalty: penalize(candidates, n=self._n, no_pardon=no_pardon, method=method) return candidates
def generate(self): alph = _build_regex(self._alph.get_alphabet()) mut = _build_regex(self._mut.get_alphabet()) ignore = "" if self._ignore == "" else " | " + self._ignore # Remove syllable boundaries in input and insert insyms gen = hfst.regex("[ " + syl_bound + ":0 | [ 0:" + in_sym + " " + alph + " ]" + ignore + " ]*") # Map input symbol to output symbol gen2 = hfst.regex("[ " + in_sym + " " + alph + " ] 0:[ " + out_sym + " " + mut + " ]") # Ignore specified characters if self._ignore != "": ignore_marks = hfst.regex(ignore[3:]) gen2.disjunct(ignore_marks) # Insert characters if self._allow_ins: ins = "0:[ " + in_sym + " " + no_sym + " " + out_sym + " " + mut + " ]" gen2.disjunct(hfst.regex(ins)) # Delete input characters if self._allow_del: dle = "[ " + in_sym + " " + alph + " ] 0:[ " + out_sym + " " + no_sym + " ]" gen2.disjunct(hfst.regex(dle)) # Loop mutator and compose with gen gen2.repeat_star() gen.compose(gen2) # Restrict insertions if desired if self._max_ins > 0: restrict = hfst.regex(at_most_n_of(ins_sym, self._max_ins)) gen.compose(restrict) # Insert syllable boundaries if required if self._syl is not None: gen.compose(self._syl.syllabify()) # Insert word boundaries surround = hfst.regex("?* -> " + word_bound + " ... " + word_bound + " || .#. _ .#.") gen.compose(surround) gen.minimize() return gen
def serial_compile(regexs): # Compile each rule individually queue = [] for regex in regexs: fst = hfst.regex(regex) n = fst.number_of_states() queue.append(fst) # Sort resulting FST by number of states queue.sort(key=lambda fst: fst.number_of_states()) # Compose smallest two, move resulting FST to end of queue n = len(queue) for i in range(n - 1): fst1, fst2 = queue[0:2] fst1.compose(fst2) queue = queue[2:] + [fst1] queue.sort(key=lambda fst: fst.number_of_states()) return queue[0]
def shuffle_with_zeros(string, target_length): """Return a fsa where zeros are inserted in all possible ways string -- the string to which zero symbols are inserted target_length -- how long the strings after insertions must be Returns a fsa which accepts all the strings with the inserted zeros. All strings have exactly target_length symbols. """ result_fsa = hfst.fst(string) l = len(string) if l < target_length: n = target_length - l n_zeros_fsa = hfst.regex(' '.join(n * 'Ø')) result_fsa.shuffle(n_zeros_fsa) result_fsa.minimize() result_fsa.set_name(string) if cfg.verbosity >= 30: print("shuffle_with_zeros:") print(result_fsa) return result_fsa
def shuffle_with_zeros(string, target_length): """Return a fsa where zeros are inserted in all possible ways string -- the string to which zero symbols are inserted target_length -- how long the strings after insertions must be Returns a fsa which accepts all the strings with the inserted zeros. All strings have exactly target_length symbols. """ ### result_fsa = hfst.fst(string) # not correct for composed graphemes !!! result_fsa = fs.string_to_fsa(string) l = grapheme.length(string) if l < target_length: n = target_length - l n_zeros_fsa = hfst.regex(" ".join(n * "Ø")) result_fsa.shuffle(n_zeros_fsa) result_fsa.minimize() result_fsa.set_name(string) if cfg.verbosity >= 30: print("shuffle_with_zeros:") print(result_fsa) return result_fsa
def penalize(candidates, n=10, no_pardon=False, method="matching"): """ Remove losing candidates. :param candidates: Current candidate set :param n: The penalization precision for the counting approach :param no_pardon: Do not remove violation marks if True :param method: Use matching (default) or counting approach :return: Updated candidate set FST """ if method == "counting": for i in reversed(range(n + 1)): penalty_i = hfst.regex(only_n_of(mark_sym, i)) candidates.lenient_composition(penalty_i) else: # Remove modifications of gen, keep input characters and violation marks strip = hfst.regex("[ [ " + in_sym + ":0 [ " + no_sym + ":0 .P. ? ] ]" + " | [ " + out_sym + " ? ]:0 | " + bound_syms + ":0 | " + mark_sym + " ]*") # Insert at least one violation mark into the string insert_marks = hfst.regex("[ ?* 0:" + mark_sym + "+ ?* ]+") # Randomly insert new output characters mutate_output = hfst.regex("[ ? | 0:? ]*") # Randomly scatter violation marks throughout the string permute1 = hfst.regex("[ ?* " + mark_sym + ":0 ?* 0:" + mark_sym + " ?* ]*") permute2 = hfst.regex("[ ?* 0:" + mark_sym + " ?* " + mark_sym + ":0 ?* ]*") # Compose everything worse = candidates.copy() worse.compose(strip) worse.compose(insert_marks) worse.compose(permute1) worse.compose(permute2) worse.compose(mutate_output) # Subtract worse candidates from actual candidates candidates.subtract(worse) candidates.minimize() if not no_pardon: pardon(candidates) return candidates
if not hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE): sys.exit(77) hfst.set_default_fst_type(hfst.ImplementationType.TROPICAL_OPENFST_TYPE) else: raise RuntimeError('implementation format not recognized') transducers = [] istr = hfst.HfstInputStream() while not istr.is_eof(): transducers.append(istr.read()) istr.close() if not len(transducers) == 3: raise RuntimeError('Wrong number of transducers read.') i = 0 for re in ['föö:bär','0','0-0']: if not transducers[i].compare(hfst.regex(re)): raise RuntimeError('Transducers are not equivalent.') i += 1 if len(transducers) > 0: f = sys.stdout i=0 transducers[i].write_att(f) i += 1 while i < len(transducers): f.write('--\n') transducers[i].write_att(f) i += 1
# -*- coding: utf-8 -*- import hfst import sys if sys.argv[1] == 'sfst': if not hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.SFST_TYPE): sys.exit(77) hfst.set_default_fst_type(hfst.ImplementationType.SFST_TYPE) elif sys.argv[1] == 'foma': if not hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.FOMA_TYPE): sys.exit(77) hfst.set_default_fst_type(hfst.ImplementationType.FOMA_TYPE) elif sys.argv[1] == 'openfst': if not hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE): sys.exit(77) hfst.set_default_fst_type(hfst.ImplementationType.TROPICAL_OPENFST_TYPE) else: raise RuntimeError('implementation format not recognized') tr1 = hfst.regex('föö:bär') tr2 = hfst.regex('0') tr3 = hfst.regex('0-0') ostr = hfst.HfstOutputStream() ostr.write(tr1) ostr.write(tr2) ostr.write(tr3) ostr.flush() ostr.close()
# Distances between any two consonants: cclist = featmetr(consonants, consonants, posdist, adist, adist) vowl = sorted(vowels.keys()) cons = sorted(consonants.keys()) letters = sorted(vowl + cons) # Deletion of a letter possible at a fairly high cost: dellist = ['{}:Ø::{}'.format(l,3) for l in letters] # Insertion of a letter possible at a fairly high cost: epelist = ['Ø:{}::{}'.format(l,3) for l in letters] # Doubling only after the letter, not before: dbllist = ['{} Ø:{}::{}'.format(l,l,2) for l in letters] # Shortening the second of two identical letters only: sholist = ['{} {}:Ø::{}'.format(l,l,2) for l in letters] # Individual treatment of some pairs or sequences: speclist = ['k:c::0 k::0', 'k:x s:Ø::0', 't:d s:z::0', 'Ø:d s:z::3', 'i:j::1', 'j:i::1', 'i j:Ø::0', 'i i:j::0', 'f:p Ø:h::0', 'u:v::1', 'v:u::1', 'u:w::1', 'k:c::1', '[o:Ø o:?]::5', '[ö:Ø ö:?]::5'] all = vvlist + cclist + dbllist + sholist + dellist + epelist + speclist re = '[{}]*'.format(' | '.join(all)) print(re) ## algfst = hfst.regex(re) algfile = hfst.HfstOutputStream(filename="chardist.fst") algfile.write(algfst) algfile.flush() algfile.close()
def build(self, verbosity=1): """ Build the tableau FST from the submitted gen and constraints. :param verbosity: Amount of information to be printed during building. 0 = print nothing, 1 = print progress in single line (default), 2+ = print time and FST size for each constraint """ start = time.time() self._gen.remove_optimization() self._runnable = self._gen.copy() self._optimize_lookup(self._gen) if verbosity > 1: print("Gen: %d states, %d arcs" % (self._runnable.number_of_states(), self._runnable.number_of_arcs()), flush=True) n = len(self._constraints) for (i, constraint) in enumerate(self._constraints): c_start = time.time() if verbosity == 1: print("\rApplying constraints... (%d/%d)" % (i, n), end="", flush=True) elif verbosity > 1: print("Constraint %d: " % i, end="", flush=True) constraint.apply(self._runnable, no_penalty=True) self._runnable.minimize() before = self._runnable.copy() self._optimize_lookup(before) penalize(self._runnable, constraint.n(), no_pardon=True, method=self._penal_method) self._runnable.minimize() after = self._runnable.copy() self._optimize_lookup(after) pardon(self._runnable) self._runnable.minimize() self._stepwise.append((before, after)) if verbosity > 1: c_end = time.time() print("%d states, %d arcs (%.2f sec.)" % (self._runnable.number_of_states(), self._runnable.number_of_arcs(), c_end - c_start), flush=True) finish = hfst.regex(out_prefix + " | " + word_bound + " " + syl_bound + " | " + syl_bound + " " + word_bound + " | " + nucl_bound + " -> 0") finish2 = hfst.regex(no_sym + " -> 0") finish.compose(finish2) self._runnable.compose(finish) self._runnable.minimize() if verbosity > 1: print("Final: %d states, %d arcs" % (self._runnable.number_of_states(), self._runnable.number_of_arcs()), flush=True) self._optimize_lookup(self._runnable) end = time.time() if verbosity > 0: if verbosity == 1: print("\r", end="") print("Build complete in %.2f seconds." % (end - start), flush=True)
types = [] if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.SFST_TYPE): types.append(hfst.ImplementationType.SFST_TYPE) if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE): types.append(hfst.ImplementationType.TROPICAL_OPENFST_TYPE) if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.FOMA_TYPE): types.append(hfst.ImplementationType.FOMA_TYPE) for type in types: if hfst.HfstTransducer.is_implementation_type_available(type): hfst.set_default_fst_type(type) # StreamIsClosedException try: tr = hfst.regex('foo') outstr = hfst.HfstOutputStream(filename='testfile') outstr.close() outstr.write(tr) except hfst.exceptions.StreamIsClosedException: print("Could not write transducer: stream to file was closed.") # TransducerIsCyclicException transducer = hfst.regex('[a:b]*') try: results = transducer.extract_paths(output='text') print("The transducer has %i paths:" % len(results)) print(results) except hfst.exceptions.TransducerIsCyclicException: print("The transducer is cyclic and has an infinite number of paths. Some of them:") results = transducer.extract_paths(output='text', max_cycles=5)
def expr(e): """Return an FST corresponding to a XFST regular expression""" res = hfst.regex(e) res.minimize() return res
def boundary(self, ast): result_fst = hfst.regex("END") #print(result_fst)#### result_fst.set_name(".#.") return result_fst
def delete_aux(fst): regex = '[ "<P>" | "<S>" | "<E>" | "<D>" | "<.>" ] -> 0' fst.compose(hfst.regex(regex))
# -*- coding: utf-8 -*- import sys if len(sys.argv) > 1: sys.path.insert(0, sys.argv[1]) import hfst for type in [hfst.ImplementationType.SFST_TYPE, hfst.ImplementationType.TROPICAL_OPENFST_TYPE, hfst.ImplementationType.FOMA_TYPE]: if hfst.HfstTransducer.is_implementation_type_available(type): hfst.set_default_fst_type(type) tr = hfst.regex('[foo:bar] | [?:B ?:A ?:R]') result = tr.lookup('foo') assert(len(result) == 1) assert(result[0][0] == 'bar') tr = hfst.regex('[f:0 o:0 o:foo]') result = tr.lookup('foo') assert(len(result) == 1) assert(result[0][0] == '@_EPSILON_SYMBOL_@@_EPSILON_SYMBOL_@foo') tr = hfst.regex('[foo:bar]|[f:B o:A o:R]') result = tr.lookup('foo') assert(len(result) == 1) assert(result[0][0] == 'bar')
def init(): """Initializes the module by computing several common FSTs Assumes that twexamp.read_fst() has read in cfg.examples_fst and initialized sone symbol sets. """ global pistar_fst, pistar_fsa, diamond_sym, diamond_fst global trim_pre_fst, trim_post_fst assert cfg.examples_fst, "cfg.examples_fst not loaded (by twexamp module)" cfg.definitions["PAIRS"] = cfg.all_pairs_fst.copy() cfg.definitions["PI"] = cfg.all_pairs_fst.copy() diamond_sym = 'DIAMOND' diamond_fst = hfst.regex(diamond_sym) pi_fst = cfg.all_pairs_fst.copy() pistar_fst = cfg.all_pairs_fst.copy() pistar_fst.repeat_star() pistar_fst.remove_epsilons() pistar_fst.minimize() pistar_fsa = hfst.fst_to_fsa(pistar_fst, separator='^') pi_in_fst = pi_fst.copy() pi_in_fst.input_project() pi_out_fst = pi_fst.copy() pi_out_fst.output_project() pi_in_star_fst = pistar_fst.copy() pi_in_star_fst.input_project() pi_out_star_fst = pistar_fst.copy() pi_out_star_fst.output_project() if cfg.verbosity >= 20: twbt.ppfst(pistar_fst, title="pistar_fst") fst1 = fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst)) fst2 = fs.star(fs.concat(fst1, fs.expr("ZERO:BEGIN"))) fst3 = fs.concat(fst2, pi_in_star_fst) fst4 = fs.star( fs.concat(fs.expr("ZERO:END"), fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst)))) trim_pre_fst = fs.concat(fst3, fst4) trim_pre_fst.set_name("trim_pre_fst") #trim_pre_fst = XRC.compile( # "[[ZERO .x. [PI].u]* ZERO:BEGIN]* " \ # "[[PI].u]* " \ # "[ZERO:END [ZERO .x. [PI].u]*]*" #) fst1 = fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO"))) fst2 = fs.star(fs.concat(fst1, fs.expr("BEGIN:ZERO"))) fst3 = fs.concat(fst2, pi_out_star_fst) fst4 = fs.star( fs.concat(fs.expr("END:ZERO"), fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO"))))) trim_post_fst = fs.concat(fst3, fst4) trim_post_fst.set_name("trim_post_fst") #trim_post_fst = XRC.compile( # "[[[PI].l .x. ZERO]* BEGIN:ZERO]* " \ # "[[PI].l]* " \ # "[END:ZERO [[PI].l .x. ZERO]*]*" #) if cfg.verbosity >= 20: twbt.ppfst(trim_pre_fst) twbt.ppfst(trim_post_fst) return
for tr in r: transducers.append(tr) assert(f.closed) assert(len(transducers)) == 4 transducers = [] with open('testfile_fail.att', 'r') as f: try: r = hfst.AttReader(f, "<eps>") for tr in r: transducers.append(tr) except hfst.exceptions.NotValidAttFormatException as e: assert("1 baz baz 0.3" in e.what()) assert("line: 11" in e.what()) assert(f.closed) assert(len(transducers)) == 4 transducers = [] with open('testfile_unicode.att', 'r') as f: r = hfst.AttReader(f) for tr in r: transducers.append(tr) assert(f.closed) assert(len(transducers)) == 1 TR = hfst.regex('föö:bär::0.5') assert(TR.compare(transducers[0]))
def test_fst(input, result): tr1_ = hfst.fst(input) tr2_ = hfst.regex(result) if not tr1_.compare(tr2_): raise RuntimeError('test_fst failed with input: ' + input)
print('HERE!!!') for type in types: print('\n--- Testing implementation type %s ---\n' % hfst.fst_type_to_string(type)) hfst.set_default_fst_type(type) tr1 = None tr2 = None tr3 = None type_ = hfst.ImplementationType.TROPICAL_OPENFST_TYPE ostr = hfst.HfstOutputStream(filename='foobar.hfst', type=type_) tr_ = hfst.regex('{foo}:{bar}::0.5') tr_.convert(type_) ostr.write(tr_) ostr.write(tr_) ostr.flush() ostr.close() if not os.path.isfile('foobar.hfst'): raise RuntimeError('Missing file: foobar.hfst') istr = hfst.HfstInputStream('foobar.hfst') numtr = 0 try: tr1 = istr.read() numtr += 1
if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE): types.append(hfst.ImplementationType.TROPICAL_OPENFST_TYPE) if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.FOMA_TYPE): types.append(hfst.ImplementationType.FOMA_TYPE) from hfst.xerox_rules import * from hfst import regex for type in types: if hfst.HfstTransducer.is_implementation_type_available(type): hfst.set_default_fst_type(type) rule = Rule() # just testing the default constructor mapping = ( (regex('a'),regex('b')), ) rule = Rule(mapping) assert(replace(rule, False).compare(regex('a -> b'))) assert(replace(rule, True).compare(regex('a (->) b'))) mapping = ( (regex('a'),regex('b')), (regex('b'),regex('a')) ) rule = Rule(mapping) assert(replace(rule, False).compare(regex('a -> b, b -> a'))) assert(replace(rule, True).compare(regex('a (->) b, b (->) a'))) for repl_type in [(ReplaceType.REPL_UP, '||'), (ReplaceType.REPL_DOWN, '\\/'), (ReplaceType.REPL_LEFT, '\\\\'), (ReplaceType.REPL_RIGHT,'//')]: mapping1 = ( (regex('a'),regex('b')), ) context1 = ( (regex('c'),regex('c')), ) rule1 = Rule(mapping1, context1, repl_type[0])
types.append(hfst.ImplementationType.FOMA_TYPE) for type in types: print('\n--- Testing implementation type %s ---\n' % hfst.fst_type_to_string(type)) hfst.set_default_fst_type(type) tr1 = None tr2 = None tr3 = None type_ = hfst.ImplementationType.TROPICAL_OPENFST_TYPE ostr = hfst.HfstOutputStream(filename='foobar.hfst', type=type_) tr_ = hfst.regex('{foo}:{bar}::0.5') tr_.convert(type_) ostr.write(tr_) ostr.write(tr_) ostr.flush() ostr.close() if not os.path.isfile('foobar.hfst'): raise RuntimeError('Missing file: foobar.hfst') istr = hfst.HfstInputStream('foobar.hfst') numtr = 0 try: tr1 = istr.read() numtr += 1
def expand(fst): regex = '"<S>" -> [ "<S>" "<E>" "<.>" "<E>" "<S>" ]' fst.compose(hfst.regex(regex))
e = hfst.exceptions.ContextTransducersAreNotAutomataException('foo','bar', 10) e = hfst.exceptions.TransducersAreNotAutomataException('foo','bar', 10) e = hfst.exceptions.StateIndexOutOfBoundsException('foo','bar', 10) e = hfst.exceptions.TransducerHeaderException('foo','bar', 10) e = hfst.exceptions.MissingOpenFstInputSymbolTableException('foo','bar', 10) e = hfst.exceptions.TransducerTypeMismatchException('foo','bar', 10) e = hfst.exceptions.EmptySetOfContextsException('foo','bar', 10) e = hfst.exceptions.SpecifiedTypeRequiredException('foo','bar', 10) e = hfst.exceptions.HfstFatalException('foo','bar', 10) e = hfst.exceptions.TransducerHasWrongTypeException('foo','bar', 10) e = hfst.exceptions.IncorrectUtf8CodingException('foo','bar', 10) e = hfst.exceptions.EmptyStringException('foo','bar', 10) e = hfst.exceptions.SymbolNotFoundException('foo','bar', 10) e = hfst.exceptions.MetadataException('foo','bar', 10) e = hfst.exceptions.FlagDiacriticsAreNotIdentitiesException('foo','bar', 10) import hfst # Test that importing exceptions via a package works if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.FOMA_TYPE) and hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE): try: foo = hfst.regex('foo') bar = hfst.regex('bar') foo.convert(hfst.ImplementationType.FOMA_TYPE) bar.convert(hfst.ImplementationType.TROPICAL_OPENFST_TYPE) foo.concatenate(bar) assert False except hfst.exceptions.TransducerTypeMismatchException as e: pass
def symbol_or_pair(self, ast): string = ast.token.strip() failmsg = [] pat = re.compile( r"""^ (?P<up>[a-zšžåäöüõA-ZÅÄÖ0-9'´`]* | \{[a-zåäöüõA-ZÅÄÖØ'´`]+\}) : (?P<lo>[a-zšžåäöüõA-ZÅÄÖØ'´`]*) $""", re.X) m = re.match(pat, string) if m: # it is a pair with a colon up = m.group("up") up_quoted = re.sub(r"([{}])", r"%\1", up) lo = m.group("lo") if up and (up not in cfg.input_symbol_set): failmsg.append("input symbol '{}'".format(up)) if lo and (lo not in cfg.output_symbol_set): failmsg.append("output symbol '{}'".format(lo)) if up and lo and ((up, lo) not in cfg.symbol_pair_set): failmsg.append("symbol pair '{}'".format(string)) if failmsg: cfg.error_message = " and ".join(failmsg) + " not in alphabet" raise FailedSemantics(cfg.error_message) elif up and lo: # it is a valid pair with a colon result_fst = hfst.regex(up_quoted + ':' + lo) result_fst.set_name(string) return result_fst elif up and (not lo): result_fst = hfst.regex(up_quoted) result_fst.compose(cfg.all_pairs_fst) result_fst.set_name(string) return result_fst elif (not up) and lo: result_fst = cfg.all_pairs_fst.copy() lo_fst = hfst.regex(lo) result_fst.compose(lo_fst) result_fst.set_name(string) return result_fst else: result_fst = cfg.all_pairs_fst.copy() result_fst.set_name("PI") return result_fst m = re.fullmatch(r"[a-zåäöšžüõA-ZÅÄÖØ'´`]+", string) if m: # its either a defined sym or a surf ch if string in cfg.definitions: result_fst = cfg.definitions[string].copy() result_fst.set_name(string) return result_fst elif (string in cfg.output_symbol_set) and (string in cfg.input_symbol_set): result_fst = hfst.regex(string) result_fst.set_name(string) return result_fst elif string in {'BEGIN', 'END'}: result_fst = hfst.regex(string) result_fst.set_name(string) return result_fst cfg.error_message = "'" + string + "' is an invalid pair/definend symbol" raise FailedSemantics(cfg.error_message)
def separators(fst): regex = '0 -> "<S>"' fst.compose(hfst.regex(regex))
hfst.set_default_fst_type(hfst.ImplementationType.SFST_TYPE) elif sys.argv[1] == 'foma': if not hfst.HfstTransducer.is_implementation_type_available( hfst.ImplementationType.FOMA_TYPE): sys.exit(77) hfst.set_default_fst_type(hfst.ImplementationType.FOMA_TYPE) elif sys.argv[1] == 'openfst': if not hfst.HfstTransducer.is_implementation_type_available( hfst.ImplementationType.TROPICAL_OPENFST_TYPE): sys.exit(77) hfst.set_default_fst_type(hfst.ImplementationType.TROPICAL_OPENFST_TYPE) else: raise RuntimeError('implementation format not recognized') transducers = [] try: while (True): transducers.append(hfst.read_att_transducer(sys.stdin)) except hfst.exceptions.EndOfStreamException: pass if not len(transducers) == 3: raise RuntimeError('Wrong number of transducers read.') i = 0 for re in ['föö:bär', '0', '0-0']: if not transducers[i].compare(hfst.regex(re)): raise RuntimeError('Transducers are not equivalent.') i += 1
def double(fst, chars): regex = '0 -> "<D>" "<.>" || "<S>" _ [ ? - "<S>" ] ,, 0 -> [ "<D>" "<.>" ] || .#. _ ' fst.compose(hfst.regex(regex)) rlist = ['"<D>" "<.>" %s -> %s "<.>" %s' % (c, c, c) for c in chars] regex = ' ,, '.join(rlist) fst.compose(hfst.regex(regex))
import argparse eps = hfst.EPSILON pad = '"<P>"' eps_pair = ( eps, eps, ) pad_pair = ( pad, pad, ) tok = hfst.HfstTokenizer() levenshtein = hfst.regex('[ ?::0 | ?:?::1 | 0:?::1 | ?:0::1 | 0:0::0 ]*') cldict = { '\\': '\\\\', '\x84': '', } def clean(s): """ Remove and escape certain characters """ for a, b in cldict.items(): s = s.replace(a, b) return s
# -*- coding: utf-8 -*- import sys if len(sys.argv) > 1: sys.path.insert(0, sys.argv[1]) import hfst for type in [hfst.ImplementationType.SFST_TYPE, hfst.ImplementationType.TROPICAL_OPENFST_TYPE, hfst.ImplementationType.FOMA_TYPE]: if hfst.HfstTransducer.is_implementation_type_available(type): f = open('cats_and_dogs.prolog', 'r') F = open('tmp', 'w') tr = hfst.read_prolog_transducer(f) re = hfst.regex('{cat}') assert(tr.compare(re)) tr.write_prolog(F, True) F.write('\n') tr = hfst.read_prolog_transducer(f) re = hfst.regex('0 - 0') assert(tr.compare(re)) tr.write_prolog(F, True) F.write('\n') tr = hfst.read_prolog_transducer(f) re = hfst.regex('{dog}:{cat}::0.5') assert(tr.compare(re)) tr.write_prolog(F, True) F.write('\n') tr = hfst.read_prolog_transducer(f)
e = hfst.exceptions.StateIndexOutOfBoundsException('foo', 'bar', 10) e = hfst.exceptions.TransducerHeaderException('foo', 'bar', 10) e = hfst.exceptions.MissingOpenFstInputSymbolTableException('foo', 'bar', 10) e = hfst.exceptions.TransducerTypeMismatchException('foo', 'bar', 10) e = hfst.exceptions.EmptySetOfContextsException('foo', 'bar', 10) e = hfst.exceptions.SpecifiedTypeRequiredException('foo', 'bar', 10) e = hfst.exceptions.HfstFatalException('foo', 'bar', 10) e = hfst.exceptions.TransducerHasWrongTypeException('foo', 'bar', 10) e = hfst.exceptions.IncorrectUtf8CodingException('foo', 'bar', 10) e = hfst.exceptions.EmptyStringException('foo', 'bar', 10) e = hfst.exceptions.SymbolNotFoundException('foo', 'bar', 10) e = hfst.exceptions.MetadataException('foo', 'bar', 10) e = hfst.exceptions.FlagDiacriticsAreNotIdentitiesException('foo', 'bar', 10) import hfst # Test that importing exceptions via a package works if hfst.HfstTransducer.is_implementation_type_available( hfst.ImplementationType.FOMA_TYPE ) and hfst.HfstTransducer.is_implementation_type_available( hfst.ImplementationType.TROPICAL_OPENFST_TYPE): try: foo = hfst.regex('foo') bar = hfst.regex('bar') foo.convert(hfst.ImplementationType.FOMA_TYPE) bar.convert(hfst.ImplementationType.TROPICAL_OPENFST_TYPE) foo.concatenate(bar) assert False except hfst.exceptions.TransducerTypeMismatchException as e: pass
import sys, fileinput, io, hfst s2e_file = hfst.HfstInputStream("s2m.fst") s2e = s2e_file.read() # print(s2e.number_of_states()) def print_results(paths): for path in paths.strip().split('\n'): print("\t" + path.split(':')[0]) while True: res = hfst.regex("?*") # print(res) print("Enter forms of the next lemma") while True: try: line = input() except EOFError: sys.exit() l = " ".join(list(line.strip())) # print("word = " + l) a = hfst.regex(l) a.compose(s2e) a.output_project() a.minimize() a.extract_paths(max_number=10) a.minimize() nps = a.extract_paths(output='text') # print(" tentative new entries = ") # print_results(nps)
import hfst for type in [hfst.ImplementationType.SFST_TYPE, hfst.ImplementationType.TROPICAL_OPENFST_TYPE, hfst.ImplementationType.FOMA_TYPE]: if hfst.HfstTransducer.is_implementation_type_available(type): comp = hfst.XreCompiler(hfst.get_default_fst_type()) comp.set_expand_definitions(True) comp.define_xre('FooStar', '[foo]*') tr = hfst.regex('[foo]+') comp.define_transducer('FooPlus', tr) comp.define_xre('Bar', 'bar') comp.undefine('Bar') TR = comp.compile('FooStar a FooPlus Bar') TR1 = hfst.regex('[foo* a foo+ Bar]') assert TR1.compare(TR) tr = hfst.regex('foo:bar') comp.define_transducer('FooBar', tr) TR = comp.compile('FooBar.l') TR1 = hfst.regex('bar') assert TR1.compare(TR)
def single(fst, chars): regex = '"<S>" ? -> 0' fst.compose(hfst.regex(regex))