def read_fst(filename="examples.fst"): """Reads in a previously stored example FST file """ import hfst exfile = hfst.HfstInputStream(filename) cfg.examples_fst = exfile.read() pair_symbols = cfg.examples_fst.get_property("x-pair_symbols") # print("pair_symbols", pair_symbols) ## pair_symbol_lst = re.split(r" +", pair_symbols) for pair in pair_symbol_lst: cfg.pair_symbol_set.add(pair) (insym, outsym) = cfg.pairsym2sympair(pair) cfg.symbol_pair_set.add((insym, outsym)) cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) cfg.all_pairs_fst = hfst.empty_fst() for insym, outsym in cfg.symbol_pair_set: in_quoted = re.sub(r"([{}])", r"%\1", insym) #print(in_quoted, outsym)### tilts if insym contains bad chars pair_fst = hfst.regex(in_quoted + ':' + outsym) cfg.all_pairs_fst.disjunct(pair_fst) cfg.all_pairs_fst.remove_epsilons() cfg.all_pairs_fst.minimize() if cfg.verbosity >= 30: twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst") return
def read_examples(filename="test.pstr", build_fsts=True): """Reads the examples from the file whose name is 'filename'. The file must contain one example per line and each line consists of a space separated sequence of pair-symbols. The examples are processed into """ if build_fsts: import hfst examples_bfst = hfst.HfstBasicTransducer() exfile = open(filename, "r") for line_nl in exfile: line = line_nl.strip() if not line or line.startswith("!"): continue pairsym_lst = re.split("\s+", line) symbol_pair_lst = [ cfg.pairsym2sympair(pairsym) for pairsym in pairsym_lst ] # print("symbol_pair_lst:", symbol_pair_lst) ## pair_symbol_str = " ".join([ cfg.sympair2pairsym(insym, outsym) for insym, outsym in symbol_pair_lst ]) # print("pair_symbol_lst:", pair_symbol_lst) ## cfg.example_lst.append(pair_symbol_str) cfg.example_set.add(pair_symbol_str) # spaces normalized #LINE_FST = hfst.tokenized_fst(symbol_pair_lst) # twbt.printfst(LINE_FST, True) ## if build_fsts: examples_bfst.disjunct(symbol_pair_lst, 0) for insym, outsym in symbol_pair_lst: cfg.symbol_pair_set.add((insym, outsym)) exfile.close() if cfg.verbosity >= 30: print("List of examples:", cfg.example_lst) print("List of alphabet symbol pairs:", sorted(cfg.symbol_pair_set)) if build_fsts: cfg.examples_fst = hfst.HfstTransducer(examples_bfst) cfg.examples_fst.set_name(filename) cfg.examples_fst.minimize() if cfg.verbosity >= 30: twbt.ppfst(cfg.examples_fst, False, title="Example file as FST") ## for insym, outsym in cfg.symbol_pair_set: cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) for insym, outsym in cfg.symbol_pair_set: pair_symbol = cfg.sympair2pairsym(insym, outsym) cfg.pair_symbol_set.add(pair_symbol) if build_fsts: pair_symbol_lst = [ insym + ':' + outsym for insym, outsym in cfg.symbol_pair_set ] pair_symbol_str = " ".join(sorted(pair_symbol_lst)) # print("symbol pairs:", pair_symbol_str) ## cfg.examples_fst.set_property("x-pair_symbols", pair_symbol_str) return
def read_fst(filename="examples.fst"): """Reads in a previously stored example FST file """ exfile = hfst.HfstInputStream(filename) cfg.examples_fst = exfile.read() pair_symbols = cfg.examples_fst.get_property("x-pair_symbols") # print("pair_symbols", pair_symbols) ## pair_symbol_lst = re.split(r" +", pair_symbols) for pair in pair_symbol_lst: cfg.pair_symbol_set.add(pair) (insym, outsym) = cfg.pairsym2sympair(pair) cfg.symbol_pair_set.add((insym, outsym)) cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set) if cfg.verbosity >= 30: twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst") return
def relevant_contexts(pair_symbol): """Select positive and negative contexts for a given pair-symbol pair_symbol -- the pair-symbol for which the contexts are selected returns a tuple of: pos_context_set -- a set of contexts in the examples where the pair_symbol occurs neg_context_set -- a set of contexts where the input-symbol of the pair_symbol occurs with another output-symbol but so that there is no example in the example_set where the pair_symbol occurs in such a context """ input_symbol, output_symbol = cfg.pairsym2sympair(pair_symbol) positive_context_set = set() negative_context_set = set() pairsymlist = [ re.sub(r"([}{])", r"\\\1", psym) for psym in pair_symbols_for_input[input_symbol] ] # print("pairsymlist:", pairsymlist) ## pattern = re.compile("|".join(pairsymlist)) for example in cfg.example_set: for m in pattern.finditer(example): i1 = m.start() i2 = m.end() # print('"' + example[0:i1] +'"', '"' + example[i2:] + '"') ## left_context = ".#. " + example[0:i1 - 1] centre = example[i1:i2] if i2 >= len(example): right_context = ".#." else: right_context = example[i2 + 1:] + " .#." context = (left_context, right_context) # print(centre, context) ## if centre == pair_symbol: positive_context_set.add(context) else: negative_context_set.add(context) negative_context_set = negative_context_set - positive_context_set return positive_context_set, negative_context_set
def context_to_output_str(pairsym_str): pairsym_lst = pairsym_str.split(" ") sympair_lst = [cfg.pairsym2sympair(psym) for psym in pairsym_lst] outsym_lst = [outsym for insym, outsym in sympair_lst] return "".join(outsym_lst)
twexamp.read_examples(filename=args.examples, build_fsts=False) if cfg.verbosity >= 5: print("--- all examples read in ---") for insym in cfg.input_symbol_set: pair_symbols_for_input[insym] = set() for insym, outsym in cfg.symbol_pair_set: pair_symbol = cfg.sympair2pairsym(insym, outsym) pair_symbols_for_input[insym].add(pair_symbol) if args.symbol: pair_set = pair_symbols_for_input[args.symbol] pair_lst = [] for pairsym in pair_set: insym, outsym = cfg.pairsym2sympair(pairsym) pair_lst.append((insym, outsym)) if cfg.verbosity >= 10: print("pair_lst:", pair_lst) else: pair_lst = sorted(cfg.symbol_pair_set) for insym, outsym in pair_lst: if len(pair_symbols_for_input[insym]) <= 1: continue pair_symbol = cfg.sympair2pairsym(insym, outsym) posi_contexts, nega_contexts = relevant_contexts(pair_symbol) pos_contexts, neg_contexts = minimal_contexts(pair_symbol, posi_contexts.copy(), nega_contexts.copy()) if len(pos_contexts) <= len(neg_contexts) or cfg.verbosity > 0: