示例#1
0
def correct_to_incorrect(x_fst, side):
    """used for creating negative examples for <= rules
    
    In order to make negative examples for <= rules we need to transform
    the examples so that some correct input:output pairs are
    changed so that the output part becomes different.  The computed
    encoded FST maps correct inputs to any possible outputs (correct or
    incorrect).

    x_fst -- the FST for the X part of the rule

    side -- either "input" or "output"

    returns: an fst (encoded as a fsa) which maps correct examples into
    incorrect exs
    """
    global pistar_fst, pistar_fsa
    if side == "input":
        mixed_fsa = mix_input(x_fst)
    else:
        mixed_fsa = mix_output(x_fst)
    temp_encod_fsa = hfst.fst_to_fsa(x_fst, separator="^")
    temp_encod_fsa.cross_product(
        mixed_fsa)  # now maps corr X to all variations
    # twbt.ppfst(temp_encod_fsa, True) ##
    corr_to_incorr_encod_fst = pistar_fsa.copy()
    corr_to_incorr_encod_fst.concatenate(temp_encod_fsa)
    corr_to_incorr_encod_fst.concatenate(pistar_fsa)
    corr_to_incorr_encod_fst.minimize()  # now complete
    corr_to_incorr_encod_fst.set_name("Correct to incorrect")
    return corr_to_incorr_encod_fst
示例#2
0
def incorrect_to_correct(x_fst):
    """Compute a transformation for right-arrow (=>) rules
    
    In order to make negative examples for the => rules we need to
    modify the examples so that some correct occurrences of X are
    modified so that the output part of X becomes something else,
    i.e. incorrect because it is in an unexpected context.
    
    x_fst -- FST for the center part (X) of a rule
    
    Returns: scrambler_fst -- an encoded FST which maps encoded
    instances of X into all possible correct and incorrect pairs (where
    the input symbol is the same but the output symbol perhaps
    different).
    """
    global pistar_fst, pistar_fsa
    x_encod_fsa = hfst.fst_to_fsa(x_fst, separator="^")
    mix_fst = mix_output(x_fst)  # still an encoded fsa
    mix_fst.cross_product(x_encod_fsa)  # now fst
    scrambler_fst = pistar_fsa.copy()
    scrambler_fst.concatenate(mix_fst)
    scrambler_fst.concatenate(pistar_fsa)
    scrambler_fst.minimize()  # now complete
    scrambler_fst.set_name("Scrambler " + x_fst.get_name())
    return scrambler_fst
示例#3
0
def mix_output(x_fst):
    """Computes an FSA that is used when creating negative examples
    
    First, it computes an expression Y which represent all possible
    (correct and incorrect) realizations of the input side of X.  Then,
    Y is transformed into an encoded FSA which can be a component of the
    transformation of correct examples into incorrect ones.
    
    x_fst -- the center FST (X part) of a rule Returns [X.u .o. PI*]
    encoded as an FSA (i.e. maps pairs to themselves)
    """
    global pistar_fst
    result_fst = x_fst.copy()
    result_fst.input_project()
    result_fst.compose(pistar_fst)
    result_fst.minimize()
    result_encod_fsa = hfst.fst_to_fsa(result_fst, separator="^")
    # twbt.ppfst(result_fsa, True) ##
    return result_encod_fsa
示例#4
0
def init():
    """Initializes the module by computing several common FSTs
    
    Assumes that twexamp.read_fst() has read in cfg.examples_fst and
    initialized sone symbol sets.
    """
    global pistar_fst, pistar_fsa, diamond_sym, diamond_fst
    global trim_pre_fst, trim_post_fst

    assert cfg.examples_fst, "cfg.examples_fst not loaded (by twexamp module)"

    cfg.definitions["PAIRS"] = cfg.all_pairs_fst.copy()
    cfg.definitions["PI"] = cfg.all_pairs_fst.copy()

    diamond_sym = 'DIAMOND'
    diamond_fst = hfst.regex(diamond_sym)
    pi_fst = cfg.all_pairs_fst.copy()
    pistar_fst = cfg.all_pairs_fst.copy()
    pistar_fst.repeat_star()
    pistar_fst.remove_epsilons()
    pistar_fst.minimize()
    pistar_fsa = hfst.fst_to_fsa(pistar_fst, separator='^')
    pi_in_fst = pi_fst.copy()
    pi_in_fst.input_project()
    pi_out_fst = pi_fst.copy()
    pi_out_fst.output_project()
    pi_in_star_fst = pistar_fst.copy()
    pi_in_star_fst.input_project()
    pi_out_star_fst = pistar_fst.copy()
    pi_out_star_fst.output_project()
    if cfg.verbosity >= 20:
        twbt.ppfst(pistar_fst, title="pistar_fst")

    fst1 = fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst))
    fst2 = fs.star(fs.concat(fst1, fs.expr("ZERO:BEGIN")))
    fst3 = fs.concat(fst2, pi_in_star_fst)
    fst4 = fs.star(
        fs.concat(fs.expr("ZERO:END"),
                  fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst))))
    trim_pre_fst = fs.concat(fst3, fst4)
    trim_pre_fst.set_name("trim_pre_fst")
    #trim_pre_fst =  XRC.compile(
    #    "[[ZERO .x. [PI].u]* ZERO:BEGIN]* " \
    #    "[[PI].u]* " \
    #    "[ZERO:END [ZERO .x. [PI].u]*]*"
    #)

    fst1 = fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO")))
    fst2 = fs.star(fs.concat(fst1, fs.expr("BEGIN:ZERO")))
    fst3 = fs.concat(fst2, pi_out_star_fst)
    fst4 = fs.star(
        fs.concat(fs.expr("END:ZERO"),
                  fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO")))))
    trim_post_fst = fs.concat(fst3, fst4)
    trim_post_fst.set_name("trim_post_fst")
    #trim_post_fst = XRC.compile(
    #    "[[[PI].l .x. ZERO]* BEGIN:ZERO]* " \
    #    "[[PI].l]* " \
    #    "[END:ZERO [[PI].l .x. ZERO]*]*"
    #)
    if cfg.verbosity >= 20:
        twbt.ppfst(trim_pre_fst)
        twbt.ppfst(trim_post_fst)
    return
示例#5
0
文件: twol.py 项目: koskenni/pytwolc
cfg.verbosity = args.verbosity
if args.recursion:
    sys.setrecursionlimit(args.recursion)

if args.examples.endswith(".fst"):
    twexamp.read_fst(args.examples)
else:
    twexamp.read_examples(args.examples)

if cfg.verbosity >= 30:
    twbt.ppfst(cfg.examples_fst, title="examples_fst")

parser = twparser.init()

examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^")

examples_up_fsa = cfg.examples_fst.copy()
examples_up_fsa.input_project()
if cfg.verbosity >= 30:
    twbt.ppfst(examples_up_fsa, title="examples_up_fsa")

twrule.init()

skip = False
all_rules_fst_lst = []
rule_file = open(args.rules, 'r')
line_lst = []
for line_nl in rule_file:
    line = line_nl.split('!', maxsplit=1)[0].strip()
    if line == "START":
示例#6
0
    for transition in state:
        print('%u\t%u\t%s\t%s\t%.2f' % (index, transition.get_target_state(), transition.get_input_symbol(), transition.get_output_symbol(), transition.get_weight()), file=f)
    if fsm.is_final_state(index):
        print('%s\t%.2f' % (index, fsm.get_final_weight(index)), file=f)
    index = index + 1

print(fsm, file=f)
f.close()

tr = hfst.HfstBasicTransducer(hfst.regex('foo'))
tr.substitute({'foo':'bar'})
tr.substitute({('foo','foo'):('bar','bar')})

tr = hfst.fst({'foo':'bar'})
fst = hfst.HfstBasicTransducer(tr)
fsa = hfst.fst_to_fsa(fst, '^')
fst = hfst.fsa_to_fst(fsa, '^')
TR = hfst.HfstTransducer(fst)
assert(TR.compare(tr))

tr = hfst.regex('{foo}:{bar}|{FOO}:{BAR}')
fsm = hfst.HfstBasicTransducer(tr)
net = fsm.states_and_transitions()
for state in net:
    for arc in state:
        arc.set_input_symbol(arc.get_input_symbol() + '>')
        arc.set_output_symbol('<' + arc.get_output_symbol())
        arc.set_weight(arc.get_weight() - 0.5)

for state, arcs in enumerate(fsm):
    for arc in arcs:
示例#7
0
arpar.add_argument("-d",
                   "--debug",
                   help="level of PLY debugging output",
                   type=int,
                   default=0)
arpar.add_argument("-p",
                   "--parser",
                   help="which parser to use: ply or tatsu",
                   default="ply")
args = arpar.parse_args()

print('Reading examples from:', args.examples)
twex.read_fst(args.examples)

examples_fsa = twex.EXAMPLES.copy()
examples_fsa = hfst.fst_to_fsa(examples_fsa, separator="^")

examples_up_fsa = twex.EXAMPLES.copy()
examples_up_fsa.input_project()

twrl.init(args.verbosity)

if args.parser == "ply":
    import plytw
    plytw.init(args.verbosity)
elif args.parser == "tatsu":
    import twolcsyntax
    twolcsyntax.init()
else:
    print("--parser must be either 'tatsu' or 'ply', not", args.parser)
示例#8
0
文件: test_hfst.py 项目: hfst/hfst
    for transition in state:
        print('%u\t%u\t%s\t%s\t%.2f' % (index, transition.get_target_state(), transition.get_input_symbol(), transition.get_output_symbol(), transition.get_weight()), file=f)
    if fsm.is_final_state(index):
        print('%s\t%.2f' % (index, fsm.get_final_weight(index)), file=f)
    index = index + 1

print(fsm, file=f)
f.close()

tr = hfst.HfstBasicTransducer(hfst.regex('foo'))
tr.substitute({'foo':'bar'})
tr.substitute({('foo','foo'):('bar','bar')})

tr = hfst.fst({'foo':'bar'})
fst = hfst.HfstBasicTransducer(tr)
fsa = hfst.fst_to_fsa(fst, '^')
fst = hfst.fsa_to_fst(fsa, '^')
TR = hfst.HfstTransducer(fst)
assert(TR.compare(tr))

tr = hfst.regex('{foo}:{bar}|{FOO}:{BAR}')
fsm = hfst.HfstBasicTransducer(tr)
net = fsm.states_and_transitions()
for state in net:
    for arc in state:
        arc.set_input_symbol(arc.get_input_symbol() + '>')
        arc.set_output_symbol('<' + arc.get_output_symbol())
        arc.set_weight(arc.get_weight() - 0.5)

for state, arcs in enumerate(fsm):
    for arc in arcs:
示例#9
0
文件: twolcomp.py 项目: koskenni/twol
def main():

    version = cfg.timestamp(__file__)
    import argparse
    arpar = argparse.ArgumentParser(
        description="A compiler and tester for two-level rules."\
        " Version {}."\
        " See https://pytwolc.readthedocs.io/en/latest/index.html"\
        " or https://github.com/koskenni/twol"\
        " for more information.".format(version))
    arpar.add_argument(
        "-e", "--examples", action='store', nargs='+',
        help="""Either one name of a FST file that contains the examples or
            a list of names of files which contain the PSTR form examples
            used for compiling the rules.""",
        default=[None])
    arpar.add_argument(
        "-r", "--rules", action='store', nargs='+',
        help="""One or more files which contain the rules,
             either just one rule file or a file of defines
             as the first one and a part of the whole rule set
             as the second""",
        default=[None])
    arpar.add_argument(
        "-o", "--output",
        help="File to which write the compiled rules if a name is given",
        default="")
    arpar.add_argument(
        "-l", "--lost",
        help="File to which write the examples"\
        " that were not accepted by all rules"\
        " -- it is written as a FST",
        default="")
    arpar.add_argument(
        "-w", "--wrong",
        help="file to which write the wrong strings"\
        " that are accepted by all rules -- it is written as a FST",
        default="")
    arpar.add_argument(
        "-t", "--thorough",
        help="test each rule separately: 0 if no testing is desired,"\
        " 1 if against positive examples,"
        " 2 against both positive and negative examples."\
        " Default is 2.",
        type=int, choices=[0, 1, 2], default=2)
    arpar.add_argument(
        "--recursion",
        help="set the limit for recursion depth",
        type=int)
    arpar.add_argument(
        "-v", "--verbosity",
        help="level of  diagnostic output",
        type=int, default=0)

    args = arpar.parse_args()

    cfg.verbosity = args.verbosity
    if args.recursion:
        sys.setrecursionlimit(args.recursion)

    if len(args.examples) == 1 and args.examples[0].endswith(".fst"):
        twexamp.read_fst(args.examples[0])
    else:
        twexamp.read_examples(args.examples)

    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.examples_fst, title="examples_fst")

    parser = twparser_init()

    examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^")

    examples_up_fsa = cfg.examples_fst.copy()
    examples_up_fsa.input_project()
    if cfg.verbosity >= 30:
        twbt.ppfst(examples_up_fsa, title="examples_up_fsa")

    twrule.init()

    i = 0
    skip = False
    all_rules_fst_lst = []
    line_lst = []

    for line_nl in fileinput.input(args.rules):
        i += 1
        if not line_lst:
            line_nl_lst = []
        line_nl_lst.append(line_nl)
        line = line_nl.split('!', maxsplit=1)[0].strip()
        if line == "START":
            skip = False
            continue
        elif line == "STOP":
            skip = True
        if skip or (not line) or line.startswith("!"):
            continue
        line_lst.append(line)
        if not line.endswith(";"):
            continue
        else:
            rule_str = " ".join(line_lst)
            line_lst = []

        op, left, right = parse_rule(parser, rule_str, i, line_nl_lst)
        if op == "?" or not (left and right):
            continue

        if (args.thorough > 0 and op != "=") or cfg.verbosity > 0:
            print("\n")
            print(rule_str)

        if op == "=":
            #        if cfg.verbosity > 0:
            #            print(line)
            if cfg.verbosity >= 10:
                print(left, op)
                twbt.ppfst(right)
            continue
        elif op == "=>":
            R, selector_fst, MIXe = twrule.rightarrow(line, left, *right)
        elif op == "<=":
            R, selector_fst, MIXe = twrule.output_coercion(line, left, *right)
        elif op == "<--":
            R, selector_fst, MIXe = twrule.input_coercion(line, left, *right)
        elif op == "<=>":
            R, selector_fst, MIXe = twrule.doublearrow(line, left, *right)
        elif op == "/<=":
            R, selector_fst, MIXe = twrule.center_exclusion(line, left, *right)
        else:
            print("Error: not a valid type of a rule", op)
            continue
        if cfg.verbosity >= 10:
            twbt.ppfst(R)
        if args.lost or args.wrong or args.output:
            all_rules_fst_lst.append(R)
        if args.thorough > 0:
            selector_fst.intersect(cfg.examples_fst)
            # selector_fst.n_best(5)
            selector_fst.minimize()
            if cfg.verbosity >= 20:
                paths = selector_fst.extract_paths(output='raw')
                print_raw_paths(paths[0:20])
            passed_pos_examples_fst = selector_fst.copy()
            passed_pos_examples_fst.intersect(R)
            if args.thorough > 0:
                if passed_pos_examples_fst.compare(selector_fst):
                    print("All positive examples accepted")
                else:
                    lost_examples_fst = selector_fst.copy()
                    lost_examples_fst.minus(passed_pos_examples_fst)
                    lost_examples_fst.minimize()
                    print("** Some positive examples were rejected:")
                    lost_paths = lost_examples_fst.extract_paths(output='raw')
                    print_raw_paths(lost_paths[0:20])
        if args.thorough > 1 and op in {"=>", "<=", "<=>", "<--"}:
            neg_examples_fsa = examples_fsa.copy()
            neg_examples_fsa.compose(MIXe)
            neg_examples_fsa.output_project()
            neg_examples_fst = hfst.fsa_to_fst(neg_examples_fsa, separator="^")
            neg_examples_fst.minus(cfg.examples_fst)
            NG = examples_up_fsa.copy()
            NG.compose(neg_examples_fst)
            npaths = NG.extract_paths(output='raw')
            #print_raw_paths(npaths)
            passed_neg_examples_fst = NG.copy()
            passed_neg_examples_fst.intersect(R)
            if passed_neg_examples_fst.compare(hfst.empty_fst()):
                print("All negative examples rejected")
            else:
                print("** Some negative examples accepted:")
                npaths = passed_neg_examples_fst.extract_paths(output='raw')
                print_raw_paths(npaths[0:20])

    if args.lost or args.wrong:
        RESU = examples_up_fsa.copy()
        print(RESU.number_of_arcs(), "arcs in RESU")
        RESU.compose_intersect(tuple(all_rules_fst_lst))
        RESU.minimize()
    if args.lost:
        lost_positive_examples_fst = cfg.examples_fst.copy()
        lost_positive_examples_fst.minus(RESU)
        lost_positive_examples_fst.minimize()
        lost_stream = hfst.HfstOutputStream(filename=args.lost)
        lost_stream.write(lost_positive_examples_fst)
        lost_stream.flush()
        lost_stream.close()
        print("wrote lost examples to", args.lost)
    if args.wrong:
        WRONG = RESU.copy()
        WRONG.subtract(cfg.examples_fst)
        WRONG.minimize()
        wrong_stream = hfst.HfstOutputStream(filename=args.wrong)
        wrong_stream.write(WRONG)
        wrong_stream.flush()
        wrong_stream.close()
        print("wrote wrongly accepted examples to", args.wrong)
    if args.output:
        outstream = hfst.HfstOutputStream(filename=args.output)
        for fst in all_rules_fst_lst:
            outstream.write(fst)
        outstream.flush()
        outstream.close()
        print("wrote {} rule transducers to {}".format(len(all_rules_fst_lst),
                                                        args.output))
    return