示例#1
0
def read_fst(filename="examples.fst"):
    """Reads in a previously stored example FST file
    """
    import hfst
    exfile = hfst.HfstInputStream(filename)
    cfg.examples_fst = exfile.read()
    pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
    # print("pair_symbols", pair_symbols) ##
    pair_symbol_lst = re.split(r" +", pair_symbols)
    for pair in pair_symbol_lst:
        cfg.pair_symbol_set.add(pair)
        (insym, outsym) = cfg.pairsym2sympair(pair)
        cfg.symbol_pair_set.add((insym, outsym))
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    cfg.all_pairs_fst = hfst.empty_fst()
    for insym, outsym in cfg.symbol_pair_set:
        in_quoted = re.sub(r"([{}])", r"%\1", insym)
        #print(in_quoted, outsym)### tilts if insym contains bad chars
        pair_fst = hfst.regex(in_quoted + ':' + outsym)
        cfg.all_pairs_fst.disjunct(pair_fst)
    cfg.all_pairs_fst.remove_epsilons()
    cfg.all_pairs_fst.minimize()
    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
    return
示例#2
0
def read_examples(filename="test.pstr", build_fsts=True):
    """Reads the examples from the file whose name is 'filename'.
    
    The file must contain one example per line and each line consists of
    a space separated sequence of pair-symbols.  The examples are processed into 
    """
    if build_fsts:
        import hfst
        examples_bfst = hfst.HfstBasicTransducer()
    exfile = open(filename, "r")
    for line_nl in exfile:
        line = line_nl.strip()
        if not line or line.startswith("!"):
            continue
        pairsym_lst = re.split("\s+", line)
        symbol_pair_lst = [
            cfg.pairsym2sympair(pairsym) for pairsym in pairsym_lst
        ]
        # print("symbol_pair_lst:", symbol_pair_lst) ##
        pair_symbol_str = " ".join([
            cfg.sympair2pairsym(insym, outsym)
            for insym, outsym in symbol_pair_lst
        ])
        # print("pair_symbol_lst:", pair_symbol_lst) ##
        cfg.example_lst.append(pair_symbol_str)
        cfg.example_set.add(pair_symbol_str)  # spaces normalized
        #LINE_FST = hfst.tokenized_fst(symbol_pair_lst)
        # twbt.printfst(LINE_FST, True) ##
        if build_fsts:
            examples_bfst.disjunct(symbol_pair_lst, 0)
        for insym, outsym in symbol_pair_lst:
            cfg.symbol_pair_set.add((insym, outsym))
    exfile.close()
    if cfg.verbosity >= 30:
        print("List of examples:", cfg.example_lst)
        print("List of alphabet symbol pairs:", sorted(cfg.symbol_pair_set))
    if build_fsts:
        cfg.examples_fst = hfst.HfstTransducer(examples_bfst)
        cfg.examples_fst.set_name(filename)
        cfg.examples_fst.minimize()
        if cfg.verbosity >= 30:
            twbt.ppfst(cfg.examples_fst, False,
                       title="Example file as FST")  ##
    for insym, outsym in cfg.symbol_pair_set:
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    for insym, outsym in cfg.symbol_pair_set:
        pair_symbol = cfg.sympair2pairsym(insym, outsym)
        cfg.pair_symbol_set.add(pair_symbol)
    if build_fsts:
        pair_symbol_lst = [
            insym + ':' + outsym for insym, outsym in cfg.symbol_pair_set
        ]
        pair_symbol_str = " ".join(sorted(pair_symbol_lst))
        # print("symbol pairs:", pair_symbol_str) ##
        cfg.examples_fst.set_property("x-pair_symbols", pair_symbol_str)
    return
示例#3
0
def read_fst(filename="examples.fst"):
    """Reads in a previously stored example FST file
    """
    exfile = hfst.HfstInputStream(filename)
    cfg.examples_fst = exfile.read()
    pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
    # print("pair_symbols", pair_symbols) ##
    pair_symbol_lst = re.split(r" +", pair_symbols)
    for pair in pair_symbol_lst:
        cfg.pair_symbol_set.add(pair)
        (insym, outsym) = cfg.pairsym2sympair(pair)
        cfg.symbol_pair_set.add((insym, outsym))
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set)
    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
    return
示例#4
0
def relevant_contexts(pair_symbol):
    """Select positive and negative contexts for a given pair-symbol
    
    pair_symbol -- the pair-symbol for which the contexts are selected
    
    returns a tuple of:
    
    pos_context_set -- a set of contexts in the examples where the
    pair_symbol occurs
    
    neg_context_set -- a set of contexts where the input-symbol of the
    pair_symbol occurs with another output-symbol but so that there is
    no example in the example_set where the pair_symbol occurs in such a
    context
    """
    input_symbol, output_symbol = cfg.pairsym2sympair(pair_symbol)
    positive_context_set = set()
    negative_context_set = set()
    pairsymlist = [
        re.sub(r"([}{])", r"\\\1", psym)
        for psym in pair_symbols_for_input[input_symbol]
    ]
    # print("pairsymlist:", pairsymlist) ##
    pattern = re.compile("|".join(pairsymlist))
    for example in cfg.example_set:
        for m in pattern.finditer(example):
            i1 = m.start()
            i2 = m.end()
            # print('"' + example[0:i1] +'"', '"' + example[i2:] + '"') ##
            left_context = ".#. " + example[0:i1 - 1]
            centre = example[i1:i2]
            if i2 >= len(example):
                right_context = ".#."
            else:
                right_context = example[i2 + 1:] + " .#."
            context = (left_context, right_context)
            # print(centre, context) ##
            if centre == pair_symbol:
                positive_context_set.add(context)
            else:
                negative_context_set.add(context)
    negative_context_set = negative_context_set - positive_context_set
    return positive_context_set, negative_context_set
示例#5
0
def context_to_output_str(pairsym_str):
    pairsym_lst = pairsym_str.split(" ")
    sympair_lst = [cfg.pairsym2sympair(psym) for psym in pairsym_lst]
    outsym_lst = [outsym for insym, outsym in sympair_lst]
    return "".join(outsym_lst)
示例#6
0
    
    twexamp.read_examples(filename=args.examples, build_fsts=False)
    if cfg.verbosity >= 5:
        print("--- all examples read in ---")
    
    for insym in cfg.input_symbol_set:
        pair_symbols_for_input[insym] = set()
    for insym, outsym in cfg.symbol_pair_set:
        pair_symbol = cfg.sympair2pairsym(insym, outsym)
        pair_symbols_for_input[insym].add(pair_symbol)

    if args.symbol:
        pair_set = pair_symbols_for_input[args.symbol]
        pair_lst = []
        for pairsym in pair_set:
            insym, outsym = cfg.pairsym2sympair(pairsym)
            pair_lst.append((insym, outsym))
        if cfg.verbosity >= 10:
            print("pair_lst:", pair_lst)
    else:
        pair_lst = sorted(cfg.symbol_pair_set)

    for insym, outsym in pair_lst:
        if len(pair_symbols_for_input[insym]) <= 1:
            continue
        pair_symbol = cfg.sympair2pairsym(insym, outsym)
        posi_contexts, nega_contexts = relevant_contexts(pair_symbol)
        pos_contexts, neg_contexts = minimal_contexts(pair_symbol,
                                                      posi_contexts.copy(),
                                                      nega_contexts.copy())
        if len(pos_contexts) <= len(neg_contexts) or cfg.verbosity > 0: