예제 #1
0
 def get_all_outputs(self):
     transducer = self.get_transducer()
     transducer_symbol_table = SegmentTable().transducer_symbol_table
     outputs = list()
     for path in transducer.paths():
         output = ""
         for i in path:
             symbol = transducer_symbol_table.find(i.olabel)
             if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY:
                 output += symbol
         outputs.append(output)
     return outputs
예제 #2
0
    def get_all_outputs(self, with_noise=True):
        transducer = self.get_transducer(with_noise=with_noise)
        if configurations["MINIMIZE_TRANSDUCER"]:
            transducer = self.minimize_transducer(transducer)

        transducer_symbol_table = SegmentTable().transducer_symbol_table
        outputs = list()
        for path in transducer.paths():
            output = ""
            for i in path:
                symbol = transducer_symbol_table.find(i.olabel)
                if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY:
                    output += symbol
            outputs.append(output)
        return outputs
예제 #3
0
def get_transducer_outputs(transducer, limit=float("inf")):
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    outputs = list()
    counter = 0
    for path in transducer.paths():
        output = ""
        for arc in path:
            symbol = transducer_symbol_table.find(arc.olabel)
            if symbol != u"\u03b5":
                output += symbol
        outputs.append(output)
        counter += 1
        if counter > limit:
            break
    return outputs
    def get_from_pyfst_transducer(cls, transducer):
        transducer_symbol_table = SegmentTable().transducer_symbol_table
        nfa = ParsingNFA()
        nfa.final_states = list()
        arcs_dict = dict()
        probabilities = dict()

        num_states = len(list(transducer.states))
        transition_matrix = np.ones(
            (num_states, num_states)) * NO_TRANSITION_IDX

        for state in transducer:
            m = re.match(
                r".*#(\w*).*", str(state)
            )  # get sate number from the string: "<StdState #x with y arcs>"
            nfa_state1 = m.group(1)
            if state.initial:
                nfa.initial_state = nfa_state1
            if state.final:
                nfa.final_states.append(nfa_state1)

            for arc in state:
                nfa_state2 = str(arc.nextstate)
                output_symbol = transducer_symbol_table.find(arc.olabel)
                if output_symbol == u"\u03b5":
                    output_symbol = NULL_SEGMENT
                if nfa_state1 not in arcs_dict:
                    arcs_dict[nfa_state1] = {}
                    probabilities[nfa_state1] = []
                if output_symbol not in arcs_dict[nfa_state1]:
                    arcs_dict[nfa_state1][output_symbol] = []

                arcs_dict[nfa_state1][output_symbol].append(nfa_state2)
                probabilities[nfa_state1].append((output_symbol, nfa_state2))

                segment_idx = NULL_SEGMENT_IDX if output_symbol == NULL_SEGMENT else arc.olabel
                transition_matrix[int(nfa_state1),
                                  int(nfa_state2)] = segment_idx

        nfa.arcs_dict = arcs_dict
        nfa.probabilities = probabilities
        nfa.transition_matrix = transition_matrix
        return nfa
예제 #5
0
def pyfst_to_dfa(transducer, alphabet):
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    nfa = NFA()
    nfa.Sigma = alphabet
    delta = dict()
    States = list()
    nfa.Initial = set()
    for state in transducer:
        m = re.match(r".*#(\w*).*", str(state))  # get sate number from the string: "<StdState #x with y arcs>"
        nfa_state1_name = m.group(1)
        States.append(nfa_state1_name)
        nfa_state1 = States.index(nfa_state1_name)
        if state.initial:
            nfa.Initial.add(nfa_state1)
        if state.final:
            nfa.Final.add(nfa_state1)

    for state in transducer:
        m = re.match(r".*#(\w*).*", str(state))
        nfa_state1_name = m.group(1)
        nfa_state1 = States.index(nfa_state1_name)
        for arc in state:
            nfa_state2 = States.index(str(arc.nextstate))
            output_symbol = transducer_symbol_table.find(arc.olabel)
            if output_symbol == u"\u03b5":
                output_symbol = FAdo.common.Epsilon
            if nfa_state1 not in delta:
                delta[nfa_state1] = dict()
            if output_symbol not in delta[nfa_state1]:
                delta[nfa_state1][output_symbol] = set()
            delta[nfa_state1][output_symbol].add(nfa_state2)


    nfa.delta = delta
    nfa.States = States

    dfa = nfa.toDFA()

    return dfa