Exemplo n.º 1
0
 def _pair_from_tree(self, tree):
     if (tree.node != 'Pair'):
         raise RuntimeException('expected Pair, got ' + str(tree))
     if len(tree) == 1:
         return KimmoPair(tree[0], tree[0])
     else:
         return KimmoPair(tree[0], tree[2])
Exemplo n.º 2
0
 def from_dfa_dict(name, states, subsets):
     fsa = FSA()
     pairs = set([KimmoPair.make('@')])
     for (statename, trans) in states.items():
         for label in trans:
             if label != 'others':
                 pairs.add(KimmoPair.make(label))
     for (statename, trans) in states.items():
         parts = statename.split()
         source = parts[-1]
         if not parts[0].startswith('rej'):
             fsa.add_final(source)
         
         if fsa.start() == 0 and source in ['begin', 'Begin', '1', 1]:
             fsa.set_start(source)
         if source in ['start', 'Start']:
             fsa.set_start(source)
             
         used_pairs = set()
         for label in trans:
             if label != 'others':
                 used_pairs.add(KimmoPair.make(label))
         for label, target in trans.items():
             if label.lower() == 'others':
                 fsa.insert_safe(source, KimmoPair.make('@'), target)
                 for pair in pairs.difference(used_pairs):
                     fsa.insert_safe(source, pair, target)
             else:
                 fsa.insert_safe(source, KimmoPair.make(label), target)
     return KimmoFSARule(name, fsa, subsets)
Exemplo n.º 3
0
    def from_dfa_dict(name, states, subsets):
        fsa = FSA()
        pairs = set([KimmoPair.make('@')])
        for (statename, trans) in states.items():
            for label in trans:
                if label != 'others':
                    pairs.add(KimmoPair.make(label))
        for (statename, trans) in states.items():
            parts = statename.split()
            source = parts[-1]
            if not parts[0].startswith('rej'):
                fsa.add_final(source)

            if fsa.start() == 0 and source in ['begin', 'Begin', '1', 1]:
                fsa.set_start(source)
            if source in ['start', 'Start']:
                fsa.set_start(source)

            used_pairs = set()
            for label in trans:
                if label != 'others':
                    used_pairs.add(KimmoPair.make(label))
            for label, target in trans.items():
                if label.lower() == 'others':
                    fsa.insert_safe(source, KimmoPair.make('@'), target)
                    for pair in pairs.difference(used_pairs):
                        fsa.insert_safe(source, pair, target)
                else:
                    fsa.insert_safe(source, KimmoPair.make(label), target)
        return KimmoFSARule(name, fsa, subsets)
Exemplo n.º 4
0
 def parse_table(name, table, subsets):
     lines = table.split('\n')
     if len(lines) < 4:
         raise ValueError,\
         "Rule %s has too few lines to be an FSA table." % name
     pairs1 = lines[1].strip().split()
     pairs2 = lines[2].strip().split()
     if len(pairs1) != len(pairs2):
         raise ValueError,\
         "Rule %s has pair definitions that don't line up." % name
     pairs = [KimmoPair(p1, p2) for p1, p2 in zip(pairs1, pairs2)]
     finals = []
     fsa = FSA()
     for line in lines[3:]:
         line = line.strip()
         if not line: continue
         groups = re.match(r'(\w+)(\.|:)\s*(.*)', line)
         if groups is None:
             raise ValueError,\
             "Can't parse this line of the state table for rule %s:\n%s"\
             % (name, line)
         state, char, morestates = groups.groups()
         if fsa.start() == 0: fsa.set_start(state)
         if char == ':': finals.append(state)
         fsa.add_state(state)
         morestates = morestates.split()
         if len(morestates) != len(pairs):
             raise ValueError,\
             "Rule %s has a row of the wrong length:\n%s\ngot %d items, should be %d"\
             % (name, line, len(morestates), len(pairs))
         for pair, nextstate in zip(pairs, morestates):
             fsa.insert_safe(state, pair, nextstate)
     fsa.set_final(finals)
     return KimmoFSARule(name, fsa, subsets)
Exemplo n.º 5
0
 def _from_yaml_dict(cls, map):
     lexicon = map.get('lexicon')
     if lexicon:
         lexicon = KimmoMorphology.load(lexicon)
     subsets = map['subsets']
     for key, value in subsets.items():
         if isinstance(value, basestring):
             subsets[key] = value.split()
     defaults = map['defaults']
     if isinstance(defaults, basestring):
         defaults = defaults.split()
     defaults = [KimmoPair.make(text) for text in defaults]
     ruledic = map['rules']
     rules = []
     for (name, rule) in ruledic.items():
         if isinstance(rule, dict):
             rules.append(KimmoFSARule.from_dfa_dict(name, rule, subsets))
         elif isinstance(rule, basestring):
             if rule.strip().startswith('FSA'):
                 rules.append(KimmoFSARule.parse_table(name, rule, subsets))
             else:
                 rules.append(KimmoArrowRule(name, rule, subsets))
         else:
             raise ValueError, "Can't recognize the data structure in '%s' as a rule: %s" % (
                 name, rule)
     return cls(subsets, defaults, rules, lexicon)
Exemplo n.º 6
0
 def _from_yaml_dict(cls, map):
     lexicon = map.get('lexicon')
     if lexicon:
         lexicon = KimmoMorphology.load(lexicon)
     subsets = {}
     if 'subsets' in map:
         map['subsets']
         for key, value in subsets.items():
             if isinstance(value, basestring):
                 subsets[key] = value.split()
     defaults = map['defaults']
     if isinstance(defaults, basestring):
         defaults = defaults.split()
     defaults = [KimmoPair.make(text) for text in defaults]
     rules = []
     return cls(subsets, defaults, rules, lexicon)
Exemplo n.º 7
0
 def _from_yaml_dict(cls, map):
     lexicon = map.get('lexicon')
     if lexicon:
         lexicon = KimmoMorphology.load(lexicon)
     subsets = {}
     if 'subsets' in map:
         map['subsets']
         for key, value in subsets.items():
             if isinstance(value, basestring):
                 subsets[key] = value.split()
     defaults = map['defaults']
     if isinstance(defaults, basestring):
         defaults = defaults.split()
     defaults = [KimmoPair.make(text) for text in defaults]
     rules = []
     return cls(subsets, defaults, rules, lexicon)
Exemplo n.º 8
0
 def complete_fsa(self, fsa, fail_state=None):
     fsa = deepcopy(fsa)
     if fail_state is None:
         fail_state = fsa.add_state('Fail')
         fsa.insert('Fail', KimmoPair.make('@'), 'Fail')
     sorted_pairs = sort_subsets(self._pairs, self._subsets)
     for state in fsa.states():
         trans = fsa._transitions[state]
         for pair in self._pairs:
             if pair not in trans:
                 for sp in sorted_pairs:
                     if sp in trans and sp.includes(pair, self._subsets):
                         trans[pair] = trans[sp]
                         break
                 trans[pair] = [fail_state]
             if trans[pair] == []: trans[pair] = [fail_state]
     fsa._build_reverse_transitions()
     return fsa
Exemplo n.º 9
0
 def complete_fsa(self, fsa, fail_state=None):
     fsa = deepcopy(fsa)
     if fail_state is None:
         fail_state = fsa.add_state('Fail')
         fsa.insert('Fail', KimmoPair.make('@'), 'Fail')
     sorted_pairs = sort_subsets(self._pairs, self._subsets)
     for state in fsa.states():
         trans = fsa._transitions[state]
         for pair in self._pairs:
             if pair not in trans:
                 for sp in sorted_pairs:
                     if sp in trans and sp.includes(pair, self._subsets):
                         trans[pair] = trans[sp]
                         break
                 trans[pair] = [fail_state]
             if trans[pair] == []: trans[pair] = [fail_state]
     fsa._build_reverse_transitions()
     return fsa
Exemplo n.º 10
0
 def _from_yaml_dict(cls, map):
     lexicon = map.get('lexicon')
     if lexicon:
         lexicon = KimmoMorphology.load(lexicon)
     subsets = map['subsets']
     for key, value in subsets.items():
         if isinstance(value, basestring):
             subsets[key] = value.split()
     defaults = map['defaults']
     if isinstance(defaults, basestring):
         defaults = defaults.split()
     defaults = [KimmoPair.make(text) for text in defaults]
     ruledic = map['rules']
     rules = []
     for (name, rule) in ruledic.items():
         if isinstance(rule, dict):
             rules.append(KimmoFSARule.from_dfa_dict(name, rule, subsets))
         elif isinstance(rule, basestring):
             if rule.strip().startswith('FSA'):
                 rules.append(KimmoFSARule.parse_table(name, rule, subsets))
             else: rules.append(KimmoArrowRule(name, rule, subsets))
         else:
             raise ValueError, "Can't recognize the data structure in '%s' as a rule: %s" % (name, rule)
     return cls(subsets, defaults, rules, lexicon)
Exemplo n.º 11
0
def _pairify(state):
    newstate = {}
    for label, targets in state.items():
        newstate[KimmoPair.make(label)] = targets
    return newstate
Exemplo n.º 12
0
    def _generate(self, pairs, state_list, morphology_state=None, word='',
    lexical=None, surface=None, features='', log=None, origsurface=None):
        feat = None
        if morphology_state:
            morph = self._morphology
            morphed = False
            for state, feat in morph.next_states(morphology_state, word):
                if feat is not None:
                    #log.addFeature(feat)
                    newfeat = combine_features(features, feat)
                else:
                    newfeat = features
                    #log.clearFeatures()
                for result in self._generate(pairs, state_list, state, '', lexical, surface, newfeat, log, origsurface):
                    #log.clearFeatures()
                    log.addFeature(feat)
                    yield result
                    return # only first result needed
                    morphed = True
                    #log.clearFeatures()
            if morphed:
                #log.clearFeatures()
                return
            lexical_chars = list(morph.valid_lexical(morphology_state,
            word, self._pair_alphabet.union(set([KimmoPair.make(x) for x in origsurface])))) + list(self._null)
        else:
            #log.clearFeatures()
            lexical_chars = None
        if lexical == '' or surface == '':
            if morphology_state is None or morphology_state.lower() == 'end':
                # check that all rules are in accepting states
                for r in range(len(self._rules)):
                    rule = self._rules[r]
                    state = state_list[r]
                    if state not in rule.fsa().finals():
                        log.clearFeatures()
                        return
                if log:
                    log.succeed(pairs)
                    #if feat is not None:
                    #    log.addFeature(feat)
                #log.clearFeatures()
                yield pairs, features
                #log.clearFeatures()
                return
            
        #print len(lexical_chars)
        npa = self._pair_alphabet.union(set([KimmoPair.make(x) for x in origsurface]))
        next_pairs = [p for p in npa if
          (lexical is None or startswith(lexical, self._pairtext(p.input()))) and
          (surface is None or startswith(surface, self._pairtext(p.output())))]
        for pair in next_pairs:
            if pair.input() == self._null and pair.output() == self._null:
                print "Warning: The pair 0:0 would be an infinite loop. Ignoring it."
                log.clearFeatures()
                continue
            if lexical_chars is not None and pair.input() not in lexical_chars:
                #log.clearLastFeature()
                continue
            new_states = state_list[:]
            for r in range(len(self._rules)):
                rule = self._rules[r]
                state = state_list[r]
                next_state = self._advance_rule(rule, state, pair)
                new_states[r] = next_state
            
            newword = word + self._pairtext(pair.input())

            if log:
                log.step(pairs, pair, self._rules, state_list, new_states,
                morphology_state, newword)
                #if feat:
                #    log.addFeature(feat)
            fail = False
            for new_state in new_states:
                if new_state is None or str(new_state) == '0'\
                or str(new_state) == 'reject':
                    fail = True
                    break
            if fail: continue
            newlex, newsurf = lexical, surface
            if lexical: newlex = lexical[len(self._pairtext(pair.input())):]
            if surface: newsurf = surface[len(self._pairtext(pair.output())):]
            for result in self._generate(pairs+[pair], new_states, morphology_state, newword, newlex, newsurf, features, log, origsurface):
                yield result
                return # only first result needed
Exemplo n.º 13
0
def _pairify(state):
    newstate = {}
    for label, targets in state.items():
        newstate[KimmoPair.make(label)] = targets
    return newstate
Exemplo n.º 14
0
    def _generate(self,
                  pairs,
                  state_list,
                  morphology_state=None,
                  word='',
                  lexical=None,
                  surface=None,
                  features='',
                  log=None,
                  origsurface=None):
        feat = None
        if morphology_state:
            morph = self._morphology
            morphed = False
            for state, feat in morph.next_states(morphology_state, word):
                if feat is not None:
                    #log.addFeature(feat)
                    newfeat = combine_features(features, feat)
                else:
                    newfeat = features
                    #log.clearFeatures()
                for result in self._generate(pairs, state_list, state, '',
                                             lexical, surface, newfeat, log,
                                             origsurface):
                    #log.clearFeatures()
                    log.addFeature(feat)
                    yield result
                    return  # only first result needed
                    morphed = True
                    #log.clearFeatures()
            if morphed:
                #log.clearFeatures()
                return
            lexical_chars = list(
                morph.valid_lexical(
                    morphology_state, word,
                    self._pair_alphabet.union(
                        set([KimmoPair.make(x)
                             for x in origsurface])))) + list(self._null)
        else:
            #log.clearFeatures()
            lexical_chars = None
        if lexical == '' or surface == '':
            if morphology_state is None or morphology_state.lower() == 'end':
                # check that all rules are in accepting states
                for r in range(len(self._rules)):
                    rule = self._rules[r]
                    state = state_list[r]
                    if state not in rule.fsa().finals():
                        log.clearFeatures()
                        return
                if log:
                    log.succeed(pairs)
                    #if feat is not None:
                    #    log.addFeature(feat)
                #log.clearFeatures()
                yield pairs, features
                #log.clearFeatures()
                return

        #print len(lexical_chars)
        npa = self._pair_alphabet.union(
            set([KimmoPair.make(x) for x in origsurface]))
        next_pairs = [
            p for p in npa
            if (lexical is None or startswith(lexical, self._pairtext(p.input(
            )))) and (surface is None
                      or startswith(surface, self._pairtext(p.output())))
        ]
        for pair in next_pairs:
            if pair.input() == self._null and pair.output() == self._null:
                print "Warning: The pair 0:0 would be an infinite loop. Ignoring it."
                log.clearFeatures()
                continue
            if lexical_chars is not None and pair.input() not in lexical_chars:
                #log.clearLastFeature()
                continue
            new_states = state_list[:]
            for r in range(len(self._rules)):
                rule = self._rules[r]
                state = state_list[r]
                next_state = self._advance_rule(rule, state, pair)
                new_states[r] = next_state

            newword = word + self._pairtext(pair.input())

            if log:
                log.step(pairs, pair, self._rules, state_list, new_states,
                         morphology_state, newword)
                #if feat:
                #    log.addFeature(feat)
            fail = False
            for new_state in new_states:
                if new_state is None or str(new_state) == '0'\
                or str(new_state) == 'reject':
                    fail = True
                    break
            if fail: continue
            newlex, newsurf = lexical, surface
            if lexical: newlex = lexical[len(self._pairtext(pair.input())):]
            if surface: newsurf = surface[len(self._pairtext(pair.output())):]
            for result in self._generate(pairs + [pair], new_states,
                                         morphology_state, newword, newlex,
                                         newsurf, features, log, origsurface):
                yield result
                return  # only first result needed