Пример #1
0
def lemma_deduplicate(matches):
    """ Accepts an iterable of AnalysisMatch objects and returns a list
    deduplicated by lemma, so that only one match object per lemma is returned.
    """
    output = OrderedDict()
    for m in matches:
        output[m.lemma_id] = m
    return output.values()
Пример #2
0
def lemma_deduplicate(matches):
    """ Accepts an iterable of AnalysisMatch objects and returns a list
    deduplicated by lemma, so that only one match object per lemma is returned.
    """
    output = OrderedDict()
    for m in matches:
        output[m.lemma_id] = m
    return output.values()
Пример #3
0
    def _prefix_fill(self, lemma_id, entries, suffix_tmp):
        if len(entries) == 0: return
        first_word = entries[0]

        prefix = first_word[0]

        okay = False
        while len(prefix) >= self.stem_minimum_length and not okay:
            okay = True
            if len(prefix)==0:
                break
            for wordform in entries:
                if not wordform[0].startswith(prefix):
                    okay = False
                    break
            if not okay: prefix = prefix[:-1]

        if okay:
            suffixes = [e[0][len(prefix):] for e in entries]
            tags = [tag for word,tag in entries]

            formcount = len(tags)
            suffixmap = OrderedDict.fromkeys( sorted(list(set(suffixes))) )

            for key in suffixmap.iterkeys():
                suffixmap[key] = []

            for suffix,tag in zip(suffixes,tags):
                suffixmap[suffix].append(tag)

            for key,val in suffixmap.iteritems():
                suffixmap[key] = tuple(val)

            frozenmap = tuple(suffixmap.items())
            if frozenmap in suffix_tmp:
                suffix_id = suffix_tmp[frozenmap]
            else:
                suffix_id = len(suffix_tmp)
                suffix_tmp[frozenmap] = suffix_id

            self.prefix_map[prefix].append( (lemma_id, suffix_id, formcount) )
            return prefix

        else:
            for word,tag in set(entries):
                frozenmap = (('', (tag,)),)
                if frozenmap in suffix_tmp:
                    suffix_id = suffix_tmp[frozenmap]
                else:
                    suffix_id = len(suffix_tmp)
                    suffix_tmp[frozenmap] = suffix_id

                self.prefix_map[word].append( (lemma_id, suffix_id, 1) )