示例#1
0
class Synonyms:
    def __init__(self, filename='./data/th_uk_UA.dat.txt'):
        self.lemmatizer = Lemmatizer()
        self.synonyms = parse_synonyms(filename)
        self.lemmed_synonyms = {}
        for w, s in self.synonyms.items():
            self.lemmed_synonyms[self.lemmatizer.lemma(w)] = s

    def get_synonyms(self, word):
        if word in self.synonyms:
            return self.synonyms[word]
        return self.lemmed_synonyms.get(self.lemmatizer.lemma(word), [])
示例#2
0
def parse_synonyms(filename='./data/th_uk_UA.dat.txt'):
    lemmatizer = Lemmatizer()
    synonyms = OrderedDict()
    with open(filename) as f:
        word = None
        for line in f:
            if word is None and line is not None and not line.startswith('|'):
                word = line
            elif word is not None:
                word, syns = _process(word, line)
                # clean synonyms from too similar words
                lemma_word = lemmatizer.lemma(word)
                syns_filtered = [s for s in syns if lemma_word != lemmatizer.lemma(s)]
                if syns_filtered:
                    synonyms[word] = syns_filtered

                word = None

    return synonyms
示例#3
0
 def __init__(self, filename='./data/th_uk_UA.dat.txt'):
     self.lemmatizer = Lemmatizer()
     self.synonyms = parse_synonyms(filename)
     self.lemmed_synonyms = {}
     for w, s in self.synonyms.items():
         self.lemmed_synonyms[self.lemmatizer.lemma(w)] = s