def lemmatize(self, query): flag = True for i in range(len(query), 0, -1): stem = query[:i] flex = query[i:] lemmas = set() """in case we find our stem in db_stems and flex in db_flex, we intersect the keys of inner dict of db_stems (the tuples) with the values of db_flex (also the tuples); and if the intersection is not empty we add the values of the inner dict of db_stems (lems) to set lemmas""" if stem in self.db_stems and flex in self.db_flex: for t in self.db_stems[stem].keys() & self.db_flex[flex]: lemmas.add(self.db_stems[stem][t]) if lemmas: flag = False for l in lemmas: yield l """returning to the previous algorithm if this one failed""" if flag: for stem in getStem(query, morphs): if stem != '': flag = False yield stem if flag: print('STEMMA') yield from stem_2_1.stemmer(query) else: print('MACHINE') else: print('LEMMA')
def stemmer(query, db_stems_name = config.DATABASE_STEMS_NAME, \ db_flex_name = config.DATABASE_FLEX_NAME): db_stems = shelve.open(db_stems_name) db_flex = shelve.open(db_flex_name) flag = True for i in range(len(query), 0, -1): stem = query[:i] flex = query[i:] if stem in db_stems and \ flex in db_flex and \ set(db_stems[stem]) & db_flex[flex]: flag = False yield stem #if flag: # for el in stem_2_1.stemmer(query): # yield el db_stems.close() db_flex.close() if flag: yield from stem_2_1.stemmer(query)
def test_stem1(self): self.assertEqual(list(sorted(stemmer('мам'))), list(sorted(['мам'])))
def test_stem9(self): self.assertEqual(list(sorted(stemmer('мыла'))), list(sorted(['мыл'])))
def test_stem0(self): self.assertEqual(list(sorted(stemmer('лаял'))), list(sorted(['лаял', 'ла'])))
def test_stem7(self): self.assertEqual(list(sorted(stemmer('абвгдейку'))), list(sorted(['абвгдейку', 'абвгдейк'])))
def test_stem8(self): self.assertEqual(list(sorted(stemmer('мала'))), list(sorted(['мала', 'м', 'ма', 'мал'])))
def test_stem6(self): self.assertEqual(list(sorted(stemmer('а'))), list(sorted(['а'])))
def test_stem5(self): self.assertEqual(list(sorted(stemmer('пам'))), list(sorted(['пам', 'п'])))
def test_stem4(self): self.assertEqual(list(sorted(stemmer('ого'))), list(sorted(['ого', 'ог'])))
def test_stem3(self): self.assertEqual(list(sorted(stemmer('ба'))), list(sorted(['ба', 'б'])))
def test_stem2(self): self.assertEqual(list(sorted(stemmer('бабах'))), list(sorted(['бабах', 'баб'])))