def test_clear_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertEqual(len(h2._suggest_cache), 0) self.assertEqual(len(h2._stem_cache), 0) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content
def __stem(speller: Hunspell, plural: str) -> [str]: stems = list() for stem in speller.stem(plural): stem = stem.replace("ij", "ij") if len(plural) - len(stem) <= 3: ps = pluralize(stem) if ps == plural: stems.append(stem) return stems
def test_non_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem)
def test_clear_caches_persistance(hunspell): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_suffix = h1.suffix_suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest assert h1.suggest('made-up') == test_suggest h1._suffix_cache['made-up'] = test_suffix assert h1.suffix_suggest('made-up') == test_suffix h1._stem_cache['made-up'] = test_stem assert h1.stem('made-up') == test_stem h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() assert len(cacheman.cache_by_name) == 0 h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') assert len(h2._suggest_cache) == 0 assert len(h2._stem_cache) == 0 assert h2.suggest('made-up') != test_suggest assert h2.suffix_suggest('made-up') != test_suffix assert h2.stem('made-up') != test_stem finally: shutil.rmtree(temp_dir) # Nuke temp content
def test_non_overlapping_caches(hunspell): test_suggest = hunspell.suggest('testing') test_suffix = hunspell.suffix_suggest('testing') test_stem = hunspell.stem('testing') hunspell._suggest_cache['made-up'] = test_suggest assert hunspell.suggest('made-up') == test_suggest hunspell._suffix_cache['made-up'] = test_suffix assert hunspell.suffix_suggest('made-up') == test_suffix hunspell._stem_cache['made-up'] = test_stem assert hunspell.stem('made-up') == test_stem h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) assert h2.suggest('made-up') != test_suggest assert h2.stem('made-up') != test_stem
def test_clear_caches_non_peristance(hunspell): test_suggest = hunspell.suggest('testing') test_suffix = hunspell.suffix_suggest('testing') test_stem = hunspell.stem('testing') hunspell._suggest_cache['made-up'] = test_suggest assert hunspell.suggest('made-up') == test_suggest hunspell._suffix_cache['made-up'] = test_suffix assert hunspell.suffix_suggest('made-up') == test_suffix hunspell._stem_cache['made-up'] = test_stem assert hunspell.stem('made-up') == test_stem hunspell.clear_cache() del hunspell hunspell = Hunspell('test', hunspell_data_dir=DICT_DIR) assert hunspell.suggest('made-up') != test_suggest assert hunspell.suffix_suggest('made-up') != test_suffix assert hunspell.stem('made-up') != test_stem
class HindiLanguage(StopWordsFromFileMixIn): """Hindi language support module.""" __slots__ = [ # Stop words map '__stop_words_map', # Hunspell instance '__hindi_hunspell', # Word tokenizer '__treebank_tokenizer', ] def __init__(self): """Constructor.""" super().__init__() self.__treebank_tokenizer = TreebankWordTokenizer() hunspell_dict_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'hindi-hunspell', 'dict-hi_IN', ) if not os.path.isdir(hunspell_dict_dir): raise McLanguageException( "Hunspell dictionary directory does not exist at path: %s." % hunspell_dict_dir ) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')): raise McLanguageException("Hunspell dictionary file does not exist at path: %s" % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')): raise McLanguageException("Hunspell affix file does not exist at path: %s" % hunspell_dict_dir) try: self.__hindi_hunspell = Hunspell(lang='hi_IN', hunspell_data_dir=hunspell_dict_dir) except Exception as ex: raise McLanguageException( "Unable to initialize Hunspell with data directory '%s': %s" % (hunspell_dict_dir, str(ex),) ) # Quick self-test to make sure that Hunspell is installed and dictionary is available hunspell_exc_message = """ Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g. you might need to fetch Git submodules by running: git submodule update --init --recursive """ try: test_stems = self.stem_words(['गुरुओं']) except Exception as _: raise McLanguageException(hunspell_exc_message) else: if len(test_stems) == 0 or test_stems[0] != 'गुरु': raise McLanguageException(hunspell_exc_message) @staticmethod def language_code() -> str: return "hi" @staticmethod def sample_sentence() -> str: return ( "ऋषियों को सताने वाले दुष्ट राक्षसों के राजा रावण का सर्वनाश करने वाले " "विष्णुवतार भगवान श्रीराम, अयोध्या के महाराज दशरथ के बड़े सपुत्र थे।" ) def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) if words is None: raise McLanguageException("Words to stem is None.") stems = [] for word in words: if word is None or len(word) == 0: log.debug("Word is empty or None.") stem = word else: term_stems = self.__hindi_hunspell.stem(word) if len(term_stems) > 0: stem = term_stems[0] if stem is None or len(stem) == 0: log.debug("Stem for word '%s' is empty or None." % word) stem = word else: log.debug("Stem for word '%s' was not found." % word) stem = word stems.append(stem) if len(words) != len(stems): log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),)) return stems def split_text_to_sentences(self, text: str) -> List[str]: text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period text = text.replace("।", "।\n\n") # No non-breaking prefixes in Hausa, so using English file en = EnglishLanguage() return en.split_text_to_sentences(text) def split_sentence_to_words(self, sentence: str) -> List[str]: sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence is None.") return [] # Normalize apostrophe so that "it’s" and "it's" get treated identically sentence = sentence.replace("’", "'") # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period sentence = sentence.replace("।", ".") # TweetTokenizer / sentence_splitter don't work with Hindi for whatever reason, and word_tokenize() would # require NLTK data to be installed which is time consuming on Travis tokens = self.__treebank_tokenizer.tokenize(sentence) def is_word(token_: str) -> bool: """Returns True if token looks like a word.""" if re.match(pattern=r'\w', string=token_, flags=re.UNICODE): return True else: return False # TweetTokenizer leaves punctuation in-place tokens = [token for token in tokens if is_word(token)] return tokens
def test_hunspell_stem(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertListEqual(d.stem('dog'), ['dog']) self.assertListEqual(d.stem('permanently'), ['permanent']) del d
class HunspellTest(unittest.TestCase): def assertRegexpSearch(self, *args, **kwargs): if PY3: self.assertRegex(*args, **kwargs) else: self.assertRegexpMatches(*args, **kwargs) def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(HunspellFilePathError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) @patch('os.path.isfile', return_value=True) @patch('os.access', return_value=True) def test_bad_path_encoding(self, *mocks): if PY3: with self.assertRaises(HunspellFilePathError): Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') else: # Python 2 just make an illegal string instead of raising with captured_c_stderr_file() as caperr: Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') with open(caperr, 'r') as err: self.assertRegexpSearch(err.read(), r'error:[^\n]*bad/[^\n]*/decoding') @patch('hunspell.hunspell.WIN32_LONG_PATH_PREFIX', '/not/valid') def test_windows_utf_8_encoding_applies_prefix(self, *mocks): with captured_c_stderr_file() as caperr: with patch("os.name", 'nt'): # If python file existance checks used prefix, this would raise a HunspellFilePathError Hunspell('test', system_encoding='UTF-8') with open(caperr, 'r') as err: # But the Hunspell library lookup had the prefix applied self.assertRegexpSearch(err.read(), r'error:[^\n]*/not/valid[^\n]*') def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog',)) self.assertEqual(self.h.stem('permanently'), ('permanent',)) def test_add(self): word = 'outofvocabularyword' self.assertEqual(self.h.spell(word), False) self.h.add(word) self.assertEqual(self.h.spell(word), True) typo = word + 'd' self.assertAllIn([word], self.h.suggest(typo)) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog',), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg'] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent',), 'dog': ('dog',) }) self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded',), 'permanently': ('permanent',), 'twigs': ('twig',), 'dog': ('dog',) }) def test_non_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) def test_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertEqual(self.h.suggest('made-up'), test_suggest) self.assertEqual(self.h.stem('made-up'), test_stem) def test_save_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertNotEqual(len(h2._suggest_cache), 0) self.assertNotEqual(len(h2._stem_cache), 0) self.assertEqual(h2.suggest('made-up'), test_suggest) self.assertEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content def test_clear_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertEqual(len(h2._suggest_cache), 0) self.assertEqual(len(h2._stem_cache), 0) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content def test_clear_caches_non_peristance(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) self.h.clear_cache() del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertNotEqual(self.h.suggest('made-up'), test_suggest) self.assertNotEqual(self.h.stem('made-up'), test_stem)
class HunspellTest(unittest.TestCase): def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(IOError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog', )) self.assertEqual(self.h.stem('permanently'), ('permanent', )) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog', ), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = [ 'bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg' ] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent', ), 'dog': ('dog', ) }) self.assertDictEqual( self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded', ), 'permanently': ('permanent', ), 'twigs': ('twig', ), 'dog': ('dog', ) })
class HunspellTest(unittest.TestCase): def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(IOError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog',)) self.assertEqual(self.h.stem('permanently'), ('permanent',)) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog',), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg'] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent',), 'dog': ('dog',) }) self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded',), 'permanently': ('permanent',), 'twigs': ('twig',), 'dog': ('dog',) })