def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: tagger_args = [ f'--dicdir={mecab_dictionary_path}', '--rcfile=/dev/null', f'--node-format=%m{self.__MECAB_TOKEN_POS_SEPARATOR}%h{self.__EOL_SEPARATOR}', f'--eos-format={self.__MECAB_EOS_MARK}{self.__EOL_SEPARATOR}', ] self.__mecab = MeCab.Tagger(' '.join(tagger_args)) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message)
def split_text_to_sentences(self, text: str) -> List[str]: text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period text = text.replace("।", "।\n\n") # No non-breaking prefixes in Hausa, so using English file en = EnglishLanguage() return en.split_text_to_sentences(text)
def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = jieba.Tokenizer() self.__jieba.cache_file = self.__CACHE_PATH if not os.path.isdir(self.__DICT_PATH): raise McLanguageException( "Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join( self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize Jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that the dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message)
def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, }) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message)
def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__DICT_PATH): raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH ) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH ) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message)
def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, } ) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message)
def split_text_to_sentences(self, text: str) -> List[str]: text = decode_object_from_bytes_if_needed(text) # No non-breaking prefixes in Hausa, so using English file en = EnglishLanguage() return en.split_text_to_sentences(text)
def default_language_code() -> str: """Return default language code ('en' for English).""" return EnglishLanguage.language_code()
class LanguageFactory(object): """Language instance factory.""" # Supported + enabled language codes and their corresponding classes __ENABLED_LANGUAGES = { CatalanLanguage.language_code(): CatalanLanguage, ChineseLanguage.language_code(): ChineseLanguage, DanishLanguage.language_code(): DanishLanguage, DutchLanguage.language_code(): DutchLanguage, EnglishLanguage.language_code(): EnglishLanguage, FinnishLanguage.language_code(): FinnishLanguage, FrenchLanguage.language_code(): FrenchLanguage, GermanLanguage.language_code(): GermanLanguage, HausaLanguage.language_code(): HausaLanguage, HindiLanguage.language_code(): HindiLanguage, HungarianLanguage.language_code(): HungarianLanguage, ItalianLanguage.language_code(): ItalianLanguage, JapaneseLanguage.language_code(): JapaneseLanguage, LithuanianLanguage.language_code(): LithuanianLanguage, NorwegianLanguage.language_code(): NorwegianLanguage, PortugueseLanguage.language_code(): PortugueseLanguage, RomanianLanguage.language_code(): RomanianLanguage, RussianLanguage.language_code(): RussianLanguage, SpanishLanguage.language_code(): SpanishLanguage, SwedishLanguage.language_code(): SwedishLanguage, TurkishLanguage.language_code(): TurkishLanguage, } # Static language object instances ({'language code': language object, ... }) __language_instances = dict() @staticmethod def enabled_languages() -> set: """Return set of enabled languages (their codes).""" return set(LanguageFactory.__ENABLED_LANGUAGES.keys()) @staticmethod def language_is_enabled(language_code: str) -> bool: """Return True if language is supported + enabled, False if it's not.""" language_code = decode_object_from_bytes_if_needed(language_code) if language_code is None: log.warning("Language code is None.") return False return language_code in LanguageFactory.__ENABLED_LANGUAGES @staticmethod def language_for_code(language_code: str) -> Union[AbstractLanguage, None]: """Return language module instance for the language code, None if language is not supported.""" language_code = decode_object_from_bytes_if_needed(language_code) if not LanguageFactory.language_is_enabled(language_code): return None if language_code not in LanguageFactory.__language_instances: language_class = LanguageFactory.__ENABLED_LANGUAGES[language_code] language = language_class() LanguageFactory.__language_instances[language_code] = language return LanguageFactory.__language_instances[language_code] @staticmethod def default_language_code() -> str: """Return default language code ('en' for English).""" return EnglishLanguage.language_code() @staticmethod def default_language() -> AbstractLanguage: """Return default language module instance (English).""" return LanguageFactory.language_for_code( LanguageFactory.default_language_code())
class JapaneseLanguage(StopWordsFromFileMixIn): """Japanese language support module.""" # Paths where mecab-ipadic-neologd might be located __MECAB_DICTIONARY_PATHS = [ # Ubuntu / Debian '/var/lib/mecab/dic/ipadic-neologd', # CentOS / Fedora '/usr/lib64/mecab/dic/ipadic-neologd/', # OS X '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/', ] __MECAB_TOKEN_POS_SEPARATOR = random_string( length=16) # for whatever reason tab doesn't work __MECAB_EOS_MARK = 'EOS' __slots__ = [ # MeCab instance '__mecab', # Text -> sentence tokenizer for Japanese text '__japanese_sentence_tokenizer', # English language instance for tokenizing non-Chinese (e.g. English) text '__english_language', ] @staticmethod def _mecab_ipadic_neologd_path( ) -> str: # (protected and not private because used by the unit test) """Return path to mecab-ipadic-neologd dictionary installed on system.""" mecab_dictionary_path = None candidate_paths = JapaneseLanguage.__MECAB_DICTIONARY_PATHS for candidate_path in candidate_paths: if os.path.isdir(candidate_path): if os.path.isfile(os.path.join(candidate_path, 'sys.dic')): mecab_dictionary_path = candidate_path break if mecab_dictionary_path is None: raise McLanguageException( "mecab-ipadic-neologd was not found in paths: %s" % str(candidate_paths)) return mecab_dictionary_path @staticmethod def _mecab_allowed_pos_ids() -> Dict[int, str]: """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def. Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def didn't change in some unexpected way and we're not missing out on newly defined POSes. """ return { 36: '名詞,サ変接続,*,*', # noun-verbal 38: '名詞,一般,*,*', # noun 40: '名詞,形容動詞語幹,*,*', # adjectival nouns or quasi-adjectives 41: '名詞,固有名詞,一般,*', # proper nouns 42: '名詞,固有名詞,人名,一般', # proper noun, names of people 43: '名詞,固有名詞,人名,姓', # proper noun, first name 44: '名詞,固有名詞,人名,名', # proper noun, last name 45: '名詞,固有名詞,組織,*', # proper noun, organization 46: '名詞,固有名詞,地域,一般', # proper noun in general 47: '名詞,固有名詞,地域,国', # proper noun, country name } def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, }) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message) @staticmethod def language_code() -> str: return "ja" @staticmethod def sample_sentence() -> str: return "いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす(ん)。" # noinspection PyMethodMayBeStatic def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) # MeCab's sentence -> word tokenizer already returns "base forms" of every word return words def split_text_to_sentences(self, text: str) -> List[str]: """Tokenize Japanese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Japanese text japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in japanese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Japanese text non_japanese_sentences = self.__english_language.split_text_to_sentences( list_item) sentences += non_japanese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def split_sentence_to_words(self, sentence: str) -> List[str]: """Tokenize Japanese sentence into words. Removes punctuation and words that don't belong to part-of-speech whitelist.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__mecab.parse(sentence).strip() parsed_tokens = parsed_text.split("\n") allowed_pos_ids = self._mecab_allowed_pos_ids() words = [] for parsed_token_line in parsed_tokens: if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line: primary_form_and_pos_number = parsed_token_line.split( self.__MECAB_TOKEN_POS_SEPARATOR) primary_form = primary_form_and_pos_number[0] pos_number = primary_form_and_pos_number[1] if pos_number.isdigit(): pos_number = int(pos_number) if pos_number in allowed_pos_ids: words.append(primary_form) else: # Ignore all the "EOS" stuff pass return words
def setUp(self): self.__tokenizer = EnglishLanguage()
class TestEnglishLanguage(TestCase): def setUp(self): self.__tokenizer = EnglishLanguage() def test_language_code(self): assert self.__tokenizer.language_code() == "en" def test_sample_sentence(self): assert len(self.__tokenizer.sample_sentence()) def test_stop_words_map(self): stop_words = self.__tokenizer.stop_words_map() assert "the" in stop_words assert "not_a_stopword" not in stop_words def test_stem(self): input_words = ["stemming"] expected_stems = ["stem"] actual_stems = self.__tokenizer.stem_words(input_words) assert expected_stems == actual_stems def test_stem_apostrophe_normal(self): """Stemming with normal apostrophe.""" input_words = ["Katz's", "Delicatessen"] expected_stems = ['katz', 'delicatessen'] actual_stems = self.__tokenizer.stem_words(input_words) assert expected_stems == actual_stems def test_stem_apostrophe_right_single_quotation_mark(self): """Stemming with right single quotation mark.""" input_words = ["it’s", "toasted"] expected_stems = ['it', 'toast'] actual_stems = self.__tokenizer.stem_words(input_words) assert expected_stems == actual_stems def test_split_text_to_sentences_period_in_number(self): """Period in number.""" input_text = "Sentence contain version 2.0 of the text. Foo." expected_sentences = [ 'Sentence contain version 2.0 of the text.', 'Foo.', ] actual_sentences = self.__tokenizer.split_text_to_sentences(input_text) assert expected_sentences == actual_sentences def test_split_text_to_sentences_may_ending(self): """'May' ending.""" input_text = "Sentence ends in May. This is the next sentence. Foo." expected_sentences = [ 'Sentence ends in May.', 'This is the next sentence.', 'Foo.', ] actual_sentences = self.__tokenizer.split_text_to_sentences(input_text) assert expected_sentences == actual_sentences def test_split_text_to_sentences_punctuation(self): """'May' ending.""" input_text = "Leave the city! [Mega No!], l." expected_sentences = [ 'Leave the city!', '[Mega No!], l.', ] actual_sentences = self.__tokenizer.split_text_to_sentences(input_text) assert expected_sentences == actual_sentences def test_split_text_to_sentences_unicode(self): """Basic Unicode.""" input_text = "Non Mega Não! [Mega No!], l." expected_sentences = [ 'Non Mega Não!', '[Mega No!], l.', ] actual_sentences = self.__tokenizer.split_text_to_sentences(input_text) assert expected_sentences == actual_sentences def test_split_text_to_sentences_quotation(self): """Basic Unicode (with fancy Unicode quotation marks).""" input_text = """ Perhaps that’s the best thing the Nobel Committee did by awarding this year’s literature prize to a non-dissident, someone whom Peter Englund of the Swedish Academy said was “more a critic of the system, sitting within the system.” They’ve given him a chance to bust out. """ expected_sentences = [ ( 'Perhaps that’s the best thing the Nobel Committee did by awarding this year’s literature prize to a ' 'non-dissident, someone whom Peter Englund of the Swedish Academy said was “more a critic of the ' 'system, sitting within the system.”' ), 'They’ve given him a chance to bust out.', ] actual_sentences = self.__tokenizer.split_text_to_sentences(input_text) assert expected_sentences == actual_sentences def test_split_text_to_sentences_two_spaces(self): """Two spaces in the middle of the sentence.""" input_text = """ Although several opposition groups have called for boycotting the coming June 12 presidential election, it seems the weight of boycotting groups is much less than four years ago. """ expected_sentences = [ ( 'Although several opposition groups have called for boycotting the coming June 12 presidential ' 'election, it seems the weight of boycotting groups is much less than four years ago.' ), ] actual_sentences = self.__tokenizer.split_text_to_sentences(input_text) assert expected_sentences == actual_sentences def test_split_text_to_sentences_nbsp(self): """Non-breaking space.""" input_text = """ American Current TV journalists Laura Ling and Euna Lee have been sentenced to 12 years of hard labor (according to CNN).\u00a0 Jillian York rounded up blog posts for Global Voices prior to the journalists' sentencing. """ expected_sentences = [ ( 'American Current TV journalists Laura Ling and Euna Lee have been sentenced to 12 years of hard labor ' '(according to CNN).' ), "Jillian York rounded up blog posts for Global Voices prior to the journalists' sentencing.", ] actual_sentences = self.__tokenizer.split_text_to_sentences(input_text) assert expected_sentences == actual_sentences def test_split_text_to_sentences_no_space_after_period(self): """No space after a period.""" input_text = """ Anger is a waste of energy and what North Korea wants of you.We can and will work together and use our minds, to work this through. """ expected_sentences = [ 'Anger is a waste of energy and what North Korea wants of you.', 'We can and will work together and use our minds, to work this through.', ] actual_sentences = self.__tokenizer.split_text_to_sentences(input_text) assert expected_sentences == actual_sentences def test_split_text_to_sentences_unicode_ellipsis(self): """Unicode "…".""" input_text = """ One of the most popular Brahmin community, with 28, 726 members, randomly claims: “we r clever & hardworking. no one can fool us…” The Brahmans community with 41952 members and the Brahmins of India community with 30588 members are also very popular. """ expected_sentences = [ ( 'One of the most popular Brahmin community, with 28, 726 members, randomly claims: “we r clever & ' 'hardworking. no one can fool us...”' ), ( 'The Brahmans community with 41952 members and the Brahmins of India community with 30588 members are ' 'also very popular.' ), ] actual_sentences = self.__tokenizer.split_text_to_sentences(input_text) assert expected_sentences == actual_sentences def test_split_sentence_to_words_normal_apostrophe(self): """Normal apostrophe (').""" input_sentence = "It's always sunny in Philadelphia." expected_words = ["it's", "always", "sunny", "in", "philadelphia"] actual_words = self.__tokenizer.split_sentence_to_words(input_sentence) assert expected_words == actual_words def test_split_sentence_to_words_right_single_quotation_mark(self): """Right single quotation mark (’), normalized to apostrophe (').""" input_sentence = "It’s always sunny in Philadelphia." expected_words = ["it's", "always", "sunny", "in", "philadelphia"] actual_words = self.__tokenizer.split_sentence_to_words(input_sentence) assert expected_words == actual_words def test_split_sentence_to_words_hyphen_without_split(self): """Hyphen without split.""" input_sentence = "near-total secrecy" expected_words = ["near-total", "secrecy"] actual_words = self.__tokenizer.split_sentence_to_words(input_sentence) assert expected_words == actual_words def test_split_sentence_to_words_hyphen_without_split_as_dash(self): """Hyphen with split (where it's being used as a dash).""" input_sentence = "A Pythagorean triple - named for the ancient Greek Pythagoras" expected_words = ['a', 'pythagorean', 'triple', 'named', 'for', 'the', 'ancient', 'greek', 'pythagoras'] actual_words = self.__tokenizer.split_sentence_to_words(input_sentence) assert expected_words == actual_words def test_split_sentence_to_words_quotes(self): """Quotation marks.""" input_sentence = 'it was in the Guinness Book of World Records as the "most difficult mathematical problem"' expected_words = [ 'it', 'was', 'in', 'the', 'guinness', 'book', 'of', 'world', 'records', 'as', 'the', 'most', 'difficult', 'mathematical', 'problem' ] actual_words = self.__tokenizer.split_sentence_to_words(input_sentence) assert expected_words == actual_words
class ChineseLanguage(StopWordsFromFileMixIn): """Chinese language support module.""" # Path to jieba dictionary(ies) __DICT_PATH = os.path.dirname(os.path.abspath(__file__)) __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big') __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt') __slots__ = [ # Stop words map '__stop_words_map', # Jieba instance '__jieba', # Text -> sentence tokenizer for Chinese text '__chinese_sentence_tokenizer', # English language instance for tokenizing non-Chinese (e.g. English) text '__english_language', ] def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__DICT_PATH): raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH ) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH ) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message) @staticmethod def language_code() -> str: return "zh" @staticmethod def sample_sentence() -> str: return ( "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特," "为后者的连续5个参议员任期划上句点。" ) def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) # Jieba's sentence -> word tokenizer already returns "base forms" of every word return words def split_text_to_sentences(self, text: str) -> List[str]: """Tokenize Chinese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Chinese text chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in chinese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Chinese text non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item) sentences += non_chinese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def split_sentence_to_words(self, sentence: str) -> List[str]: """Tokenize Chinese sentence into words. Removes punctuation.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__jieba.lcut(sentence, cut_all=False) parsed_tokens = [x for x in parsed_text if x.strip()] words = [] for parsed_token in parsed_tokens: if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None: words.append(parsed_token) else: pass return words
class ChineseLanguage(StopWordsFromFileMixIn): """Chinese language support module.""" # Path to jieba dictionary(ies) __DICT_PATH = os.path.dirname(os.path.abspath(__file__)) __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big') __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt') __slots__ = [ # Stop words map '__stop_words_map', # Jieba instance '__jieba', # Text -> sentence tokenizer for Chinese text '__chinese_sentence_tokenizer', # English language instance for tokenizing non-Chinese (e.g. English) text '__english_language', ] def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__DICT_PATH): raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH ) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH ) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message) @staticmethod def language_code() -> str: return "zh" @staticmethod def sample_sentence() -> str: return ( "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特," "为后者的连续5个参议员任期划上句点。" ) def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) # Jieba's sentence -> word tokenizer already returns "base forms" of every word return words def split_text_to_sentences(self, text: str) -> List[str]: """Tokenize Chinese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Chinese text chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in chinese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split(r"\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split(r"\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Chinese text non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item) sentences += non_chinese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def split_sentence_to_words(self, sentence: str) -> List[str]: """Tokenize Chinese sentence into words. Removes punctuation.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__jieba.lcut(sentence, cut_all=False) parsed_tokens = [x for x in parsed_text if x.strip()] words = [] for parsed_token in parsed_tokens: if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None: words.append(parsed_token) else: pass return words
class JapaneseLanguage(StopWordsFromFileMixIn): """Japanese language support module.""" # Paths where mecab-ipadic-neologd might be located __MECAB_DICTIONARY_PATHS = [ # Ubuntu / Debian '/var/lib/mecab/dic/ipadic-neologd', # CentOS / Fedora '/usr/lib64/mecab/dic/ipadic-neologd/', # OS X '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/', ] __MECAB_TOKEN_POS_SEPARATOR = random_string(length=16) # for whatever reason tab doesn't work __MECAB_EOS_MARK = 'EOS' __slots__ = [ # MeCab instance '__mecab', # Text -> sentence tokenizer for Japanese text '__japanese_sentence_tokenizer', # English language instance for tokenizing non-Chinese (e.g. English) text '__english_language', ] @staticmethod def _mecab_ipadic_neologd_path() -> str: # (protected and not private because used by the unit test) """Return path to mecab-ipadic-neologd dictionary installed on system.""" mecab_dictionary_path = None candidate_paths = JapaneseLanguage.__MECAB_DICTIONARY_PATHS for candidate_path in candidate_paths: if os.path.isdir(candidate_path): if os.path.isfile(os.path.join(candidate_path, 'sys.dic')): mecab_dictionary_path = candidate_path break if mecab_dictionary_path is None: raise McLanguageException( "mecab-ipadic-neologd was not found in paths: %s" % str(candidate_paths) ) return mecab_dictionary_path @staticmethod def _mecab_allowed_pos_ids() -> Dict[int, str]: """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def. Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def didn't change in some unexpected way and we're not missing out on newly defined POSes. """ return { 36: '名詞,サ変接続,*,*', # noun-verbal 38: '名詞,一般,*,*', # noun 40: '名詞,形容動詞語幹,*,*', # adjectival nouns or quasi-adjectives 41: '名詞,固有名詞,一般,*', # proper nouns 42: '名詞,固有名詞,人名,一般', # proper noun, names of people 43: '名詞,固有名詞,人名,姓', # proper noun, first name 44: '名詞,固有名詞,人名,名', # proper noun, last name 45: '名詞,固有名詞,組織,*', # proper noun, organization 46: '名詞,固有名詞,地域,一般', # proper noun in general 47: '名詞,固有名詞,地域,国', # proper noun, country name } def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, } ) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message) @staticmethod def language_code() -> str: return "ja" @staticmethod def sample_sentence() -> str: return "いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす(ん)。" # noinspection PyMethodMayBeStatic def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) # MeCab's sentence -> word tokenizer already returns "base forms" of every word return words def split_text_to_sentences(self, text: str) -> List[str]: """Tokenize Japanese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Japanese text japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in japanese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split(r"\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split(r"\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Japanese text non_japanese_sentences = self.__english_language.split_text_to_sentences(list_item) sentences += non_japanese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def split_sentence_to_words(self, sentence: str) -> List[str]: """Tokenize Japanese sentence into words. Removes punctuation and words that don't belong to part-of-speech whitelist.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__mecab.parse(sentence).strip() parsed_tokens = parsed_text.split("\n") allowed_pos_ids = self._mecab_allowed_pos_ids() words = [] for parsed_token_line in parsed_tokens: if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line: primary_form_and_pos_number = parsed_token_line.split(self.__MECAB_TOKEN_POS_SEPARATOR) primary_form = primary_form_and_pos_number[0] pos_number = primary_form_and_pos_number[1] if pos_number.isdigit(): pos_number = int(pos_number) if pos_number in allowed_pos_ids: words.append(primary_form) else: # Ignore all the "EOS" stuff pass return words