Пример #1
0
class EvaluateMix(object):
    def __init__(self):
        self.jieba = Tokenizer()
        master_dict = os.path.join(os.path.dirname(__file__), '../data/cw_dict.txt')
        self.jieba.load_userdict(master_dict)

    def test(self, text, keywords):
        seg_text = list(self.jieba.cut(text))
        total = 0
        correct = 0
        for kw in keywords:
            total += text.count(kw)
            correct += seg_text.count(kw)
        if total == 0:
            return 0.0
        return float(correct) / total
Пример #2
0
class McChineseTokenizer(object):
    """Chinese language tokenizer that uses jieba."""

    # Path to jieba dictionary(ies)
    __dict_path = os.path.join(mc_root_path(),
                               'lib/MediaWords/Languages/resources/zh/')
    __jieba_dict_path = os.path.join(__dict_path, 'dict.txt.big')
    __jieba_userdict_path = os.path.join(__dict_path, 'userdict.txt')

    # jieba instance
    __jieba = None

    # Text -> sentence tokenizer for Chinese text
    __chinese_sentence_tokenizer = RegexpTokenizer(
        r'([^!?。]*[!?。])',
        gaps=True,  # don't discard non-Chinese text
        discard_empty=True,
    )

    # Text -> sentence tokenizer for non-Chinese (e.g. English) text
    __non_chinese_sentence_tokenizer = PunktSentenceTokenizer()

    def __init__(self):
        """Initialize jieba tokenizer."""

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__dict_path):
            raise McChineseTokenizerException("""
                jieba dictionary directory was not found: %s
                Maybe you forgot to initialize Git submodules?
                """ % self.__dict_path)

        if not os.path.isfile(self.__jieba_dict_path):
            raise McChineseTokenizerException("""
                Default dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        if not os.path.isfile(self.__jieba_userdict_path):
            raise McChineseTokenizerException("""
                User dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        try:
            # loading dictionary is part of the init process
            self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path))
            self.__jieba.load_userdict(os.path.join(
                self.__jieba_userdict_path))
        except Exception as ex:
            raise McChineseTokenizerException(
                "Unable to initialize jieba: %s" % str(ex))

    def tokenize_text_to_sentences(self, text: str) -> list:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            log.warning("Text to tokenize into sentences is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__non_chinese_sentence_tokenizer.tokenize(
                        list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Chinese sentence into words.
        
        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass
        return words
Пример #3
0
class ChineseLanguage(StopWordsFromFileMixIn):
    """Chinese language support module."""

    # Path to jieba dictionary(ies)
    __DICT_PATH = os.path.dirname(os.path.abspath(__file__))
    __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big')
    __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt')

    __slots__ = [
        # Stop words map
        '__stop_words_map',

        # Jieba instance
        '__jieba',

        # Text -> sentence tokenizer for Chinese text
        '__chinese_sentence_tokenizer',

        # English language instance for tokenizing non-Chinese (e.g. English) text
        '__english_language',
    ]

    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH
            )
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH
            )
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize jieba: %s" % str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)

    @staticmethod
    def language_code() -> str:
        return "zh"

    @staticmethod
    def sample_sentence() -> str:
        return (
            "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特,"
            "为后者的连续5个参议员任期划上句点。"
        )

    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)

        # Jieba's sentence -> word tokenizer already returns "base forms" of every word
        return words

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Tokenize Chinese sentence into words.

        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass

        return words
Пример #4
0
class ChineseLanguage(StopWordsFromFileMixIn):
    """Chinese language support module."""

    # Path to jieba dictionary(ies)
    __DICT_PATH = os.path.dirname(os.path.abspath(__file__))
    __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big')
    __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt')

    __slots__ = [
        # Stop words map
        '__stop_words_map',

        # Jieba instance
        '__jieba',

        # Text -> sentence tokenizer for Chinese text
        '__chinese_sentence_tokenizer',

        # English language instance for tokenizing non-Chinese (e.g. English) text
        '__english_language',
    ]

    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH
            )
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH
            )
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize jieba: %s" % str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)

    @staticmethod
    def language_code() -> str:
        return "zh"

    @staticmethod
    def sample_sentence() -> str:
        return (
            "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特,"
            "为后者的连续5个参议员任期划上句点。"
        )

    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)

        # Jieba's sentence -> word tokenizer already returns "base forms" of every word
        return words

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split(r"\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split(r"\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Tokenize Chinese sentence into words.

        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass

        return words
Пример #5
0
#!/usr/bin/env python3
#
# Jieba builds a dictionary cache on every load which takes about 0.5 s so here
# we prebuild such a cache
#

import os
from jieba import Tokenizer as JiebaTokenizer

if __name__ == '__main__':
    # Keep in sync with zh/__init__.py
    cache_file = '/var/tmp/jieba.cache'

    jieba = JiebaTokenizer()
    jieba.cache_file = '/var/tmp/jieba.cache'

    dict_base_dir = '/opt/mediacloud/src/common/python/mediawords/languages/zh/'
    dict_path = os.path.join(dict_base_dir, 'dict.txt.big')
    dict_user_path = os.path.join(dict_base_dir, 'userdict.txt')

    assert os.path.isfile(dict_path)
    assert os.path.isfile(dict_user_path)

    jieba.set_dictionary(dict_path)
    jieba.load_userdict(dict_user_path)
    jieba.initialize()

    assert os.path.isfile(cache_file)