Пример #1
0
    def __init__(self):
        self.dt = Tokenizer()
        self.dt.initialize()  # 预加载字典,避免界面卡顿
        self.name_dict = {}
        self.reversed_name_dict = {}

        self.text = None
        self._cut_result = []
        self.splited_result = []
Пример #2
0
    def __init__(self):
        """Initialize jieba tokenizer."""

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__dict_path):
            raise McChineseTokenizerException("""
                jieba dictionary directory was not found: %s
                Maybe you forgot to initialize Git submodules?
                """ % self.__dict_path)

        if not os.path.isfile(self.__jieba_dict_path):
            raise McChineseTokenizerException("""
                Default dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        if not os.path.isfile(self.__jieba_userdict_path):
            raise McChineseTokenizerException("""
                User dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        try:
            # loading dictionary is part of the init process
            self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path))
            self.__jieba.load_userdict(os.path.join(
                self.__jieba_userdict_path))
        except Exception as ex:
            raise McChineseTokenizerException(
                "Unable to initialize jieba: %s" % str(ex))
Пример #3
0
class EvaluateMix(object):
    def __init__(self):
        self.jieba = Tokenizer()
        master_dict = os.path.join(os.path.dirname(__file__), '../data/cw_dict.txt')
        self.jieba.load_userdict(master_dict)

    def test(self, text, keywords):
        seg_text = list(self.jieba.cut(text))
        total = 0
        correct = 0
        for kw in keywords:
            total += text.count(kw)
            correct += seg_text.count(kw)
        if total == 0:
            return 0.0
        return float(correct) / total
Пример #4
0
    def __init__(self):
        file_path = os.path.abspath(__file__)
        file_dir = os.path.dirname(file_path)
        setLogLevel(0)

        self.tokenizer = Tokenizer()
        self.tokenizer.set_dictionary(
            os.path.join(
                file_dir,
                'dict.txt.big.txt'
            )
        )

        specific_tokens = [
            '_url_',
            '_num_',
            '_phone_',
            '_time_'
        ]
        self.add_words(specific_tokens)
Пример #5
0
class jieba_api(object):
    def __init__(self):
        print("----------using jieba cut tool---------")

    def init_config(self, config):
        self.config = config
        self.dt = Tokenizer()

    def build_tool(self):
        dict_path = self.config.get("user_dict", None)
        if dict_path is not None:
            import codecs
            with codecs.open(dict_path, "r", "utf-8") as frobj:
                lines = frobj.read().splitlines()
                for line in lines:
                    self.dt.add_word(line, 10000, "<baidu>")

    def cut(self, text):
        words = list(self.dt.cut(text))
        # print(words, " ".join([word for word in words if len(word) >= 1]))
        return " ".join([word for word in words if len(word) >= 1])
Пример #6
0
class EvaluateJieba(object):
    def __init__(self):
        self.jieba = Tokenizer()

    def test(self, text, keywords):
        seg_text = list(self.jieba.cut(text))
        total = 0
        correct = 0
        for kw in keywords:
            total += text.count(kw)
            correct += seg_text.count(kw)
        if total == 0:
            return 0.0
        return float(correct) / total
Пример #7
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException(
                "Jieba dictionary directory was not found: %s" %
                self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s"
                % self.__DICT_PATH)
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in jieba dictionary directory: %s" %
                self.__DICT_PATH)
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(
                self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize jieba: %s" %
                                      str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)
Пример #8
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH
            )
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH
            )
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize jieba: %s" % str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)
Пример #9
0
class JiebaTokenizer(BaseTokenizer):

    def __init__(self):
        file_path = os.path.abspath(__file__)
        file_dir = os.path.dirname(file_path)
        setLogLevel(0)

        self.tokenizer = Tokenizer()
        self.tokenizer.set_dictionary(
            os.path.join(
                file_dir,
                'dict.txt.big.txt'
            )
        )

        specific_tokens = [
            '_url_',
            '_num_',
            '_phone_',
            '_time_'
        ]
        self.add_words(specific_tokens)

    def cut(self, sentence):
        splitted_tokens = self.tokenizer.lcut(sentence)
        while '_' in splitted_tokens:
            splitted_tokens.remove('_')
        return splitted_tokens

    def add_word(self, word, freq=None, tag=None):
        self.tokenizer.add_word(word, freq, tag)
        self.tokenizer.suggest_freq(word, tune=True)

    def add_words(self, words, freq=None, tag=None):
        for word in words:
            self.add_word(word, freq, tag)
Пример #10
0
 def initialize_tokenizer(self):
     self.dt = Tokenizer()
     self.dt.initialize()
     self._cache_expired()
Пример #11
0
class McChineseTokenizer(object):
    """Chinese language tokenizer that uses jieba."""

    # Path to jieba dictionary(ies)
    __dict_path = os.path.join(mc_root_path(),
                               'lib/MediaWords/Languages/resources/zh/')
    __jieba_dict_path = os.path.join(__dict_path, 'dict.txt.big')
    __jieba_userdict_path = os.path.join(__dict_path, 'userdict.txt')

    # jieba instance
    __jieba = None

    # Text -> sentence tokenizer for Chinese text
    __chinese_sentence_tokenizer = RegexpTokenizer(
        r'([^!?。]*[!?。])',
        gaps=True,  # don't discard non-Chinese text
        discard_empty=True,
    )

    # Text -> sentence tokenizer for non-Chinese (e.g. English) text
    __non_chinese_sentence_tokenizer = PunktSentenceTokenizer()

    def __init__(self):
        """Initialize jieba tokenizer."""

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__dict_path):
            raise McChineseTokenizerException("""
                jieba dictionary directory was not found: %s
                Maybe you forgot to initialize Git submodules?
                """ % self.__dict_path)

        if not os.path.isfile(self.__jieba_dict_path):
            raise McChineseTokenizerException("""
                Default dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        if not os.path.isfile(self.__jieba_userdict_path):
            raise McChineseTokenizerException("""
                User dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        try:
            # loading dictionary is part of the init process
            self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path))
            self.__jieba.load_userdict(os.path.join(
                self.__jieba_userdict_path))
        except Exception as ex:
            raise McChineseTokenizerException(
                "Unable to initialize jieba: %s" % str(ex))

    def tokenize_text_to_sentences(self, text: str) -> list:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            log.warning("Text to tokenize into sentences is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__non_chinese_sentence_tokenizer.tokenize(
                        list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Chinese sentence into words.
        
        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass
        return words
Пример #12
0
def import_jieba():
    from jieba import Tokenizer
    dt = Tokenizer(dictionary=BETTER_DICT_FILEPATH)
    dt.initialize()

    return dt
Пример #13
0
class ChineseLanguage(StopWordsFromFileMixIn):
    """Chinese language support module."""

    # Path to jieba dictionary(ies)
    __DICT_PATH = os.path.dirname(os.path.abspath(__file__))
    __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big')
    __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt')

    __slots__ = [
        # Stop words map
        '__stop_words_map',

        # Jieba instance
        '__jieba',

        # Text -> sentence tokenizer for Chinese text
        '__chinese_sentence_tokenizer',

        # English language instance for tokenizing non-Chinese (e.g. English) text
        '__english_language',
    ]

    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH
            )
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH
            )
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize jieba: %s" % str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)

    @staticmethod
    def language_code() -> str:
        return "zh"

    @staticmethod
    def sample_sentence() -> str:
        return (
            "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特,"
            "为后者的连续5个参议员任期划上句点。"
        )

    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)

        # Jieba's sentence -> word tokenizer already returns "base forms" of every word
        return words

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Tokenize Chinese sentence into words.

        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass

        return words
Пример #14
0
# encoding=utf-8
from collections import defaultdict
from AC import AC
from jieba import Tokenizer
from nlp.sents_split import sents_split
from gensim.models import word2vec
mm = Tokenizer("dict.txt")
import json


def zh_tok(sent):
    return [x.encode('utf-8') for x in mm.cut(sent)]


def jprint(x, s=None):
    return
    if s: print s
    try:
        print json.dumps(x, ensure_ascii=False)
    except:
        print x


antis = ["不", "没有", "一点也不", "没", "未"]


class DictBaseTag(object):
    def __init__(self, dict_path):
        self.read_dict(dict_path)

    def read_dict(self, dict_path):
Пример #15
0
如果
單位
哪個
'''.split('\n')

words = '''
亂丟
垃圾
柏油路面
被罰
低收入戶
為何
目前
'''.split('\n')

tokenizer = Tokenizer('./dict.txt.big.txt')

for word in words:
    tokenizer.add_word(word)


def read_answer(path):
    df = pd.read_csv(path, index_col=0)
    return df['地址'].to_dict()


def read_train_data(path, x_col, y_col):
    df = pd.read_csv(path, index_col=0)
    df.dropna(inplace=True)
    col_freq = \
        df[y_col].value_counts().to_frame() \
Пример #16
0
class Movie_Tokenizer:
    SKIP_SPACE_RE = re.compile(r"^\s*$")
    BREAK_SENTENCE_RE = re.compile(r"[。;;.……!!]")
    STOPWORDS = set()

    def __init__(self):
        self.dt = Tokenizer()
        self.dt.initialize()  # 预加载字典,避免界面卡顿
        self.name_dict = {}
        self.reversed_name_dict = {}

        self.text = None
        self._cut_result = []
        self.splited_result = []

    def set_text(self, text):
        text = text.strip()
        if self.text != text:
            self.text = text
            self._split_text()
            self._cache_expired()  # 缓存过期

    def _split_text(self):
        self.splited_result = list(
            self._filter_empty(self.BREAK_SENTENCE_RE.split(self.text)))
        return self.splited_result

    def _filter_empty(self, result):
        return list(
            filterfalse(lambda text: self.SKIP_SPACE_RE.match(text), result))

    def _generate_words_dict(self):
        d = self.name_dict
        res = set(chain.from_iterable(d.values())).union(d.keys())
        return res

    def _cache_expired(self):
        self._cut_result = []

    def cut(self):
        if self._cut_result:
            return self._cut_result
        if not self.splited_result:
            self._split_text()
        words_dict = self._generate_words_dict()
        for word in words_dict:
            self.dt.add_word(word)
        res = map(self.dt.cut, self.splited_result)
        res = list(self._filter_empty(line_cut) for line_cut in res)
        self._cut_result = res
        return res

    def add_name(self, name):
        self.name_dict.setdefault(name, set())
        self._cache_expired()

    def add_alias(self, name, alias):
        self.name_dict[name].add(alias)
        self.reversed_name_dict[alias] = name
        self._cache_expired()

    def get_alias(self, name):
        return self.name_dict[name]

    def get_names(self):
        return set(self.name_dict.keys())

    def del_name(self, name):
        for alias in self.name_dict[name]:
            del self.reversed_name_dict[alias]
        del self.name_dict[name]
        self._cache_expired()

    def del_alias(self, name, alias):
        del self.reversed_name_dict[alias]
        self.name_dict[name].discard(alias)
        self._cache_expired()

    def initialize_tokenizer(self):
        self.dt = Tokenizer()
        self.dt.initialize()
        self._cache_expired()

    def names_by_sentence(self, drop_empty=False):
        cut_result = self.cut()
        words_dict = self._generate_words_dict()
        for line in cut_result:
            # 替换角色名
            word_set = set(
                self.reversed_name_dict.get(word) or word for word in line)
            # 过滤停用词
            word_set_without_stopwords = set(
                filter(lambda word: word not in self.STOPWORDS, word_set))
            # 取剩余结果和角色名字典的交集
            name_set = word_set_without_stopwords & words_dict
            if drop_empty and not name_set:
                continue
            yield name_set

    def co_present(self):
        res = defaultdict(lambda: defaultdict(int))
        for name_set in self.names_by_sentence():
            for name1, name2 in combinations(name_set, 2):
                res[name1][name2] += 1
                res[name2][name1] += 1
        return res

    def word_freq(self):
        word_list = self.cut()
        words_without_stopwords = filterfalse(lambda x: x in self.STOPWORDS,
                                              chain.from_iterable(word_list))
        res = Counter(words_without_stopwords)
        return res

    def import_name_dict(self, name_dict):
        self.name_dict = name_dict
        for name in name_dict:
            for alias in name_dict[name]:
                self.reversed_name_dict.setdefault(alias, name)
        self._cache_expired()

    def import_stopwords(self, filename="edited_baidu_stopwords.txt"):
        self.STOPWORDS = set(
            line.strip()
            for line in open(filename, encoding="utf8").readlines())
        self._cache_expired()

    def apriori(self, min_support=0.01):
        names_by_sentence = list(self.names_by_sentence(drop_empty=True))
        itemsets, rule = apriori(names_by_sentence, min_support=min_support)
        return itemsets
Пример #17
0
class ChineseLanguage(StopWordsFromFileMixIn):
    """Chinese language support module."""

    # Path to jieba dictionary(ies)
    __DICT_PATH = os.path.dirname(os.path.abspath(__file__))
    __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big')
    __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt')

    __slots__ = [
        # Stop words map
        '__stop_words_map',

        # Jieba instance
        '__jieba',

        # Text -> sentence tokenizer for Chinese text
        '__chinese_sentence_tokenizer',

        # English language instance for tokenizing non-Chinese (e.g. English) text
        '__english_language',
    ]

    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH
            )
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH
            )
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize jieba: %s" % str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)

    @staticmethod
    def language_code() -> str:
        return "zh"

    @staticmethod
    def sample_sentence() -> str:
        return (
            "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特,"
            "为后者的连续5个参议员任期划上句点。"
        )

    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)

        # Jieba's sentence -> word tokenizer already returns "base forms" of every word
        return words

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split(r"\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split(r"\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Tokenize Chinese sentence into words.

        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass

        return words
Пример #18
0
#!/usr/bin/env python3
#
# Jieba builds a dictionary cache on every load which takes about 0.5 s so here
# we prebuild such a cache
#

import os
from jieba import Tokenizer as JiebaTokenizer

if __name__ == '__main__':
    # Keep in sync with zh/__init__.py
    cache_file = '/var/tmp/jieba.cache'

    jieba = JiebaTokenizer()
    jieba.cache_file = '/var/tmp/jieba.cache'

    dict_base_dir = '/opt/mediacloud/src/common/python/mediawords/languages/zh/'
    dict_path = os.path.join(dict_base_dir, 'dict.txt.big')
    dict_user_path = os.path.join(dict_base_dir, 'userdict.txt')

    assert os.path.isfile(dict_path)
    assert os.path.isfile(dict_user_path)

    jieba.set_dictionary(dict_path)
    jieba.load_userdict(dict_user_path)
    jieba.initialize()

    assert os.path.isfile(cache_file)
Пример #19
0
 def __init__(self):
     Tokenizer.__init__(self)
     self.stop_words = self.load_stop_word('./jieba/stop.txt')
Пример #20
0
 def __init__(self):
     self.jieba = Tokenizer()
Пример #21
0
 def init_config(self, config):
     self.config = config
     self.dt = Tokenizer()
Пример #22
0
 def __init__(self):
     self.jieba = Tokenizer()
     master_dict = os.path.join(os.path.dirname(__file__), '../data/cw_dict.txt')
     self.jieba.load_userdict(master_dict)