def __init__(self): self.dt = Tokenizer() self.dt.initialize() # 预加载字典,避免界面卡顿 self.name_dict = {} self.reversed_name_dict = {} self.text = None self._cut_result = [] self.splited_result = []
def __init__(self): """Initialize jieba tokenizer.""" self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__dict_path): raise McChineseTokenizerException(""" jieba dictionary directory was not found: %s Maybe you forgot to initialize Git submodules? """ % self.__dict_path) if not os.path.isfile(self.__jieba_dict_path): raise McChineseTokenizerException(""" Default dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) if not os.path.isfile(self.__jieba_userdict_path): raise McChineseTokenizerException(""" User dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) try: # loading dictionary is part of the init process self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path)) self.__jieba.load_userdict(os.path.join( self.__jieba_userdict_path)) except Exception as ex: raise McChineseTokenizerException( "Unable to initialize jieba: %s" % str(ex))
class EvaluateMix(object): def __init__(self): self.jieba = Tokenizer() master_dict = os.path.join(os.path.dirname(__file__), '../data/cw_dict.txt') self.jieba.load_userdict(master_dict) def test(self, text, keywords): seg_text = list(self.jieba.cut(text)) total = 0 correct = 0 for kw in keywords: total += text.count(kw) correct += seg_text.count(kw) if total == 0: return 0.0 return float(correct) / total
def __init__(self): file_path = os.path.abspath(__file__) file_dir = os.path.dirname(file_path) setLogLevel(0) self.tokenizer = Tokenizer() self.tokenizer.set_dictionary( os.path.join( file_dir, 'dict.txt.big.txt' ) ) specific_tokens = [ '_url_', '_num_', '_phone_', '_time_' ] self.add_words(specific_tokens)
class jieba_api(object): def __init__(self): print("----------using jieba cut tool---------") def init_config(self, config): self.config = config self.dt = Tokenizer() def build_tool(self): dict_path = self.config.get("user_dict", None) if dict_path is not None: import codecs with codecs.open(dict_path, "r", "utf-8") as frobj: lines = frobj.read().splitlines() for line in lines: self.dt.add_word(line, 10000, "<baidu>") def cut(self, text): words = list(self.dt.cut(text)) # print(words, " ".join([word for word in words if len(word) >= 1])) return " ".join([word for word in words if len(word) >= 1])
class EvaluateJieba(object): def __init__(self): self.jieba = Tokenizer() def test(self, text, keywords): seg_text = list(self.jieba.cut(text)) total = 0 correct = 0 for kw in keywords: total += text.count(kw) correct += seg_text.count(kw) if total == 0: return 0.0 return float(correct) / total
def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__DICT_PATH): raise McLanguageException( "Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join( self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message)
def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__DICT_PATH): raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH ) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH ) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message)
class JiebaTokenizer(BaseTokenizer): def __init__(self): file_path = os.path.abspath(__file__) file_dir = os.path.dirname(file_path) setLogLevel(0) self.tokenizer = Tokenizer() self.tokenizer.set_dictionary( os.path.join( file_dir, 'dict.txt.big.txt' ) ) specific_tokens = [ '_url_', '_num_', '_phone_', '_time_' ] self.add_words(specific_tokens) def cut(self, sentence): splitted_tokens = self.tokenizer.lcut(sentence) while '_' in splitted_tokens: splitted_tokens.remove('_') return splitted_tokens def add_word(self, word, freq=None, tag=None): self.tokenizer.add_word(word, freq, tag) self.tokenizer.suggest_freq(word, tune=True) def add_words(self, words, freq=None, tag=None): for word in words: self.add_word(word, freq, tag)
def initialize_tokenizer(self): self.dt = Tokenizer() self.dt.initialize() self._cache_expired()
class McChineseTokenizer(object): """Chinese language tokenizer that uses jieba.""" # Path to jieba dictionary(ies) __dict_path = os.path.join(mc_root_path(), 'lib/MediaWords/Languages/resources/zh/') __jieba_dict_path = os.path.join(__dict_path, 'dict.txt.big') __jieba_userdict_path = os.path.join(__dict_path, 'userdict.txt') # jieba instance __jieba = None # Text -> sentence tokenizer for Chinese text __chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) # Text -> sentence tokenizer for non-Chinese (e.g. English) text __non_chinese_sentence_tokenizer = PunktSentenceTokenizer() def __init__(self): """Initialize jieba tokenizer.""" self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__dict_path): raise McChineseTokenizerException(""" jieba dictionary directory was not found: %s Maybe you forgot to initialize Git submodules? """ % self.__dict_path) if not os.path.isfile(self.__jieba_dict_path): raise McChineseTokenizerException(""" Default dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) if not os.path.isfile(self.__jieba_userdict_path): raise McChineseTokenizerException(""" User dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) try: # loading dictionary is part of the init process self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path)) self.__jieba.load_userdict(os.path.join( self.__jieba_userdict_path)) except Exception as ex: raise McChineseTokenizerException( "Unable to initialize jieba: %s" % str(ex)) def tokenize_text_to_sentences(self, text: str) -> list: """Tokenize Chinese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text to tokenize into sentences is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Chinese text chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in chinese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Chinese text non_chinese_sentences = self.__non_chinese_sentence_tokenizer.tokenize( list_item) sentences += non_chinese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def tokenize_sentence_to_words(self, sentence: str) -> list: """Tokenize Chinese sentence into words. Removes punctuation.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__jieba.lcut(sentence, cut_all=False) parsed_tokens = [x for x in parsed_text if x.strip()] words = [] for parsed_token in parsed_tokens: if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None: words.append(parsed_token) else: pass return words
def import_jieba(): from jieba import Tokenizer dt = Tokenizer(dictionary=BETTER_DICT_FILEPATH) dt.initialize() return dt
class ChineseLanguage(StopWordsFromFileMixIn): """Chinese language support module.""" # Path to jieba dictionary(ies) __DICT_PATH = os.path.dirname(os.path.abspath(__file__)) __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big') __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt') __slots__ = [ # Stop words map '__stop_words_map', # Jieba instance '__jieba', # Text -> sentence tokenizer for Chinese text '__chinese_sentence_tokenizer', # English language instance for tokenizing non-Chinese (e.g. English) text '__english_language', ] def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__DICT_PATH): raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH ) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH ) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message) @staticmethod def language_code() -> str: return "zh" @staticmethod def sample_sentence() -> str: return ( "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特," "为后者的连续5个参议员任期划上句点。" ) def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) # Jieba's sentence -> word tokenizer already returns "base forms" of every word return words def split_text_to_sentences(self, text: str) -> List[str]: """Tokenize Chinese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Chinese text chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in chinese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Chinese text non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item) sentences += non_chinese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def split_sentence_to_words(self, sentence: str) -> List[str]: """Tokenize Chinese sentence into words. Removes punctuation.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__jieba.lcut(sentence, cut_all=False) parsed_tokens = [x for x in parsed_text if x.strip()] words = [] for parsed_token in parsed_tokens: if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None: words.append(parsed_token) else: pass return words
# encoding=utf-8 from collections import defaultdict from AC import AC from jieba import Tokenizer from nlp.sents_split import sents_split from gensim.models import word2vec mm = Tokenizer("dict.txt") import json def zh_tok(sent): return [x.encode('utf-8') for x in mm.cut(sent)] def jprint(x, s=None): return if s: print s try: print json.dumps(x, ensure_ascii=False) except: print x antis = ["不", "没有", "一点也不", "没", "未"] class DictBaseTag(object): def __init__(self, dict_path): self.read_dict(dict_path) def read_dict(self, dict_path):
如果 單位 哪個 '''.split('\n') words = ''' 亂丟 垃圾 柏油路面 被罰 低收入戶 為何 目前 '''.split('\n') tokenizer = Tokenizer('./dict.txt.big.txt') for word in words: tokenizer.add_word(word) def read_answer(path): df = pd.read_csv(path, index_col=0) return df['地址'].to_dict() def read_train_data(path, x_col, y_col): df = pd.read_csv(path, index_col=0) df.dropna(inplace=True) col_freq = \ df[y_col].value_counts().to_frame() \
class Movie_Tokenizer: SKIP_SPACE_RE = re.compile(r"^\s*$") BREAK_SENTENCE_RE = re.compile(r"[。;;.……!!]") STOPWORDS = set() def __init__(self): self.dt = Tokenizer() self.dt.initialize() # 预加载字典,避免界面卡顿 self.name_dict = {} self.reversed_name_dict = {} self.text = None self._cut_result = [] self.splited_result = [] def set_text(self, text): text = text.strip() if self.text != text: self.text = text self._split_text() self._cache_expired() # 缓存过期 def _split_text(self): self.splited_result = list( self._filter_empty(self.BREAK_SENTENCE_RE.split(self.text))) return self.splited_result def _filter_empty(self, result): return list( filterfalse(lambda text: self.SKIP_SPACE_RE.match(text), result)) def _generate_words_dict(self): d = self.name_dict res = set(chain.from_iterable(d.values())).union(d.keys()) return res def _cache_expired(self): self._cut_result = [] def cut(self): if self._cut_result: return self._cut_result if not self.splited_result: self._split_text() words_dict = self._generate_words_dict() for word in words_dict: self.dt.add_word(word) res = map(self.dt.cut, self.splited_result) res = list(self._filter_empty(line_cut) for line_cut in res) self._cut_result = res return res def add_name(self, name): self.name_dict.setdefault(name, set()) self._cache_expired() def add_alias(self, name, alias): self.name_dict[name].add(alias) self.reversed_name_dict[alias] = name self._cache_expired() def get_alias(self, name): return self.name_dict[name] def get_names(self): return set(self.name_dict.keys()) def del_name(self, name): for alias in self.name_dict[name]: del self.reversed_name_dict[alias] del self.name_dict[name] self._cache_expired() def del_alias(self, name, alias): del self.reversed_name_dict[alias] self.name_dict[name].discard(alias) self._cache_expired() def initialize_tokenizer(self): self.dt = Tokenizer() self.dt.initialize() self._cache_expired() def names_by_sentence(self, drop_empty=False): cut_result = self.cut() words_dict = self._generate_words_dict() for line in cut_result: # 替换角色名 word_set = set( self.reversed_name_dict.get(word) or word for word in line) # 过滤停用词 word_set_without_stopwords = set( filter(lambda word: word not in self.STOPWORDS, word_set)) # 取剩余结果和角色名字典的交集 name_set = word_set_without_stopwords & words_dict if drop_empty and not name_set: continue yield name_set def co_present(self): res = defaultdict(lambda: defaultdict(int)) for name_set in self.names_by_sentence(): for name1, name2 in combinations(name_set, 2): res[name1][name2] += 1 res[name2][name1] += 1 return res def word_freq(self): word_list = self.cut() words_without_stopwords = filterfalse(lambda x: x in self.STOPWORDS, chain.from_iterable(word_list)) res = Counter(words_without_stopwords) return res def import_name_dict(self, name_dict): self.name_dict = name_dict for name in name_dict: for alias in name_dict[name]: self.reversed_name_dict.setdefault(alias, name) self._cache_expired() def import_stopwords(self, filename="edited_baidu_stopwords.txt"): self.STOPWORDS = set( line.strip() for line in open(filename, encoding="utf8").readlines()) self._cache_expired() def apriori(self, min_support=0.01): names_by_sentence = list(self.names_by_sentence(drop_empty=True)) itemsets, rule = apriori(names_by_sentence, min_support=min_support) return itemsets
class ChineseLanguage(StopWordsFromFileMixIn): """Chinese language support module.""" # Path to jieba dictionary(ies) __DICT_PATH = os.path.dirname(os.path.abspath(__file__)) __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big') __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt') __slots__ = [ # Stop words map '__stop_words_map', # Jieba instance '__jieba', # Text -> sentence tokenizer for Chinese text '__chinese_sentence_tokenizer', # English language instance for tokenizing non-Chinese (e.g. English) text '__english_language', ] def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__DICT_PATH): raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH ) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH ) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message) @staticmethod def language_code() -> str: return "zh" @staticmethod def sample_sentence() -> str: return ( "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特," "为后者的连续5个参议员任期划上句点。" ) def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) # Jieba's sentence -> word tokenizer already returns "base forms" of every word return words def split_text_to_sentences(self, text: str) -> List[str]: """Tokenize Chinese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Chinese text chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in chinese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split(r"\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split(r"\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Chinese text non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item) sentences += non_chinese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def split_sentence_to_words(self, sentence: str) -> List[str]: """Tokenize Chinese sentence into words. Removes punctuation.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__jieba.lcut(sentence, cut_all=False) parsed_tokens = [x for x in parsed_text if x.strip()] words = [] for parsed_token in parsed_tokens: if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None: words.append(parsed_token) else: pass return words
#!/usr/bin/env python3 # # Jieba builds a dictionary cache on every load which takes about 0.5 s so here # we prebuild such a cache # import os from jieba import Tokenizer as JiebaTokenizer if __name__ == '__main__': # Keep in sync with zh/__init__.py cache_file = '/var/tmp/jieba.cache' jieba = JiebaTokenizer() jieba.cache_file = '/var/tmp/jieba.cache' dict_base_dir = '/opt/mediacloud/src/common/python/mediawords/languages/zh/' dict_path = os.path.join(dict_base_dir, 'dict.txt.big') dict_user_path = os.path.join(dict_base_dir, 'userdict.txt') assert os.path.isfile(dict_path) assert os.path.isfile(dict_user_path) jieba.set_dictionary(dict_path) jieba.load_userdict(dict_user_path) jieba.initialize() assert os.path.isfile(cache_file)
def __init__(self): Tokenizer.__init__(self) self.stop_words = self.load_stop_word('./jieba/stop.txt')
def __init__(self): self.jieba = Tokenizer()
def init_config(self, config): self.config = config self.dt = Tokenizer()
def __init__(self): self.jieba = Tokenizer() master_dict = os.path.join(os.path.dirname(__file__), '../data/cw_dict.txt') self.jieba.load_userdict(master_dict)