def fenci(one_string, discover_new_word=False): one_string = re.sub(r'\s+', '', one_string)# 去掉所有空格 final_result = [] temp_list = jieba.lcut(one_string, HMM=discover_new_word) if discover_new_word==False:# HMM=False已实际使之缩小了不少粒度 for word in temp_list: if isAllZh(word) == False: continue if len(word) > 3: jieba.del_word(word) final_result.extend(jieba.lcut(word, HMM=discover_new_word)) else: final_result.append(word) else: for word in temp_list: if isAllZh(word)==False: continue # if len(word)==3: # 根据词频设置阈值 # print(word,jieba.get_FREQ(word)) if jieba.get_FREQ(word)==None \ or (len(word)>1 and (jieba.get_FREQ(word)==None or jieba.get_FREQ(word)==0)) \ or len(word)>3 \ or (len(word)==3 and jieba.get_FREQ(word)!=None and jieba.get_FREQ(word)<100): jieba.del_word(word) # 强制 final_result.extend(jieba.lcut(word)) else: final_result.append(word) return final_result
def frequency_tune(): testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) # 使用suggest_freq()调节单个词语的词频,使其能被分出来 print('%s before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) print('-'*40)
def jieba_test(): """ :return: """ jieba.load_userdict("./dict/user_dict.txt") jieba.add_word('石墨烯') jieba.add_word('凱特琳') jieba.del_word('自定义词') test_sent = ("李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n" "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n" "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。") words = jieba.cut(test_sent) print('/'.join(words)) print("=" * 40) result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ", end=' ') print("\n" + "=" * 40) terms = jieba.cut('easy_install is great') print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的') print('/'.join(terms)) print("=" * 40) # test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) print("-" * 40) return None
def write_dictionary_line(self, w, dict_file=None, withFlag=True): """ write to dictionary file w : word dict_file : dictionary file """ n = jieba.get_FREQ(w.word) if n: if dict_file != None: if withFlag == True: dict_file.write(w.word + " " + str(n) + " " + w.flag + "\n") else: dict_file.write(w.word + "/ ") else: print(w.word + " " + str(n) + " " + w.flag + "\n")
def main(): # jieba.enable_parallel(4) text = "我们来到了北京饭店" print(" ".join(jieba.cut(text))) print(jieba.get_FREQ('北京饭店'), jieba.get_FREQ('北京'), jieba.get_FREQ('饭店')) print("=" * 10) # 定义词库, 分隔 北京 饭店 jieba.set_dictionary('my_dict.txt') print(" ".join(jieba.cut('今天天气不错'))) print(jieba.get_FREQ('北京饭店'), jieba.get_FREQ('北京'), jieba.get_FREQ('饭店')) print(" ".join(jieba.cut(text))) print("=" * 10) print(" ".join(jieba.cut('藏宝阁太贵'))) jieba.suggest_freq(('太', '贵'), True) print(" ".join(jieba.cut('藏宝阁太贵')))
def norm_document(self, document, test=True): """ Arguments : document : converted document return normalize document """ norm_sentences = document.split('\n') sentences = "" if test: seg = Segmentation() words = seg.segment(norm_sentences)[0] for sentence in words: hasword = False for w in sentence: if w in self.SEP: sentences += w elif len(w) == 1: # 去掉不常用的单字 n = jieba.get_FREQ(w) if n != None and n > 10: sentences += w else: hasword = True sentences += w if hasword: sentences += '\n' else: for sentence in norm_sentences: s = sentence if s: sentences += s + '\n' return sentences
def norm_document(self, document): norm_sentences = document.split('\n') sentences = "" seg = Segmentation() words = seg.segment(norm_sentences)[0] for sentence in words: hasword = False for w in sentence: if w in self.SEP: sentences += w elif len(w) == 1: # 去掉不常用的单字 n = jieba.get_FREQ(w) if n != None and n > 10: sentences += w else: hasword = True sentences += w if hasword: sentences += '\n' return sentences
def fun5(): jieba.load_userdict("userdict.txt") jieba.add_word("石墨烯") jieba.add_word("凯特琳") jieba.del_word("自定义词") test_send = ( "李小福是创新办主任也是云计算方面的专家;什么是八一双鹿\n" "例如我输入一个带“韩玉鉴赏”的标题,在自定义词库中也增加了此词为N类\n" "「台中」正確應該不會被切開。mac上可以分出「石墨烯」;此時又可以分出來凱特琳了。" ) words = jieba.cut(test_send) print("/".join(words)) print("=" * 40) result = pseg.cut(test_send) for w in result: print(w.word, "/", w.flag, ", ", end=" ") print("\n" + "=" * 48) terms = jieba.cut("easy_install is great") print("/".join(terms)) terms = jieba.cut("python 的正则表达式是好用的") print("/".join(terms)) print("=" * 40) testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')) ] for send, seg in testlist: print("/".join(jieba.cut(send, HMM=False))) word = ''.join(seg) print("%s Before: %s, After: %s" % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print("/".join(jieba.cut(send, HMM=False))) print("-" * 40)
def test_user_dict(self): """ 2. 自定义词典 """ topic = '添加自定义词典' split_line = self.get_split_line(topic) self.logger.debug(split_line) test_sent = """李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n 例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n 台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。 """ words = jieba.cut(test_sent) self.logger.debug('{topic}_原始: {msg}'.format(topic=topic, msg='/'.join(words))) """ 调整词典:动态修改词典 """ userdict_path = os.path.dirname(__file__) + "/jieba_dict/dict.txt" jieba.add_word('石墨烯') jieba.add_word('凱特琳') jieba.del_word('自定义词') jieba.load_userdict(userdict_path) words = jieba.cut(test_sent) self.logger.debug('{topic}_自定义字典分词:{msg}'.format(topic=topic, msg='/'.join(words))) self.logger.debug('test split words' + "=" * 40) terms = jieba.cut('easy_install is great') self.logger.debug('{topic}_字典分词: {msg}'.format(topic=topic, msg='/'.join(terms))) jieba.del_word('easy_install') terms = jieba.cut('easy_install is great') self.logger.debug('{topic}_删除单词: {msg}'.format(topic=topic, msg='/'.join(terms))) terms = jieba.cut('python 的正则表达式是好用的') self.logger.debug('{topic}_单词: {msg}'.format(topic=topic, msg='/'.join(terms))) self.logger.debug('test frequency tune' + "=" * 40) word = '这里中将应该被切开' self.logger.debug('{topic}_调低词频之前: {msg}'.format(topic=topic, msg='/'.join( jieba.cut(word)))) self.logger.debug('{topic}_调整词频: {msg}'.format( topic=topic, msg='before: {before}, after: {after}'.format( before=jieba.get_FREQ('中将'), after=jieba.suggest_freq(('中', '将'), True)))) self.logger.debug('{topic}_调低词频之后: {msg}'.format( topic=topic, msg='/'.join(jieba.cut(word, HMM=False)))) jieba.del_word('台中') word = '[台中]正确应该不会被切开' self.logger.debug('{topic}_调高词频之前: {msg}'.format(topic=topic, msg='/'.join( jieba.cut(word)))) self.logger.debug('{topic}_调整词频: {msg}'.format( topic=topic, msg='before: {before}, after: {after}'.format( before=jieba.get_FREQ('台中'), after=jieba.suggest_freq('台中', True)))) self.logger.debug('{topic}_调高词频之后: {msg}'.format( topic=topic, msg='/'.join(jieba.cut(word, HMM=False))))
def freq_tag(self, word): freq = jieba.get_FREQ(word) tag = "" if freq is not None: tag = pseg.lcut(word, HMM=False)[0].flag return freq, tag
words = jieba.cut(test_sent) #print('/'.join(words)) #print("="*40) result = pseg.cut(test_sent) #for w in result: # print(w.word, "/", w.flag, ", ", end=' ') #print("\n" + "="*40) terms = jieba.cut('easy_install is great') #print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的') #print('/'.join(terms)) #print("="*40) # test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: # print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) # print("-"*40)
print('/'.join(words)) print("=" * 40) result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ", end=' ') print("\n" + "=" * 40) terms = jieba.cut('easy_install is great') print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的') print('/'.join(terms)) print("=" * 40) # test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) print("-" * 40)
/「/台中/」/正確/應該/不會/被/切開/。/mac/上/可/分出/「/石墨烯/」/;/此時/又/可以/分出/來/凱特琳/了/。 """ # print("="*40) # # result = pseg.cut(test_sent) # # for w in result: # print(w.word, "/", w.flag, ", ", end=' ') # # print("\n" + "="*40) # # terms = jieba.cut('easy_install is great') # print('/'.join(terms)) # terms = jieba.cut('python 的正则表达式是好用的') # print('/'.join(terms)) # # print("="*40) # test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) print("-"*40)
#!/usr/bin/python3 # coding: utf-8 import jieba ################################################################## ## suggest_freq(segment, tune=True) 可调节单个词语的词频, 使其能(或不能)被分出来 # suggest_freq() 每执行一次, 频率会增加 1 print(jieba.get_FREQ(('中', '将'))) # None print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) # 如果/放到/post/中将/出错/。 print(jieba.suggest_freq(('中', '将'), True)) # 494; 意思是 中将 两个字要分开 print(jieba.get_FREQ('中'), jieba.get_FREQ('将')) # 243191 122305 print(jieba.get_FREQ('中', '将')) # 243191; 输出的是 中 的词频 print(jieba.get_FREQ(('中', '将'))) # None, 没有意义 print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) # 如果/放到/post/中/将/出错/。 print(jieba.get_FREQ('台中')) print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) # 「/台/中/」/正确/应该/不会/被/切开 print(jieba.suggest_freq('台中', True)) # 69; 执行几次以后会增加..., print(jieba.get_FREQ('台中')) print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) # 「/台中/」/正确/应该/不会/被/切开 ################################################################## ## "台中"总是被切成"台 中"; P(台中) < P(台) x P(中), "台中"词频不够导致其成词概率较低 # 解决方法: 强制调高词频 # jieba.add_word('台中') 或者 jieba.suggest_freq('台中', True) ################################################################## ## test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: