def separate_xwlbotxt(file_str, level=1, coding="utf-8", add_words=[], del_words=[]): ''' :param file_str:txt file or parsing text :param level:separate result mode, 1 is return lis, 2 is iterator :param coding:read file encoding :return: list or iterator ''' parsing_str = "" if pathlib.Path(file_str).is_file(): with open(file_str, "r", encoding=coding) as f: for line in f: parsing_str += line.strip() elif isinstance(file_str, str): parsing_str = file_str else: return None if add_words: for word in add_words: jieba.add_word(word) if del_words: for word in del_words: jieba.del_word(word) return jieba.lcut(parsing_str)
def Initialization(): jieba.suggest_freq('采购单', True) jieba.suggest_freq('采购提交', True) jieba.suggest_freq('玻尿酸', True) jieba.suggest_freq("新增采购", True) jieba.suggest_freq("水泥", True) jieba.del_word('采购提交水泥')
def better_cut(one_string, discover_new_word=False): one_string = re.sub(r'\s+', '', one_string) # 去掉所有空格 final_result = [] temp_list = jieba.lcut(one_string, HMM=discover_new_word) if discover_new_word == False: # HMM=False已实际使之缩小了不少粒度 for word in temp_list: if isAllZh(word) == False: continue if len(word) > 4: jieba.del_word(word) # jieba.add_word(word,freq=0) 也行! final_result.extend(jieba.lcut(word, HMM=discover_new_word)) else: final_result.append(word) else: for word in temp_list: if isAllZh(word) == False: continue # if len(word)==4: # 根据词频设置阈值 # print(word,jieba.get_FREQ(word)) if jieba.get_FREQ(word)==None \ or (len(word)>1 and (jieba.get_FREQ(word)==None or jieba.get_FREQ(word)==0)) \ or len(word)>4 \ or (len(word)==4 and jieba.get_FREQ(word)!=None and jieba.get_FREQ(word)<100): jieba.del_word(word) # 强制 # jieba.add_word(word,freq=0) 也行! final_result.extend(jieba.lcut(word)) else: final_result.append(word) return final_result
def addDictToJieba(): ##### roadList content=open('../data_crawl/finalRoads.txt','r').read().strip('\n') contentList=content.split('\n');print len(contentList) #############load district dict districtNameList=grab('/home/yr/intellicredit/data/'+'districtNameList0503') test_sent = [ "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿,上海市浦东区\n" ] ######## print cut before add dictionary #print test_sent[0].decode('utf-8') words = jieba.cut(test_sent[0].decode('utf-8')) #print('/'.join(words)) ####add word_dictionary to jieba for w in districtNameList[:]+contentList: #print w jieba.add_word(w) ####add district-name-not-in-dictionary to jieba jieba.add_word('浦东区'); jieba.add_word('浦东新区') jieba.del_word('上海市') jieba.add_word('兰城路') words = jieba.cut(test_sent[0].decode('utf-8'))
def get_subword_list(big_word): if not isZH(big_word[0]): return big_word if len(big_word)>4: jieba.del_word(big_word) return jieba.lcut(big_word, HMM=False) else: return big_word
def __init__(self, cus_files=None): if type(cus_files) is list: for cf in cus_files: for s in open(cf,'r'): del_word(s.split()[0]) load_userdict(cf) assert type(cus_files) is list, "cus_files must be a files list"
def sentence_segmentation(self, sentence, entity1, entity2): jieba.add_word(entity1, freq=999999) jieba.add_word(entity2, freq=999999) seglist = list(jieba.cut(sentence, cut_all=False, HMM=False)) jieba.del_word(entity1) jieba.del_word(entity2) return seglist
def test_parse_sentence(self): self.assertTrue(True) import jieba jieba.del_word('价格便宜') sent_txt = '价格便宜。' for sent in parser.parse2sents(sent_txt): print('sent: ', sent)
def load_data_and_labels(train_file_org='atec_nlp_sim_train.csv', train_file_add='atec_nlp_sim_train_add.csv', word_dict_file='word_dict', userdict='userdict1.txt', less_frequency=5): """ Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ # Load data from files #加载结巴分词,如果单词出现次数很少,且不能被字典识别,则不将它作为一个分词单位 jieba.load_userdict(userdict) word_f_less = torch.load('word_f_less') for word in word_f_less: jieba.del_word(word) train_data_org = list( open(train_file_org, "r", encoding='utf-8').readlines()) train_data_add = list( open(train_file_add, "r", encoding='utf-8').readlines()) train_data = train_data_org + train_data_add train_sentence = [] for i in range(0, len(train_data)): if train_data[i]: seg_list1 = jieba.lcut(train_data[i].split('\t')[1], cut_all=False) seg_list2 = jieba.lcut(train_data[i].split('\t')[2], cut_all=False) train_sentence.append([[seg_list1, seg_list2], float(train_data[i].split('\t')[-1])]) else: continue if i % 1000 == 0: print('transfering sentence to list', i, '/', len(train_data), 'had been solved') #导入处理后的预训练字典{单词:向量} word_dict = torch.load('word_dict') word_dict['padding'] = np.zeros((200), dtype=np.float64) word_dict['unknow'] = np.zeros((200), dtype=np.float64) #构建单词与索引间互相查询的字典 word2ix = {word: ix for ix, word in enumerate(word_dict.keys())} ix2word = {ix: word for word, ix in word2ix.items()} new_word, word_f_less = find_new_word_and_frequency( train_sentence, word_dict, less_frequency) train_sentence = delete_new_word(train_sentence, word_f_less) word_matrix = make_word_matrix(word_dict).float() train_sentence, valid_sentence, test_sentence = get_train_and_valid( train_sentence) del train_data, train_data_add, train_data_org del train_file_add, train_file_org return train_sentence, valid_sentence, test_sentence, word_dict, word2ix, ix2word, word_matrix
def make_worldcloud(file_path): jieba.add_word('少女心') jieba.add_word('颜值') jieba.del_word('男主') jieba.del_word('女主') text_from_file_with_apath = io.open(file_path, 'r', encoding='UTF-8').read() wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=False) #精确模式 # wl_space_split = " ".join(wordlist_after_jieba) # print(wl_space_split) backgroud_Image = plt.imread('./dou1.jpg') # print('loaded ipg!') '''set wordcloud mode''' #stopwords = STOPWORDS.copy() # stopwords = set(STOPWORDS) stopwords = stopwordslist('./stopwords.txt') # 这里加载停用词的路径 # stopwords.add("哈哈") # stopwords.add("就是") # stopwords.add("电视剧") # stopwords.add("男主") # stopwords.add("女主") # stopwords.add("还是")#单项添加 # stopwords.update([u'哈哈', u'就是', u'男主', u'女主', u'还是', u'电视剧'])#多项添加 outstr = '' for word in wordlist_after_jieba: if word not in stopwords: if word != '\t': outstr += word outstr += " " print outstr wc = WordCloud( width=1024, height=768, background_color='white', # 设置背景颜色 mask=backgroud_Image, # 设置背景图片 font_path= '/Library/Fonts/华文仿宋.ttf', # 设置中文字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字 max_words=300, # 设置最大现实的字数 #stopwords=stopwords,# 设置停用词 #stopwords=STOPWORDS.add('男主'),# 设置停用词 random_state=50, # 设置有多少种随机生成状态,即有多少种配色方案 ) #wc.generate_from_text(wl_space_split)#开始加载文本 wc.generate_from_text(outstr) img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) #字体颜色为背景图片的颜色 plt.imshow(wc) # 显示词云图 plt.axis('off') # 是否显示x轴、y轴下标 plt.show() #显示 # 获得模块所在的路径的 d = path.dirname(__file__) wc.to_file(path.join(d, "test.jpg")) print('word cloud!')
def init(): del_list = ["订单", "取消"] #移除单词 suggest_list = ["取消订单", "单方面取消", "单方取消", "单方面违约", "无故退款", "不发货", "按时发货"] #增加词库 for word in del_list: jieba.del_word(word) for word in suggest_list: jieba.suggest_freq(word, True)
def shorter_chinese_cut(self, line): result = [] for long_word in jieba.lcut(line, HMM=False): cp = ord(long_word[0]) if self._is_chinese_char(cp) and len(long_word) > 3: jieba.del_word(long_word) result.extend(jieba.lcut(long_word)) else: result.append(long_word) return result
def __init__(self, save_path): add_words_file = os.path.join(save_path, '_add_words.txt') del_words_file = os.path.join(save_path, '_del_words.txt') with open(add_words_file, "r", encoding='utf-8') as f: for word in f: word = word.strip().lstrip('\ufeff') jieba.add_word(word) with open(del_words_file, "r", encoding='utf-8') as f: for word in f: word = word.strip().lstrip('\ufeff') jieba.del_word(word)
def cut_text(text): stopwords = [ '熟悉', '技术', '职位', '相关', '工作', '开发', '使用', '能力', '优先', '描述', '任职', '经验', '经验者', '具有', '具备', '以上', '善于', '一种', '以及', '一定', '进行', '能够', '我们' ] for stopword in stopwords: jieba.del_word(stopword) words = jieba.lcut(text) content = " ".join(words) return content
def __load_dict(self): """ Function: 加载yaml文件,获取jieba分词词库配置的词集 :return: 分句的依赖字典,list形式 """ f = open(CUT_WORD_FILE, encoding='utf-8') word_dict = yaml.load(f) for del_words in word_dict.get("del_word", []): jieba.del_word(del_words) for add_words in word_dict.get("add_word", []): jieba.add_word(add_words)
def key_name_split(input_key_name_cn): jieba.del_word('总金额') jieba.load_userdict('E:/cai_project/python/test_file/keyword_dict.txt') seq_list = jieba.cut(input_key_name_cn.replace('的', '')) split_word_list = list(seq_list) number_index = [] for key in range(len(split_word_list)): if re.match('\\d+', split_word_list[key]): number_index.append(key) else: print split_word_list[key] return '_'.join(split_word_list)
def setdict(self, cmd): arr = cmd.split(':') if len(arr) == 3: if arr[1] == 'add': jieba.add_word(arr[2]) return '添加词典【' + arr[2] + '】成功!' elif arr[1] == 'del': jieba.del_word(arr[2]) return '删除词典【' + arr[2] + '】成功!' else: return '错误的命令!' else: return '错误的命令!'
def word_seg(s): #tukai source = read_data_test('source.csv') for i in range(860): source[i] = source[i][0] add = ['汪建', '姜广策', '藻酸双酯钠'] + source delate = ['了', '的'] for x in add: jieba.add_word(x) for x in delate: jieba.del_word(x) str_list = list(jieba.cut(s, cut_all=False, HMM=False)) return str_list
def word_seg(s): add = ['汪建', '姜广策', '藻酸双酯钠', '盘口'] + source delate = ['了', '的'] for x in add: jieba.add_word(x) for x in delate: jieba.del_word(x) str_list = list(jieba.cut(s, cut_all=False, HMM=False)) s_list = [] for x in str_list: if x not in stop_word: s_list.append(x) return s_list
async def reply(session: CommandSession): message = session.state.get('message').replace(' ', '') if message in ('电话问题', '电脑问题', '网络问题'): table = '58_robot_1' answer = await database_search(session, table, message) elif 'add-' in message: key = message.split('-')[1] if key != '': with open(base_dir + '/dict.txt', 'a') as k: k.write(key + ' ' + '10' + '\n') jieba.add_word(key) answer = '关键词已经激活' else: answer = '请按照此格式激活:add-关键词' elif 'del-' in message: dict_txt = [] key = message.split('-')[1] if key != '': fp = open(base_dir + '/dict.txt', 'r') txt = fp.readlines() for i in txt: if key in i: jieba.del_word(key) else: dict_txt.append(i) fp.close() with open(base_dir + '/dict.txt', 'w+') as fp: for i in dict_txt: fp.write(i) answer = '关键词已经删除' else: answer = '请按照此格式删除:del-关键词' else: table = '58_robot_2' des = supplement answer = await database_search(session, table, message) + '\n' + des if EXPR_DONT_UNDERSTAND not in answer: print(answer) await session.send(answer) msg_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 聊天记录写入数据库 try: cursor.execute( 'insert into 58_robot_3 (im, time, question, answer) values ("QQ", "{}", "{}", "{}")' .format(msg_time, message, answer)) db.commit() except: db.rollback()
def cloud(text, itemid): # 存放照片途径 fileroute = imageRoute + itemid + ".jpg" # jieba分词 removes = [ '团购', '点评', '但是', '还是', '感觉', '就是', '而且', '没有', '还有', '不过', '知道', '什么', '比较', '这里' '我们', '以前', '一下', '一次', '不是', '不是', '我们', '时候', '老板', '里面', '很多', '朋友', '一个', '这家', '购点评', '有点', '觉得', '东西', '个人', '绵阳', '这个', '下次', '因为', '位置' ] for w in removes: jieba.del_word(w) words = jieba.lcut(text) cuted = ' '.join(words) # wordCloud 生成词云 fontpath = "SourceHanSansCN-Regular.otf" #backgroud_Image = plt.imread('cloud.jpg') wc = WordCloud( background_color='black', # 设置背景颜色 # mask=backgroud_Image, # 设置背景图片 max_words=33, # 设置最大现实的字数 stopwords=STOPWORDS, # 设置停用词 font_path=fontpath, # 设置字体格式,如不设置显示不了中文 max_font_size=300, # 设置字体最大值 min_font_size=50, # 设置字体最小值 #random_state=42, # 设置有多少种随机生成状态,即有多少种配色方案 mode='RGBA', colormap='Spectral', collocations=False, # 避免重复的单词 width=1590, height=1205, margin=20, # 设置图像宽高,字体间距 ) wc.generate(cuted) # image_colors = ImageColorGenerator(backgroud_Image) # wc.recolor(color_func=image_colors) fig, ax = plt.subplots() plt.figure(dpi=100) plt.imshow(wc, interpolation='catrom', vmax=1000) plt.axis('off') height, width = wc.height, wc.width # 如果dpi=300,那么图像大小=height*width fig.set_size_inches(width / 100.0 / 3.0, height / 100.0 / 3.0) plt.gca().xaxis.set_major_locator(plt.NullLocator()) plt.gca().yaxis.set_major_locator(plt.NullLocator()) plt.subplots_adjust(top=1, bottom=0, left=0, right=1, hspace=2, wspace=2) plt.margins(0, 0) plt.savefig(fileroute) #plt.show() return fileroute
def jieba_test(): """ :return: """ jieba.load_userdict("./dict/user_dict.txt") jieba.add_word('石墨烯') jieba.add_word('凱特琳') jieba.del_word('自定义词') test_sent = ("李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n" "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n" "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。") words = jieba.cut(test_sent) print('/'.join(words)) print("=" * 40) result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ", end=' ') print("\n" + "=" * 40) terms = jieba.cut('easy_install is great') print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的') print('/'.join(terms)) print("=" * 40) # test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) print("-" * 40) return None
def __init__(self, del_file="jieba_del.txt", areacode_file="areacode.txt", area_json_file="china_city_area.json", baidu_ak="eXiTVqhBbnU7TeF3WrtGAvxXkIUXBRwg"): self.cities = dict() with open(os.path.join(self.current_path, del_file), encoding="utf8") as file: for word in file: jieba.del_word(word) with open(os.path.join(self.current_path, areacode_file), encoding="utf8") as file: for line in file: city, areacode = line.replace("\r", "").replace("\n", "").split(",") self.cities[city] = areacode with open(os.path.join(self.current_path, area_json_file), encoding="utf8") as file: self.area = json.load(file) self.geocoder_url_template = "http://api.map.baidu.com/geocoder/v2/?address={addr}&output=json&ak=" + baidu_ak self.city_url_template = "http://api.map.baidu.com/geocoder/v2/?ak=" + baidu_ak \ + "&location={lat},{lng}&output=json" self.session = requests.session()
def getContent(table_name): db = dbHandle() jieba.del_word("电影") jieba.del_word("导演") jieba.del_word("没有") jieba.del_word("影片") jieba.del_word("看到") query_sql = "select comment_content from {0}".format(table_name) texts = db.query_db(query_sql) text = '' for t in texts: text += t[0] #result = jieba.analyse.textrank(text, topK=1000, withWeight=True) result = jieba.cut(text, cut_all=True) wl_space_split = " ".join(result) return wl_space_split
def del_sentiment_dict(): for word in sentiment_emotion_dict: jieba.del_word(word) for word in sentiment_privative_dict: jieba.del_word(word) for word in sentiment_transitional_dict: jieba.del_word(word) for word in sentiment_degree_dict: jieba.del_word(word)
def process(test_path, result_path): jieba.load_userdict("./user.dict") jieba.del_word('元用') word2idx = load_word2idx("./word2idx.dict") source_inputs = [] target_inputs = [] all_lineno = [] with open(test_path, 'r', encoding='utf-8') as fin: for line in fin: lineno, sen1, sen2 = line.encode('utf-8').decode( 'utf-8-sig').strip().split('\t') idx1 = [ word2idx.get(w, word2idx['<UNK>']) for w in jieba.cut(sen1) if w.strip() ] idx2 = [ word2idx.get(w, word2idx['<UNK>']) for w in jieba.cut(sen2) if w.strip() ] all_lineno.append(lineno) def standard_length(idx): if len(idx) > seq_length: idx = idx[:seq_length] else: for i in range(len(idx), seq_length): idx.append(word2idx['<UNK>']) return idx source_inputs.append(standard_length(idx1)) target_inputs.append(standard_length(idx2)) model = build_model(word2idx) model.load_weights('./atec.model') logits = model.predict([source_inputs, target_inputs]) predicts = np.reshape(np.argmax(logits, axis=1), -1) with open(result_path, 'r', encoding='utf-8') as f: results = [] for line in f: _, score = line.strip().split('\t') results.append(int(score)) print(classification_report(results, predicts))
def label(file,wordlist,classifier): lf = [] namelist4text = [] #keyword(file) countr = 0 countf = 0 list4nn = discorverynewword.dis_new_word(file,wordlist) for inn in list4nn: pred = classifier.predict(tran.input_word(wordlist,file,inn)) if int(pred[0]) != 0: namelist4text.append(inn) #namelist4text = list4nn for inn in namelist4text: jieba.add_word(inn, freq=1000, tag='nn') with open('./novel/'+file) as f: str4text = f.read() # 以两个回车为一个block进行分割 ls4block = str4text.strip().split('\n\n') for block in ls4block: lb = dealblock(file,block,wordlist,namelist4text) lf.append(lb) # 每个 ls 是一个block # 打印输出 file 结果 with open('./result/label_'+file,'w') as f: for each in lf: f.write('\n') if each != -1: countr += 1 for r in each: f.write(r+'\n') else: countf += 1 for inn in namelist4text: jieba.del_word(inn) print('delete word ' +inn+' successful!') print(str(countr+countf)+' blocks complete!') return countr,countf
def Get_fenci(self): # jieba.add_word('石墨烯')#动态添加自定义单词 jieba.add_word('凱特琳') jieba.del_word('自定义词') jieba.add_word("易风化") filtered_tokens = [] test_sent = "" for i in range(1,2): Data_path = path + "he"+".txt" test_sent ="".join(open(Data_path, 'rb').read()) print (test_sent) words = jieba.cut(test_sent) filtered_tokens.append([each for each in jieba.cut(test_sent)]) print ('-'*40) print (json.dumps(filtered_tokens)) print("="*40)
def Get_fenci(self): # jieba.add_word('石墨烯')#动态添加自定义单词 jieba.add_word('凱特琳') jieba.del_word('自定义词') jieba.add_word("易风化") filtered_tokens = [] test_sent = "" for i in range(1, 2): Data_path = path + "he" + ".txt" test_sent = "".join(open(Data_path, 'rb').read()) print(test_sent) words = jieba.cut(test_sent) filtered_tokens.append([each for each in jieba.cut(test_sent)]) print('-' * 40) print(json.dumps(filtered_tokens)) print("=" * 40)
def key_name_split(input_key_name_cn): get_oradata = GetOracleData() jieba.del_word('总金额') jieba.load_userdict('E:/cai_project/python/test_file/keyword_dict.txt') seq_list = jieba.cut(input_key_name_cn.replace('的', '')) split_word_list = list(seq_list) standard_word_cn = [] business_word_list = [] kpi_word_list = [] aggr_word_list = [] cycle_word_list = [] for key in range(len(split_word_list)): if re.match('\\d+', split_word_list[key]): split_word_list.append(split_word_list[key]) else: root_result = get_oradata.get_root_word(split_word_list[key]) standard_word_cn.append(root_result['WORD_NAME_CN']) if key > 0 and re.match('\\d+', split_word_list[ key - 1]) and root_result['WORD_TYPE'] == '周期修饰词': root_result['WORD_NAME_EN'] = split_word_list[ key - 1] + root_result['WORD_NAME_EN'] root_result['WORD_NAME_CN'] = split_word_list[ key - 1] + root_result['WORD_NAME_CN'] root_result['WORD_NAME_EN_ABBR'] = split_word_list[ key - 1] + root_result['WORD_NAME_EN_ABBR'] cycle_word_list.append(root_result) elif root_result['WORD_TYPE'] == '周期修饰词': cycle_word_list.append(root_result) elif root_result["WORD_TYPE"] == '业务修饰词': business_word_list.append(root_result) elif root_result["WORD_TYPE"] == '指标修饰词': kpi_word_list.append(root_result) elif root_result["WORD_TYPE"] == '聚合修饰词': aggr_word_list.append(root_result) get_oradata.conn_close() standard_word_list = business_word_list + kpi_word_list + aggr_word_list + cycle_word_list standard_word_en = [x['WORD_NAME_EN'] for x in standard_word_list] standard_word_en_abbr = [ x['WORD_NAME_EN_ABBR'] for x in standard_word_list ] # print '_'.join(standard_word_en) # print '_'.join(standard_word_en_abbr) return '_'.join(standard_word_en), '_'.join(standard_word_en_abbr)
def cleanup_data(data_path: str): """clean up data to desired format""" stopwords = get_stopwords(path=f'{data_path}/stopwords.txt') parse_strategy = None with open(f'{data_path}/dictionary', 'r', encoding='utf-8') as dictionary, \ open(f'{data_path}/not_word', 'r', encoding='utf-8') as not_word: dictionary_lines = dictionary.readlines() not_word_lines = not_word.readlines() for dictionary_line in dictionary_lines: dictionary_line = dictionary_line.strip() jieba.add_word(dictionary_line) for not_word_line in not_word_lines: not_word_line = not_word_line.strip() jieba.del_word(not_word_line) with open(f'{data_path}/ant_train', 'r', encoding='utf-8') as ant_train, \ open(f'{data_path}/ant_train_add', 'r', encoding='utf-8') as ant_train_add, \ open(f'{data_path}/epidemic_dev.csv', 'r', encoding='utf-8') as epidemic_dev, \ open(f'{data_path}/epidemic_train.csv', 'r', encoding='utf-8') as epidemic_train, \ open(f'{data_path}/icqmc_train.txt', 'r', encoding='utf-8') as icqmc_train, \ open(f'{data_path}/icqmc_dev.txt', 'r', encoding='utf-8') as icqmc_dev, \ open(f'{data_path}/icqmc_test.txt', 'r', encoding='utf-8') as icqmc_test, \ open(f'{data_path}/simtrain_to05sts.txt', encoding='utf-8') as simtrain: ant_train_lines = ant_train.readlines() + ant_train_add.readlines() epidemic_lines = epidemic_dev.readlines( )[1:] + epidemic_train.readlines()[1:] icqmc_lines = icqmc_train.readlines()[1:] + icqmc_dev.readlines( )[1:] + icqmc_test.readlines()[1:] sentences = [] sentences += cleanup_corpus(epidemic_lines, ',', 2, 5, stopwords) sentences += cleanup_corpus(ant_train_lines, '\t', 1, 4, stopwords) sentences += cleanup_corpus(icqmc_lines, '\t', 0, 3, stopwords) with open(f'./preprocessed/data', 'a+', encoding='utf-8') as ant_file: for sentence in sentences: ant_file.write(f'{sentence}\n')
def analyze_suggestions_2(): fontpath = 'SourceHanSansCN-Regular.otf' content = open('Suggestions.txt', 'r').read() #移除 removes = ['最好', '考虑', '可以', '孩子', '不能', '不要', '希望', '主要', '离家近', '学生'] for rm in removes: jieba.del_word(rm) #添加 adds = [ '安全第一', '确保安全', '运行时间太长', '减少时间', '减少换乘', '缩短时间', '等车时间' '固定线路', '固定班次', '准时', '时间不要太长', '步行距离短', '准点', '票价合理', '公交车站', '公交站台', '站点', '小区门口', '附近' ] for add in adds: jieba.add_word(add) words = jieba.cut(content, cut_all=False) #print("Default Mode: " + "/ ".join(words)) # 精确模式 cuted_words = ' '.join(words) print cuted_words wc = WordCloud( font_path=fontpath, # 设置字体 background_color="white", # 背景颜色 max_words=120, # 词云显示的最大词数 max_font_size=500, # 字体最大值 min_font_size=20, #字体最小值 random_state=42, #随机数 collocations=True, #避免重复单词 width=1600, height=1200, margin=10, #图像宽高,字间距,需要配合下面的plt.figure(dpi=xx)放缩才有效 ) wc.generate(cuted_words) # print f # wc = WordCloud() # wc.generate(cuted_words) wc.to_file('./2.jpg')
def del_word_dict(word): ''' 向词典中删除单词 ''' jieba.del_word(word)
cuttest("长春市长春节讲话") cuttest("结婚的和尚未结婚的") cuttest("结合成分子时") cuttest("旅游和服务是最好的") cuttest("这件事情的确是我的错") cuttest("供大家参考指正") cuttest("哈尔滨政府公布塌桥原因") cuttest("我在机场入口处") cuttest("邢永臣摄影报道") cuttest("BP神经网络如何训练才能在分类时增加区分度?") cuttest("南京市长江大桥") cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究") cuttest('长春市长春药店') cuttest('邓颖超生前最喜欢的衣服') cuttest('胡锦涛是热爱世界和平的政治局常委') cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪') cuttest('一次性交多少钱') cuttest('两块五一套,三块八一斤,四块七一本,五块六一条') cuttest('小和尚留了一个像大和尚一样的和尚头') cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站') cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('AT&T是一件不错的公司,给你发offer了吗?') cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') cuttest('枪杆子中出政权') cuttest('张三风同学走上了不归路') cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。') cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。') jieba.del_word('很赞') cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么?')
# coding: utf-8 import jieba def cuttest(sentence): seg_list = jieba.cut(sentence, cut_all=False, HMM=True) print "全模式:", "/ ".join(seg_list) cuttest('我需要廉租房') cuttest('据说这位语言学家去参加神马学术会议了') cuttest('小明硕士毕业于中国科学院计算所,后在日本京都大学深造') cuttest('他来到了网易杭研大厦') # jieba.add_word('湖南') # jieba.add_word('长沙市') jieba.del_word('湖南长沙市') cuttest('湖南长沙市天心区') # cuttest(u'自然语言处理')
Created on 2015年5月11日 @author: BFD474 ''' from __future__ import print_function, unicode_literals import sys import jieba import jieba.posseg as pseg sys.path.append( "../" ) jieba.load_userdict( "userdict.txt" ) jieba.add_word( '石墨烯' ) jieba.add_word( '凱特琳' ) jieba.del_word( '自定义词' ) test_sent = ( "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n" "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n" "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。" ) words = jieba.cut( test_sent ) print( '/'.join( words ) ) print( "="*40 ) result = pseg.cut( test_sent ) for w in result: print( w.word, "/", w.flag, ", ", end = ' ' )
# -*- coding:utf-8 -*- # User: rudy # Time: 2015/11/01 import MySQLdb import jieba import pandas as pd from pandas import Series,DataFrame conn=MySQLdb.connect(host="115.28.149.242",user="******",passwd="***",db="test",charset="utf8") cursor = conn.cursor() jieba.load_userdict('./foobar.txt') jieba.del_word('web') sql = 'SELECT jd FROM lagou_source' cursor.execute(sql) result = cursor.fetchall() ci_arr = {} for item in result: temp_arr = jieba.lcut(item[0]) for key in temp_arr: if len(key) >= 3: temp = key.lower() if temp in ci_arr: ci_arr[temp] += 1 else: ci_arr[temp] = 1 is_first = True
test_sent = ( "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿,上海市浦东区\n", "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n", "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。" ) print test_sent[0].decode('utf-8') words = jieba.cut(test_sent[0].decode('utf-8')) print('/'.join(words)) print("="*40) ########### del word from dictionary jieba.del_word('云计算') jieba.add_word('浦东区') words = jieba.cut(test_sent[0].decode('utf-8')) print('/'.join(words)) """ result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ", end=' ') print("\n" + "="*40) terms = jieba.cut('easy_install is great') print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的')