def segment(self, cleaned_sentances: list, use_hmm: bool = False) -> list: jieba.enable_parallel(self.num_worker) cleaned_sentances = [ ' '.join(jieba.lcut(i, HMM=use_hmm)) for i in cleaned_sentances ] jieba.disable_parallel() return cleaned_sentances
def main(data_dir, file_dict, surfix, dry_run_dict): encoder_path = '{}/{}_encoder_source.txt'.format(data_dir, surfix) decoder_path = '{}/{}_decoder_source.txt'.format(data_dir, surfix) source_sentences, target_sentences = merge_blanks( os.path.join(data_dir, file_dict['source']), os.path.join(data_dir, file_dict['target'])) print('String Preprocessing') source_sentences = str_utils_en.text_cleaning(source_sentences) target_sentences = str_utils_ch.text_cleaning(target_sentences) print('Double check source={}, target={}'.format(len(source_sentences), len(target_sentences))) print('Word segmentation') jieba.initialize() jieba.disable_parallel() with ProcessingPool(nodes=min(os.cpu_count(), 5)) as pool: source_sentences = pool.map( lambda x: [i.strip() for i in x.strip().lower().split(' ') if len(i) >= 1], source_sentences) with ProcessingPool(nodes=min(os.cpu_count(), 5)) as pool: target_sentences = pool.map( lambda x: [ i.strip() for i in jieba.cut(x.strip(), cut_all=False) if len(i) >= 1 ], target_sentences) print('Triple check source={}, target={}'.format(len(source_sentences), len(target_sentences))) source_sentences, target_sentences = filter_sample(source_sentences, target_sentences) print('Triple check source={}, target={}'.format(len(source_sentences), len(target_sentences))) print( 'Writing pair into encoder and decoder source at {}'.format(data_dir)) with open(encoder_path, 'w', encoding='utf-8') as fe, open(decoder_path, 'w', encoding='utf-8') as fd: for encoder_source, decoder_source in zip(source_sentences, target_sentences): fe.write(' '.join(encoder_source).lower()) fe.write('\n') fd.write(' '.join(decoder_source).lower()) fd.write('\n') # better sub tokenizer can be used to generate dictionary dump_dictionary(data_dir, source_sentences, prefix='source', debug=True, dry_run=dry_run_dict) dump_dictionary(data_dir, target_sentences, prefix='target', debug=True, dry_run=dry_run_dict)
def cut_word(sentence, parallel=False, processnum=2): if parallel: # 开启并行分词模式,参数为并行进程数,不支持windows jieba.enable_parallel(processnum=processnum) word_list = jieba.lcut(sentence, cut_all=False, HMM=True) # 关闭并行分词模式 jieba.disable_parallel() else: word_list = jieba.lcut(sentence, cut_all=False, HMM=True) return word_list
def text_processing(folder_path, test_size=0.2): folder_list = os.listdir(folder_path) data_list = [] class_list = [] # 遍历文件夹 for folder in folder_list: new_folder_path = os.path.join(folder_path, folder) files = os.listdir(new_folder_path) # 读取文件 j = 1 for file in files: if j > 100: # 怕内存爆掉,只取100个样本文件,你可以注释掉取完 break with open(os.path.join(new_folder_path, file), 'r') as fp: raw = fp.read() ## 是的,随处可见的jieba中文分词 jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数,不支持windows word_cut = jieba.cut(raw, cut_all=False) # 精确模式,返回的结构是一个可迭代的genertor word_list = list(word_cut) # genertor转化为list,每个词unicode格式 jieba.disable_parallel() # 关闭并行分词模式 data_list.append(word_list) #训练集list class_list.append(folder.decode('utf-8')) #类别 j += 1 ## 粗暴地划分训练集和测试集 data_class_list = zip(data_list, class_list) random.shuffle(data_class_list) index = int(len(data_class_list) * test_size) + 1 train_list = data_class_list[index:] test_list = data_class_list[:index] train_data_list, train_class_list = zip(*train_list) test_data_list, test_class_list = zip(*test_list) #其实可以用sklearn自带的部分做 #train_data_list, test_data_list, train_class_list, test_class_list = sklearn.cross_validation.train_test_split(data_list, class_list, test_size=test_size) # 统计词频放入all_words_dict all_words_dict = {} for word_list in train_data_list: for word in word_list: if all_words_dict.has_key(word): all_words_dict[word] += 1 else: all_words_dict[word] = 1 # key函数利用词频进行降序排序 all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True) # 内建函数sorted参数需为list all_words_list = list(zip(*all_words_tuple_list)[0]) return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
def MakeSentences(self): sentences = [] files = self.getFilePathList() jieba.enable_parallel(8) for file in files: print(file) sentences += self.readFile(file) jieba.disable_parallel() return sentences
def DT(DT): results = session.query.filter( or_(session.documenttype == DT, session.region.like('%' + DT + '%'), session.court.like('%' + DT + '%'))).all() if not results: return render_template('worldcloudStatistics.html') # 定义一个字符串,保存关键字 keyword = "" for res in results: keyword += str(res.keyword) # 分词 fe = '|'.join(jieba.cut(keyword)) santi_words = [x for x in jieba.cut(fe) if len(x) >= 0] jieba.disable_parallel() # 提取关键字 c = Counter(santi_words).most_common(1000) keys = "" for word in c: if word[0].isdigit(): del word else: keys += str(word) # 词云制作 font = 'app/static/HYQiHei-55J.ttf' # 选择字体路径,这里使用了黑体G:/pythonWeb/web/005.jpg color_mask = plt.imread( "app/static/keyword/china.jpg") # 读取模板图片,这里使用了一张五角星图片 cloud = WordCloud(font_path=font, background_color='white', mask=color_mask, max_words=200, max_font_size=200, width=3000, height=3000, random_state=42) # 设置词云参数,字体,模板,背景白色,最大词量100个,最大字体尺寸100 # word_cloud = cloud.generate(fe) cloud.generate(fe) # 基于彩色图像生成相应彩色 image_colors = ImageColorGenerator(color_mask) plt.imshow(cloud) # 关闭坐标轴 plt.axis('off') # 绘制词云 plt.figure() plt.imshow(cloud.recolor(color_func=image_colors)) plt.axis('off') # 保存图片 word_cloud2 = cloud.generate(str(keys)) # 产生词云数据 word_cloud # wcould="分词词云_" wcould2 = "cloud" img = wcould2 + ".jpg" l = 'app/static/keyword/' word_cloud2.to_file(l + '/' + img) cloud.to_file(l + '/' + 'cloudword.png') return render_template('worldcloudStatistics.html', val1=time.time())
def get_cut_word_cixing(arg, parallel=False, num=1): if not parallel: s = arg res = psg.cut(s) return {x.word:x.flag for x in res} else: filename = arg s = open(filename).read() jieba.enable_parallel(parallel) res = psg.cut(s) jieba.disable_parallel() return {x.word:x.flag for x in res}
def _tokenize(self, sentence, cut_all, cut_for_search, HMM, enable): if enable[0]: print('use multiprocessing') jieba.enable_parallel(enable[1]) else: jieba.disable_parallel() if not cut_for_search: sentence_temp = ' '.join(jieba.cut(sentence, cut_all, HMM)) return sentence_temp else: sentence_temp = ' '.join(jieba.cut_for_search(sentence, HMM)) return sentence_temp
def text_processing(folder_path, test_size=0.2): # test_size to divide set folder_list = os.listdir(folder_path) data_list = [] class_list = [] # traverse all folders for folder in folder_list: new_folder_path = os.path.join(folder_path, folder) files = os.listdir(new_folder_path) # read files files_number = 1 for file in files: if files_number > 100: # avoid break memory, only sample 100 files break with open(os.path.join(folder_path, file), 'r') as fp: raw = fp.read() jieba.enable_parallel(4) # Parallel processing is 4 word_cut = jieba.cut(raw, cut_all=False) # exact mode word_list = list( word_cut ) # generator turn to list, every word's format is unicode jieba.disable_parallel() # close the parallel mode data_list.append(word_list) # train set list class_list.append(folder.decode('utf-8')) # genre files_number += 1 # divide train set and test set (also could use sklearn to divide) data_class_list = zip(data_list, class_list) random.shuffle(data_class_list) index = int(len(data_class_list) * test_size) + 1 train_list = data_class_list[index:] test_list = data_class_list[:index] train_data_list, train_class_list = zip(*train_list) test_data_list, test_class_list = zip(*test_list) # statistic words frequency in all_words_list all_words_dict = {} for word_list in train_data_list: for word in word_list: if all_words_dict.has_key(word): all_words_dict[word] += 1 else: all_words_dict[word] = 1 # key function use words frequency by descending order # internal function sorted has to be list all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True) all_words_list = list(zip(*all_words_tuple_list[0])) return all_words_list, train_data_list, test_data_list, test_data_list, train_class_list, test_class_list
def DT(DT): Court = session.query.with_entities(session.court).distinct().all() Region = session.query.with_entities(session.region).distinct().all() document_Type = session.query.with_entities(session.document_Type).distinct().all() results = session.query.filter(DT==session.document_Type).all() if not results: return render_template('词云统计.html',court=Court,region=Region,document_Type=document_Type) num=random.randint(0,10000) H='关键字/' char_txt=H+str(num)+".txt" for row in results[0].keyword: fname1=row[0] #print(fname1) m=open(char_txt,'a') m.write(str(fname1)) santi_text = open(char_txt,'rb').read() fe='|'.join(jieba.cut(santi_text)) santi_words = [x for x in jieba.cut(fe) if len(x) >= 2] jieba.disable_parallel() c = Counter(santi_words).most_common(1000) f=open(char_txt,"w") word=[''] for word in c: if word[0].isdigit(): del word else: f.write(str(word)) f.close() m=open(char_txt,'r').read() #词云制作 font='HYQiHei-55J.ttf' #选择字体路径,这里使用了黑体 color_mask = imread("photo.jpg") #读取模板图片,这里使用了一张五角星图片 cloud = WordCloud(font_path=font,background_color='white',mask=color_mask,max_words=100,max_font_size=50,width=5000,height=5000)#设置词云参数,字体,模板,背景白色,最大词量100个,最大字体尺寸100 #word_cloud = cloud.generate(fe) word_cloud2=cloud.generate(str(m)) # 产生词云数据 word_cloud # wcould="分词词云_" # cy=wcould+str(num)+".jpg" # word_cloud.to_file(cy) #词云保存为图片w_cloud.jpg #print ("词云成功...") wcould2="词云" img=wcould2+".jpg" l='static/keyword' word_cloud2.to_file(l+'/'+img) return render_template('词云统计.html',court=Court,region=Region,document_Type=document_Type,val1=time.time())
def getOneSong(self, lyric, id): try: # 先改变状态到生成歌曲中, 锁住 self.dbManager.execute( "update rap_music163 set status = 2 where status = 1 and id = '" + str(id) + "'") # 结巴分词 print len(lyric) # 打开并行 # jieba.enable_parallel(4) # 关闭并行 jieba.disable_parallel() words = [x for x in jieba.cut(lyric) if len(x) >= 2] jieba.disable_parallel() from collections import Counter count = Counter(words).most_common(20) print count for vo in count: word = vo[0] number = vo[1] # 自增有序集合内value对应的分数 self.r.zincrby(self.sortedSetKey, word, number) # 自增zset_name对应的有序集合里a1对应的分数 print self.r.zcard(self.sortedSetKey) # # 获取关键词 # tags = jieba.analyse.extract_tags(lyric, topK=3) # print u"关键词:" # print " ".join(tags) # 循环每个词,数据库里确认是插入还是更新 redis更好 self.dbManager.execute( "update rap_music163 set status = 3 where status = 2 and id = '" + str(id) + "'") except Exception as err: # 打印异常堆栈 exstr = traceback.format_exc() print exstr c.Log('{} : {}'.format("Error 901", err))
def main(): xgkData = [ w.strip() for w in codecs.open('xgk_seg.txt', 'r', encoding='utf-8').readlines() ] model = gensim.models.KeyedVectors.load_word2vec_format('wiki_text.vector', binary=False) # print(xgkData[0]) jieba.enable_parallel(2) for index in range(len(xgkData)): words = xgkData[index] words = jieba.cut(words) words = list(set(words)) wordvecs = getWordVecs(words, model) data_vecs = pd.DataFrame(wordvecs) data_vecs.to_csv('vec/xgk_vec_' + str(index) + '.csv', index=False) jieba.disable_parallel()
def _cut_words(self, fromCache=True): if fromCache: wordFrags = pkl_load("wordFrags.pkl") else: wordFragsList = list() with DataBase() as db: newsID, newsData = db.get_news() jieba.enable_parallel(4) for news in show_status(newsData,"cut words"): frags = jieba.cut(news, cut_all=False) words = [frag for frag in frags if (frag not in self.stopWords) \ and (not frag.isspace() and (not frag.isdigit()))] wordFragsList.append(words) jieba.disable_parallel() wordFrags = dict(zip(newsID, wordFragsList)) pkl_dump("wordFrags.pkl") return wordFrags
def text_processing(folder_path,test_size=0.2): folder_list = os.listdir(folder_path) data_list = [] class_list = [] # 遍历文件夹 for folder in folder_list: new_folder_path = os.path.join(folder_path,folder) files = os.listdir(new_folder_path) # 读取文件 for file in files: with open(os.path.join(new_folder_path,file),'r') as fp: raw = fp.read() raw = raw.strip() # jieba并行模式 jieba.enable_parallel(4) word_list = jieba.lcut(raw) jieba.disable_parallel() data_list.append(word_list) class_list.append(folder) # 简单的划分训练集和测试集 data_class_list = list(zip(data_list,class_list)) # 对列表进行随机打散 random.shuffle(data_class_list) index = int(len(data_class_list)*test_size)+1 train_list = data_class_list[index:] test_list = data_class_list[:index] train_data_list,train_class_list = list(zip(*train_list)) test_data_list,test_class_list = list(zip(*test_list)) # 统计所有单词的词频 all_words_dict = {} for word_list in train_data_list: for word in word_list: all_words_dict.setdefault(word,0) all_words_dict[word] += 1 # 根据词频对单词进行降序排列 all_words_tuple_list = sorted(all_words_dict.items(),key=lambda f:f[1],reverse=True) all_words_list = list(list(zip(*all_words_tuple_list))[0]) return all_words_list,train_data_list,test_data_list,train_class_list,test_class_list
def text_processing(folder_path, test_size=0.2): """ :param folder_path: :param test_size: :return: text processing """ folder_list = os.listdir(folder_path) data_list = [] class_list = [] for folder in folder_list: new_folder_path = os.path.join(folder_path, folder) files = os.listdir(new_folder_path) for file in files: with open(os.path.join(new_folder_path, file), 'r') as fp: raw = fp.read() # try: # with codecs.open(os.path.join(new_folder_path,file), 'r', 'GB18030') as fp: # raw = fp.read() # except UnicodeDecodeError: # pass jieba.enable_parallel(2) word_cut = jieba.cut(raw, cut_all=False) word_list = list(word_cut) #print(word_list) jieba.disable_parallel() data_list.append(word_list) class_list.append(folder) train_data_list, test_data_list, train_class_list, test_class_list = train_test_split( data_list, class_list, test_size=test_size) all_words_dict = {} for word_list in train_data_list: for word in word_list: if word in all_words_dict: all_words_dict[word] += 1 else: all_words_dict[word] = 1 all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True) all_words_list = list(zip(*all_words_tuple_list))[0] #return all_words_list return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
def count_words(self, lwords): """ :param lwords: :return: dwords type is dict ; key :word#flag value :freq """ jieba.enable_parallel(10) # start many processes word_flags = [] # {word#flag : freq},{word:sex} for context in lwords: for sub in norm_seg(context): w = sub.word if self.oGWF.isGeneralWord(w.encode('utf-8')) or w.strip() == '': continue if len(w) >= int(self.word_length): key = '%s#%s' % (w, sub.flag) word_flags.append(key) logger.info('count is starting') jieba.disable_parallel() dwords = Counter(word_flags) return dwords
def test_key_words_with_jieba(type='jieba'): with open( '/Users/zhaowei/Desktop/八爪鱼/yeyonglong_enterprise_name/names2.txt' ) as f: a = f.readline() print(a) key_words = [] jieba.enable_parallel(2) if type == 'jieba': key_words = [x for x in jieba.cut(a) if len(x) > 1] elif type == 'jieba_fast': key_words = [x for x in jieba_fast.cut(a) if len(x) > 1] print(key_words) jieba.disable_parallel() # 获取高频词 num = 20 most_words = Counter(key_words).most_common(num) print('高频词汇{}:{}'.format(str(num), most_words))
def cut_word(sentence, parallel=False, processnum=2): if parallel: # 开启并行分词模式,参数为并行进程数,不支持windows jieba.enable_parallel(processnum=processnum) word_list = jieba.lcut(sentence, cut_all=False, HMM=True) # 关闭并行分词模式 jieba.disable_parallel() else: word_list = jieba.lcut(sentence, cut_all=False, HMM=True) # 去除停用词 stopwords = [ line.strip() for line in open(file='../resource/ChineseStopwords.txt', mode='r', encoding='UTF-8').readlines() ] new_word_list = [] for word in word_list: if not stopwords.__contains__(word): new_word_list.append(word) return new_word_list
def text_processing(folder_path, test_size=0.2): #test_size也就是训练集占80%,测试集占20% folder_list = os.listdir(folder_path) #该路径下的所有文件和文件夹 data_list = [] # 所有单词 class_list = [] # 类别,因为一个文本file属于一个folder,我们把folder名字直接作为类别就好 #遍历文件夹 for folder in folder_list: new_folder_path = os.path.join( folder_path, folder ) # join就是将两个拼接起来,比如folder_path为".", folder为"hello", join后就是"./hello" files = os.listdir(new_folder_path) #读取文件 j = 1 for file in files: if j > 100: #怕内存爆炸,只取100个样本文件,你可以注释掉取完 break with open(os.path.join(new_folder_path, file), 'r') as f: raw = f.read() #拿到的内容用jieba进行分词 jieba.enable_parallel(4) #开启并行分词模式,参数为线程个数,不支持windows word_cut = jieba.cut( raw, cut_all=False ) #cut_all为True就是全模式,为False就是精确模式,默认是精确模式。全模式就是"清华大学 华大",精确模式就只有一个"清华大学" #以上cut返回可迭代的generator word_list = list(word_cut) #generator转成list,每个次unicode格式 jieba.disable_parallel() # 关闭并行分词模式 data_list.append(word_list) class_list.append(folder.decode('utf-8')) #类别 j += 1 # 粗暴的划分训练集和测试集 data_class_list = zip(data_list, class_list) random.shuffle(data_class_list) index = int(len(data_class_list) * test_size) + 1 train_list = data_class_list[index:] test_list = data_class_list[:index] train_data_list, train_class_list = zip(*train_list) test_data_list, test_class_list = zip(*test_list)
def jieba_config(userdict=None, config=None, wordlist=None, parallel=False, p=0): """ Use load_userdict() to load your dict or use add_word add a word list or delete a word :param userdict: A list contains filename :param config: 'A' or 'D',add_word or del_word :param wordlist: A list contains specify word :param parallel: Configurate to enable multiprocessing :param p: Process number :return: A string """ if userdict: for file in userdict: load_userdict(file) if config == 'A': if wordlist: for word in wordlist: add_word(word) else: return 'Wordlist require' elif config == 'D': if wordlist: for word in wordlist: del_word(word) else: return 'Wordlist require' else: return 'Invalid config content' if parallel: if p >= 0: enable_parallel(p) else: return 'Invalid p content' elif not parallel: disable_parallel() else: return 'Invalid parallel content'
def process_text(self, test_size: int = 0.2): """处理语料库中的文本信息 Args: test_size (float): 测试集占比 Returns: sorted_words (List): 按照词频从大到小排列的单词列表 train_words_list (List): 训练文本列表 test_words_list (List): 测试文本列表 train_class_list (List): 训练类别列表 test_class_list (List): 测试类别列表 """ words_list, class_list = [], [] for folder in os.listdir(self.folder_path): for text_file in os.listdir(self.folder_path / folder): file_path = self.folder_path / folder / text_file with open(file_path) as f: content = f.read() jieba.enable_parallel(4) # 开启并行分词 segs = jieba.lcut(content, cut_all=False) # 精确模式分词 jieba.disable_parallel() words_list.append(segs) class_list.append(folder) # 划分训练集与测试集 train_words_list, test_words_list, train_class_list, test_class_list = train_test_split( words_list, class_list, test_size=test_size, random_state=0) # 统计词频 word_count = Counter() for words in train_words_list: word_count.update(words) # 将单词按照词频从大到小排序 sorted_words = sorted(word_count.keys(), key=lambda x: word_count[x], reverse=True) return sorted_words, train_words_list, test_words_list, train_class_list, test_class_list
def parse_article(self, response): item = ArticleItem() for key in list(parser_config['all_spider'].keys()): try: item[key] = response.xpath(parser_config['all_spider'][key]).extract()[0].encode('utf-8') if len( response.xpath(parser_config['all_spider'][key]).extract()) > 0 else '' except: traceback.print_exc() for key in list(parser_config[self.name].keys()): try: item[key] = response.xpath(parser_config[self.name][key]).extract()[0].encode('utf-8') if len( response.xpath(parser_config[self.name][key]).extract()) > 0 else '' except: traceback.print_exc() # url无法分辨的时候使用 if item['content_original'] == '': self.log('*** not article url for %s' % response._url.encode('utf-8')) return item['fromURL'] = response._url.encode('utf-8') item['creat_date'] = time.strftime("%Y/%m/%d %H:%M:%S") item['content_clear'] = del_html_attr(item['content_original']).encode('utf-8') item['lenth'] = len(item['content_clear'].replace(' ', '')) jieba.enable_parallel(20) cn_str = get_CN_str(item['content_clear']) words = [x.encode('utf-8') for x in jieba.cut_for_search(cn_str)] article_keywords = [x for x in words if len(x) >= len('标签')] article_descr = [x for x in words if len(x) >= len('分词短语')] article_note = [x for x in words if len(x) >= len('分词文章摘要')] jieba.disable_parallel() article_keywords = Counter(article_keywords).most_common(20) article_descr = Counter(article_descr).most_common(10) article_note = Counter(article_note).most_common(5) item['keywords_by_app'] = ','.join([c[0] for c in article_keywords]) item['descr_by_app'] = ','.join([c[0] for c in article_descr]) item['note_by_app'] = ','.join([c[0] for c in article_note]) return item
import jieba # import jieba.posseg as pseg import re from operator import itemgetter, attrgetter, methodcaller import time jieba.enable_parallel(4) # 开启并行分词模式,参数为并发执行的进程数 content = open( 'iter.txt', 'rb').read() # GuiZhou reports as the input. both finding and diagnosis. start_time = time.time() jieba.load_userdict("coronary_dict.txt") words = jieba.lcut(content) # words = pseg.lcut(content) ## 默认是精确模式 elapsed_time = time.time() - start_time time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) print(elapsed_time) jieba.disable_parallel() # 关闭并行分词模式 # words_sort = sorted(words, key=attrgetter('flag')) # words_set = sorted(set(words), key=attrgetter('flag')) words_set = set(words) for word in words_set: # in words_all: m_number = re.search(r"(\d*\.\d+|\d+)+", word) if m_number is None: print(word) # print (word.word, word.flag)
def testcase(): cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。") cuttest("我不喜欢日本和服。") cuttest("雷猴回归人间。") cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作") cuttest("我需要廉租房") cuttest("永和服装饰品有限公司") cuttest("我爱北京天安门") cuttest("abc") cuttest("隐马尔可夫") cuttest("雷猴是个好网站") if __name__ == "__main__": testcase() jieba.set_dictionary("foobar.txt") print "================================" testcase() jieba分词 1分词 jieba.cut 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型 jieba.cut_for_search 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细 待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8 jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用 jieba.lcut 以及 jieba.lcut_for_search 直接返回 list jieba.Tokenizer(dictionary=DEFAULT_DICT) 新建自定义分词器,可用于同时使用不同词典。jieba.dt 为默认分词器,所有全局分词相关函数都是该分词器的映射。 复制代码 # encoding=utf-8 import jieba seg_list = jieba.cut("我来到北京清华大学", cut_all=True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut("我来到北京清华大学", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) 复制代码 2添加自定义辞典 载入辞典 开发者可以指定自己自定义的词典,以便包含 jieba 词库里没有的词。虽然 jieba 有新词识别能力,但是自行添加新词可以保证更高的正确率 用法: jieba.load_userdict(file_name) # file_name 为文件类对象或自定义词典的路径 词典格式和 dict.txt 一样,一个词占一行;每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒。file_name 若为路径或二进制方式打开的文件,则文件必须为 UTF-8 编码。 词频省略时使用自动计算的能保证分出该词的词频。 调整辞典 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典。 使用 suggest_freq(segment, tune=True) 可调节单个词语的词频,使其能(或不能)被分出来。 注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。 复制代码 >>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) 如果/放到/post/中将/出错/。 >>> jieba.suggest_freq(('中', '将'), True) 494 >>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) 如果/放到/post/中/将/出错/。 >>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) 「/台/中/」/正确/应该/不会/被/切开 >>> jieba.suggest_freq('台中', True) 69 >>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) 「/台中/」/正确/应该/不会/被/切开 复制代码 3关键词提取 基于 TF-IDF 算法的关键词抽取 import jieba.analyse jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=()) sentence 为待提取的文本 topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20 withWeight 为是否一并返回关键词权重值,默认值为 False allowPOS 仅包括指定词性的词,默认值为空,即不筛选 jieba.analyse.TFIDF(idf_path=None) 新建 TFIDF 实例,idf_path 为 IDF 频率文件 基于 TextRank 算法的关键词抽取 jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 直接使用,接口相同,注意默认过滤词性。 jieba.analyse.TextRank() 新建自定义 TextRank 实例 算法论文: TextRank: Bringing Order into Texts 基本思想: 将待抽取关键词的文本进行分词 以固定窗口大小(默认为5,通过span属性调整),词之间的共现关系,构建图 计算图中节点的PageRank,注意是无向带权图 4. 词性标注 jieba.posseg.POSTokenizer(tokenizer=None) 新建自定义分词器,tokenizer 参数可指定内部使用的jieba.Tokenizer 分词器。jieba.posseg.dt 为默认词性标注分词器。 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法。 用法示例 >>> import jieba.posseg as pseg >>> words = pseg.cut("我爱北京天安门") >>> for word, flag in words: ... print('%s %s' % (word, flag)) 5. 并行分词 原理:将目标文本按行分隔后,把各行文本分配到多个 Python 进程并行分词,然后归并结果,从而获得分词速度的可观提升 基于 python 自带的 multiprocessing 模块,目前暂不支持 Windows 用法: jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数 jieba.disable_parallel() # 关闭并行分词模式 6. Tokenize:返回词语在原文的起止位置 注意,输入参数只接受 unicode 默认模式 result = jieba.tokenize(u'永和服装饰品有限公司') for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) word 永和 start: 0 end:2 word 服装 start: 2 end:4 word 饰品 start: 4 end:6 word 有限公司 start: 6 end:10 搜索模式 result = jieba.tokenize(u'永和服装饰品有限公司', mode='search') for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) word 永和 start: 0 end:2 word 服装 start: 2 end:4 word 饰品 start: 4 end:6 word 有限 start: 6 end:8 word 公司 start: 8 end:10 word 有限公司 start: 6 end:10
def disable_parallel(): jieba.disable_parallel()
print '获取词性----------------------------' import jieba.posseg as psg # print [(x.word,x.flag) for x in psg.cut(s)] for x in psg.cut(s): print x.word + " " + x.flag + ",", print '\n只获取名词--------------------------' # print [(x.word,x.flag) for x in psg.cut(s) if x.flag.startswith('n')] for x in psg.cut(s): if x.flag.startswith('n'): print x.word + " " + x.flag + ",", print '' #并行分词 # 开启并行分词模式,参数为并发执行的进程数 jieba.enable_parallel(5) santi_text = open('./santi.txt').read() print len(santi_text) santi_words = [x for x in jieba.cut(santi_text) if len(x) >= 2] # 关闭并行分词模式 jieba.disable_parallel() #获取出现频率Top n的词:还是以上面的三体全集文本为例,假如想要获取分词结果中出现频率前20的词列表,可以这样获取: from collections import Counter c = Counter(santi_words).most_common(20) print type(c) for each in c: print each[0] + u'' + str(each[1]) + ',', print '' print c
def fun4(): # 关闭 jieba.disable_parallel() # 开启 jieba.enable_parallel(4)
#========================================================== #jieba participle #parallel jieba.enable_parallel(1) #start:the paralleled num of processes;but one is better. jieba.set_dictionary("dict_for_jieba.txt") #set dictionary dir trainData = [] for s in train_file["String"]: trainData.append("/".join(jieba.cut(s))) print len(trainData) print trainData[0] jieba.disable_parallel() #turn off processes #============================================================ #TF-IDF :Extract features. #train_data = np.array(trainData["String"],dtype=np.float64) train_target = np.array(train_file["Value"],dtype=np.float64) #print train_target # print train_data[1] # print train_target[1] # # test_data = np.array(test_file["String"],dtype=np.float64) # # print test_data[0] # #=======================================================
>>> words = pseg.cut("我爱北京天安门") >>> for w in words: ... print w.word, w.flag ... 我 r 爱 v 北京 ns 天安门 ns 功能 5) : 并行分词 原理:将目标文本按行分隔后,把各行文本分配到多个python进程并行分词,然后归并结果,从而获得分词速度的可观提升 基于python自带的multiprocessing模块,目前暂不支持windows 用法: jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数 jieba.disable_parallel() # 关闭并行分词模式 例子: import urllib2 import sys,time import sys sys.path.append("../../") import jieba jieba.enable_parallel(4) url = sys.argv[1] content = open(url,"rb").read() t1 = time.time() words = list(jieba.cut(content)) t2 = time.time() tm_cost = t2-t1
def text_processing(file_path): mentality_list_num = {"happy_love": 0, "sad_guilty": 0, "angry_hatred": 0, "surprise_afraid": 0, "other_emotion":0} positive_num = 0 negative_num = 0 neutral_num = 0 # 获取档案里面需要特征词(关键词)的数量,用于IDF的计算 words_dict_in_total = {} # 获取档案里面所有评论的数量、之后用于IDF的计算中 comments_in_total = get_num_of_comments_in_total(file_path) # 获取所有档案里所有的评论,用于IDF的计算中 comments = get_total_comments_list(file_path) with open(file_path, 'r', encoding='utf-8') as f: lines = f.read() # 数据的格式化处理,将每则新闻分开 lines = re.sub(r"\d+-\d+-\d+", "", lines) lines = lines.replace(' ', "") lines = lines.split("******") for line in lines: # 分离出评论区,之后都是对于评论区做讨论 line = line.split("**评论区**") try: # 将评论区格式"*评论*d+"消除 line[1] = re.sub("\*\w+\*\d+:", "", line[1]) # 使用jieba分词软件对于对于文本进行词语切分 # 开启并行分词模式,参数为并行进程数, # 精确模式,返回的结构是一个可迭代的genertor, # genertor转化为list,每个词unicode格式, # 关闭并行分词模式 jieba.enable_parallel(4) word_cut = jieba.cut(line[1], cut_all=False) word_list = list(word_cut) jieba.disable_parallel() all_words_dict = {} mentality_words_dict = {} emotion_words_dict = {} # jieba切词出来,进行一次筛选,然后统计词数放入all_words_dict for word in word_list: if not word.isdigit() and word not in stopwords_set and 1 < len(word) < 5: if word in all_words_dict: all_words_dict[word] += 1 else: all_words_dict[word] = 1 # 获得词总数,用于计算TF词频 word_num = len(word_list) for word in all_words_dict: if word not in words_dict_in_total: count = 0 for comment in comments: if word in comment: count += 1 words_dict_in_total[word] = count # 心态:TF * IDF: # TF————>新闻为单位 出现关键词的个数/ 新闻为单位 词语总个数 # IDF————>log(档案为单位 总评论数量 / 档案为单位 出现关键词的评论数量 + 1) mentality_words_dict[word] = all_words_dict[word]/word_num * math.log(comments_in_total/(words_dict_in_total[word]+1)) # 进行从高到低排序获得新闻为单位 统计出来的心态列表mentality_words_list、心态列表emotion_words_list mentality_words_list = sorted_words_dict(mentality_words_dict) # 筛选最前20名的词,构成特征词列表 mentality_feature_words = words_dict(mentality_words_list) # 依据心态特征词统计新闻的性质,使用score量化量化每则新闻观众心态 # 即 若在正面心态词集合中score+=TFIDF值,若在负面心态词集合中score-=TFIDF值 score = 0 mentality_score_list = {"happy_love":0, "sad_guilty":0, "angry_hatred":0, "surprise_afraid":0} for unit in mentality_feature_words: if unit in test_positive_set: score += mentality_words_dict[unit] if unit in test_negative_set: score -= mentality_words_dict[unit] # 根据score参数,逐一统计正面/负面/中性 三种心态状态的新闻总量 if score > 0: positive_num += 1 elif score == 0: neutral_num += 1 else: negative_num += 1 # 依据心态特征词统计新闻的性质,使用字典emotion_score_list的元素 量化量化每则新闻观众心态 # 即 若在某一心态词集合中,若在负面心态词集合中元素+=1 for unit in mentality_feature_words: if unit in happy_love_set: mentality_score_list["happy_love"] += mentality_words_dict[unit] if unit in sad_guilty_set: mentality_score_list["sad_guilty"] += mentality_words_dict[unit] if unit in angry_hatred_set: mentality_score_list["angry_hatred"] += mentality_words_dict[unit] if unit in surprise_afraid_set: mentality_score_list["surprise_afraid"] += mentality_words_dict[unit] # 如果字典emotion_score_list元素值都为0,第五类other_emotion+=1 if (mentality_score_list["happy_love"] == mentality_score_list["sad_guilty"] == mentality_score_list["angry_hatred"] == mentality_score_list["surprise_afraid"] == 0): mentality_list_num["other_emotion"] += 1 # 将字典emotion_score_list元素值比较大小,最大者在字典emotion_list_num元素值+=1,若有两者相同两者同+=1/2,类推下去 else: number = 0 top_item_list = [] for item in mentality_score_list: if mentality_score_list[item] == mentality_score_list[max(mentality_score_list, key=mentality_score_list.get)]: number += 1 top_item_list.append(item) for item in mentality_score_list: if item in top_item_list: mentality_list_num[item] += 1 / number except: pass return positive_num, neutral_num, negative_num, mentality_list_num
def main(): # 基本分词函数 segs = jieba.cut('我在学习自然语言处理') # 精确模式 print(list(segs)) segs = jieba.cut('我在学习自然语言处理', cut_all=True) # 全模式 print(list(segs)) segs = jieba.cut_for_search( # 搜索引擎模式 '小明硕士毕业于中国科学院计算所,后在哈佛大学深造') print(list(segs)) segs = jieba.lcut('小明硕士毕业于中国科学院计算所,后在哈佛大学深造') # lcut返回list print(segs) print(jieba.lcut('如果放到旧字典中将出错。')) jieba.suggest_freq(('中', '将'), True) # 调节词频, 使其能够被分出来 print(jieba.lcut('如果放到旧字典中将出错。')) print('-' * 100) # TF-IDF关键词抽取 root_path = Path('/media/bnu/data/nlp-practice/jieba-tutorials') with open(root_path / 'NBA.txt') as f: lines = f.read() tags = jieba.analyse.extract_tags(lines, topK=20, withWeight=False, allowPOS=()) print(tags) with open(root_path / '西游记.txt') as f: lines = f.read() tags = jieba.analyse.extract_tags(lines, topK=20, withWeight=False, allowPOS=()) print(tags) print('-' * 100) # TextRank关键词抽取 with open(root_path / 'NBA.txt') as f: lines = f.read() tags = jieba.analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n')) print(tags) with open(root_path / '西游记.txt') as f: lines = f.read() tags = jieba.analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) print(tags) print('-' * 100) # 词性标注 pseg = jieba.posseg.cut('我爱自然语言处理') for word, pos in pseg: print(word, pos) print('-' * 100) # 并行分词 jieba.enable_parallel(4) with open(root_path / '西游记.txt') as f: lines = f.read() t1 = time.time() seg = list(jieba.cut(lines)) t2 = time.time() print('Parallel Speed {} bytes/sec'.format(len(lines) / (t2 - t1))) jieba.disable_parallel() with open(root_path / '西游记.txt') as f: lines = f.read() t1 = time.time() segs = list(jieba.cut(lines)) t2 = time.time() print('Non-Parallel Speed {} bytes/sec'.format(len(lines) / (t2 - t1))) print('-' * 100) # 词语在原文的起止位置 tokens = jieba.tokenize('自然语言处理非常有用') # 默认模式 for token in tokens: print('{}\t\t start: {} \t\t end: {}'.format(token[0], token[1], token[2])) tokens = jieba.tokenize('自然语言处理非常有用', mode='search') # 搜索模式 print('-' * 100) for token in tokens: print('{}\t\t start: {} \t\t end: {}'.format(token[0], token[1], token[2]))
def get_file_cut_word_parallel(filename, parallel=2): file_text = open(filename).read() jieba.enable_parallel(parallel) file_words = [x for x in jieba.cut(file_text) if len(x) >= 2] jieba.disable_parallel() return file_words
def text_processing(folder_path, test_size=0.2): folder_list = os.listdir(folder_path) data_list = [] class_list = [] # 遍历文件夹 for folder in folder_list: new_folder_path = os.path.join(folder_path, folder) files = os.listdir(new_folder_path) # 读取文件 j = 1 for file in files: if j > 100: # 怕内存爆掉,只取100个样本文件,你可以注释掉取完 break with open(os.path.join(new_folder_path, file), 'r', encoding='utf-8') as fp: raw = fp.read() ## 是的,随处可见的jieba中文分词 # jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数,不支持windows ''' NotImplementedError: jieba: parallel mode only supports posix system ''' word_cut = jieba.cut(raw, cut_all=False) # 精确模式,返回的结构是一个可迭代的genertor word_list = list(word_cut) # genertor转化为list,每个词unicode格式 jieba.disable_parallel() # 关闭并行分词模式 data_list.append(word_list) # 训练集list ''' class_list.append(folder.decode('utf-8')) #类别 AttributeError: 'str' object has no attribute 'decode' ''' class_list.append(folder) # 类别 j += 1 ## 粗暴地划分训练集和测试集 data_class_list = zip(data_list, class_list) # print(data_class_list) ''' TypeError: object of type 'zip' has no len() 需要转 list ''' data_class_list = list(data_class_list) random.shuffle(data_class_list) index = int(len(data_class_list) * test_size) + 1 train_list = data_class_list[index:] test_list = data_class_list[:index] train_data_list, train_class_list = zip(*train_list) test_data_list, test_class_list = zip(*test_list) # 其实可用sklearn自带的部分做 # train_data_list, test_data_list, train_class_list, test_class_list = sklearn.cross_validation.train_test_split(data_list, class_list, test_size=test_size) # 统计词频放入all_words_dict all_words_dict = {} for word_list in train_data_list: for word in word_list: ''' if all_words_dict.has_key(word): 报错 'dict' object has no attribute 'has_key' has_key方法在python2中是可以使用的,在python3中删除了。 比如:if dict.has_key(word): 改为:if word in dict: ''' if word in all_words_dict: all_words_dict[word] += 1 else: all_words_dict[word] = 1 # key函数利用词频进行降序排序 all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True) # 内建函数sorted参数需为list ''''' all_words_list = zip(*all_words_tuple_list)[0] 运行报错:TypeError: 'zip' object is not subscriptable 因为 python 3.x 需要 list()后,再索引查找元素 改为 all_words_list = list(zip(*all_words_tuple_list))[0] ''' all_words_list = list(zip(*all_words_tuple_list))[0] return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list