def __input_data(sentence1, sentence2, dtype="word", input_length=20): data_left_sentence = [] data_right_sentence = [] for s1, s2 in zip(sentence1, sentence2): if dtype == "word": # 句子中出现连续*号,表示数字 star = re.compile("\*+") data_left_sentence.append([ word2index[word] for word in list(jieba.cut(star.sub("1", s1))) if word in word2index ]) data_right_sentence.append([ word2index[word] for word in list(jieba.cut(star.sub("1", s2))) if word in word2index ]) if dtype == "char": data_left_sentence.append( [char2index[char] for char in s1 if char in char2index]) data_right_sentence.append( [char2index[char] for char in s2 if char in char2index]) # 对齐语料中句子的长度 data_left_sentence = pad_sequences(data_left_sentence, maxlen=input_length) data_right_sentence = pad_sequences(data_right_sentence, maxlen=input_length) return [data_left_sentence, data_right_sentence]
def __iter__(self): with open(model_dir + "atec_nlp_sim_train.csv","r", encoding="utf8") as atec: for line in atec: lineno, s1, s2, label=line.strip().split("\t") yield list(jieba.cut(s1)) + list(jieba.cut(s2)) with open("resources/wiki_corpus/wiki.csv",'r',encoding="utf8") as wiki: for line in wiki: title, doc = line.strip().split("|") for sentense in doc.split("#"): if len(sentense)>0: yield [word for word in list(jieba.cut(sentense)) if word and 0x4E00<= ord(word[0]) <= 0x9FA5]
def __load_data(dtype="word", input_length=20, w2v_length=VECTOR_LENGTH): filename = os.path.join(MODEL_DIR, "%s_%d_%d" % (dtype, input_length, w2v_length)) if os.path.exists(filename): return pd.read_pickle(filename) data_left_sentence = [] data_right_sentence = [] labels = [] for line in open(ANT_NLP_FILE_PATH, "r", encoding="utf8"): line_number, sentence1, sentence2, label = line.strip().split("\t") # 句子中出现连续*号,表示数字 star = re.compile("\*+") sentence1 = remove_punctuation(star.sub("1", sentence1)) sentence2 = remove_punctuation(star.sub("1", sentence2)) if dtype == "word": data_left_sentence.append([ word2index[word] for word in list(jieba.cut(sentence1)) if word in word2index ]) data_right_sentence.append([ word2index[word] for word in list(jieba.cut(sentence2)) if word in word2index ]) if dtype == "char": data_left_sentence.append([ char2index[char] for char in sentence1 if char in char2index ]) data_right_sentence.append([ char2index[char] for char in sentence2 if char in char2index ]) labels.append(int(label)) logging.info('length of featured sentence is ' + str(len(data_left_sentence))) # 对齐语料中句子的长度 data_left_sentence = pad_sequences(data_left_sentence, maxlen=input_length) data_right_sentence = pad_sequences(data_right_sentence, maxlen=input_length) labels = np.array(labels) pd.to_pickle((data_left_sentence, data_right_sentence, labels), filename) return (data_left_sentence, data_right_sentence, labels)
def load_data_and_labels_multiclass(sql, stops_words): """ Loads data from files, splits the data into words and generates labels. Returns split sentences and labels. """ engine = create_engine("mysql+pymysql://***:***@***.*.*.*:3306/FlatWhite?charset=utf8",encoding = 'utf-8') # Load stop words with codecs.open(stops_words, "r", "utf-8") as file: stops_words = [line.strip() for line in file.readlines()] # Load data from files data = pd.read_sql(sql,con=engine) x_corpus = data['corpus'].tolist() # Map the actual labels to one hot labels labels = sorted(set(data['label'].tolist())) one_hot = np.eye(len(labels),dtype = int) label_dict = dict(zip(labels, one_hot)) x_raw = [[item for item in jieba.cut(s) if item not in stops_words] for s in x_corpus] y_raw = data['label'].map(lambda s : label_dict[s]).tolist() return x_raw, y_raw, data
def filter_alphabet(one_dict): """ 过滤字母,多为基因序列等 :param one_dict: :return: """ one_pattern = r'[a-zA-Z]{3,}' pattern = re.compile(one_pattern) for key in list(one_dict): max_char = key.split("-")[-1] # 获取最大的字符 if max_char.isalpha(): # 判断字符是否是英文字符,若不是则不处理,因为如??????adfadsf re_res = pattern.findall(one_dict[key]) re_res_length = len(re_res) seg_res = jieba.cut(one_dict[key].strip(), cut_all=True) seg_res_list = list(seg_res) seg_res_list = [i for i in seg_res_list if i != ''] seg_res_list_length = len(seg_res_list) if seg_res_list_length == 0: continue elif seg_res_list_length * 0.5 <= re_res_length: one_dict.pop(key) return one_dict
def count(): #统计字频 with open(r'C:\Users\Administrator\Desktop\python计字频\斗破苍穹.txt', 'r') as f: str = f.read() words = jieba.cut(str) word = [x for x in words if len(x) > 1] dict = Counter(word) print(dict)
def eval(infile, ime): tot = 0 corr = 0 tot_sen = 0 corr_sen = 0 corpus = [] with open(infile, "r", encoding="utf-8") as inf: for line in tqdm(inf.readlines()): line = punc.sub('\n', line) line = line.split('\n') for item in line: if (len(item)): corpus.append(item) tot_sen = len(corpus) for item in tqdm(corpus): pinyin = [] item = filtrate.sub('', item) charlist = jieba_fast.cut(item) for word in charlist: pinyin.extend(list(lazy_pinyin(word))) try: res = ime.predictio(' '.join(pinyin)) dis = Levenshtein.distance(res, item) tot += len(item) corr += (len(item) - dis) if (dis == 0): corr_sen += 1 except: pass print("Prediction precision: (word)%f%%, (sentence)%f%%" % ((corr * 100 / tot), (corr_sen * 100 / tot_sen))) return corr * 100 / tot
def word_cut(self, sentences): if self.language == 'ch': func = lambda line: [ i.strip() for i in jieba.cut(line, cut_all=False) ] else: func = lambda line: line.split(" ") ##TODO: remove stop words or mark stop words t0 = time.time() word_cut = [] for line in tqdm(sentences): try: words = func(line) if self.language == 'ch': words = [ i for i in words if ((not i.isdigit()) and (i not in self.stop_words)) ] else: words = [ i for i in words if ((not i.isdigit()) and ( i not in self.stop_words) and (len(i) > 1)) ] if len(words) > 1: word_cut.append(words) except Exception as e: print(line) print(e) continue print('Single Process time {:.0f}'.format(time.time() - t0)) return word_cut
def pre_process(df, train_mode=True): x = lambda s: list(jieba.cut(star.sub("X", s))) df["words1"] = df["sent1"].apply(x) df["words2"] = df["sent2"].apply(x) if train_mode: df.to_csv(clean_path, sep="\t", index=False, encoding="utf8") return df
def make_segment_file(): print("seement file start") jieba.suggest_freq('沙瑞金', True) jieba.suggest_freq('田国富', True) jieba.suggest_freq('高育良', True) jieba.suggest_freq('侯亮平', True) jieba.suggest_freq('钟小艾', True) jieba.suggest_freq('陈岩石', True) jieba.suggest_freq('欧阳菁', True) jieba.suggest_freq('易学习', True) jieba.suggest_freq('王大路', True) jieba.suggest_freq('蔡成功', True) jieba.suggest_freq('孙连城', True) jieba.suggest_freq('季昌明', True) jieba.suggest_freq('丁义珍', True) jieba.suggest_freq('郑西坡', True) jieba.suggest_freq('赵东来', True) jieba.suggest_freq('高小琴', True) jieba.suggest_freq('赵瑞龙', True) jieba.suggest_freq('林华华', True) jieba.suggest_freq('陆亦可', True) jieba.suggest_freq('刘新建', True) jieba.suggest_freq('刘庆祝', True) with open("./in_the_name_of_people.txt") as f: document = f.read() d_cut = jieba.cut(document) res = " ".join(d_cut) with open("./segment_doc.txt", "w") as f: f.write(res) print("segment file ok")
def make_cut(model_file): with open(model_file, "r", encoding="utf-8") as f: doc = f.read() d_cut = " ".join(jieba.cut(doc)) with open("wiki_cut.txt", "w", encoding="utf-8") as f: f.write(d_cut) return d_cut
def testcut(testD,stopword,dictionary,similarity): s_id,t_id = [],[] for i in testD.index: text = testD.loc[i].values[0].strip() text = re.sub('[\"*\【\】\[\]\s*]','',text) # sub Special symbol text =re.sub('\([a-zA-z]+://[^\s]*\)','',text) # substitute URL text = re.sub('\d+\.*\d*','',text) text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", '',text) #cuting = jieba.cut(text) #cuting = ' '.join(cuting) temp = list(jieba.cut(text,HMM=True)) #temp=thu1.cut(text,text=True).split() word_list = temp '''### word_list = [] for word in temp: if word not in stopword: word_list.append(word) '''### test_corpus = dictionary.doc2bow(word_list) similarity.num_best = 21 temp_id = [] [temp_id.append(int(item[0])+1) for item in similarity[test_corpus]] if i not in temp_id: t_id.extend(temp_id[:20]) else: temp_id.remove(i) t_id.extend(temp_id) [s_id.append(i) for j in range(20)] dfre = pd.DataFrame({'source_id':s_id,'target_id':t_id}) return dfre
def tcutword(data,stopword): corpora_documents = [] for i in data.index: text = data.loc[i].values[0].strip() text = re.sub('[\"*\【\】\[\]\s*]','',text) # sub Special symbol text =re.sub('\([a-zA-z]+://[^\s]*\)','',text) # substitute URL text = re.sub('\d+\.*\d*','',text) text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", '',text) #cuting = jieba.cut(text) #cuting = ' '.join(cuting) temp = list(jieba.cut(text,HMM=True)) #temp=thu1.cut(text,text=True).split() word_list = temp ''' word_list = [] for word in temp: if word not in stopword: word_list.append(word) #text = ' '.join(temp) ''' corpora_documents.append(word_list) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(ttext) for ttext in corpora_documents] similarity = similarities.Similarity('-Similarity-index', corpus, num_features=99999999) return dictionary,similarity
def get_processed_content_from_content(event_dict): cat_content = event_dict['title'] +\ ('.' if event_dict['lang'] == 'en' else '。') +\ event_dict['content'] stop_words = ['。', ' ', ',', '.', ',', '的', '-', '了', '新冠', '病毒', '、', '研究', '和', '在', '发现', '中', '患者', '冠状病毒', '与', '肺炎', '团队', '人员', '(', ')', '是', '该', '对', '为'] return [word for word in jieba.cut(cat_content) if word not in stop_words]
def get_processed_content(brief): """first get content by _id, then concatenate title and content, then use jieba to cut the sentences, return list of words""" event_dict = get_content_by_id(get_id(brief)) cat_content = event_dict['title'] +\ '.' if event_dict['lang'] == 'en' else '。' +\ event_dict['content'] return [word for word in jieba.cut(cat_content)]
def extract_two(self, file=None): fn = codecs.open(file, 'r+', encoding='utf-8') string_data = fn.read() fn.close() # 文本预处理 pattern = re.compile( '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~。“”、:?,【】!()——↓0-9a-zA-Z\.\.\.\.\.\.]+' ) # pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除 string_data = string_data.replace('\n', '') string_data = string_data.replace('\u3000', '') string_data = string_data.replace('\r', '') string_data = string_data.replace(' ', '') logging.info(string_data) # 文本分词 seg_list_exact = jieba.cut(string_data, cut_all=False) # 精确模式分词 object_list = [] remove_words_custom = [ u'的', u',', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。', u' ', u'、', u'中', u'在', u'了', u'通常', u'如果', u'我们', u'需要', u'月', u'日' ] # 自定义去除词库 remove_words = self.parse_multiple_files( ['中文停用词表.txt', '哈工大停用词表.txt', '四川大学机器智能实验室停用词库.txt', '百度停用词表.txt']) remove_words = remove_words_custom + remove_words for word in seg_list_exact: # 循环读出每个分词 if word not in remove_words: # 如果不在去除词库中 logging.info('\n') logging.info(word) object_list.append(word) # 分词追加到列表 logging.info(object_list) # 词频统计 word_counts = collections.Counter(object_list) # 对分词做词频统计 word_counts_top10 = word_counts.most_common(10) # 获取前10最高频的词 print(word_counts_top10) # 输出检查 # 词频展示 font_path = r'C:\Windows\Fonts\simfang.ttf' mask = np.array(Image.open('background.jpg')) # 定义词频背景 wc = wordcloud.WordCloud( background_color='white', # 设置背景颜色 font_path=font_path, # 设置字体格式 mask=mask, # 设置背景图 max_words=200, # 最多显示词数 max_font_size=200, # 字体最大值 scale=80 # 调整图片清晰度,值越大越清楚 ) wc.generate_from_frequencies(word_counts) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案 wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 plt.figure() plt.imshow(wc) # 显示词云 plt.axis('off') # 关闭坐标轴 plt.show() # 显示图像 wc.to_file("bb.jpg") # 将图片输出为文件
def cut(file, outfile): with open(file, mode='r', encoding="utf-8") as f: document = f.read() document_cut = jieba.cut(document) result = ' '.join(document_cut) with open(outfile, mode='w', encoding="utf-8") as outF: outF.write(result) print("文件已分词!")
def jieba_cut(self,text): ''' 2020-6-3 发现'İrem 艾丽' - ywz_replace 报错:IndexError: string index out of range ''' try: text = self.ywz_replace(text) except: pass return list(jieba.cut(text))
def clean_data(file, outFIle): with open(file, mode="r", encoding="utf-8") as f: doc = f.read() # print(doc) doc_cut = jieba.cut(doc) # print(" ".join(doc_cut)) res = " ".join(doc_cut) with open(outFIle, mode='w', encoding='utf-8') as f2: f2.write(res)
def my_wordcloud(filename): punct = str.maketrans("!.,:;-?※></()=,、。/[]《》", " ") plt.rcParams['font.sans-serif'] = 'PingFang TC' # 設字型 # 讀取停用字 stop = [line.strip() for line in open('stopwords.txt').readlines()] print('停用字長度', len(stop)) all_segs = [] with open(filename) as file: for line in file: # print(line) line = line.translate(punct) segs = line.split(' ') for anyy in segs: if len(anyy.strip()) > 2: all_segs.append(anyy.strip()) # print(all_segs) print(len(all_segs)) jieba.load_userdict('userdict.txt') word_appear_times = {} for i in all_segs: # print('-'*30) # print(i,':',list(jieba.cut(i,cut_all=False))) for anyy in list(jieba.cut(i, cut_all=True)): anyy = anyy.lower() if anyy not in stop and len(anyy.strip()) > 2: # print(anyy) if anyy not in word_appear_times: word_appear_times[anyy] = 1 else: word_appear_times[anyy] += 1 else: continue # print('-'*30,'\n') # time.sleep(3) # print(word_appear_times) word_appear_times_ordered = sorted(word_appear_times.items(), key=lambda x: x[1], reverse=True) top150 = word_appear_times_ordered[0:150] # print(top150) top150_word = ' '.join([x[0] for x in top150]) print(top150_word) cloud_mask = np.array(Image.open("cloud_mask.png")) wc = WordCloud(colormap='RdYlGn', mask=cloud_mask, max_words=150, background_color="black", scale=4, font_path='/System/Library/Fonts/PingFang.ttc') # 產生文字雲 wc.generate(top150_word) wc.to_file(f'{filename[:-4]}.jpg')
def stop_word(line): data_line = line.strip() wordList = jieba_fast.cut(data_line) # wordlist是一个生成器 outStr = '' for word in wordList: if word not in stopword: outStr += word outStr += ' ' lineOut = outStr.strip().encode('utf-8') return lineOut
def seg_sentence(sentence): sentence_seged = jieba.cut(sentence.strip()) stopwords = stopwordslist('data/stopwords.txt') outstr = '' for word in sentence_seged: if word not in stopwords: if word != '\t': outstr += word outstr += " " return outstr
def make_segment_file(file_path): print("start seg file") jieba.load_userdict("./seg_dict.txt") with open(file_path) as f: document = f.read() d_cut = jieba.cut(document) res = " ".join(d_cut) with open("./segment_wiki.txt", "w") as f: f.write(res) print("segment file ok")
def extract_keyword_from_prodname(prod_name,stopwords=stopwords_path): try: word_list = jieba.cut(prod_name, cut_all=True) stop_words_list = get_stopwords(stopwords) if stop_words_list: word_list = [word.strip() for word in word_list if word.strip() not in stop_words_list] word_list = ' '.join(word_list) except: word_list = 'None' return word_list
def split(stop_lists, data): word_list = [] seg_list = jieba.cut(data, cut_all=False) list_str = " ".join(seg_list) for word in list_str.split(" "): if not (word.strip().lower() in stop_lists) and len(word.strip()) > 1: word_list.append(word) write_file( "/Users/red/Desktop/temp/news/data/word/" + str(uuid.uuid4()) + ".txt", word_list)
def get_topK(text, topK): text = jieba_fast.cut(text) result_str = [] for word in text: if word not in stopwords: if word != '\t': result_str.append(word) count = Counter(result_str) topk_words = count.most_common(topK) per_data = build_info_list(topk_words) return per_data
def __iter__(self): with open(ANT_NLP_FILE_PATH, "r", encoding="utf8") as atec: logging.info('generating word corpus, processing file %s', ANT_NLP_FILE_PATH) for line in atec: line_code, s1, s2, label = line.strip().split("\t") s1 = utils.remove_punctuation(s1) s2 = utils.remove_punctuation(s2) yield list(jieba.cut(s1)) + list(jieba.cut(s2)) for file in extract_wiki.list_all_files(PROCESSED_WIKI_FILE_PATH): logging.info('generating word corpus, processing file %s', file) with open(file, 'r', encoding="utf8") as wiki: for line in wiki: line = utils.remove_punctuation(line) if len(line) > 0: # 汉字的unicode编码范围是[0x4E00,0x9FA5] yield [ word for word in list(jieba.cut(line)) if word and 0x4E00 <= ord(word[0]) <= 0x9FA5 ]
def kmeans_spiltouttxtfile(needspiltfile, spilttype=10, stopwords=r"./stop_words_ch.txt"): f1 = open(needspiltfile, "r", encoding='utf-8', errors='ignore') middlespiltfilt = fh.get_path_file_subpath( needspiltfile) + "/" + fh.get_path_file_completebasename( needspiltfile) + "temp" f2 = open(middlespiltfilt, 'w', encoding='utf-8', errors='ignore') for line in f1: seg_list = jieba.cut(line, cut_all=False) w = (" ".join(seg_list)).replace("\t\t\t", "\t") f2.write(w) # print(w) f1.close() f2.close() #取需要分词的内容 titles = open(middlespiltfilt, encoding='utf-8', errors='ignore').read().split('\n') #查看内容,这里是一个list,list里面每个原素是分好的标题,查看下长度看有没有错误 #titles #len(titles) #构建停词函数,停词表是自己在网上搜的 def get_custom_stopwords(stop_words_file): with open(stop_words_file, encoding='utf-8') as f: stopwords = f.read() stopwords_list = stopwords.split('\n') custom_stopwords_list = [i for i in stopwords_list] return custom_stopwords_list #停用词函数调用 stop_words_file = stopwords stopwords = get_custom_stopwords(stop_words_file) # print(stopwords) #查看停用词,也是list格式 #stopwords #构建词向量,也就是把分好的次去除停词转化成kmeans可以接受的形式 from sklearn.feature_extraction.text import CountVectorizer count_vec = CountVectorizer(stop_words=stopwords) km_matrix = count_vec.fit_transform(titles) # print(km_matrix.shape) #查看词向量 #print(km_matrix.toarray()) #开始聚类啦 from sklearn.cluster import KMeans num_clusters = spilttype #聚为四类,可根据需要修改 km = KMeans(n_clusters=num_clusters) km.fit(km_matrix) clusters = km.labels_.tolist() #查看聚类的结果,是list,这里省略,看看长度是不是和title一样就行啦 #len(clusters) #最后把聚类结果写在一个新的txt里面 return clusters
def textSplit(content): dic = {} splitedText = "" seqlist = jieba_fast.cut(content) for word in seqlist: splitedText = splitedText + word + " " if word not in dic: dic[word] = 1 else: dic[word] += 1 wordCloud(splitedText) saveJson(dic, "报告全文") saveSheet(dic, "报告全文")
def word_seg(input_file, output_file, mode): if mode == 'word': jieba.load_userdict(dict_path) with open(output_file, 'w') as f, open(input_file, 'r') as fi: for l in fi: # remove all whitespace characters l = ''.join(l.split()) if mode == 'char': f.write(' '.join(list(l)) + '\n') else: seg = jieba.cut(l, cut_all=False) f.write(' '.join(seg) + '\n')
import jieba_fast as jieba c = '小明硕士毕业于中国科学院计算所,后在日本京都大学深造' print(" ".join(jieba.cut(c, cut_all = True))) print(" ".join(jieba.cut(c, cut_all = False))) print(" ".join(jieba.cut_for_search(c)))
import csv import jieba_fast kw = '把字刻在石头上' print(' '.join(jieba_fast.cut(kw))) print(' '.join(jieba_fast.cut_for_search(kw)))