def build_wordcloud(msg): msg = ','.join([m.split(DELIMITER)[-1]for m in msg]) words = list(jieba.cut(msg)) for i in range(len(words)): if words[i] in stopwords: words[i] = '' wc = WC(font_path='./resource/msyh.ttf', #如果是中文必须要添加这个,否则乱码 background_color='white', width=1000, height=800,).generate(' '.join(words)) wc.to_file('./output/chat/message_cloud.png') return './output/chat/message_cloud.png'
def make_wc(self): with open('/home/suifeng/文档/text.txt', 'r') as f: text = f.read() text_spilt = ''.join(jieba.cut(text)) wc = WC( background_color='black', #背景色 max_words=100, #最大词数 max_font_size=80, #最大字体大小 random_state=35, # 最多的字样随机形态 font_path='MSYH.TTC' #字体路径,必须有字体,字体文件可以从Windows系统下复制过来 ).generate(text_spilt) plt.imshow(wc) plt.axis('off') plt.show() wc.to_file('/home/suifeng/图片/python词云图.png')
def wc(): # 读取小说内容 file = 'C:/Users/26015/Desktop/金瓶梅/金瓶梅.txt' with open(file, 'r', encoding='utf-8') as f: content = f.read() # 产生词云 wordcloud = WC( font_path='C:/Windows/Fonts/simkai.ttf' , width=1400 , height=700 ).generate(content) # plt.imshow(wordcloud) # plt.axis('off') # plt.show() # 保存图片 wordcloud.to_file("金瓶梅.jpg")
def __init__(self, movie, stop_words, srt_folder): self.movie = movie self.stop_words = stop_words self.subtitle = Subtitle.get_from_movie(movie, srt_folder) lines = [line.content for line in self.subtitle.get_lines()] self.words = tokenize_text(' '.join(lines)) self.wordcloud_title = f"{self.movie.original_title} ({self.movie.release_date})" self.filename = os.path.join(CONFIG['PNG_FOLDER'], f"{self.wordcloud_title}.png") self.cloud = WC(background_color="white", max_words=200, stopwords=set(self.stop_words), width=1280, height=720, collocations=False) # Related to issue_5: Duplicated words in word cloud. # With this parameter in False we avoid repeated words. self.cloud.generate(self.words) if not os.path.exists(CONFIG['PNG_FOLDER']): os.mkdir(CONFIG['PNG_FOLDER'])
def topic_wordcloud(self, figname: str = "wordcloud"): """ Uses WordCloud library to display the topic Use Laplace Smoothing and compare the vocabs from the date of interest with vocab from the date of the year before """ from wordcloud import WordCloud as WC import matplotlib.pyplot as plt prev_yr_voc = self._prev_voc prev_day_voc = self._voc.previous_day() prev_yr_voc.voc.update(prev_day_voc) self._voc = self.laplace_smoothing(self._voc, prev_yr_voc) # Create wordcloud wc = WC().generate_from_frequencies(self._voc) plt.imshow(wc) plt.axis('off') plt.tight_layout() plt.savefig(figname)
def __init__(self): self._min_word_len = 1 self._text_processors = [ replace_abbr_not(), # remove_len_less_than(3), sentence_tokenize(), ] self._sentence_processors = [ word_tokenize(), remove_stop_words(), ] self._words_processors = [ word_lemmatizing(), correct_spelling(), ] self._word_list = [] self._wc = WC( max_words=100, font_path=None, background_color='white', # width=400, height=200, mask=cv2.imread('word_cloud.mask0.png'))
def word_cloud_1(image,df1,number_clusters,selected_adjectives): for i in range(number_clusters): df_words=df1[df1['own_cluster']==i] df_words=df_words.adjectives flat_text=[item for sublist in df_words for item in sublist] text="" for word in flat_text: if word.lower() in selected_adjectives: text=text+" "+word wordcloud = WC(mask=image,background_color="white",contour_width=3, contour_color="black").generate(text) plt.imshow(wordcloud, interpolation='bilinear') plt.suptitle("Most numerous words in cluster".upper()) plt.title(str(i)) plt.axis("off") plt.show() return
def wc2(): # 读取小说内容 file = 'C:/Users/26015/Desktop/金瓶梅/金瓶梅.txt' with open(file, 'r', encoding='utf-8') as f: content = f.read() cut_content = jieba.cut(content.strip()) # 读取停用词 stopwords = stopwordslist('stopwords.txt') outstr = '' for word in cut_content: if word not in stopwords and word != '\t': outstr = outstr + word + " " # 产生词云 wordcloud = WC( font_path='C:/Windows/Fonts/simkai.ttf' , width=1400 , height=700 ).generate(outstr) # 保存图片 wordcloud.to_file("金瓶梅2.jpg")
def weibo_cloud(self): """ 微博分词 词云 :return: """ try: # 获取微博数据 data_count.main() # 读取数据文件 with open("./data/weibo/data-quting.txt", "r", encoding="utf-8") as f: wl_space_split = f.read() # print(wl_space_split) self.fig.clf() # 清除之前的画图 ax = self.fig.add_subplot(111) # 设置中文字体 font = r'C:\Windows\Fonts\simfang.ttf' # print("123456798") # 对分词后的文本生成词云 my_wordcloud = WC( collocations=False, font_path=font, width=3000, height=2000, background_color="white").generate(wl_space_split) # 显示词云图 ax.imshow(my_wordcloud) # 是否显示x轴、y轴下标 ax.axis("off") self.canvas.draw() # TODO:这里开始绘制 except Exception as e: print(e)
token.counter ## SHOW - number of times each bigram and word appear bigrams = {k: v for k, v in token.counter.items() if k.count("~")} cnt = Counter(bigrams) cnt.most_common(5) # [('of~the', 14615), # ('in~the', 9913), # ('to~the', 7339), # ('on~the', 4883), # ('and~the', 4843)] bigrams ## SHOW the selection # Word-cloud wc = WC().generate_from_frequencies(bigrams) plt.imshow(wc) plt.axis("off") plt.tight_layout() ## SHOW plt.savefig("books_wordcloud.png", dpi=300) ## SKIP # Histogram hist = defaultdict(list) _ = [hist[v].append(k) for k, v in bigrams.items()] plt.plot(np.log([len(hist.get(i, [0])) for i in range(1, 2000)])) plt.grid() plt.xlabel("Frequency") plt.ylabel("Number of bigrams (log-scale)") plt.tight_layout()
plt.plot(n, v, '.') plt.plot(n, k * n**beta) plt.legend(['Measured', 'Predicted']) plt.grid() plt.xlabel('Number of tokens') plt.ylabel('Vocabulary Size') plt.tight_layout() plt.savefig('heaps_law2.png', dpi=300) # Activities date = dict(year=2022, month=1, day=10) voc = Vocabulary(date, lang='Es', country='MX') words = {k: v for k, v in voc.voc.items() if not k.count('~')} wc = WC().generate_from_frequencies(words) plt.imshow(wc) plt.axis('off') plt.tight_layout() plt.savefig('wordcloud_mx.png', dpi=300) ## Zipf's Law - $$f=\frac{c}{r}$$ countries = [ 'MX', 'CO', 'ES', 'AR', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY' ] vocs = Parallel(n_jobs=-1)( delayed(Vocabulary)(date, lang='Es', country=country) for country in tqdm(countries)) words = [{k: v
words.insert(0, '<s>') words.append('</s>') _ = [(a, b) for a, b in zip(words, words[1:])] bigrams.update(_) words = set([a for a, b in bigrams]) words2 = set([b for a, b in bigrams]) prev = dict() for (a, b), v in bigrams.items(): try: prev[a] += v except KeyError: prev[a] = v wc = WC().generate_from_frequencies(prev) plt.imshow(wc) plt.axis('off') plt.tight_layout() P = defaultdict(Counter) for (a, b), v in bigrams.items(): next = P[a] next[b] = v / prev[a] wc = WC().generate_from_frequencies(P['<s>']) plt.imshow(wc) plt.axis('off') plt.tight_layout() plt.savefig('wordcloud_prob_start.png', dpi=300)
#! -*- coding: utf-8 -*- from scipy.misc import imread from wordcloud import ImageColorGenerator, WordCloud as WC text = open('files/python_en.txt').read() back_coloring = imread('python_logo.png') wc = WC(background_color='white', mask=back_coloring, random_state=42, margin=2) wc.generate(text) image_colors = ImageColorGenerator(back_coloring) wc.recolor(color_func=image_colors) plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.show() wc.to_file('image_colored_wordcloud.png')
#! -*- coding: utf-8 -*- import numpy as np from PIL import Image import matplotlib.pyplot as plt from wordcloud import WordCloud as WC text = open('files/python_en.txt').read() alice_mask = np.array(Image.open('logo.png')) wc = WC(background_color='white', mask=alice_mask, contour_width=2, contour_color='steelblue') wc.generate(text) plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.show() wc.to_file('masked_wordcloud.png')
X = tm.transform(_) m = LogisticRegression(multi_class='multinomial').fit(X, y[tr]) # m = LinearSVC().fit(X, y[tr]) _ = [D[x][0] for x in val] hy[val] = m.predict(tm.transform(_)) ci = bootstrap_confidence_interval(y, hy) ci D = list(tweet_iterator('../../../datasets/semeval/semeval2017_En_train.json')) tm = TextModel(token_list=[-1]).fit(D) id2word = {v:k for k, v in tm.model.word2id.items()} _ = {id2word[k]:v for k, v in tm.model.wordWeight.items()} wc = WC().generate_from_frequencies(_) plt.imshow(wc) plt.axis('off') plt.tight_layout() plt.savefig('semeval2017_idf.png', dpi=300) cnt = Counter() _ = [cnt.update(tm.tokenize(x)) for x in D] wc = WC().generate_from_frequencies(cnt) plt.imshow(wc) plt.axis('off') plt.tight_layout() plt.savefig('semeval2017_tf.png', dpi=300) perf = load_model('dataset/performance.gz')
#! -*- coding: utf-8 -*- import jieba from scipy.misc import imread from wordcloud import ImageColorGenerator, WordCloud as WC text = open('files/python_zh.txt').read() cn_text = ' '.join(jieba.cut(text)) font_path = '/usr/share/fonts/adobe-source-han-sans-cn/SourceHanSansCN-Normal.otf' back_coloring = imread('images/python_logo.png') wc = WC(font_path=font_path, background_color='white', mask=back_coloring, random_state=42, margin=2) wc.generate(cn_text) image_colors = ImageColorGenerator(back_coloring) wc.recolor(color_func=image_colors) plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.show() wc.to_file('images/wordcloud_cn.png')
okt = Okt() # 형태소 분석 객체 생성 kwds = okt.pos(x, norm=True, stem=True) kwds_filtered = [x for x, y in kwds if y in ['Noun', 'Adjective', 'Verb']] return kwds_filtered df_rev.loc[:, 'kwd'] = df_rev.loc[:, 'text'].map(get_kwds) df_rev.to_excel(writer, sheet_name='reviews') for topic in [topic_diff_max, topic_diff_min]: for model in ['A', 'B']: df_filtered = df_rev[(df_rev['model'] == model) & (df_rev['topic'] == topic)] list_kwd = df_filtered['kwd'].sum() dict_kwd_cnt = Counter(list_kwd) df_kwd_cnt = df({ 'kwd': list(dict_kwd_cnt.keys()), 'cnt': list(dict_kwd_cnt.values()) }) df_kwd_cnt = df_kwd_cnt.sort_values(by='cnt', ascending=False) sheet_name = f'kwd_cnt_{model}_{topic}' df_kwd_cnt.to_excel(writer, sheet_name=sheet_name, index=False) wc = WC(max_words=40, font_path='./font/SeoulNamsanM.ttf', background_color='white') wc.generate_from_frequencies(dict_kwd_cnt) rst_png_file_path = f'./sample/kwd_cnt_{model}_{topic}.png' wc.to_file(rst_png_file_path) writer.sheets[sheet_name].insert_image(3, 4, rst_png_file_path) writer.save()
_ = " & ".join(map(lambda x: "{:0.4f}".format(x), w)) print(r"{} \\".format(_)) R_m = W.sum(axis=1) C_m = W.sum(axis=0) ind = np.dot(np.atleast_2d(R_m).T, np.atleast_2d(C_m)) for w in (W - ind): _ = " & ".join(map(lambda x: "{:0.4f}".format(x), w)) print(r"{} \\".format(_)) # Example of the [bigrams](#tab:bivariate-distribution) _ = [(bigram, [index[x] for x in bigram.split("~")]) for bigram in bigrams] _ = {key: co_occurrence[i, j] for key, (i, j) in _} wc = WC().generate_from_frequencies(_) plt.imshow(wc) plt.axis('off') plt.tight_layout() plt.savefig('wordcloud_us.png', dpi=300) M = co_occurrence.sum(axis=1) def get_diff(key): a, b = [index[x] for x in key.split('~')] if a == b: return -M[a] * M[b] return co_occurrence[a, b] - M[a] * M[b]
#stemming the tweets from nltk.stem.porter import * stemmer = PorterStemmer() tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) #Rejoining the tweets combined['clean_tweet'] = tokenized_tweet.apply(lambda x: ' '.join(w for w in x)) words = ' '.join([word for word in combined['clean_tweet']]) # Generating the word cloud from wordcloud import WordCloud as WC cloud = WC(width=800,height=500,random_state=21,max_font_size=21).generate(words) plt.figure(figsize = (10,7)) plt.imshow(cloud,interpolation="bilinear") plt.axis('off') plt.show() # Generating normal words normal_words = ' '.join([word for word in combined['clean_tweet'][combined['label'] == 0]]) normal_wordcloud = WC(width=800,height=500,random_state=21,max_font_size=110).generate(normal_words) plt.figure(figsize = (10,7)) plt.imshow(normal_wordcloud,interpolation="bilinear") plt.axis('off') plt.show()
def review_analysis(src_file_path, rst_file_path, font_file_path='./font/SeoulNamsanM.ttf'): """ 구매 후기 데이터 처리 모듈 :param src_file_path: 구매 후기 원본 데이터 경로 :param rst_file_path: 처리 결과를 저장할 경로 :param font_file_path: 글꼴 경로 """ from pandas import DataFrame as df import re from statistics import mean from konlpy.tag import Okt from collections import Counter from wordcloud import WordCloud as WC f = open(src_file_path, 'rt', encoding='utf-8', errors='ignore') rev = f.read() f.close() ptrn_html_code = re.compile(r'&[a-zA-Z]+?;') # HTML 코드 패턴 ptrn_html_tag = re.compile(r'</?[a-zA-Z]+?>') # HTML 태그 패턴 ptrn_all_clean = re.compile(r'[^0-9a-zA-Z가-힣.,:/\s]') # 기타 불용어 패턴 for cleaner in [ptrn_html_code, ptrn_html_tag, ptrn_all_clean]: rev = re.sub(cleaner, '', rev) re_phrase = r'^([AB]):([a-z]+?)관련/점수:([1-5])점/내용:([0-9a-zA-Z가-힣.,:\s]+?)$' ptrn_phrase = re.compile(re_phrase, re.M) rev_values = re.findall(ptrn_phrase, rev) models, topics, scores, texts = zip(*rev_values) df_rev = df({ 'model': models, 'topic': topics, 'score': scores, 'text': texts }) df_rev['score'] = pd.to_numeric(df_rev['score'], downcast='integer', errors='ignore') df_rev_mean = df_rev.pivot_table(values='score', index='model', columns='topic', aggfunc=mean) df_rev_diff = df_rev_mean.diff(periods=-1) topic_diff_max = df_rev_diff.loc['A', :].idxmax() # B대비 A의 성과가 가장 좋은 토픽 탐색 topic_diff_min = df_rev_diff.loc['A', :].idxmin() # B대비 A의 성과가 가장 나쁜 토픽 탐색 writer = pd.ExcelWriter(rst_file_path, engine='xlsxwriter') # 엑셀 파일 객체 생성 df_rev_mean.to_excel(writer, sheet_name='score_mean') df_rev_diff.to_excel(writer, sheet_name='score_diff') m_chart = writer.book.add_chart({'type': 'radar'}) # 토픽별 평균 점수로 방사형 차트 생성 m_cat = ['score_mean', 0, 1, 0, 5] m_val_a = ['score_mean', 1, 1, 1, 5] m_val_b = ['score_mean', 2, 1, 2, 5] m_chart.add_series({'name': 'A', 'categories': m_cat, 'values': m_val_a}) m_chart.add_series({'name': 'B', 'categories': m_cat, 'values': m_val_b}) m_chart.set_title({'name': '토픽별 점수 평균'}) writer.sheets['score_mean'].insert_chart(3, 0, m_chart) d_chart = writer.book.add_chart({'type': 'column'}) # 토픽별 평균 점수 차이로 차트 생성 d_cat = ['score_diff', 0, 1, 0, 5] d_val = ['score_diff', 1, 1, 1, 5] d_chart.add_series({'name': 'A-B', 'categories': d_cat, 'values': d_val}) d_chart.set_title({'name': '토픽별 점수 평균 차이 (A-B)'}) writer.sheets['score_diff'].insert_chart(2, 0, d_chart) def get_kwds(x): okt = Okt() # 형태소 분석 객체 생성 kwds = okt.pos(x, norm=True, stem=True) kwds_filtered = [ x for x, y in kwds if y in ['Noun', 'Adjective', 'Verb'] ] return kwds_filtered df_rev.loc[:, 'kwd'] = df_rev.loc[:, 'text'].map(get_kwds) df_rev.to_excel(writer, sheet_name='reviews') for topic in [topic_diff_max, topic_diff_min]: for model in ['A', 'B']: df_filtered = df_rev[(df_rev['model'] == model) & (df_rev['topic'] == topic)] list_kwd = df_filtered['kwd'].sum() dict_kwd_cnt = Counter(list_kwd) df_kwd_cnt = df({ 'kwd': list(dict_kwd_cnt.keys()), 'cnt': list(dict_kwd_cnt.values()) }) df_kwd_cnt = df_kwd_cnt.sort_values(by='cnt', ascending=False) sheet_name = f'kwd_cnt_{model}_{topic}' df_kwd_cnt.to_excel(writer, sheet_name=sheet_name, index=False) wc = WC(max_words=40, font_path=font_file_path, background_color='white') wc.generate_from_frequencies(dict_kwd_cnt) rst_png_file_path = f'./sample/kwd_cnt_{model}_{topic}.png' wc.to_file(rst_png_file_path) writer.sheets[sheet_name].insert_image(3, 4, rst_png_file_path) writer.save()