def make_word_cloud(product, sentiment): if sentiment == "all": pos, neg = get_top_five_phrases(product,sentiment) pos.index = range(0,len(pos)) neg.index = range(0,len(neg)) pos_words_array = [] neg_words_array = [] for i in range(0,len(pos)): pos_words_array.append((pos["vocab"][i].upper(), float(pos["count"][i]))) for i in range(0,len(neg)): neg_words_array.append((neg["vocab"][i].upper(), float(neg["count"][i]))) wc = WordCloud(background_color="white", max_words=2000, max_font_size=300, random_state=42) # generate word cloud for positive positive_name = '../app/static/img/pos_wordcloud.png' wc.generate_from_frequencies(pos_words_array) wc.recolor(color_func=pos_color_func, random_state=3) wc.to_file(positive_name) # generate word cloud for negative negative_name = '../app/static/img/neg_wordcloud.png' wc.generate_from_frequencies(neg_words_array) wc.recolor(color_func=neg_color_func, random_state=3) wc.to_file(negative_name) return positive_name, negative_name
def generate_word_cloud(img_bg_path,top_words_with_freq,font_path,to_save_img_path,background_color = 'white'): # 读取背景图形 img_bg = imread(img_bg_path) # 创建词云对象 wc = WordCloud(font_path = font_path, # 设置字体 background_color = background_color, # 词云图片的背景颜色,默认为白色 max_words = 500, # 最大显示词数为1000 mask = img_bg, # 背景图片蒙版 max_font_size = 50, # 字体最大字号 random_state = 30, # 字体的最多模式 width = 1000, # 词云图片宽度 margin = 5, # 词与词之间的间距 height = 700) # 词云图片高度 # 用top_words_with_freq生成词云内容 wc.generate_from_frequencies(top_words_with_freq) # 用matplotlib绘出词云图片显示出来 plt.imshow(wc) plt.axis('off') plt.show() # 如果背景图片颜色比较鲜明,可以用如下两行代码获取背景图片颜色函数,然后生成和背景图片颜色色调相似的词云 #img_bg_colors = ImageColorGenerator(img_bg) #plt.imshow(wc.recolor(color_func = img_bg_colors)) # 将词云图片保存成图片 wc.to_file(to_save_img_path)
def draw_tag_cloud(users_tokens): from PIL import Image import matplotlib.pyplot as plt from wordcloud import WordCloud, ImageColorGenerator trump_coloring = np.array(Image.open("pics/trump.png")) freqs = get_full_frequencies(users_tokens) freq_pairs = freqs.items() wc = WordCloud(max_words=2000, mask=trump_coloring, max_font_size=40, random_state=42) wc.generate_from_frequencies(freq_pairs) image_colors = ImageColorGenerator(trump_coloring) # plt.imshow(wc) # plt.axis("off") # # plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor # plt.imshow(trump_coloring, cmap=plt.cm.gray) plt.axis("off") plt.show()
def generate_image(words, image): graph = np.array(image) wc = WordCloud(font_path=os.path.join(CUR_DIR, 'fonts/simhei.ttf'), background_color='white', max_words=MAX_WORDS, mask=graph) wc.generate_from_frequencies(words) image_color = ImageColorGenerator(graph) return wc, image_color
def wcloud(wf, color, save_as=None): """Create a word cloud based on word frequencies, `wf`, using a color function from `wc_colors.py` Parameters ---------- wf : list (token, value) tuples color : function from `wc_colors.py` save_as : str filename Returns ------- None """ wc = WordCloud(background_color=None, mode='RGBA', width=2400, height=1600, relative_scaling=0.5, font_path='/Library/Fonts/Futura.ttc') wc.generate_from_frequencies(wf) plt.figure() plt.imshow(wc.recolor(color_func=color, random_state=42)) plt.axis("off") if save_as: plt.savefig(save_as, dpi=300, transparent=True)
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'): """ calculate and show hot words of Job Description (JD) :param jd_dir: :return: """ if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0: print('Error! No valid content in {0}'.format(jd_dir)) sys.exit(0) else: jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)} for k, v in jd_and_dir.items(): text = "".join(pd.read_excel(v)['详情描述']) jieba.analyse.set_stop_words(STOPWORDS_PATH) jieba.load_userdict(USER_CORPUS) hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()) frequencies = {_[0]: _[1] for _ in hot_words_with_weights} print(frequencies) x, y = np.ogrid[:300, :300] mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2 mask = 255 * mask.astype(int) wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white", repeat=False, mask=mask) wordcloud.generate_from_frequencies(frequencies) import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
def generate_image(files, src_image): content = get_content(files) graph = np.array(Image.open(src_image)) wc = WordCloud(font_path=os.path.join(CUR_DIR, 'fonts/simhei.ttf'), background_color='white', max_words=MAX_WORDS, mask=graph) words = process_text(content) wc.generate_from_frequencies(words) image_color = ImageColorGenerator(graph) return wc, image_color
def create_Cloud(self, data): print ('creating wordpair graph...') self.twitter_mask = np.array(Image.open(path.join(path.dirname(__file__), 'MASK/twitter_mask.png'))) for word in data: wordcloud = WordCloud(font_path=path.join(path.dirname(__file__), 'FONT/CabinSketch-Bold.ttf'), relative_scaling=.5, width=1800, height=1400, stopwords=None, mask=self.twitter_mask) wordcloud.generate_from_frequencies(list(data[word].items())) wordcloud.to_file(path.join(path.dirname(__file__), 'WORDPAIRS/'+word+'.png')) return
def create_wordcloud(wordcloud_data): mask = imread(MASK_PATH) wordcloud = WordCloud(max_words=1000, mask=mask, stopwords=None, margin=10, random_state=1, font_path=FONT_PATH, prefer_horizontal=1.0, width=WORD_CLOUD_WIDTH, height = WORD_CLOUD_HEIGHT, background_color='black', mode='RGBA') word_importance_list = [(dct['word'], dct['importance']) for dct in wordcloud_data['words']] partisanship_list = [dct['partisanship'] for dct in wordcloud_data['words']] kwargs = {'word_partisanship': partisanship_list} wordcloud.generate_from_frequencies(word_importance_list, **kwargs) return wordcloud
def generateWordCloud(): words_old = [ #some words to visualize { 'word': 'this', 'size': 55, 'color': COLOR_RED, 'font': '\'Indie Flower\', cursive', 'angle': '45' }, { 'word': 'Test', 'size': 73, 'color': COLOR_BLUE, 'font': '\'Open Sans\', sans-serif', 'angle': '-30' }, { 'word': 'kinDA', 'size': 153, 'color': COLOR_GREEN, 'font': '\'Indie Flower\', cursive', 'angle': '-150' }, { 'word': 'WERKS', 'size': 33, 'color': COLOR_PURPLE, 'font': '\'Open Sans\', sans-serif', 'angle': '90' } ] # Read the whole text. words = [('chipotle', 55), ('McDonalds', 15), ('burgerking', 12), ('wendies', 41), ('using', 1), ('font', 2), ('randomize', 1), ('yet', 1), ('HHBs', 1), ('knowledge', 1), ('generator', 1), ('everything', 3), ('implementation', 2), ('simple', 2), ('might', 1), ('pixel', 1), ('real', 1), ('designs', 1), ('good', 1), ('without', 1), ('checking', 1), ('trees', 2), ('famous', 1), ('boxes', 1), ('every', 1), ('optimal', 1), ('front', 1), ('integer', 1), ('bit', 2), ('now', 2), ('easily', 1), ('shape', 1), ('fs', 1), ('stuff', 1), ('found', 1), ('works', 1), ('view', 1), ('right', 1), ('force', 1), ('generation', 3), ('hard', 1), ('back', 1), ('second', 1), ('sure', 1), ('Hopefully', 1), ('portrait', 1), ('best', 1), ('really', 2), ('speed', 1), ('method', 2), ('dataset', 2), ('figuring', 1), ('modify', 1), ('understanding', 1), ('represented', 1), ('come', 1), ('generate', 2), ('last', 2), ('fit', 1), ('Tweak', 1), ('study', 1), ('studied', 1), ('turn', 1), ('place', 2), ('isn', 1), ('uses', 2), ('implement', 1), ('sprites', 1), ('adjustable', 1), ('render', 1), ('color', 2), ('one', 1), ('fashion', 1), ('fake', 1), ('cloud', 5), ('size', 2), ('guess', 1), ('working', 1), ('Separate', 1), ('sake', 1), ('placing', 1), ('brute', 1), ('least', 2), ('insider', 1), ('lot', 1), ('basic', 1), ('prototype', 1), ('start', 1), ('empty', 1), ('sort', 1), ('testing', 1), ('spiral', 1), ('overlapping', 1), ('else', 1), ('controller', 1), ('part', 2), ('somewhat', 1), ('varying', 1), ('MySQL', 1), ('quad', 2), ('copy', 1), ('also', 1), ('bundled', 1), ('word', 9), ('algorithm', 2), ('typography', 1), ('will', 1), ('fll', 1), ('following', 2), ('bet', 1), ('perfecting', 1), ('proved', 1), ('orientation', 2), ('wordle', 1), ('JavaScript', 1), ('collision', 2), ('reads', 1), ('want', 1), ('ready', 1), ('compressing', 1), ('apparently', 1), ('check', 1), ('inefficient', 1), ('preferably', 1), ('end', 2), ('thing', 2), ('efficient', 1), ('make', 3), ('note', 1), ('python', 3), ('need', 3), ('complex', 1), ('instead', 1), ('hierarchical', 1), ('used', 1), ('ft', 1), ('see', 1), ('though', 2), ('moving', 1), ('preliminary', 1), ('data', 1), ('fm', 1), ('Figure', 2), ('database', 1), ('author', 1), ('together', 1), ('think', 1), ('provide', 1), ('definitely', 1), ('time', 1), ('position', 2), ('model', 2), ('D3', 1)] alice_mask = np.array(Image.open(path.join(d,"alice_mask.png"))) burrito_mask = np.array(Image.open(path.join(d,"burrito2.png"))) print alice_mask.shape print burrito_mask.shape # Generate a word cloud image wordcloud = WordCloud( background_color="white", max_words = 1500, mask = burrito_mask) wordcloud.generate_from_frequencies(words) # The pil way (if you don't have matplotlib) image = wordcloud.to_image() #words = wordcloud.process_text(text) #image.show() return serveImg(image)
def makeImage(text): alice_mask = np.array(Image.open("alice_mask.png")) wc = WordCloud(background_color="white", max_words=1000, mask=alice_mask) # generate word cloud wc.generate_from_frequencies(text) # show plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.show()
def _save_word_cloud_img(frequencies, file_path): """ ワードクラウドの画像ファイルを指定されたファイルパスに保存する。 参考:http://amueller.github.io/word_cloud/index.html :param frequencies: タブル(単語, 出現頻度)のリスト :param file_path: 画像ファイルのパス """ # 日本語フォントのパスが正しく設定されている必要がある。 font_path = config.JAPANESE_FONT_PATH wc = WordCloud(background_color='white', max_font_size=320, font_path=font_path, width=900, height=500) wc.generate_from_frequencies(frequencies) wc.to_file(file_path)
def create_word_cloud(df, mask_file, font_path): mask = np.array(Image.open(mask_file)) wc = WordCloud(relative_scaling=0.5, mask=mask, prefer_horizontal=1.0, background_color='white', font_path=font_path) wc.generate_from_frequencies(df.values) return wc
def word_cloud(dictionary,topic_index,topic_word): wd={} b_1 = np.argsort(topic_word[ipt,:])[::-1] cloud_word = [str(dictionary[i])+' ' for i in b_1] for j in b_1: wd[str(dictionary[j])] = topic_word[topic_index,j]/np.sum(topic_word[topic_index,:]) huaji = imread('250px.png') wc = WordCloud(width=1920, height=1080,background_color="white") wc.generate_from_frequencies(wd.items()) plt.figure() plt.imshow(wc) plt.axis('off') plt.show()
def test_generate_from_frequencies(): # test that generate_from_frequencies() takes input argument dicts wc = WordCloud(max_words=50) words = wc.process_text(THIS) result = wc.generate_from_frequencies(words) assert_true(isinstance(result, WordCloud))
def get_comments(): productURL='https://sclub.jd.com/comment/'\ 'productPageComments.action?'\ 'productId=11461683&score=0&'\ 'sortType=3&page=' pageURL='&pageSize=10&isShadowSku=0&callback=fetchJSON_comment98vv14008' for i in range(399): i=str(i) url=productURL+i+pageURL print(url) html=requests.get(url).content time.sleep(0.2) with open(r"jd_books.txt","ab") as f: f.write(html) # 信息可视化 html=open("jd_books.txt",encoding="utf-8") print(html) content=re.findall(r'"content":(.*?),',html) content_list=[] for i in content: if "img" not in i: content_list.append(str(i)) # 词云可视化 contents=''.join(content_list) contents_rank=jieba.analyse.extract_tags( contents,topK=40,withWeight=True) key_words=dict() for i in contents_rank: key_words[i[0]]=i[1] print(key_words) # 可视化 wc=WordCloud(font_path='/System/Libray/Fonts/PingFang.ttc', background_color='White', max_words=50) wc.generate_from_frequencies(key_words) plt.imshow(wc) plt.axis("off") plt.show()
def test_generate_from_frequencies(): # test that generate_from_frequencies() takes input argument of class # 'dict_items' wc = WordCloud(max_words=50) words = wc.process_text(THIS) items = words.items() result = wc.generate_from_frequencies(items) assert_true(isinstance(result, WordCloud))
def generatewordcloud(freqTable, inputImageFileName, outputImageFileName): global stopwordshearing ImageFile.LOAD_TRUNCATED_IMAGES = True img = Image.open(inputImageFileName) img = img.resize((980,1080), Image.ANTIALIAS) sl = STOPWORDS | stopwordshearing speakerArray = np.array(img) wc = WordCloud(background_color="white", max_words=1000, mask=speakerArray, stopwords=sl, random_state=42) wc.generate_from_frequencies(freqTable) #print wc.words_ # create coloring from image image_colors = ImageColorGenerator(speakerArray) wc.recolor(color_func=image_colors) wc.to_file(outputImageFileName)
def WordCloudTopic( items , imagePath = None): # Generate a word cloud image if imagePath: alice_coloring = np.array(Image.open(imagePath)) wc = WordCloud(background_color="white", max_words=200, mask=alice_coloring, stopwords=STOPWORDS.add("said"), max_font_size=300) # generate word cloud wc.generate_from_frequencies(items) image_colors = ImageColorGenerator(alice_coloring) plt.imshow(wc.recolor(color_func=image_colors)) else: wc = WordCloud(background_color="white", max_words=300, max_font_size=40, random_state=42) wordcloud = wc.generate_from_frequencies(items) plt.imshow(wordcloud) plt.axis("off") plt.show()
def generate_image(files, image_name): content = '' for f in files: content += open(f, 'rb').read() content += '\n' graph = np.array(Image.open(image_name)) wc = WordCloud(font_path=os.path.join(CUR_DIR, 'fonts/simhei.ttf'), background_color='white', max_words=MAX_WORDS, mask=graph) words = process_text(content) print(len(words)) wc.generate_from_frequencies(words) image = ImageColorGenerator(graph) plt.imshow(wc) plt.axis("off") plt.imshow(wc.recolor(color_func=image)) plt.axis("off") plt.figure() plt.imshow(graph, cmap=plt.cm.gray) plt.axis("off") plt.show()
def generate_word_cloud(top_words_with_freq, font_path, to_save_img_path, background_color='white'): # 创建词云对象 wc = WordCloud(font_path=font_path, # 设置字体 background_color=background_color, # 词云图片的背景颜色,默认为白色 max_words=100, # 最大显示词数为100 max_font_size=80, # 字体最大字号 random_state=50, # 字体的最多模式 width=500, # 词云图片宽度 margin=2, # 词与词之间的间距 height=300) # 词云图片高度 # 用top_words_with_freq生成词云内容 wc.generate_from_frequencies(top_words_with_freq) # 用matplotlib绘出词云图片显示出来 plt.imshow(wc) plt.axis('off') plt.show() # 将词云图片保存成图片 wc.to_file(to_save_img_path)
def plotCloud(): while True: try: ipt = raw_input('Topic:') except ImportError: print 'invalid type' else: cloud_word_tuple = LDA.get_topic_terms(ipt,topn=50) cloud_word = [str(dictionary[i[0]])+' ' for i in cloud_word_tuple] wd={} for i in cloud_word_tuple: wd[str(dictionary[i[0]])] = i[1] huaji = imread('250px.jpg') wc = WordCloud() wc.generate_from_frequencies(wd.items()) plt.figure() plt.imshow(wc) plt.axis('off') plt.show() if ipt == 'exit()': break
class MyWordCloud(object): def __init__(self): self.stopwords = {} self.seg_list =[] self.m_wordcloud = None def StopWord(self,filename): # pass f = open(filename, 'r') line = f.readline().rstrip()#strip() while line: self.stopwords.setdefault(line, 0) self.stopwords[line.decode('utf-8')] = 1 line = f.readline().rstrip() f.close() return self.stopwords def WordCut(self,stopwords, inputfile): # pass with open (inputfile) as f: text = f.readlines() text = r' '.join(text) seg_generator = jieba.cut(text) self.seg_list = [i for i in seg_generator if i not in stopwords] self.seg_list = [i for i in self.seg_list if i != u' '] self.seg_list = r' '.join(self.seg_list) return self.seg_list def GenWordCloud(self, seg_list = None, font_path=None, background_color="black", margin=5, width=1800, height=800,flag=1): # pass self.m_wordcloud = WordCloud(font_path=font_path, background_color=background_color, margin=margin, width=width, height=height) if flag==0: self.m_wordcloud = self.m_wordcloud.generate_from_frequencies(seg_list) else : self.m_wordcloud = self.m_wordcloud.generate(seg_list) return self.m_wordcloud
def save_word_cloud(subreddit, frequencies, stopwords=STOPWORDS): try: # download images for subreddit download_images(['--score', MIN_SCORE, '--num', NUM_PHOTOS, '--sort-type', 'topall', subreddit, subreddit]) # get a list of downloaded file names coloring = [] for file in ext_files(subreddit, 'jpg') + ext_files(subreddit, 'png'): base_file = os.path.basename(file) filename = os.path.join(BASE_DIR, subreddit, base_file) # get the number of colors in the image and compare image = Image.open(filename) w, h = get_image_size(filename) if w > WIDTH and h > HEIGHT and all_colors(filename) > COLORS and num_colors(filename) >= DOMINANT_COLORS: coloring = np.array(image) break shutil.rmtree(subreddit) if not len(coloring): # get previews for gifs coloring = get_gif_coloring(subreddit) if not len(coloring): raise Exception('No suitable image found') wc = WordCloud(font_path=os.path.join(BASE_DIR, 'fonts', 'Viga-Regular.otf'), background_color="white", width=WIDTH, height=HEIGHT, max_words=500, mask=coloring, min_font_size=18) # generate word cloud wc.generate_from_frequencies(frequencies) # create coloring from image image_colors = ImageColorGenerator(coloring) # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") fig = plt.gcf() # save wordcloud for subreddit fig.savefig('{}.png'.format(subreddit), transparent=True) return "generated image for {}".format(subreddit) except Exception,e: print str(e)
def make_word_cloud(content): # read the mask image d = path.dirname(__file__) # alice_mask = np.array(Image.open(path.join(d, "mask/terran.jpg"))) mask = np.array(Image.open(path.join(d, mask_img))) # font__dir = '/var/www/FlaskApp/FlaskApp/word_cloud_min/_fonts/lth.ttf' # font__dir = 'C:\Users\zjsep_000\PycharmProjects\myDrone\word_cloud_min\_fonts\lth.ttf' # font__dir = '_fonts/lth.ttf' wc = WordCloud(background_color="white", max_words=1000, mask=mask) # give the absolute dir for font ttf file # wc.font_path = 'C:\Users\JI\Documents\GitHub\PycharmProjects\myDrone\word_cloud\_fonts\lth.ttf' wc.font_path = abs_font_dir # wc.font_path = 'C:\Users\zjsep_000\PycharmProjects\myDrone\word_cloud_min\_fonts\lth.ttf' # wc.font_path = '_fonts/lth.ttf' # wc.font_path = '/var/www/FlaskApp/FlaskApp/word_cloud_min/_fonts/lth.ttf' # brush options: {'shoujin_brush.ttf','Japan_brush.ttf','qingke_fangzheng.ttf','KouzanBrushFont.ttf'} # serfi-fonts:[] wc.generate_from_frequencies(content) # generate word cloud # wc.generate(text) # store to file wc.to_file(path.join(d, "img/output.png")) # store to static foder in web server # wc.to_file(path.join(d, "../static/output.png")) # show plt.imshow(wc) plt.axis("off") plt.figure() plt.imshow(mask, cmap=plt.cm.gray) plt.axis("off") plt.show()
def main(): # read the whole text reader = csv.reader(open('/Users/kudari/workspaces/final_homework/output/test.csv', 'r')) d = {} for k, v in reader: d[k] = int(v) # read the pic coloring = np.array(Image.open(path.join(d, "/Users/kudari/workspaces/final_homework/pic2.jpg"))) # set stopwords stopwords = set(STOPWORDS) # gerenate a wordcloud image # create a wordcloud wc = WordCloud( max_font_size = 88, background_color = 'white', font_path = "/Users/kudari/workspaces/final_homework/Fins-Regular.otf", width = 1000, height = 860, # 设置词云形状 mask = coloring, stopwords = stopwords, max_words = 500, ) # generate the cloud wc.generate_from_frequencies(frequencies = d) # create coloring from image image_colors = ImageColorGenerator(coloring) # show plt.imshow(wc.recolor(color_func = image_colors), interpolation="bilinear") plt.axis("off") plt.show() wc.to_file("/Users/kudari/workspaces/final_homework/test2.png")
def plot_wordcloud_with_property(topicWeightedWords, topicsByProperty): figure(figsize=(16, 40)) for idx, topic in enumerate(topicWeightedWords): wc = WordCloud(background_color="white") img = wc.generate_from_frequencies( [(word, weight) for weight, word in topic]) subplot(len(topicWeightedWords), 2, 2 * idx + 1) imshow(img) axis('off') subplot(len(topicWeightedWords), 2, 2 * idx + 2) plot(topicsByProperty[:, idx]) axis([10, 100, 0, 1.0]) title('Topic #%2d' % (idx))
def phrase2pic(phrase_file, out_png, font_path, mask_file): phrase_dict = txt2dict(phrase_file) pic_address = path.abspath(mask_file) pic = imread(pic_address) #读取图片 pic_color = ImageColorGenerator(pic) #根据图片生成颜色函数 wc = WordCloud(background_color='white', #构造wordcloud类 mask=pic, width = 750, height = 750, max_font_size = 80, random_state=30, font_path=font_path, max_words=500, min_font_size=2, color_func=pic_color ) wc.generate_from_frequencies(phrase_dict) # wc.generate(new_textlist) #生成词云图 plt.figure() #画图 plt.imshow(wc) plt.axis("off") plt.show() wc.to_file(out_png) #保存图片
def colorWordCould(e_dist): from PIL import Image import numpy as np bcimg=Image.open('D:/PS素材教程/漫威/mmexport148828636136611.png') bd=np.array(bcimg) wcld = WordCloud( background_color = 'white', # 设置背景颜色 mask = bd, max_words = 2000, # 设置最大显示的字数 max_font_size = 50, # 设置字体最大值 random_state = 30, # 设置有多少种随机生成状态,即有多少种配色方案 ) wordc=wcld.generate_from_frequencies(e_dist) ims=wordc.to_image() ims.show()
def create_wordcloud(self, filename=None): ''' create a wordcloud of the top words in a cluster ''' plt.figure() for idx, topic in enumerate(self.topic_weights): wc = WordCloud(background_color="white") ww = [(word, weight) for word, weight in topic.iteritems()] img = wc.generate_from_frequencies(ww) plt.subplot(len(self.topic_weights), 2, 2 * idx + 1) plt.axis('off') plt.imshow(img) if filename == None: plt.show() else: plt.savefig(filename, dpi=300) plt.close()
wc = WordCloud(width=1000, height=1000, background_color="black", max_words=100, mask=sword_mask) wc.generate(words_no_characters) wc.recolor(color_func=grey_color_func, random_state=3) wc.to_file("word_cloud/no_characters_wc_sword2.png") wc = WordCloud(width=1000, height=1000, background_color="black", max_words=100, mask=sword_mask) wc.generate_from_frequencies(word_could_dict) wc.recolor(color_func=grey_color_func, random_state=3) wc.to_file("word_cloud/characters_wc_sword2.png") wc = WordCloud(width=1000, height=1000, background_color="white", max_words=100, mask=throne_mask) wc.generate(words) wc.to_file("word_cloud/words_wc_throne.png") wc = WordCloud(width=1000, height=1000, background_color="white", max_words=100,
#-*- coding:utf-8 -*- from scipy.misc import imread from wordcloud import WordCloud import matplotlib.pyplot as plt import jieba from collections import Counter files = open('yanjiang.txt',encoding='utf-8',errors='ignore').read() text_jieba = list(jieba.cut(files)) c = Counter(text_jieba) common_c = c.most_common(100) bg_pic = imread('b.png') wc = WordCloud(font_path = '1.4.ttf',background_color='red',width=1000, height=800,mask=bg_pic,max_words=2000,max_font_size=1000,) wc.generate_from_frequencies(dict(common_c)) # 生成图片并显示 plt.figure() plt.imshow(wc) plt.axis('off') plt.show() # 保存图片 wc.to_file('anne.jpg')
# -*- coding: utf-8 -*- """ 演示程序:结合tuple和函数generate_from_frequencies,生成一张词云图 """ # 导入 wordcloud 模块和 matplotlib 模块 from wordcloud import WordCloud import matplotlib.pyplot as plt # 读入一组词频字典文件 text_dict = (('you', 3000), ('love', 3500), ('I', 3000), ('Li', 2800), ('Phone', 3000), ('my', 300), ('mu~', 2000), ('mu~~', 510), ('heart', 500), ('sweet', 180)) # 生成词云 wc = WordCloud() wordcloud = wc.generate_from_frequencies(frequencies=text_dict) # 显示词云图片 plt.figure(num="demo", figsize=(5, 6), dpi=500, facecolor='w', edgecolor='w') plt.axis('off') plt.imshow(wordcloud) plt.show() # 保存图片 wc.to_file('outputFiles/demo1_output.jpg') plt.close()
if other_drugs[i] in filt_absList[j]: freq = FreqDist(filt_absList[j]) other_freq_df = pd.DataFrame(list(freq.items()), columns=["Word", "Frequency"]) # sort in order of frequency, most common first app_freq_df.sort_values(by=['Frequency'], ascending=False, inplace=True) other_freq_df.sort_values(by=['Frequency'], ascending=False, inplace=True) ## -------- Wordcloud of word frequency --------------------- d = {} for a, x in app_freq_df.values: d[a] = x wordcloud = WordCloud() wordcloud.generate_from_frequencies(frequencies=d) plt.figure() plt.imshow(wordcloud, interpolation="bilinear", cmap='RdBu') plt.axis("off") plt.title('approved drugs') plt.show() d = {} for a, x in other_freq_df.values: d[a] = x wordcloud = WordCloud() wordcloud.generate_from_frequencies(frequencies=d) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off")
occurence.append([k, v]) occurence.sort(key=lambda x: x[1], reverse=True) return occurence, keyCount occurence, dum = wordCounter(movies_ds, 'genres', genreNames) # Create the dictionary to produce a wordcloud of the movie genres genres = dict() trunc_occurences = occurence[0:18] for name in trunc_occurences: genres[name[0]] = name[1] # Create and display the wordcloud genre_wordcloud = WordCloud(width=1000, height=400, background_color='white') genre_wordcloud.generate_from_frequencies(genres) f, ax = plot.subplots(figsize=(16, 8)) plot.imshow(genre_wordcloud, interpolation="bilinear") plot.axis('off') plot.show() # Break up the big genre string into a string array movies_ds['genres'] = movies_ds['genres'].str.split('|') # Convert genres to string value movies_ds['genres'] = movies_ds['genres'].fillna("").astype('str') tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(movies_ds['genres'])
adv_uni_dict = {} for i in range(len(sorted_adv_unigrams)): adv_uni_dict[''.join(sorted_adv_unigrams[i][0])] = sorted_adv_unigrams[i][1] adv_bi_dict = {} for i in range(len(sorted_adv_bigrams)): adv_bi_dict[''.join(sorted_adv_bigrams[i][0])] = sorted_adv_bigrams[i][1] #instantiate wordcloud object into variable #to display max of highest 30 words, image height, width and background color to display wordCloud = WordCloud(max_words=30, height=1000, width=1500, background_color='white') #generate the word cloud and store it as image files in current project location poswc_unigrams = wordCloud.generate_from_frequencies(pos_uni_dict) poswc_unigrams.to_file('poswc_unigrams.png') conwc_unigrams = wordCloud.generate_from_frequencies(con_uni_dict) conwc_unigrams.to_file('conwc_unigrams.png') advwc_unigrams = wordCloud.generate_from_frequencies(adv_uni_dict) advwc_unigrams.to_file('advwc_unigrams.png') poswc_bigrams = wordCloud.generate_from_frequencies(pos_bi_dict) poswc_bigrams.to_file('poswc_bigrams.png') conwc_bigrams = wordCloud.generate_from_frequencies(con_bi_dict) conwc_bigrams.to_file('conwc_bigrams.png') advwc_bigrams = wordCloud.generate_from_frequencies(adv_bi_dict) advwc_bigrams.to_file('advwc_bigrams.png') #to view in console using matplotlib # plt.title('Pro Unigrams words') # plt.imshow(poswc_unigrams, interpolation='bilinear')
c = Counter(text_jieba) # 计数 word = c.most_common(500) # 取前500 bg_pic = imread('src.jpg') wc = WordCloud( #font_path='C:\Windows\Fonts\微软雅黑.TTF', # 指定中文字体 background_color='white', # 设置背景颜色 max_words=2000, # 设置最大显示的字数 mask=bg_pic, # 设置背景图片 max_font_size=200, # 设置字体最大值 random_state=20 # 设置多少种随机状态,即多少种配色 ) wc.generate_from_frequencies(dict(word)) # 生成词云 print(dict(word)) wc.to_file('result.jpg') # show plt.imshow(wc) plt.axis("off") plt.figure() plt.imshow(bg_pic, cmap=plt.cm.gray) plt.axis("off")
from wordcloud import WordCloud import matplotlib.pyplot as plt import pandas as pd df = pd.read_csv('./dat/news_word_top.csv') df.set_index('word', inplace=True) freq_dict = df.to_dict()['count'] font_path = './font/NotoSansKR-Regular.otf' wc = WordCloud(background_color='white', max_words=1000, font_path=font_path, width=1920, height=1080) wc.generate_from_frequencies(freq_dict) plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.show()
f.write(content_str) #------------------------------------------------------------ # 4) 수집결과를 기반으로 형태소 분석 #------------------------------------------------------------ # 형태소 분석 객체를 통해 수집된 뉴스 본문에서 명사만 추출 nlp = Okt() nouns = nlp.nouns(news_content) count = Counter(nouns) # 명사들에 대한 빈도수 검사 most = count.most_common(100) # 가장 많이 사용된 단어 100개 추출 # 추출 결과를 워드 클라우드에서 요구하는 형식으로 재구성 # --> {"단어": 빈도수, "단어": 빈도수 ...} tags = {} for n, c in most: if len(n) > 1: tags[n] = c #------------------------------------------------------------ # 5) 수집결과를 활용하여 워드클라우드 생성 #------------------------------------------------------------ # 워드 클라우드 객체 만들기 wc = WordCloud(font_path="NanumGothic", max_font_size=200, width=1200, height=800, background_color='#ffffff') wc.generate_from_frequencies(tags) # 미리 준비한 딕셔너리를 통해 생성 wc.to_file("news_%s.png" % datetime) # 워드 클라우드 이미지 저장
lyric += f.read() result = jieba.analyse.textrank(lyric, topK=50, withWeight=True) keywords = dict() for i in result: keywords[i[0]] = i[1] print(keywords) image = Image.open('Mrs.jpeg') graph = np.array(image) wc = WordCloud(font_path='DroidSansFallback.ttf', background_color='White', max_words=50, mask=graph) wc.generate_from_frequencies(keywords) image_color = ImageColorGenerator(graph) plt.imshow(wc) plt.imshow(wc.recolor(color_func=image_color)) plt.axis("off") plt.show() wc.to_file('output.png') X = [] Y = [] for key in keywords: X.append(key) Y.append(keywords[key]) num = len(X)
text = konlpy.utils.read_txt('이용수할머니기자회견문.txt', encoding=u'utf-8') nouns = okt.nouns(text) words = [] for i in nouns: if len(i) > 1: words.append(i) count = Counter(words) most = count.most_common(100) tags = {} for i, j in most: tags[i] = j wc = WordCloud(font_path='NANUMSQUARE.TTF', width=1200, height=1200, scale=2.0, max_font_size=250) gen = wc.generate_from_frequencies(tags) plt.figure() plt.imshow(gen, interpolation='bilinear') wc.to_file('mh2.png') plt.close()
' '.join(data_es_it[(data_es_it.user_location == candidate)].text.tolist()) for candidate in text_cloud ] cv = CountVectorizer(stop_words=spanish_stopwords, ngram_range=(1, 3)) X = cv.fit_transform(corpus_es) X = X.toarray() bow = pd.DataFrame(X, columns=cv.get_feature_names()) bow.index = text_cloud text_es = bow.loc['spain'].sort_values(ascending=False)[:4000] text2_dict = bow.loc['spain'].sort_values(ascending=False).to_dict() # create the WordCloud object wordcloud = WordCloud(min_word_length=3, background_color='white') wordcloud.generate_from_frequencies(text2_dict) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() # create a dictionary for Italian text_cloud_it = data_es_it.user_location.unique() corpus_it = [ ' '.join(data_es_it[(data_es_it.user_location == candidate)].text.tolist()) for candidate in text_cloud ] # import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(stop_words=italian_stopwords, ngram_range=(1, 3))
han.pos(s_park[0]) han.nouns(s_park[0]) # step4: plotting # word cloud for Park s_park_noun = extract_nouns(s_park) count = Counter(s_park_noun) tags = count.most_common(100) # WordCloud, matplotlib: 단어 구름 그리기 font_path = "C:/WINDOWS/Fonts/NANUMGOTHIC.TTF" wc = WordCloud(font_path=font_path, background_color='white', width=800, height=600) cloud = wc.generate_from_frequencies(dict(tags)) plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud) plt.savefig('park.png', dpi=600) plt.show() # word cloud for Moon s_moon_noun = extract_nouns(s_moon) count = Counter(s_moon_noun) tags = count.most_common(100) # WordCloud, matplotlib: 단어 구름 그리기 font_path = "C:/WINDOWS/Fonts/NANUMGOTHIC.TTF" wc = WordCloud(font_path=font_path, background_color='white', width=800,
def get_chapter_cloud(chapters, chapter): chapter_cloud_data = [] unique_words = set(chapter) for word in unique_words: weight = utility.tf_idf(word, chapters, chapter) chapter_cloud_data.append((word, int(weight * 100))) weight = lambda element: element[1] chapter_cloud_data.sort(key=weight, reverse=True) return chapter_cloud_data if __name__ == '__main__': book = utility.get_text_file_as_list('shrek.txt') chapters = utility.split_by_delimiter(book, "#" * 10) preprocessed_chapters = [preprocess_text(chapter) for chapter in chapters] cloud_data = prepare_word_cloud_data(preprocessed_chapters) for i, data in enumerate(cloud_data): wc = WordCloud(background_color="white", max_words=2000, contour_width=3, contour_color='steelblue') wc.generate_from_frequencies(dict(data[5:])) wc.to_file(f'clouds/shrek_cloud{i}.png') # subexercise 5 preprocessed_book = preprocess_text(book) cloud = get_chapter_cloud(preprocessed_book, preprocessed_book) wc = WordCloud(background_color="white", max_words=2000, contour_width=3, contour_color='steelblue') wc.generate_from_frequencies(dict(cloud[15:])) wc.to_file('clouds/book_tf_idf.png')
plt.xticks(np.arange(0, (2*top_features)), feature_names[top_coefficients], rotation = 60, ha = "right") plt.show() plot_coefficients() ###Create Word Cloud####### mystopwords = set(stopwords.words('english')) # Read a text file and calculate frequency of words in it with open("C:/Users/becky/Desktop/deceptions.txt", "r") as f: words = f.read().split() data = dict() for word in words: word = word.lower() if word in stop_words: continue data[word] = data.get(word, 0) + 1 word_cloud = WordCloud( background_color = background_color, width = width, height=height, collocations = False, stopwords = mystopwords ) word_cloud.generate_from_frequencies(data) word_cloud.to_file('image7.png')
def plot_word_cloud_single(output_dir, grades, index, color=None): import seaborn as sns if color is not None: colormap = sns.dark_palette(color, as_cmap=True) else: colormap = None # Scrap non-interesting contrasts contrasts = list(filter( lambda x: 'effects_of_interest' not in x and 'gauthier' not in x, grades))[:15] frequencies_cat = defaultdict(lambda: 0.) frequencies_single = defaultdict(lambda: 0.) occurences = defaultdict(lambda: 0.) for contrast in contrasts: grade = grades[contrast] study, contrast = contrast.split('::') contrast = contrast.replace('_', ' ').replace('&', ' ').replace('-', ' ') terms = contrast.split(' ') cat_terms = [] for term in terms: if term == 'baseline': break if term in ['vs']: break cat_terms.append(term) for term in cat_terms: frequencies_single[term] += grade occurences[term] += 1 cat_terms = ' '.join(cat_terms) frequencies_cat[cat_terms] += grade frequencies_single = {term: freq / math.sqrt(occurences[term]) for term, freq in frequencies_single.items()} width, height = (900, 450) wc = WordCloud(prefer_horizontal=1, background_color="rgba(255, 255, 255, 0)", width=width, height=height, colormap=colormap, relative_scaling=0.7) wc.generate_from_frequencies(frequencies=frequencies_single, ) wc.to_file(join(output_dir, 'wc_single_%i.png' % index)) width, height = (900, 300) wc = WordCloud(prefer_horizontal=1, background_color="rgba(255, 255, 255, 0)", width=width, height=height, mode='RGBA', colormap=colormap, relative_scaling=0.8) wc.generate_from_frequencies(frequencies=frequencies_cat, ) wc.to_file(join(output_dir, 'wc_cat_%i.png' % index)) width, height = (1200, 300) wc = WordCloud(prefer_horizontal=1, background_color="rgba(255, 255, 255, 0)", width=width, height=height, mode='RGBA', colormap=colormap, relative_scaling=0.8) wc.generate_from_frequencies(frequencies=frequencies_cat, ) wc.to_file(join(output_dir, 'wc_cat_%i_wider.png' % index))
def display(input_dict, display_type='first_appear'): if display_type == 'first_appear': input_dict = dict(sorted(input_dict.items(), key=lambda x: x[1][0])) print('Appearance Sequence: ') seq = pd.Series(key for key in input_dict.keys()) print(seq) elif display_type == 'print_name_dict': global file_line_count if len(input_dict) >= 1: input_dict = dict(sorted(input_dict.items(), key=lambda x: x[0])) print("Characters and their information: ") name = [] first_appearance = [] mentioned = [] for key, value in input_dict.items(): name.append(key) first_appearance.append( round(value[0] / file_line_count * 100, 2)) mentioned.append(len(value)) df_out = pd.DataFrame() df_out['Name'] = name df_out['FirstMentioned(%)'] = first_appearance df_out['Mentioned'] = mentioned print(df_out) elif display_type == 'print_family_name_dict': print("Family Name Dictionary:") for family_name, info in input_dict.items(): mat = "{:25}\t{:5}\t{:}" family_member = [member[0] for member in info[1]] print( mat.format(family_name, str(info[0]), " / ".join(family_member))) elif display_type == 'print_appearance_dict(after projection)': for line, appear_member in input_dict.items(): if len(appear_member): mat = "{:<5}: {:}" print(mat.format(line, " / ".join(appear_member))) elif display_type == 'characters_word_cloud': wc = WordCloud(background_color='white') wc.generate_from_frequencies(input_dict) plt.figure() plt.imshow(wc) plt.axis('off') plt.show() elif display_type == 'characters_relation_graph(csv generate)': characters_relation_extract(input_dict) with open(save_csv_path + file_name[0:file_name.find(".")] + '_node.csv', 'w', encoding='utf-8') as file: file.write('id,label,weight\n') for name, freq in freq_dict.items(): file.write(name + ',' + name + ',' + str(freq) + '\n') with open(save_csv_path + file_name[0:file_name.find(".")] + '_edge.csv', 'w', encoding='utf-8') as file: file.write('source,target,weight\n') for name, freq in relation_score_dict.items(): file.write(name[0] + ',' + name[1] + ',' + str(freq) + '\n') elif display_type == 'train_relation_classifier': if build_relation_record_dict: if generate_graph: lengths = [] scores = [] for length in range(20, len(relation_record_dict), 10): lengths.append(length) score = relation_predict.train(relation_record_dict, file_name_prefix, name_dict.keys(), length, True) scores.append(score) pylab.plot(lengths, scores, '-bo') pylab.title( 'Related Word Freq Classifier Performance with Varying Relation Set Size' ) pylab.xlabel('Relation Set Size') pylab.ylabel('Accuracy(Cross Validation)') pylab.show() with open(save_csv_path + file_name[0:file_name.find(".")] + '_scores.csv', 'w', encoding='utf-8') as file: file.write('feature set size,score\n') for i in range(len(lengths)): file.write( str(lengths[i]) + ',' + str(scores[i]) + '\n') file.close() else: res = input( 'Wanna load from a existed feature set which save you for about 1 min?[y/n]' ) if res == 'y': res = True else: res = False relation_predict.train(relation_record_dict, file_name_prefix, name_dict.keys(), best_length, res) else: print("[WARNING] Need to build relation record dict first")
def mk_wordcloud(words, filename_out, strings_exclude): """ words : string filename_out : string strings_exlucde list : ['xxx', 'yyy'] # If you want to remove any particular word form text which does not contribute much in meaning """ WNL = nltk.WordNetLemmatizer() text = words # Lowercase and tokenize text = text.lower() # Remove single quote early since it causes problems with the tokenizer. text = text.replace("'", "") # Remove numbers from text # remove_digits = str.maketrans('', '', digits) # text = text.translate(remove_digits) tokens = nltk.word_tokenize(text) text1 = nltk.Text(tokens) # Remove extra chars and remove stop words. text_content = [ "".join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text1 ] # set the stopwords list stopwords_wc = set(STOPWORDS) customised_words = strings_exclude new_stopwords = stopwords_wc.union(customised_words) text_content = [word for word in text_content if word not in new_stopwords] # After the punctuation above is removed it still leaves empty entries in the list. text_content = [s for s in text_content if len(s) != 0] # Best to get the lemmas of each word to reduce the number of similar words text_content = [WNL.lemmatize(t) for t in text_content] nltk_tokens = nltk.word_tokenize(text) bigrams_list = list(nltk.bigrams(text_content)) # print(bigrams_list) dictionary2 = [" ".join(tup) for tup in bigrams_list] # print (dictionary2) # Using count vectoriser to view the frequency of bigrams vectorizer = CountVectorizer(ngram_range=(2, 2)) bag_of_words = vectorizer.fit_transform(dictionary2) vectorizer.vocabulary_ sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) # print (words_freq[:100]) # Generating wordcloud and saving as jpg image words_dict = dict(words_freq) WC_height = 1000 WC_width = 1500 WC_max_words = 200 wordCloud = WordCloud( max_words=WC_max_words, height=WC_height, width=WC_width, stopwords=new_stopwords, ) wordCloud.generate_from_frequencies(words_dict) plt.title( "Most frequently occurring bigrams connected by same colour and font size" ) plt.imshow(wordCloud, interpolation="bilinear") plt.axis("off") # plt.show() return wordCloud.to_file(filename_out)
def main(city, keyword, region, pages): ''' 主函数 ''' csv_filename = 'zl_' + city + '_' + keyword + '.csv' txt_filename = 'zl_' + city + '_' + keyword + '.txt' headers = [ 'job', 'years', 'education', 'salary', 'company', 'scale', 'job_url' ] salaries = [] write_csv_headers(csv_filename, headers) for i in range(pages): ''' 获取该页中所有职位信息,写入csv文件 ''' job_dict = {} html = get_one_page(city, keyword, region, i) items = parse_one_page(html) for item in items: html = get_detail_page(item.get('job_url')) job_detail = get_job_detail(html) job_dict['job'] = item.get('job') job_dict['years'] = job_detail.get('years') job_dict['education'] = job_detail.get('education') job_dict['salary'] = item.get('salary') job_dict['company'] = item.get('company') job_dict['scale'] = job_detail.get('scale') job_dict['job_url'] = item.get('job_url') # 对数据进行清洗,将标点符号等对词频统计造成影响的因素剔除 pattern = re.compile(r'[一-龥]+') filterdata = re.findall(pattern, job_detail.get('requirement')) write_txt_file(txt_filename, ''.join(filterdata)) write_csv_rows(csv_filename, headers, job_dict) sal = read_csv_column(csv_filename, 3) # 撇除第一项,并转换成整形,生成新的列表 for i in range(len(sal) - 1): # 工资为'0'的表示招聘上写的是'面议',不做统计 if not sal[i] == '0': salaries.append(int(sal[i + 1])) plt.hist( salaries, bins=10, ) plt.show() content = read_txt_file(txt_filename) segment = jieba.lcut(content) words_df = pd.DataFrame({'segment': segment}) stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep=" ", names=['stopword'], encoding='utf-8') words_df = words_df[~words_df.segment.isin(stopwords.stopword)] words_stat = words_df.groupby(by=['segment'])['segment'].agg( {"计数": numpy.size}) words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False) # 设置词云属性 color_mask = imread('background.jfif') wordcloud = WordCloud( font_path="simhei.ttf", # 设置字体可以显示中文 background_color="white", # 背景颜色 max_words=100, # 词云显示的最大词数 mask=color_mask, # 设置背景图片 max_font_size=100, # 字体最大值 random_state=42, width=1000, height=860, margin=2, # 设置图片默认的大小,但是如果使用背景图片的话, # 那么保存的图片大小将会按照其大小保存,margin为词语边缘距离 ) # 生成词云, 可以用generate输入全部文本,也可以我们计算好词频后使用generate_from_frequencies函数 word_frequence = {x[0]: x[1] for x in words_stat.head(100).values} word_frequence_dict = {} for key in word_frequence: word_frequence_dict[key] = word_frequence[key] wordcloud.generate_from_frequencies(word_frequence_dict) # 从背景图片生成颜色值 image_colors = ImageColorGenerator(color_mask) # 重新上色 wordcloud.recolor(color_func=image_colors) # 保存图片 wordcloud.to_file('output.png') plt.imshow(wordcloud) plt.axis("off") plt.show()
def print_word_cloud(text): wc = WordCloud(background_color='white', width=1000, height=800) wc.generate_from_frequencies(text) plt.imshow(wc) plt.axis('off') plt.show()
def WordCloud_Color(pd_csv, video_id): pwd = os.path.dirname(__file__) jieba.set_dictionary(pwd + '/dict.txt.big_new.txt') # 再加入我們從維基百科建立的自訂字典:self_define_dict.txt,可以將專有名詞成功斷開-->蔡英文、韓國瑜 # 這個自定義字典存檔時記得使用utf-8的編碼存檔 jieba.load_userdict(pwd + '/self_define_dict.txt') # 讀入stopwords.txt並做成 stopwords 字典 stopwords = {} with open(pwd + r'/test_dict_stop.txt', 'r', encoding='UTF-8') as file: for st_word in file.readlines(): st_word = st_word.strip() # data.strip()為去除前後空白 stopwords[st_word] = 1 FilePath = pwd ImgPath = pwd + r'/static/images' wd_dict = {} # 新聞總檔案 # 有些中文字python預設為unicode無法編譯,例如游錫堃的堃,使用encoding ='utf-8-sig' for j, content in enumerate(pd_csv['clean_con']): # 這個是將udn的內容中有該段文字給替換成空白 content = content.replace( '''domready(function() {if ( !countDownStatus ) getCountdown();if ( !highChartsStatus ) getHighcharts();});domready(function() {var channelId = 2;var actArea = "poll_at_story_0_v773";var actCode = "v773";var actTemplate = "bar2";var elemDiv = document.createElement('div');elemDiv.id = actArea;elemDiv.className ='vote_body area';var scr = document.getElementById(actArea+'_script');scr.parentNode.insertBefore(elemDiv, scr);$.getScript('/funcap/actIndex.jsp?actCode=' + actCode + '&channelId=' + channelId , function() {actTemplate = eval('objAct_' + actCode + '.d1.actTemplate');$.ajaxSetup({ cache: true });$.getScript('/func/js/' + actTemplate + '_min.js?2019122401', function() {$.ajaxSetup({ cache: false });piepkg();loadTemplateJs(actTemplate);eval(actTemplate + 'view("' + '#' + actArea + '");')})});});''', '') content = content.strip('').strip('\n').strip('') # 去除文章前後的空白與斷行 seg_con_list = jieba.cut(content) # 拿stopwords來清理jieba處理完的字串 for wd in seg_con_list: wd = wd.strip('') if is_alphabet(wd) != True: if stopwords.get(wd) == None and len(wd) > 1: if wd_dict.get(wd) == None: # 開始計算字詞的數量,未出現的單字存入字典 wd_dict[wd] = 1 else: # 開始計算字詞的數量,出現過的單字字典數加1 wd_dict[wd] += 1 # 每篇文章做完再進到下一行 print("影片ID:{}".format(video_id)) # === deal with similarity_dict === fw = open(pwd + r'/similarity_dict.txt', 'r', encoding='utf-8-sig') sy_list = [] while True: line = fw.readline() b = line.strip('\n').strip(' ') a = b.split(',') sy_list.append(a) if not line: break fw.close() sy_list.pop() # 將最後一個空串列丟出 ncount = 0 for n, syn in enumerate(sy_list): for i in range(len(syn)): ncount += wd_dict.get(syn[i], 0) if wd_dict.get(syn[i]) != None: del wd_dict[syn[i]] wd_dict[syn[0]] = ncount ncount = 0 print(wd_dict) # del wd_dict['不拘'] # ===== 生成文字雲 ====== def random_color_func(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None): h = randint(0, 240) # s = int(100.0 * 255.0 / 255.0) s = randint(70, 100) l = int(100.0 * float(randint(60, 120)) / 255.0) return "hsl({}, {}%, {}%)".format(h, s, l) ###http://csscoke.com/2015/01/01/rgb-hsl-hex/ HSL調色### # font設定成微軟正黑體,這邊我是直接抓我windows中的字體檔案,將該檔案放在程式的同一個工作目錄下即可 font = pwd + '/NotoSansCJKtc-Black.otf' # wordcloud = WordCloud(background_color='white',font_path=font,scale=5) wordcloud = WordCloud(background_color='white', font_path=font, max_font_size=50, min_font_size=10, scale=10, max_words=500) # 文字雲使用頻率,輸入值為字詞數的字典 (wd_dict) my_wordcloud = wordcloud.generate_from_frequencies(frequencies=wd_dict) # 畫出文字雲 my_wordcloud.recolor(color_func=random_color_func) plt.axis("off") wordcloud.to_file(ImgPath + '/{}.png'.format(video_id))
def generateTfIdfWordClouds(pathToMemex): # PART 1: loading OCR files into a corpus ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json") citeKeys = list(ocrFiles.keys()) #[:500] print("\taggregating texts into documents...") docList = [] docIdList = [] for citeKey in citeKeys: docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8")) docId = citeKey doc = " ".join(docData.values()) # clean doc doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc) doc = re.sub('\W+', ' ', doc) doc = re.sub('_+', ' ', doc) doc = re.sub('\d+', ' ', doc) doc = re.sub(' +', ' ', doc) # update lists docList.append(doc) docIdList.append(docId) print("\t%d documents generated..." % len(docList)) # PART 2: calculate tfidf for all loaded publications and distances print("\tgenerating tfidf matrix & distances...") stopWords = functions.loadMultiLingualStopWords(["deu", "eng", "fre"]) vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=2, max_df=0.5, stop_words=stopWords) countVectorized = vectorizer.fit_transform(docList) tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True) vectorized = tfidfTransformer.fit_transform( countVectorized) # generates a sparse matrix print("\tconverting and filtering tfidf data...") tfidfTable = pd.DataFrame(vectorized.toarray(), index=docIdList, columns=vectorizer.get_feature_names()) tfidfTable = tfidfTable.transpose() tfidfTableDic = tfidfTable.to_dict() tfidfTableDic = filterTfidfDictionary(tfidfTableDic, 0.03, "more") #tfidfTableDic = json.load(open("/Users/romanovienna/Dropbox/6.Teaching_New/BUILDING_MEMEX_COURSE/_memex_sandbox/_data/results_tfidf_publications.dataJson")) # PART 4: generating wordclouds print("\tgenerating wordclouds...") wc = WordCloud( width=1000, height=600, background_color="white", random_state=2, relative_scaling= 0.5, #color_func=lambda *args, **kwargs: (179,0,0)) # single color #colormap="copper") # Oranges, Reds, YlOrBr, YlOrRd, OrRd; # copper colormap="autumn") # binary, gray # https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html counter = len(tfidfTableDic) citeKeys = list(tfidfTableDic.keys()) random.shuffle(citeKeys) for citeKey in citeKeys: savePath = functions.generatePublPath(pathToMemex, citeKey) savePath = os.path.join(savePath, "%s_wCloud.jpg" % citeKey) if not os.path.isfile(savePath): wc.generate_from_frequencies(tfidfTableDic[citeKey]) # plotting plt.imshow(wc, interpolation="bilinear") plt.axis("off") #plt.show() # this line shows the plot plt.savefig(savePath, dpi=200, bbox_inches='tight') print("\t%s (%d left...)" % (citeKey, counter)) counter -= 1 else: print("\t%s --- already done" % (citeKey)) counter -= 1
overlap.render('主要城市评论数_平均分.html') # 词云 tomato_str = ' '.join(tomato_com['comment']) words_list = [] word_generator = jieba.cut_for_search(tomato_str) for word in word_generator: words_list.append(word) words_list = [k for k in words_list if len(k) > 1] back_color = imread('tomato.jpg') # 解析该图片 wc = WordCloud( background_color='white', # 背景颜色 max_words=200, # 最大词数 mask=back_color, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略 max_font_size=300, # 显示字体的最大值 stopwords=STOPWORDS.add('苟利国'), # 使用内置的屏蔽词,再添加'苟利国' font_path="C:/Windows/Fonts/STFANGSO.ttf", random_state=42, # 为每个词返回一个PIL颜色 # width=1000, # 图片的宽 # height=860 #图片的长 ) tomato_count = Counter(words_list) wc.generate_from_frequencies(tomato_count) # 基于彩色图像生成相应彩色 image_colors = ImageColorGenerator(back_color) # 绘制词云 plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) plt.axis('off') wc.to_file(path.join(d, "词云.png"))
from re import match from wordcloud import WordCloud, STOPWORDS data = pd.read_csv("C:/Users/user/Desktop/2020_text_mining/jobkorea_data.csv") komoran = Komoran() %time komoran_nouns = komoran.nouns(''.join(str(data['답변'].fillna('')))) komoran_nouns[-10:] DBA = data.loc[data['직무분야'] == "ERP·시스템분석·설계", "답변"] nouns = komoran.nouns(''.join(str(DBA.fillna('')))) nouns = [n for n in nouns if len(n) > 1] nouns = [n for n in nouns if not(match('^[0-9]',n))] count = Counter(nouns) top = count.most_common(40) #불용어 제거 stopwords = set(STOPWORDS) stopwords.add('제가') wordcloud = WordCloud(font_path='C:/Users/user/Desktop/2020_text_mining/NanumGothic.ttf', background_color='white', width=800, height=600, stopwords=stopwords) cloud = wordcloud.generate_from_frequencies(dict(top)) plt.figure(figsize=(10,8)) plt.imshow(wordcloud) plt.tight_layout(pad=0) plt.axis('off') plt.show()
import matplotlib.colors as mcolors cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] cloud = WordCloud(stopwords=stop_words, background_color='white', width=2500, height=1800, max_words=10, colormap='tab10', color_func=lambda *args, **kwargs: cols[i], prefer_horizontal=1.0) topics = optimal_model.show_topics(formatted=False) fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) topic_words = dict(topics[i][1]) cloud.generate_from_frequencies(topic_words, max_font_size=300) plt.gca().imshow(cloud) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) plt.gca().axis('off') plt.subplots_adjust(wspace=0, hspace=0) plt.axis('off') plt.margins(x=0, y=0) plt.tight_layout() plt.show() # Here we also visualized the first 4 topics in our document along with the top 10 keywords. Each keyword's corresponding weights are shown by the size of the text. # # Based on the visualization, we see the following topics: # - Topic 0: Employer Quality # - Topic 1: Management Quality
def apply_words(self, words, evaluation, options): 'WordCloud[words_List, OptionsPattern[%(name)s]]' ignore_case = self.get_option(options, 'IgnoreCase', evaluation) == Symbol('True') freq = dict() for word in words.leaves: if not isinstance(word, String): return py_word = word.get_string_value() if ignore_case: key = py_word.lower() else: key = py_word record = freq.get(key, None) if record is None: freq[key] = [py_word, 1] else: record[1] += 1 max_items = self.get_option(options, 'MaxItems', evaluation) if isinstance(max_items, Integer): py_max_items = max_items.get_int_value() else: py_max_items = 200 image_size = self.get_option(options, 'ImageSize', evaluation) if image_size == Symbol('Automatic'): py_image_size = (800, 600) elif image_size.get_head_name() == 'System`List' and len( image_size.leaves) == 2: py_image_size = [] for leaf in image_size.leaves: if not isinstance(leaf, Integer): return py_image_size.append(leaf.get_int_value()) elif isinstance(image_size, Integer): size = image_size.get_int_value() py_image_size = (size, size) else: return # inspired by http://minimaxir.com/2016/05/wordclouds/ import random import os def color_func(word, font_size, position, orientation, random_state=None, **kwargs): return self.default_colors[random.randint(0, 7)] font_base_path = os.path.dirname( os.path.abspath(__file__)) + '/../fonts/' font_path = os.path.realpath(font_base_path + 'AmaticSC-Bold.ttf') if not os.path.exists(font_path): font_path = None from wordcloud import WordCloud wc = WordCloud(width=py_image_size[0], height=py_image_size[1], font_path=font_path, max_font_size=300, mode='RGB', background_color='white', max_words=py_max_items, color_func=color_func, random_state=42, stopwords=set()) wc.generate_from_frequencies(freq.values()) image = wc.to_image() return Image(numpy.array(image), 'RGB')
def word_count(email, keyword, savedDate, optionList, analysisName): # mongo에서 전처리 결과 가져오기 doc = getPreprocessing(email, keyword, savedDate, optionList)[0] nTokens = getPreprocessing(email, keyword, savedDate, optionList)[1] doc = sum(doc, []) # 중첩리스트 하나로 합치기 #print(doc, nTokens) vectorizer = CountVectorizer(analyzer='word', max_features=int(optionList), tokenizer=None) words=vectorizer.fit(doc) words_fit = vectorizer.fit_transform(doc) word_list=vectorizer.get_feature_names() #=sorted(vectorizer.vocabulary_) #print("Vec사전:", word_list, '\n빈도수:', words_fit.toarray().sum(axis=0)) count_list = words_fit.toarray().sum(axis=0) df=pd.DataFrame() df["words"] = word_list df["count"] = count_list count_list = list([int(x) for x in count_list]) df = df.sort_values(by=['count'], axis=0, ascending=False) #dict_words = dict(zip(word_list,count_list)) dict_words = df.set_index('words').T.to_dict('records') #type: list dict_words = dict_words[0] print("빈도수 분석결과\n", df, '\n', dict_words) ## CSV파일로 저장 # with open('wc_csvfile.csv','w') as f: # w = csv.writer(f) # for k, v in dict_words.items(): # w.writerow([k, v]) ## Barchart 그리기 FONT_PATH='TextMining/NanumBarunGothic.ttf' fontprop = fm.FontProperties(fname=FONT_PATH, size=8) plt.figure(figsize=(20,5)) plt.bar(word_list, count_list) plt.xticks(rotation=40, ha='right', fontproperties=fontprop) plt.savefig('wc_barchart.jpg') ## Wordcloud 시각화 wordcloud = WordCloud( font_path = FONT_PATH, width = 1500, height = 1000, background_color="white", ) wordcloud = wordcloud.generate_from_frequencies(dict_words) #plt.savefig('wordcould.png', bbox_inches='tight') print("빈도수분석 Wordcloud 결과파일이 생성되었습니다..") wordcloud.to_file('wc_wordcloud.jpg') #Mongo에 저장된 바차트, 워드클라우드의 binary 파일과 이미지파일이 일치하는지 확인하기 위해 size출력 from os.path import getsize BarFile = 'wc_barchart.jpg' WcFile = 'wc_wordcloud.jpg' bar_file_size = getsize(BarFile) #wc_barchart.jpg: 95129 wc_file_size = getsize(WcFile) #wc_wordcloud.jpg: 223997 print('File Name: %s \tFile Size: %d' %(BarFile, bar_file_size)) print('File Name: %s \tFile Size: %d' %(WcFile, wc_file_size)) ### Mongo 저장 ### client=MongoClient(host='localhost',port=27017) #print('MongoDB에 연결을 성공했습니다.') db=client.textMining nTokens = optionList now = datetime.datetime.now() #print("time: ", now,'\n', now.strftime("%Y-%m-%dT%H:%M:%S.%fZ")) #형식 ## 몽고에 Barchart 이미지 binary로 저장 print("\nMongoDB에 빈도수 분석 결과를 바차트로 저장합니다.") fs = gridfs.GridFS(db, 'count') #count.files, count.chunks로 생성됨 with open(BarFile, 'rb') as f: contents = f.read() fs.put(contents, filename='wc_bar') ## 몽고의 count.files & count.chunks collection에 WordCloud 이미지 binary로 저장 print("MongoDB에 빈도수 분석 결과를 wordcloud로 저장합니다.\n") with open(WcFile, 'rb') as f: contents = f.read() fs.put(contents, filename='wc_wordcloud') barBinary = getBinaryImage(bar_file_size, analysisName) wcBinary = getBinaryImage(wc_file_size, analysisName) doc={ "userEmail" : email, "keyword" : keyword, "savedDate": savedDate, "analysisDate" : datetime.datetime.now(), #"duration" : , "nTokens" : nTokens, "resultJson" : json.dumps(dict_words, ensure_ascii=False), "resultBar" : barBinary, "resultWC" : wcBinary, #"resultCSV" :, } db.count.insert_one(doc) print("MongoDB에 저장되었습니다.") return dict_words #word_count('*****@*****.**', '북한', "2021-07-08T11:46:03.973Z", 100, 'count') #word_count('*****@*****.**', '북한', "2021-07-08T11:46:03.973Z", 100, 'count')
# 5. 전처리 + 단어카운트 : 단어 길이(1음절)제한, 숫자 제외 nouns_count = {} # 단어 카운트 for noun in nouns_word: if len(noun) > 1 and not (match('^[0-9]', noun)): # key[noun] = value[출현빈도수] nouns_count[noun] = nouns_count.get(noun, 0) + 1 nouns_count len(nouns_count) # 19143 # 6. WordCloud # 6-1) top50 word word_count = Counter(nouns_count) # dict top20_word = word_count.most_common(20) top20_word # 6-2) wordcloud wc = WordCloud(font_path='C:/Windows/Fonts/malgun.ttf', width=800, height=600, max_words=100, max_font_size=200, background_color='white') wc_result = wc.generate_from_frequencies(dict(top20_word)) #wc_result # <wordcloud.wordcloud.WordCloud at 0x1e5dc3f5a48> plt.figure(figsize=(12, 8)) plt.imshow(wc_result) plt.axis('off') # x축,y축 테두리 제거 plt.show()
stopwords = pd.read_csv('ChineseStopwords.txt', index_col=False, quoting=3, sep="\t") words_df = words_df[~words_df.segment.isin(stopwords)] # 剔除停用词中的词 # 统计词频 words_stat = words_df.groupby(by=['segment'])['segment'].agg( {"计数": np.size}) # 聚合 words_stat = words_stat.reset_index().sort_values( by=["计数"], ascending=False) # 按照词频降序 # a = words_stat.head() # print(a) # 初始化一个词云 wordcloud = WordCloud( font_path='./font/simhei.ttf', background_color='white', max_font_size=230, mask=graph, ) # 取出前1000个高频词汇 word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values} # 词频字典 wordcloud = wordcloud.generate_from_frequencies(word_frequence) wordcloud.to_file(outpath) # 保存图片 plt.imshow(wordcloud) plt.axis("off") plt.show()
lda = LatentDirichletAllocation(n_components=total_topics, max_iter=15,learning_method='online', learning_offset=15,random_state=1234) ldaTransform = lda.fit_transform(td) #declaring number of terms we need per topic terms_count = 25 #Looping over lda components to get topics and their related terms with high probabilities for idx,topic in enumerate(lda.components_): print('Topic# ',idx+1) abs_topic = abs(topic) topic_terms = [[terms[i],topic[i]] for i in abs_topic.argsort()[:-terms_count-1:-1]] topic_terms_sorted = [[terms[i], topic[i]] for i in abs_topic.argsort()[:-terms_count - 1:-1]] topic_words = [] for i in range(terms_count): topic_words.append(topic_terms_sorted[i][0]) print(','.join( word for word in topic_words)) print("") dict_word_frequency = {} for i in range(terms_count): dict_word_frequency[topic_terms_sorted[i][0]] = topic_terms_sorted[i][1] wcloud = WordCloud(background_color="white",mask=None, max_words=100,\ max_font_size=60,min_font_size=10,prefer_horizontal=0.9, contour_width=3,contour_color='black') wcloud.generate_from_frequencies(dict_word_frequency) plt.imshow(wcloud, interpolation='bilinear') plt.axis("off") plt.savefig("Topic#"+str(idx+1), format="png")