def make_worldcloud(file_path): #对应的文件编码 text_from_file_with_apath = open(file_path, 'r', encoding='UTF-8').read() wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=False) wl_space_split = " ".join(wordlist_after_jieba) print(wl_space_split) backgroud_Image = plt.imread('D:/下载/123.jpg') #图片路径 print('加载图片成功') '''设置词云样式''' stopwords = STOPWORDS.copy() stopwords.add("如果") #屏蔽词,可以多个 wc = WordCloud( width=1024, height=768, background_color='white', #背景色 mask=backgroud_Image, font_path='D:/下载/simsun.ttc', #字体文件 max_font_size=400, random_state=50, ) wc.generate_from_text(wl_space_split) #开始加载文字 img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) #字体颜色为背景图片的颜色 plt.imshow(wc) #显示云词图 plt.axis('off') #是否显示X轴,Y轴下标 plt.show() #显示 #获取模块所在的路径 d = path.dirname(__file__) # os.path.join(): 将多个路径组合后返回 #print(d) wc.to_file(path.join(d, "h11.jpg")) #生成图片名称 print('生成云词成功')
def parse_comment(): comments = [] with open(file_name, 'r', encoding='utf-8') as f: lines = f.readlines() try: for line in lines: comment = line.split(',')[2] if comment: comments.append(comment) except Exception as e: print(e) comment_after_split = jieba.cut(str(comments), cut_all=False) words = ''.join(comment_after_split) #多虑没用的停止词 stopwords = STOPWORDS.copy() stopwords.add('电影') stopwords.add('一部') stopwords.add('一个') stopwords.add('没有') stopwords.add('什么') stopwords.add('有点') stopwords.add('感觉') stopwords.add('毒液') stopwords.add('就是') stopwords.add('觉得') bg_image = plt.imread('venmo1.jpg') wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, font_path='STKAITI.TTF', stopwords=stopwords, max_font_size=400, random_state=50) wc.generate_from_text(words) plt.imshow(wc) plt.axis('off') plt.show()
def create_word_cloud_b64(word_list): random.shuffle(word_list) words = " ".join(word_list) stopwords = STOPWORDS.copy() stopwords.add("http") stopwords.add("https") stopwords.add("localstorage") stopwords.add("com_0") stopwords.add("new") stopwords.add("http_www") stopwords.add("sourceid") try: wordcloud = WordCloud(stopwords=stopwords, max_words=150, background_color="white", height=700, width=1200, margin=0, regexp=r"\w[\w'@\.]+").generate(words) plt.axis("off") plt.imshow( wordcloud) # this must be here, or the wc image is blank wordcloud_bytes = cStringIO.StringIO() plt.savefig(wordcloud_bytes, bbox_inches='tight', format='jpg') wordcloud_bytes.seek(0) wordcloud_b64 = base64.b64encode(wordcloud_bytes.read()) return wordcloud_b64 except Exception as e: logging.error("Problem making word cloud: {}".format(e)) return False
def word_cloud(csv_file, stopwords_path, pic_path): pic_name = csv_file+"_词云图.png" path = os.path.abspath(os.curdir) csv_file = path+ "\\" + csv_file + ".csv" csv_file = csv_file.replace('\\', '\\\\') d = pd.read_csv(csv_file, engine='python', encoding='utf-8') content = [] for i in d['content']: try: i = translate(i) except AttributeError as e: continue else: content.append(i) comment_after_split = jieba.cut(str(content), cut_all=False) wl_space_split = " ".join(comment_after_split) backgroud_Image = plt.imread(pic_path) stopwords = STOPWORDS.copy() with open(stopwords_path, 'r', encoding='utf-8') as f: for i in f.readlines(): stopwords.add(i.strip('\n')) f.close() wc = WordCloud(width=1024, height=768, background_color='white', mask=backgroud_Image, font_path="C:\simhei.ttf", stopwords=stopwords, max_font_size=400, random_state=50) wc.generate_from_text(wl_space_split) img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) plt.imshow(wc) plt.axis('off') plt.show() wc.to_file(pic_name)
def generate_word_cloud(data): # 设置分词 signatures = jieba.cut(str(data)) words = ' '.join(signatures) print('签名:',words) # 设置屏蔽词 stopwords = STOPWORDS.copy() stopwords.add("span") stopwords.add("class") stopwords.add("emoji") stopwords.add("emoji2764") # 导入背景图片 bg_img = plt.imread('love.jpg') # 设置词云参数 wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_img, stopwords=stopwords, max_font_size=400, random_state=50, font_path='STKAITI.TTF') # 将分词后的数据导入词图 wc.generate_from_text(words) # 绘制图像 plt.imshow(wc) plt.axis('off') plt.show() wc.to_file('词云图.jpg')
def wordCl(strs): stopwords = STOPWORDS.copy() # stopwords.add('感觉') # stopwords.add('第一集') # stopwords.add('已经') # stopwords.add('为什么') # stopwords.add('啊啊啊') # stopwords.add('不要') # stopwords.add('没有') # stopwords.add('?') # stopwords.add('觉得') # stopwords.add('时候') # stopwords.add('开始') # stopwords.add('一下') # stopwords.add('自己') # stopwords.add('就是') # stopwords.add('还有') # stopwords.add('但是') # stopwords.add('怎么') # stopwords.add('不要') word_list = [" ".join(jieba.cut(strs))] new_text = ' '.join(word_list) imagename = path.join(path.dirname(__file__), BACKGROUNDIMG) # 背景图片路径 coloring = imread(imagename) # 读取背景图片 #fontname=path.join(path.dirname(__file__), "msyh.ttf") # 使用的是微软雅黑字体 wordcloud = WordCloud(stopwords=stopwords, min_font_size=10, mask=coloring, font_path="msyh.ttf", scale=24, background_color='white').generate(new_text) plt.imshow(wordcloud) plt.axis("off") plt.show()
def create_wordcloud(self, text): """生成词云""" # 对图片做处理 maskPic = np.array(Image.open(IMAGE_PATH)) # 停用词 stopwords = STOPWORDS.copy() words = ["其次", "不錯看", "其实"] list(map(stopwords.add, words)) wordcloud = WordCloud( font_path=TTF_PATH, # 字体路径(中文字需要添加) width=500, height=400, stopwords=stopwords, max_font_size=100, random_state=30, min_font_size=10, background_color="white").generate(text.replace("\n", "")) # 改变字体颜色 img_colors = ImageColorGenerator(maskPic) # 字体颜色为背景图片的颜色 wordcloud.recolor(color_func=img_colors) # 生成图片并保存 img = wordcloud.to_image() img.save(SAVE_IMAGE_PATH) print("词云图【成功】")
def main(save_files = False, db_filename = '../output/database.sqlite'): conn = sqlite3.connect(db_filename) c = conn.cursor() # Retrieve papers c.execute('''SELECT * FROM Papers''') paper_content = c.fetchall() conn.close() titles = '' for pc in paper_content: titles += pc[1] # A Marvin Minsky mask mask = np.array(Image.open("../files/minsky_mask.png")) wc = WordCloud(background_color="white", max_words=2000, mask=mask, stopwords=STOPWORDS.copy()) # Generate word cloud wc.generate(titles) if (save_files): # Store to file wc.to_file("../files/title_cloud.png") # Show word cloud plt.imshow(wc) plt.axis("off") # Show mask # plt.figure() # plt.imshow(mask, cmap=plt.cm.gray) # plt.axis("off") plt.show()
def generate_wordcloud(): comments = [] with open('data/comments.txt', 'r', encoding='utf-8') as f: rows = f.readlines() try: for row in rows: comment = row.split(',')[2] if comment != '': comments.append(comment) except Exception as e: print(e) comment_after_split = jieba.cut(str(comments), cut_all=False) words = ' '.join(comment_after_split) stopwords = STOPWORDS.copy() stopwords.add('电影') stopwords.add('一部') stopwords.add('一个') stopwords.add('没有') stopwords.add('什么') stopwords.add('有点') stopwords.add('感觉') stopwords.add('毒液') stopwords.add('就是') stopwords.add('觉得') bg_image = plt.imread('bg.jpg') wc = WordCloud(background_color='lightblue', mask=bg_image, font_path='STKAITI.TTF', stopwords=stopwords, max_font_size=400, random_state=50) wc.generate_from_text(words) plt.imshow(wc) plt.axis('off') plt.show()
def hope(): d = path.dirname(__file__) # read the mask image # taken from # http://www.stencilry.org/stencils/movies/star%20wars/storm-trooper.gif mask = imread(path.join(d, "stormtrooper_mask.png")) text = open(path.join(d, 'input.txt')).read() # preprocessing the text a little bit #text = text.replace("HAN", "Han") #text = text.replace("LUKE'S", "Luke") # adding movie script specific stopwords stopwords = STOPWORDS.copy() stopwords.add("int") stopwords.add("ext") wc = WordCloud(max_words=1000, mask=mask, stopwords=stopwords, margin=10, random_state=1).generate(text) # store default colored image default_colors = wc.to_array() plt.title("Custom colors") plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3)) wc.to_file("a_new_hope.png") plt.axis("off") plt.figure() plt.title("Default colors") plt.imshow(default_colors) plt.axis("off") plt.show()
def WorldCloud_pic(text_path, pic_path, font_path): text = open(text_path, 'r', encoding='UTF-8').read() word_list = jieba.cut(text, cut_all=False) wl_space_split = " ".join(word_list) print(wl_space_split) backgroud_Image = plt.imread(pic_path) print('加载图片成功!') stopwords = STOPWORDS.copy() # 使用词云自带的停词表 stopwords.add("哈哈") # 可以加多个屏蔽词 wc = WordCloud( width=1024, height=768, background_color='white', # 设置背景颜色 mask=backgroud_Image, # 设置背景图片 font_path=font_path, # 设置中文字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字 max_words=600, # 设置最大现实的字数 stopwords=stopwords, # 设置停用词 max_font_size=400, # 设置字体最大值 random_state=50, # 设置有多少种随机生成状态,即有多少种配色方案 ) wc.generate_from_text(wl_space_split) # 开始加载文本 img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) # 字体颜色为背景图片的颜色 plt.imshow(wc) # 显示词云图 plt.axis('off') # 是否显示x轴、y轴下标 plt.show() # 显示 d = path.dirname(__file__) # 获得模块所在的路径的 wc.to_file(path.join(d, "词云.jpg")) print('生成词云成功!')
def SetStopWords(self): """ 停词设置 """ stopwords = STOPWORDS.copy() stopwords.add("电影") return stopwords
def get_word_cloud(comments): comments_after_aplit = jieba.cut(str(comments), cut_all=False) words = ' '.join(comments_after_aplit) # print(words) stopwords = STOPWORDS.copy() stopwords.add('哪吒') stopwords.add('电影') stopwords.add('我命') stopwords.add('不由') bg_img = plt.imread('circle.png') wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_img, stopwords=stopwords, max_font_size=200, random_state=50, font_path='STKAITI.TTF') wc.generate_from_text(words) plt.imshow(wc) plt.axis('off') plt.show() wc.to_file('词云图.jpg')
def wcplot(text, n): data_dir = 'data2' pwd = os.getcwd() #file_names = os.listdir(os.path.join(pwd, data_dir)) file_ = img_dict[n] file_dir = os.path.join(pwd, data_dir, file_) mask = imread(file_dir, mode= 'L') stopwords = STOPWORDS.copy() stopwords.add("int") stopwords.add("ext") wc = WordCloud(background_color="white", max_words=200,mask=mask,stopwords=stopwords, random_state=3,font_path=font_path).generate(text) wc.recolor(color_func=color_func, random_state=3) plt.figure(figsize=(10,10),facecolor='k') plt.imshow(wc) #title = ("top words for topic %d") % n #plt.title(title) pic_name = ('{}.jpg') .format(input_movie.replace('/','_')) wc.to_file(pic_name) plt.axis("off") plt.show()
def word_cloud(csv_file, stopwords_path, pic_path): pic_name = csv_file[:-4] + "_词云图.png" d = pd.read_csv(csv_file, engine='python', encoding='utf-8') content = d['job_desc'].values comment_after_split = jieba.cut(str(content), cut_all=False) wl_space_split = " ".join(comment_after_split) background_image = plt.imread(pic_path) stopwords = STOPWORDS.copy() with open(stopwords_path, 'r', encoding='utf-8') as f: for i in f.readlines(): stopwords.add(i.strip('\n')) f.close() wc = WordCloud(width=1024, height=768, background_color='white', mask=background_image, font_path="simhei.ttf", stopwords=stopwords, max_font_size=400, random_state=50) wc.generate_from_text(wl_space_split) img_colors = ImageColorGenerator(background_image) wc.recolor(color_func=img_colors) plt.imshow(wc) plt.axis('off') plt.show() wc.to_file(pic_name)
def make_cloud(words, image, size=10, filename='figures/cloud.png', max_words=200, horizontal=0.8): # Remove URLs, 'RT' text, screen names, etc my_stopwords = ['RT', 'amp', 'lt'] words_no_urls = ' '.join([word for word in words.split() if word not in my_stopwords]) # Add stopwords, if needed stopwords = STOPWORDS.copy() stopwords.add("RT") stopwords.add('amp') stopwords.add('lt') # Load up a logo as a mask & color image logo = imread(image) # Generate colors image_colors = ImageColorGenerator(logo) # Generate plot wc = WordCloud(stopwords=stopwords, mask=logo, color_func=image_colors, scale=0.8, max_words=max_words, background_color='white', random_state=42, prefer_horizontal=horizontal) wc.generate(words_no_urls) plt.figure(figsize=(size, size)) plt.imshow(wc) plt.axis("off") plt.savefig(filename)
def signatures_cloud(): signatures = [] with open('wechatfriends.txt', mode='r', encoding='utf-8') as f: rows = f.readlines() for i in rows: signature = i.split(',')[5] if signature != '': signatures.append(signature) f.close() split = jieba.cut(str(signatures), cut_all=False) words = ' '.join(split) stopwords = STOPWORDS.copy() stopwords.add('span') stopwords.add('span') stopwords.add('class') stopwords.add('emoji') stopwords.add('emoji1f334') stopwords.add('emoji1f388') stopwords.add('emoji1f33a') stopwords.add('emoji1f33c') stopwords.add('emoji1f633') bg_image = plt.imread('moon.jpeg') wc = WordCloud(width=1000, height=1000, background_color='white', mask=bg_image, font_path='simhei.ttf', stopwords=stopwords, max_font_size=400, random_state=50) wc.generate_from_text(words) # plt.imshow(wc) plt.axis('off') wc.to_file('个性签名云图.jpg')
def gen_tag_cloud(frequencies, filename): """生成标签云""" d = path.dirname(__file__) mask = np.array(Image.open(path.join(d, "view.jpg"))) stopwords = STOPWORDS.copy() wc = WordCloud(background_color="white", max_words=2000, mask=mask, stopwords=stopwords, margin=10, random_state=42, font_path="msyh.ttf", width=1280, height=1024).fit_words(frequencies) image_colors = ImageColorGenerator(mask) plt.imshow(wc) plt.axis("off") plt.figure() wc.to_file(filename + "_tag_cloud_default.png") plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") plt.figure() wc.to_file(filename + "_tag_cloud_colored.png")
def generate_wordcloud(): comments = [] with open('maoyan.csv', mode='r', encoding='utf-8') as f: rows = f.readlines() for row in rows: comment = row.split(':')[0] if comment != '': comments.append(comment) comment_after_split = jieba.cut(str(comments), cut_all=False) words = ''.join(comment_after_split) print(words) stopwords = STOPWORDS.copy() stopwords.add('电影') stopwords.add('一出') stopwords.add('好戏') stopwords.add('有点') bg_image = plt.imread('123.jpg') wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, stopwords=stopwords, max_font_size=400, random_state=50, font_path='STKAITI.TTF') wc.generate_from_text(words) wc.to_file('output/词云图.jpg') plt.imshow(wc) plt.axis('off') plt.show()
def make_worldcloud(file_path): text_from_file_with_apath = open(file_path, 'r', encoding='UTF-8').read() wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=False) wl_space_split = " ".join(wordlist_after_jieba) print(wl_space_split) backgroud_Image = plt.imread('./douban.jpg') print('加载图片成功!') '''设置词云样式''' stopwords = STOPWORDS.copy() stopwords.add("哈哈") stopwords.add("还是") #可以加多个屏蔽词 wc = WordCloud( width=1024, height=768, background_color='white', # 设置背景颜色 mask=backgroud_Image, # 设置背景图片 font_path='E:\simsun.ttf', # 设置中文字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字 max_words=300, # 设置最大现实的字数 stopwords=stopwords, # 设置停用词 max_font_size=400, # 设置字体最大值 random_state=50, # 设置有多少种随机生成状态,即有多少种配色方案 ) wc.generate_from_text(wl_space_split) #开始加载文本 img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) #字体颜色为背景图片的颜色 plt.imshow(wc) # 显示词云图 plt.axis('off') # 是否显示x轴、y轴下标 plt.show() #显示 # 获得模块所在的路径的 d = path.dirname(__file__) # os.path.join(): 将多个路径组合后返回 wc.to_file(path.join(d, "h11.jpg")) print('生成词云成功!')
def ciyun(filepath): comment = [] with open(filepath, 'r') as f: rows = f.readlines() for row in rows: if len(row.split(';')) == 5: comment.append(row.split(';')[4].replace('\n', '')) comment2 = json.dumps(comment, ensure_ascii=False) # 转码显示中文 print("comment2", comment2) comment_after_split = jieba.cut(str(comment2), cut_all=False) # 查看分词效果 wl_space_split = " ".join(comment_after_split) print("wl_space_split", wl_space_split) # 以上都运行无误 # 导入背景图 backgroud_Image = plt.imread('1.jpg') # 读取图片数据 stopwords = STOPWORDS.copy() # 可以加多个屏蔽词 stopwords.add("剧情") stopwords.add("一部") stopwords.add("一个") stopwords.add("没有") stopwords.add("什么") stopwords.add("有点") stopwords.add("这部") stopwords.add("这个") stopwords.add("不是") stopwords.add("真的") stopwords.add("感觉") stopwords.add("觉得") stopwords.add("还是") stopwords.add("女主") stopwords.add("皇后") stopwords.add("贵妃") stopwords.add("于妈") stopwords.add("就是") stopwords.add("可以") # 设置词云参数 # 参数分别是指定字体、背景颜色、最大的词的大小、使用给定图作为背景形状 wc = WordCloud(background_color='white', mask=backgroud_Image, font_path='DroidSansFallback.ttf', stopwords=stopwords, max_font_size=400, random_state=50) wc.generate_from_text(wl_space_split) img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) # 保存结果到本地 if filepath == 'yanxi.txt': wc.to_file('yanxi_wordcloud.jpg')
def ciyun(data): comment = jieba.cut(str(data['comments']), cut_all=False) #分词 wl_space_split = " ".join(comment) backgroud_Image = plt.imread('shenteng.jpg') #添加背景 stopwords = STOPWORDS.copy() print(" STOPWORDS.copy()", help(STOPWORDS.copy())) # 可以自行加多个屏蔽词,也可直接下载停用词表格 stopwords.add("电影") stopwords.add("一部") stopwords.add("一个") stopwords.add("没有") stopwords.add("什么") stopwords.add("有点") stopwords.add("这部") stopwords.add("这个") stopwords.add("不是") stopwords.add("真的") stopwords.add("感觉") stopwords.add("觉得") stopwords.add("还是") stopwords.add("特别") stopwords.add("非常") stopwords.add("可以") stopwords.add("因为") stopwords.add("为了") stopwords.add("比较") #print(stopwords) # 设置词云参数 # 参数分别是指定字体/背景颜色/最大的词的大小,使用给定图作为背景形状 wc = WordCloud(width=1024, height=768, background_color='white', mask=backgroud_Image, font_path='C:\Windows\Fonts\simhei.ttf', stopwords=stopwords, max_font_size=400, random_state=50) print(wl_space_split) wc.generate_from_text(str(wl_space_split)) plt.imshow(wc) plt.axis('off') plt.show() wc.to_file(r'shenteng_wordcloud.jpg')
def makeWordCloud(text): #preprocess stopwords = STOPWORDS.copy() # text.replace("State","") # text.replace("year","") # text.replace("Congress","") # text.replace("will","") wC = WordCloud(max_words=2000, stopwords=stopwords, margin=5, random_state=1, width = 1600, height = 800).generate(text) plt.imshow(wC) plt.show()
def cloud(self): # 设置分词 comments = self.getComments() print(str(comments)) comment_after_split = jieba.cut(str(comments), cut_all=False) # 非全模式分词,cut_all=false words = " ".join(comment_after_split) # 以空格进行拼接 print(type(words)) print(''.join(words)) # print(words) # 设置屏蔽词 stopwords = STOPWORDS.copy() stopwords.add("电影") stopwords.add("一部") stopwords.add("一个") stopwords.add("没有") stopwords.add("什么") stopwords.add("有点") stopwords.add("这部") stopwords.add("这个") stopwords.add("不是") stopwords.add("真的") stopwords.add("感觉") stopwords.add("觉得") stopwords.add("还是") stopwords.add("但是") stopwords.add("就是") stopwords.add("他们") stopwords.add("可能") stopwords.add("应该") stopwords.add("怎么") stopwords.add("大家") # 导入背景图 bg_image = plt.imread('g.jpg') # font = '/System/Library/Assets/com_apple_MobileAsset_Font5/6bb29eea6a5b99f3100a5e3f862e6457103557de.asset/AssetData/Hannotate.ttc' font = '/System/Library/Assets/com_apple_MobileAsset_Font5/4cecce0dd640f147de4d0e4155a97d3cdf47971e.asset/AssetData/Xingkai.ttc' # 设置词云参数,参数分别表示:画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小 wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, font_path=font, stopwords=stopwords, max_font_size=400, random_state=50) # 将分词后数据传入云图 wc.generate_from_text(words) plt.imshow(wc) plt.axis('off') # 不显示坐标轴 plt.show() # 保存结果到本地 wc.to_file('myecharts/词云图.jpg')
def draw_cloud(comments): data = comments['content'] comment_data = [] print("由于数据量比较大,分词这里稍微有点慢,请耐心等候") for item in data: if pd.isnull(item) == False: comment_data.append(item) comment_after_split = jieba.cut(str(comment_data), cut_all=False) words = ' '.join(comment_after_split) # 自定义停用词 stopwords = STOPWORDS.copy() stopwords.add('复仇者联盟') stopwords.add('复联') stopwords.add('联盟') stopwords.add('复仇') stopwords.add('电影') stopwords.add('一部') stopwords.add('一个') stopwords.add('没有') stopwords.add('什么') stopwords.add('有点') stopwords.add('感觉') stopwords.add('就是') stopwords.add('觉得') stopwords.add('但是') stopwords.add('自己') stopwords.add('我们') stopwords.add('真的') stopwords.add('可以') stopwords.add('非常') stopwords.add('还是') stopwords.add('还有') stopwords.add('这部') # 这里的字体路径请根据自己电脑的实际情况设置'simfang.ttf' fortpath = './SourceHanSansCN-Normal-2.otf' wc = WordCloud(width=1000, height=700, background_color='#000000', font_path=fortpath, scale=5, stopwords=stopwords, max_font_size=200) wc.generate_from_text(words) plt.figure(figsize=(10, 8)) plt.imshow(wc) plt.axis('off') plt.savefig('./WordCloud.png') plt.show()
def ciyun(filepath): comment = [] with open(filepath, 'r') as f: rows = f.readlines() for row in rows: if len(row.split(',')) == 5: comment.append(row.split(',')[4].replace('\n', '')) comment2 = json.dumps(comment, ensure_ascii=False) #转码显示中文 print "comment2", comment2 comment_after_split = jieba.cut(str(comment2), cut_all=False) wl_space_split = " ".join(comment_after_split) print "wl_space_split", wl_space_split #导入背景图 backgroud_Image = plt.imread('1.jpg') stopwords = STOPWORDS.copy() #可以加多个屏蔽词 stopwords.add("电影") stopwords.add("一部") stopwords.add("一个") stopwords.add("没有") stopwords.add("什么") stopwords.add("有点") stopwords.add("这部") stopwords.add("这个") stopwords.add("不是") stopwords.add("真的") stopwords.add("感觉") stopwords.add("觉得") stopwords.add("还是") #设置词云参数 #参数分别是指定字体、背景颜色、最大的词的大小、使用给定图作为背景形状 wc = WordCloud(width=1024, height=768, background_color='white', mask=backgroud_Image, font_path='DroidSansFallbackFull.ttf', stopwords=stopwords, max_font_size=400, random_state=50) wc.generate_from_text(wl_space_split) img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) #plt.imshow(wc) #plt.axis('off')#不显示坐标轴 #plt.show() #保存结果到本地 if filepath == 'xie_zheng.txt': wc.to_file('xie_zheng_ciyun.jpg') elif filepath == 'yaoshen.txt': wc.to_file('yaoshen_ciyun.jpg')
def plotTwiiterWordCloud(): args = sys.argv tracefile = open(args[2], 'r') nLines = sum(1 for line in tracefile) tracefile.seek(0) dictTerms = dict() blacklist = STOPWORDS.copy() blacklist.add('rt') punctuation = set(string.punctuation) punctuation.remove('@') punctuation.remove('&') # punctuation.remove('#') for line in tqdm(tracefile, total=nLines): try: linesplited = line.split(', ') tweet = linesplited[6].lower() for p in punctuation: tweet = tweet.replace(p, '') terms = tweet.split(' ') for t in terms: if (len(t) > 1) and 'http' not in t and (t not in blacklist): try: dictTerms[t] += 1 except KeyError: dictTerms[t] = 1 except IndexError: print 'IndexError' for t in blacklist: try: del dictTerms[t] except KeyError: continue popularTerms = sorted(dictTerms.keys(), key=lambda w:dictTerms[w], reverse=True) popularTerms = [p for p in popularTerms if (dictTerms[p]) > 1] print len(popularTerms) text = list() terms = '' for p in popularTerms: text.append((p, dictTerms[p])) for i in range(dictTerms[p]): terms += ' ' + p # print terms maskfile = 'csgo-icon' mask = imread(maskfile + '.jpg') # mask=mask wc = WordCloud(mask=mask, background_color='white', width=1280, height=720).generate(terms) # max_words=10000 default_colors = wc.to_array() plt.figure() plt.imshow(default_colors) plt.axis('off') plt.savefig(maskfile + '-wordcloud.png', dpi=500, bbox_inches='tight', pad_inches=0) # bbox_inches='tight' plt.show()
def makeCloud(text, imgFile, words): """ Makes a word cloud and stores it in a jpeg file """ excludewords = STOPWORDS.copy() for word in words: excludewords.add(word) wordcloud = WordCloud(max_words=NUM_OF_WORDS, width=WIDTH, height=HEIGHT, stopwords=excludewords).generate(text) image = wordcloud.to_image() image.show() image.save(imgFile + '.jpeg')
def get_stop_words(self): stopwords = STOPWORDS.copy() # 添加屏蔽词 stopwords.add(self.movie.movie_name) with open('stopword.txt', 'r', encoding='utf_8_sig', newline='') as f: for line in f.readlines(): if '\r\n' in line: stopwords.add(line[:len(line) - 2:]) else: stopwords.add(line) # print(stopwords) return stopwords
def paint_word_cloud(): comments = [] with open('./dataSource.txt', 'r', encoding='utf-8') as f: rows = f.readlines() try: for row in rows: comment = row.split(':')[2] if comment != '': comments.append(comment) print(comments) except Exception as e: print(e) comment_after_split = jieba.cut(str(comments), cut_all=False) words = ' '.join(comment_after_split) print(words) # 多虑没用的停止词 stopwords = STOPWORDS.copy() stopwords.add('电影') stopwords.add('一部') stopwords.add('一个') stopwords.add('没有') stopwords.add('什么') stopwords.add('有点') stopwords.add('感觉') stopwords.add('海王') stopwords.add('就是') stopwords.add('觉得') stopwords.add('但是') bg_image = plt.imread('./cloudBack.jpg') print('load image success') print(words) wc = WordCloud( # 画布的宽度和高度,如果设置mask不生效 #width=1024, height=768, # 背景色 background_color='white', # 词云形状 mask=bg_image, # 字体路径,若有中文,必须添加这句代码,否则中文变方框 font_path='/Users/lichuang.lc/Desktop/python/out/STZHONGS.TTF', # 设置停用时间 stopwords=stopwords, # 最大字号,如果不指定则为图像高度 max_font_size=400, #有多少种配色方案 random_state=50) wc.generate_from_text(words) wc.to_file('./man.jpg') plt.imshow(wc) plt.axis('off') plt.show()
def writeFreq(text, outFile, words): """ Writes frequencies of words into the specified file """ excludewords = STOPWORDS.copy() for word in words: excludewords.add(word) wordcloud = WordCloud(max_words=NUM_OF_WORDS, stopwords=excludewords) freqList = wordcloud.process_text(text) for item in freqList: outFile.write(item[0] + ',' + str(item[1]) + '\n')
def main(): d = os.path.dirname(__file__) DOC_NAME = "cvpr2015papers.txt" text = open(os.path.join(d, DOC_NAME)).read() # adding computer vision specific stopwords stopwords = STOPWORDS.copy() stopwords.add("image") wc = WordCloud(max_words=300, stopwords=stopwords, width=800, height=400) wc.generate(text) wc.to_file(os.path.join(d, "cvpr2015wordcloud.png")) plt.imshow(wc) plt.axis("off") plt.show()
def generate_word_cloud(text, mask_filename): d = path.dirname(__file__) #?? mask = imread(path.join(d, mask_filename)) # adding movie script specific stopwords stopwords = STOPWORDS.copy() stopwords.add("info") stopwords.add("meetbot") stopwords.add("supybot") wc = WordCloud(max_words=1000, mask=mask, stopwords=stopwords, margin=10, random_state=1).generate(text) _, tmpfilename = tempfile.mkstemp('-wordcloud.png') wc.to_file(tmpfilename) return tmpfilename
def get_wordcloud(self, filmname): movieid = MovieHelper().select(1, {"name": filmname}) if (movieid == None): return False datalist = CommentHelper().select(conditions={"movieid": movieid}) commentlist = [] for data in datalist: if data[1] != "" or data[1] != ",": commentlist.append(data[1]) comments_after_split = jieba.cut(str(commentlist), cut_all=False) wordlist = "".join(comments_after_split) stopwords = STOPWORDS.copy() stopwords.add("电影") stopwords.add("一部") stopwords.add("一个") stopwords.add("没有") stopwords.add("什么") stopwords.add("有点") stopwords.add("这部") stopwords.add("这个") stopwords.add("不是") stopwords.add("真的") stopwords.add("感觉") stopwords.add("觉得") stopwords.add("还是") stopwords.add("但是") stopwords.add("就是") bg_image = np.array(Image.open('bg.jpg')) # 设置词云参数,参数分别表示:画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小 wc = WordCloud(width=2048, height=768, background_color='white', mask=bg_image, font_path='STKAITI.TTF', stopwords=stopwords, max_font_size=800, random_state=50) wc.generate_from_text(wordlist) plt.imshow(wc) plt.axis("off") plt.show() wc.to_file("词云.jpg")
def show_fen_ci_qianmin(): # jieba是一个基于Python的分词库,完美支持中文分词,功能强大 import jieba # Matplotlib是一个Python的2D绘图库,能够生成高质量的图形,可以快速生成绘图、直方图、功率谱、柱状图、误差图、散点图等 import matplotlib.pyplot as plt # wordcloud是一个基于Python的词云生成类库,可以生成词云图 # 可能安装失败 https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud # python -m pip install wordcloud-1.5.0-cp37-cp37m-win32.whl from wordcloud import WordCloud, STOPWORDS # 获取所有个性签名 signatures = [] with open(friends_data, mode='r', encoding='utf-8') as f: rows = f.readlines() for row in rows: signature = row.split(',')[5] if signature != '': signatures.append(signature) # 设置分词 split = jieba.cut(str(signatures), cut_all=False) # False精准模式分词、True全模式分词 words = ' '.join(split) # 以空格进行拼接 # print(words) # 设置屏蔽词,去除个性签名中的表情、特殊符号等 stopwords = STOPWORDS.copy() stopwords.add('span') stopwords.add('class') stopwords.add('emoji') stopwords.add('emoji1f334') stopwords.add('emoji1f388') stopwords.add('emoji1f33a') stopwords.add('emoji1f33c') stopwords.add('emoji1f633') # 导入背景图 bg_image = plt.imread(current_dir+'/010-wechat-bg.jpg') # 设置词云参数,参数分别表示:画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小 wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, font_path='STKAITI.TTF', stopwords=stopwords, max_font_size=400, random_state=50) # 将分词后数据传入云图 wc.generate_from_text(words) plt.imshow(wc) # 绘制图像 plt.axis('off') # 不显示坐标轴 # 保存结果到本地 wc.to_file(current_dir+'/个性签名词云图.jpg')
def to_wordcloud(text): # background_image=plt.imread('./xxx.jpg') stopwords=STOPWORDS.copy() for i in ['电影','电影院','影片','IMAX']: stopwords.add(i) # mask=background_image, wc = WordCloud(width=1024,height=768,background_color='white',font_path = 'simhei.ttf',stopwords=stopwords,max_font_size=400,random_state=50) wc.generate_from_text(text) # img_colors= ImageColorGenerator(background_image) # wc.recolor(color_func=img_colors) wc.to_file('maoyan_'+code+'.jpg') plt.imshow(wc) plt.axis('off') plt.show()
def mk_wordcloud(): text_content = open('test.txt', 'r', encoding = 'utf-8').read() wordlist_cut_by_jieba = jieba.cut(text_content, cut_all=False) wordlist_space = ' '.join(wordlist_cut_by_jieba) #print(wordlist_space) background_image = plt.imread('xpj.jpg') print('加载图片') #屏蔽词 stopwords = STOPWORDS.copy() stopwords.add("还是") stopwords.add("但是") stopwords.add("不是") stopwords.add("就是") stopwords.add("没有") stopwords.add("知道") stopwords.add("因为") stopwords.add("看到") stopwords.add("还有") stopwords.add("觉得") stopwords.add("有点") stopwords.add("这么") stopwords.add("其实") stopwords.add("一个") stopwords.add("为什么") stopwords.add("开始") stopwords.add("不要") stopwords.add("本来") stopwords.add("虽然") stopwords.add("出来") wc = WordCloud( width = 750, height = 1335, background_color = 'white', mask = background_image, font_path = 'C:\Windows\Fonts\simsun.ttc', max_words = 400,#最多字数 stopwords = stopwords, max_font_size=400,#字体最大值 random_state = 50, #随机生成状态,即多少种配色方案 ) wc.generate_from_text(wordlist_space) img_color = ImageColorGenerator(background_image)#背景色 wc.recolor(color_func=img_color) #字体颜色为背景图片的颜色 plt.imshow(wc) #显示词云图 plt.axis('off')#不显示下标 plt.show()
def generate_wordcloud(text): def my_color_func(word, font_size, position, orientation, random_state=None, **kwargs): """ To change colors change the range for random ints below: Hue values are between 0 and 360 Follows rainbow: Red Orange Yellow Green Blue Indigo Violet 0 50 100 150 200 250 300 360 """ hue_lower = 0 hue_upper = 150 saturation = 500 light_lower = 80 light_upper = 120 return "hsl(%d, %d%%, %d%%)" % ( random.randint(hue_lower, hue_upper), saturation, random.randint(light_lower, light_upper), ) stopwords = STOPWORDS.copy() stopwords.add("us") stopwords.add("one") stopwords.add("will") stopwords.add("u") rand_num = random.randint(1, 100) wc = WordCloud( max_words=100, stopwords=stopwords, margin=10, random_state=rand_num, width=2000, height=1200 ).generate(text) fig = plt.figure(figsize=(32, 20), dpi=100) plt.imshow(wc.recolor(color_func=my_color_func, random_state=1)) # Save image outfilename = "tmp.png" wc.to_file(outfilename) plt.axis("off") plt.show()
def makeWC(theText, mask_image, mw): SW = STOPWORDS.copy() mywords = ['and', 'the', 'to', 'by', 'in', 'of', 'up', 'Facebook', 'Twitter', 'Pinterest', 'Flickr', 'Google', 'Instagram', 'login', 'Login', 'Log', 'website', 'Website', 'Contact', 'contact', 'twitter', 'Branding', 'Tweet', 'pic', 'location', 'Details' ] + list(bad_words()) [SW.add(w) for w in mywords] wordcloud = WordCloud( relative_scaling=0, prefer_horizontal=random.uniform(0.5, 1), stopwords=SW, background_color='black', max_words=mw, mask = mask_image ).generate(theText) return wordcloud
def main(): data = {} data['questions'] = [] data['stakeholders'] = [] data['wordclouds'] = {} stopwords = STOPWORDS.copy() sphere_stopwords = {'1a':['live', 'born', 'year', 'yrs', 'since'], \ '1b':['live', 'born', 'year', 'yrs', 'moved', 'since'], \ '2a':['time'], \ '2b':[], \ '3':[], \ '4a':['bangalore', 'advantage', 'advantages'], \ '4b':['bangalore', 'challenge', 'challenges'], \ '4c':['work'], \ '4d':['dependency', 'dependencies', 'external'], \ '4e':['area', 'bangalore'], \ '4f':[], \ '5b':['food', 'air'], \ '6a':['end', 'user', 'enduser'], \ '6b':['month', 'income', 'per', 'household'], \ '6c':['interact', 'interaction','end', 'user', 'enduser'], \ '6d':['design', 'end', 'user', 'enduser'], \ '7a':['quality', 'control', 'challenge'], \ '7b':['end', 'user', 'access', 'challenge'], \ '7c':[], \ '7d':[], \ '8a':['tool', 'resource', 'resource'], \ '8b':['average', 'age', 'team'], \ '8c':[], \ '8d':[], \ '8e':[], \ '9a':[], \ '9b':[], \ '10a':[], \ '10b':[], \ '10c':[], \ '10d':[], \ '10e':[], \ '10f':[], \ '11a':[], \ '11b':[], \ '11c':[], \ '12a':[], \ '12b':[], \ '12c':[], \ '13a':[], \ '13b':[], \ '13c':[], \ '13d':[], \ '13e':[], \ '13f':[], \ '13g':[], \ '13h':[], \ '13i':[], \ '14a':[], \ '14b':[], \ } #Filter out standalone words 2 letters or shorter shortword = re.compile(r'\W*\b\w{1,2}\b') with open('alldata.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: stakeholder = row['Code'] data['stakeholders'].append(stakeholder) data[stakeholder] = {} data[stakeholder]['alltext'] = '' for key in row: if key != 'Code': question = key if question not in data['questions']: data['questions'].append(question) data[question] = '' raw_response = shortword.sub('', row[key].lower().translate(None,string.punctuation)) stemmed_response = ' '.join([stem(word) for word in raw_response.split()]) data[stakeholder][question] = row[key] data[stakeholder]['alltext'] += stemmed_response data[stakeholder]['alltext'] += ' ' data[question] += stemmed_response data[question] += ' ' #Generate word clouds: for question in sorted(data['questions']): if question is not '5a': #Number of words per question # print question, ':', len(data[question].split()) try: s = stopwords.union(set(sphere_stopwords[question])) data['wordclouds'][question] = WordCloud(stopwords=s).generate(data[question]) except: print question for stakeholder in data['stakeholders']: try: data['wordclouds'][stakeholder] = WordCloud(stopwords=stopwords).generate(data[stakeholder]['alltext']) except: print stakeholder pickle.dump(data, open('alldata_v2.pickle', 'wb'))
cursor = links.find({},{"body":1}) test="" for document in cursor : test=test+document['body'] with codecs.open("text_mining/my_stopwords.txt","r",encoding="utf-8") as f: read_data = f.readlines() stopwords = STOPWORDS.copy() for data in read_data: stopwords.add(data) stopwords = map(lambda s: s.strip(), stopwords) mask_choko = np.array(Image.open("text_mining/chokomag.png")) wordcloud = WordCloud( stopwords=stopwords,background_color="black", max_words=10000,mask=mask_choko).generate(test)
def main(): data = pickle.load( open('alldata.pickle', 'rb') ) data['wordclouds'] = {} stopwords = STOPWORDS.copy() sphere_stopwords = { 'common': ['sam', 'mayu', 'mani'], '1a':['live', 'born', 'year', 'years', 'yrs', 'since', 'bangalore'], \ '1b':['live', 'born', 'year', 'yrs', 'moved', 'since', 'bangalore'], \ '2a':['balance', 'time'], \ '2b':['inspiration'], \ '3':['lifestyle', ], \ '4a':['bangalore', 'advantage', 'advantages'], \ '4b':['bangalore', 'challenge', 'challenges'], \ '4c':['work'], \ '4d':['dependency', 'dependencies', 'external'], \ '4e':['area', 'bangalore'], \ '4f':['measures'], \ '5b':['food', 'air'], \ '6a':['end', 'user', 'enduser'], \ '6b':['month', 'income', 'per', 'household'], \ '6c':['interact', 'interaction','end', 'user', 'enduser'], \ '6d':['design', 'end', 'user', 'enduser'], \ '7a':['quality', 'control', 'challenge'], \ '7b':['end', 'user', 'access', 'challenge'], \ '7c':[], \ '7d':[], \ '8a':['tool', 'tools', 'resource', 'resources'], \ '8b':['average', 'age', 'years', 'team', 'people'], \ '8c':['fund', 'funding', 'funded', 'money'], \ '8d':['tech', 'technology'], \ '8e':['office', 'location', 'work', 'space'], \ '9a':['skill', 'skills'], \ '9b':['training'], \ '10a':['active', 'internal', 'collaboration', 'collaborate'], \ '10b':['active', 'external', 'collaboration', 'collaborate'], \ '10c':['lead', 'leads', 'learning', 'collaboration', 'collaborate'], \ '10d':['part', 'formal', 'collaboration', 'collaborate', 'platform'], \ '10e':['culture', 'open', 'share', 'sharing', 'sector'], \ '10f':['share', 'shares', 'shared'], \ '11a':['partner', 'partners', 'partnership', 'partnerships'], \ '11b':['criteria', 'partner', 'partners', 'partnership', 'partnerships'], \ '11c':['partner', 'partners', 'partnership', 'partnerships', 'sector'], \ '12a':['monitoring', 'evaluation', 'method', 'methods', 'impact'], \ '12b':['goal', 'next', 'year', 'years'], \ '12c':['impact', 'studies', 'data', 'shared'], \ '13a':['entrepreneur'], \ '13b':['start', 'starting', 'startup'], \ '13c':['entrepreneur', 'entrepreneurs', 'interact', 'interaction'], \ '13d':['entrepreneur', 'entrepreneurs', 'role', 'local', 'needs'], \ '13e':['advantage', 'advantages', 'local', 'entrepreneur', 'entrepreneurs'], \ '13f':['barrier', 'barriers', 'entry', 'local', 'entrepreneur', 'entrepreneurs'], \ '13g':['challenge', 'challenges', 'local', 'entrepreneur', 'entrepreneurs'], \ '13h':['entrepreneur', 'entrepreneurs', 'fail'], \ '13i':['resource', 'resources', 'need', 'needed', 'strengthen', 'local', 'entrepreneur', 'entrepreneurs'], \ '14a':['recommend', 'stakeholder', 'stakeholders'], \ '14b':['map', 'visual'], \ } #Filter out standalone words 2 letters or shorter shortword = re.compile(r'\W*\b\w{1,2}\b') questions = [] text = {} for stakeholder in data: text[stakeholder] = '' for question in data[stakeholder]: if question not in questions: questions.append(question) text[question] = '' response = shortword.sub('', ' '.join(data[stakeholder][question]).lower().translate(None,string.punctuation)) text[stakeholder] += response + ' ' text[question] += response + ' ' #Generate word clouds: for question in sorted(questions): if question is not '5a': try: s = stopwords.union(set(sphere_stopwords[question]+sphere_stopwords['common'])) wordcloud = WordCloud(stopwords=s, width=1600, height=800, background_color='white').generate(text[question]) wordcloud.to_file('clouds/'+question+'-'+str(len(text[question].split()))+'words.png') except: print question for stakeholder in data: try: s = stopwords.union(set(sphere_stopwords['common'])) wordcloud = WordCloud(stopwords=s, width=1600, height=800, background_color='white').generate(text[stakeholder]) wordcloud.to_file('clouds/'+stakeholder+'-'+str(len(text[stakeholder].split()))+'words.png') except: print stakeholder