def parse_args(arguments): # prog = 'python wordcloud_cli.py' parser = make_parser() args = parser.parse_args(arguments) if args.colormask and args.color: raise ValueError('specify either a color mask or a color function') args = vars(args) with args.pop('text') as f: text = f.read() if args['stopwords']: with args.pop('stopwords') as f: args['stopwords'] = set(map(lambda l: l.strip(), f.readlines())) if args['mask']: mask = args.pop('mask') args['mask'] = np.array(Image.open(mask)) color_func = wc.random_color_func colormask = args.pop('colormask') color = args.pop('color') if colormask: image = np.array(Image.open(colormask)) color_func = wc.ImageColorGenerator(image) if color: color_func = wc.get_single_color_func(color) args['color_func'] = color_func imagefile = args.pop('imagefile') return args, text, imagefile
def __main__(): # 当前路径 d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() #读文件 with open(os.path.join(d, 'English.txt'), encoding='utf-8') as file: text = file.read() font_path = os.path.join(d, 'GABRWFFR.TTF') pic = Image.open(os.path.join(d, 'fish.jpg')) mask = np.array(pic) # create coloring from image pic_color = wordcloud.ImageColorGenerator(mask) w = wordcloud.WordCloud(font_path=font_path, background_color='white', mask=mask, contour_width=3, contour_color="blue") w.generate(text) #recolor wordcloud w.recolor(color_func=pic_color) plt.figure() plt.subplot(1, 2, 1) plt.imshow(pic) plt.subplot(1, 2, 2) plt.imshow(w, interpolation="bilinear") plt.show() w.to_file(os.path.join(d, 'fish.png'))
def create_word_cloud(): # 设置词云形状图片,numpy+PIL方式读取图片 # 数据清洗词列表 stop_words = [ '就是', '不是', '但是', '还是', '只是', '这样', '这个', '一个', '什么', '电影', '没有' ] # 设置词云的一些配置,如:字体,背景色,词云形状,大小,生成词云对象 wc = wordcloud.WordCloud(mask=imread('lianxi/background1.png'), background_color=None, stopwords=stop_words, max_words=250, scale=4, mode='RGBA', min_font_size=10, max_font_size=70, random_state=42, font_path="C:\\Windows\\Fonts\\SimHei.TTF") # 生成词云 wc.generate(cut_word()) img = imread('lianxi/color.jpg') cloud_colors = wordcloud.ImageColorGenerator(np.array(img)) wc.recolor(color_func=cloud_colors) plt.figure(figsize=(20, 20)) plt.rcParams['font.family'] = 'SimHei' # 开始画图 plt.imshow(wc) # 为云图去掉坐标轴 plt.axis("off") plt.savefig('dataout/图7豆瓣电影词语云.png')
def show_cloud(self): # 读取文件 fn = open('job.txt') # 打开文件 string_data = fn.read() # 读出整个文件 fn.close() # 关闭文件 # 文本预处理 pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') # 定义正则表达式匹配模式 string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除 # 文本分词 seg_list_exact = jieba.cut(string_data, cut_all=False) # 精确模式分词 object_list = [] # # 去除模式 # remove_words = [u';', u':', u'\xa0', u'有', u'数据', u'开发', u'经验', u'大', # u'熟悉', u'/', u'。', u' ', u'、', u'技术', u'能力', u'了',u'的', # u'和',u'1',u'2',u'3',u'4',u'5',u'6',u'7',u'8',u'9',u'0', # u'等',u'相关',u'学习',u'或',u'了',u'进行',u'者',u'要求',u'描述', # u'任职',u'以上',u'项目',u'应用',u'业务',u'平台',u'沟通',u'对',u'产品', # u'设计',u'年',u'优先',u',',u'’',u'‘',u'及',u'负责',u'工作',u'职位', # u'具备',u'具有',u'系统',u'良好',u'团队',u'以上学历',u'使用',u'精通',u'公司', # u'通常', u'如果', u'我们', u'需要'] # 自定义去除词库 # # for word in seg_list_exact: # 循环读出每个分词 # if word not in remove_words: # 如果不在去除词库中 # object_list.append(word) # 分词追加到列表 filter_words = [ u'Hadoop', u'Spark', u'Hive', u'Flink', u'Hbase', u'hdfs', u'Python' ] # 自定义需要统计的关键词 # 保留模式 for word in seg_list_exact: # 循环读出每个分词 if word in filter_words: # 如果不在去除词库中 object_list.append(word) # 分词追加到列表 # 词频统计 word_counts = collections.Counter(object_list) # 对分词做词频统计 word_counts_top10 = word_counts.most_common(30) # 获取前10最高频的词 print(word_counts_top10) # 输出检查 # 词频展示 mask = np.array(Image.open('./wordcloud_bg.jpg')) # 定义词频背景 wc = wordcloud.WordCloud( font_path="./SourceHanSerifSC-Bold.otf", # 设置字体格式 mask=mask, # 设置背景图 max_words=30, # 最多显示词数 max_font_size=400 # 字体最大值 ) wc.generate_from_frequencies(word_counts) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案 wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 plt.imshow(wc) # 显示词云 plt.axis('off') # 关闭坐标轴 plt.show() # 显示图像 plt.imsave("./out.jpg", wc)
def make_word_cloud(object_list): # 词频展示 print('\n开始制作词云……') # 提示当前状态 word_counts = collections.Counter(object_list) mask = numpy.array(Image.open(background)) # 定义词频背景 wc = wordcloud.WordCloud( font_path='file/simfang.ttf', # 设置字体(这里选择“仿宋”) background_color='white', # 背景颜色 mask=mask, # 文字颜色+形状(有mask参数再设定宽高是无效的) max_words=number, # 显示词数 max_font_size=150 # 最大字号 ) wc.generate_from_frequencies(word_counts) # 从字典生成词云 wc.recolor(color_func=wordcloud.ImageColorGenerator(mask)) # 将词云颜色设置为背景图方案 plt.figure('词云') # 弹框名称与大小 plt.subplots_adjust(top=0.99, bottom=0.01, right=0.99, left=0.01, hspace=0, wspace=0) # 调整边距 plt.imshow(wc, cmap=plt.cm.gray, interpolation='bilinear') # 处理词云 plt.axis('off') # 关闭坐标轴 print('制作完成!') # 提示当前状态 plt.show()
def show_cloud(): # Read all of comments. total_comment_text = "" for comment in database.get_comments(): total_comment_text += re.sub('<[^<]+?>', '', comment['text']).replace( '\n', '').strip() # re用来去除html标签 # Cut sentences to short words. wordlist = jieba.lcut(total_comment_text) wordliststr = " ".join(wordlist) font = os.path.join(os.path.dirname(__file__), "word_cloud_yahei.ttf") mask = np.array(Image.open( 'word_cloud_mask.png')) # background and shape of the word cloud # Get the word cloud. wd = wordcloud.WordCloud(scale=8, width=1920, height=1080, font_path=font, mask=mask, max_font_size=100, min_font_size=12, background_color="white", stopwords=get_stop_words()).generate(wordliststr) image_colors = wordcloud.ImageColorGenerator(mask) wd.recolor(color_func=image_colors) # color is from the background image plt.figure() plt.imshow(wd) plt.axis("off") plt.show()
def cloudplot(): # 设置模板图像的路径 target_coloring = imread(r'C:\Users\ctrl\Desktop\heart.jpg') # 以词频和背景模板为依据生成词云对象 word_cloud = WordCloud(font_path=r'C:\Windows\Fonts\simhei.ttf', background_color="white", max_words=2000, mask=target_coloring).generate_from_frequencies( AnalyzeData()) # 生成颜色分布 image_color = wordcloud.ImageColorGenerator(target_coloring) # image_color = import matplotlib.pyplot as plt # 仅按照词频、边界、默认颜色生成词云图像 plt.imshow(word_cloud) plt.axis("off") plt.figure() # 重新上色,按照图像色彩分布生成 plt.imshow(word_cloud.recolor(color_func=image_color)) plt.axis("off") plt.figure() # 绘制原始图像 plt.imshow(target_coloring, cmap=plt.cm.gray) plt.axis("off") plt.show() word_cloud.to_file(filename + '.png')
def get_wordcloud(): # 词云生成 words_str = "" with open("jd_comment.txt") as f: for line in f: line = re.sub( u"[0-9\s+.!/,$%^*()?;;:-【】\"\']+|[+—!,;:。?、~@#¥%…&*()><-]+", "", line) # 去掉多余字符 if line == "": continue line = line.replace("\n", "") # 去掉换行符 seg_list = jieba.cut(line, cut_all=False) words_str += (" ".join(seg_list)) stopwords = stopwordslist() words = [ word for word in words_str.split(" ") if word not in stopwords and len(word) > 1 ] word_counts = Counter() # 词频统计 for x in words: word_counts[x] += 1 mask = np.array(PIL.Image.open(r'./background.jpg')) wc = WordCloud(font_path=r'C:\Windows\Fonts\SimHei.TTF', max_words=2000, mask=mask, repeat=False) wc.generate_from_frequencies(word_counts) image_colors = wordcloud.ImageColorGenerator( mask) # 可以去掉 # 基于彩色图像生成相应彩色 文字颜色跟随背景图颜色 wc.recolor(color_func=image_colors) wc.to_file("词云.jpg")
def question_41(): data_name = '汽车之家论坛' text = open('{}_cut.txt'.format(data_name), 'r', encoding='utf-8').read() alice_coloring = imread("bk.jpg") wc = wordcloud.WordCloud(background_color="white", width=800, height=600, mask=alice_coloring, max_font_size=20, random_state=1, max_words=100, font_path='C:\\Windows\\msyh.ttf') wc.generate(text) image_colors = wordcloud.ImageColorGenerator(alice_coloring) plt.axis("off") plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") plt.figure(dpi=300) plt.axis("off") wc.to_file('{}_word_cloud.png'.format(data_name))
def create_word_cloud(self, text): """ :param word_list: 经过去除标点符号、分词、去停用词后的词列表 :return: """ word_list = self.tokenization(text) word_counts = collections.Counter(word_list) # 对分词做词频统计 word_counts_topK = word_counts.most_common(self.topK) # 获取前K最高频的词 # print(word_counts_topK) # 词频展示 mask = np.array(Image.open(self.wc_background)) # 定义词频背景 wc = wordcloud.WordCloud( font_path=self.font_path, # 设置字体格式 mask=mask, # 设置背景图 max_words=self.max_words, # 最多显示词数 max_font_size=self.max_font_size # 字体最大值 ) wc.generate_from_frequencies(word_counts) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案 wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 plt.imshow(wc) # 显示词云 plt.axis('off') # 关闭坐标轴 plt.savefig(os.path.join(self.save_path, self.wc_name)) plt.show() # 显示图像
def title(): import jieba import jieba.analyse title = [] jieba.load_userdict('dict.txt') jieba.analyse.set_stop_words('stop_words.txt') records5 = collection1.find({}) for record5 in records5: title.append(record5['title']) words = "".join(title) wordlist = jieba.cut(words, cut_all=True) word_space_split = " ".join(wordlist) stopwords = set() mask2 = np.array(Image.open('CR.jpg')) # 定义词频背景 wc2 = wordcloud.WordCloud( background_color="white", font_path="fonts/simkai.ttf", stopwords=stopwords, mask=mask2, # 设置背景图 max_words=200, # 最多显示词数 max_font_size=100 # 字体最大值 ) wc2.generate(word_space_split) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(mask2) # 从背景图建立颜色方案 wc2.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 plt.imshow(wc2) # 显示词云 plt.axis('off') # 关闭坐标轴 plt.show() # 显示图像
def word_cloud(segment, title): """ 生成词云 :param bullets: 弹幕列表 :return: 图片url和排名前10的词 """ word_counts = collections.Counter(segment) word_counts_top10 = word_counts.most_common(10) print(word_counts_top10) # mask = np.array(Image.open(url_for('static',filename='background.jpg'))) # 定义词频背景 # mask = np.array(Image.open('D:/Python Projects/BilibiliRank/app/static/background.jpg')) # 定义词频背景 root_dir = current_app.config['ROOT_DIR'] mask = np.array(Image.open(root_dir + '/static/background.jpg')) # 定义词频背景 wc = wordcloud.WordCloud( font_path=root_dir+'/static/STFQLBYTJW.ttf', # 设置字体格式 mask=mask, # 设置背景图 max_words=200, # 最多显示词数 max_font_size=100 # 字体最大值 ) wc.generate_from_frequencies(word_counts) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案 wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 # wc.to_file(url_for('static',filename='wordcloud/wordcloud.png')) r_img_url = '/static/wordcloud/wordcloud' + str(time()) + '.png' wc.to_file(root_dir + r_img_url) print('图片生成成功!') # plt.imshow(wc) # 显示词云 # plt.axis('off') # 关闭坐标轴 # plt.show() # 显示图像 return r_img_url, word_counts_top10
def words_cloud(self, chin_list: list, eng_list: list, rm_words=None): if rm_words is None: rm_words = [ '', 'and', 'to', 'the', 'a', 'for', 'our', 'we', 'with', 'in', 'of', 'We', '的', 'be', 'that', 'their', 'as', 'on', 'an', 'is', '和', 'have', 'are', 'by', 'most', '-', 'where', 'its' ] words_list = [] remove_words = rm_words all_list = chin_list + eng_list for word in all_list: if word not in remove_words: words_list.append(word) word_counts = collections.Counter(words_list) mask = np.array(Image.open('./wordcloud.jpg')) wc = wordcloud.WordCloud( font_path='/r2/dockerfile/py2/fonts/simhei.ttf', # 设置字体格式 mask=mask, scale=4, background_color='white', max_words=2000, max_font_size=100) wc.generate_from_frequencies(word_counts) image_colors = wordcloud.ImageColorGenerator(mask) wc.recolor(color_func=image_colors) plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear") plt.axis('off') wc.to_file(f'{self.path}/word_cloud.jpg')
def main(): base_logo_path = "../../assets/publiccompany/logos/%s.png" base_transcript_path = "../../assets/publiccompany/transcripts/%s.txt" ticker = "gm" img = Image.open(base_logo_path % ticker) img = img.resize((1280, 1080)) img_color = np.array(img) img_color_gen = wordcloud.ImageColorGenerator(img_color) stop_words = wordcloud.STOPWORDS [ stop_words.add(word) for word in ["quarter", "think", "customer", "growth", "Azure"] ] #img_color_gen = wordcloud.ImageColorGenerator(np.array(Image.open("../../assets/publiccompany/logos/mcd.png"))) #img_mask = ImageOps.invert(img) img_mask = img #img_mask.save("nflx-mask.png") img_mask = np.array(img_mask) #src_text = open("../../assets/publiccompany/transcripts/nflx.txt").read() src_text = open(base_transcript_path % ticker).read() wc = wordcloud.WordCloud(width=1280, height=720, mask=img_mask, stopwords=stop_words, max_words=1000) wc.generate(src_text) wc.recolor(color_func=img_color_gen) wc.to_file("wc.jpg")
def makewordcloud(self): content = '' with open('comment.csv', 'r') as f: reader = csv.reader(f) for row in reader: content += row[1] content = content + '\n' jieba.analyse.set_stop_words('stop.txt') tags = jieba.analyse.extract_tags(content, topK=100, withWeight=True) word_freq = {} for v, n in tags: word_freq[v] = str(int(n * 10000)) word_freq[v] = int(n * 10000) mask = np.array(Image.open('index.png')) # 定义词频背景 wc = wordcloud.WordCloud( font_path='C:/Windows/Fonts/simhei.ttf', # 设置字体格式 mask=mask, # 设置背景图 max_words=2000, # 最多显示词数 max_font_size=120, # 字体最大值 background_color='white') wc.generate_from_frequencies(word_freq) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案 wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 plt.imshow(wc) # 显示词云 plt.axis('off') # 关闭坐标轴 plt.show() # 显示图像'''
def cloudplot(): # 设置词云整体形状 target_coloring = imageio.imread(r'data\alice.png') # 以词频和背景模板为依据生成词云对象 word_cloud = WordCloud(font_path=r'C:\windows\Fonts\simhei.ttf', background_color="white", max_words=2000, mask=target_coloring).generate_from_frequencies(AnalyzeData()) # 生成颜色分布 image_color = wordcloud.ImageColorGenerator(target_coloring) # image_color = # 仅按照词频、边界、默认颜色生成词云图像 plt.imshow(word_cloud) plt.axis("off") plt.figure() # 重新上色,按照图像色彩分布生成 plt.imshow(word_cloud.recolor(color_func=image_color)) plt.axis("off") plt.figure() # 绘制原始图像 plt.imshow(target_coloring, cmap=plt.cm.gray) plt.axis("off") plt.show() word_cloud.to_file(filename + '.png')
def get_wordcloud(self,imagePath,words_str,store_path=''): background=plt.imread(imagePath) #设定云图背景图案,参数为图片路径,不设置的话云图默认为方形 img = Image.open(imagePath) width = img.width height = img.height wc=wordcloud.WordCloud(mask=background,font_path=self.FONTPATH,background_color='white',width=width,height=height,max_font_size=400,min_font_size=5) alice_coloring = np.array(img) image_colors = wordcloud.ImageColorGenerator(alice_coloring) #font_path是中文字体路径,因为wordcloud库本身只支持英文,需要下载中文字体; # max_font_size和min_font_size分别设置云图最大词语的大小和最小词语的大 wc.generate(words_str)#生成词云 # show fig, axes = plt.subplots(1, 3) axes[0].imshow(wc, interpolation="bilinear") # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor axes[1].imshow(wc.recolor(color_func=image_colors), interpolation="bilinear") axes[2].imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear") for ax in axes: ax.set_axis_off() plt.show() wc.to_file(store_path or self.OUTPUTPATH)#将词云存储到指定路径
def main(): print("@@@@@@@@欢迎来到词云生成器@@@@@@@@") file_name = input("请输入txt文档所在路径: ") file_info = readFile(file_name) print("文件内容读取成功") txt = cutWords(file_info) main_choice = getChoice("是否要改变词云形成形状?<1>是 <2>否 ", 1, 2) color_mask = None if main_choice == 1: file_name = input("请输入形状图片所在路径: ") color_mask = getImage(file_name) print("获取形状图片信息成功") image_colors = wordcloud.ImageColorGenerator(color_mask) plt.ion() while True: plt.close(1) back_color = input("请输入背景颜色的英文单词(red, black, white等): ") scale = getChoice("请输入图片清晰度(1比较模糊, 4较为清晰) ", 1, 100) choice = getChoice("是否使用默认字体<1>是 <2>否 ", 1, 2) font_path = "msyh.ttc" if choice == 2: font_path = getFont() repeat = False choice = getChoice("是否允许词汇重复出现<1>是 <2>否 ", 1, 2) if choice == 1: repeat = True max_words = getChoice("请输入最大出现词汇数量(默认200): ", 1, 99999) print("正在生成词云,时间可能较长,请稍后。。。(P.S. 显示时图片可能较模糊,但保存后图片会很清晰)") w = wordcloud.WordCloud(font_path=font_path, background_color=back_color, width=1000, height=700, mask=color_mask, scale=scale, repeat=repeat, max_words=max_words) w.generate(txt) if main_choice == 1: choice = getChoice("词云底色是否要与形状图片相同<1>是 <2>否 ", 1, 2) if choice == 1: w.recolor(color_func=image_colors) print("生成图片中。。。") plt.figure(1) plt.imshow(w) plt.axis('off') plt.pause(0.001) plt.show() if getChoice("是否满意<1>是 <2>否 ", 1, 2) == 1: break name = input("请输入保存图片名: ") print("图片保存中。。。") w.to_file(name + ".png") name = input("图片保存成功!")
def extract_two(self, file=None): fn = codecs.open(file, 'r+', encoding='utf-8') string_data = fn.read() fn.close() # 文本预处理 pattern = re.compile( '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~。“”、:?,【】!()——↓0-9a-zA-Z\.\.\.\.\.\.]+' ) # pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除 string_data = string_data.replace('\n', '') string_data = string_data.replace('\u3000', '') string_data = string_data.replace('\r', '') string_data = string_data.replace(' ', '') logging.info(string_data) # 文本分词 seg_list_exact = jieba.cut(string_data, cut_all=False) # 精确模式分词 object_list = [] remove_words_custom = [ u'的', u',', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。', u' ', u'、', u'中', u'在', u'了', u'通常', u'如果', u'我们', u'需要', u'月', u'日' ] # 自定义去除词库 remove_words = self.parse_multiple_files( ['中文停用词表.txt', '哈工大停用词表.txt', '四川大学机器智能实验室停用词库.txt', '百度停用词表.txt']) remove_words = remove_words_custom + remove_words for word in seg_list_exact: # 循环读出每个分词 if word not in remove_words: # 如果不在去除词库中 logging.info('\n') logging.info(word) object_list.append(word) # 分词追加到列表 logging.info(object_list) # 词频统计 word_counts = collections.Counter(object_list) # 对分词做词频统计 word_counts_top10 = word_counts.most_common(10) # 获取前10最高频的词 print(word_counts_top10) # 输出检查 # 词频展示 font_path = r'C:\Windows\Fonts\simfang.ttf' mask = np.array(Image.open('background.jpg')) # 定义词频背景 wc = wordcloud.WordCloud( background_color='white', # 设置背景颜色 font_path=font_path, # 设置字体格式 mask=mask, # 设置背景图 max_words=200, # 最多显示词数 max_font_size=200, # 字体最大值 scale=80 # 调整图片清晰度,值越大越清楚 ) wc.generate_from_frequencies(word_counts) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案 wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 plt.figure() plt.imshow(wc) # 显示词云 plt.axis('off') # 关闭坐标轴 plt.show() # 显示图像 wc.to_file("bb.jpg") # 将图片输出为文件
def parse_args(arguments): prog = 'python wordcloud_cli.py' description = ('A simple command line interface for wordcloud module.') parser = argparse.ArgumentParser(description=description) parser.add_argument('--text', metavar='file', type=argparse.FileType(), default='-', help='specify file of words to build the word cloud (default: stdin)') parser.add_argument('--stopwords', metavar='file', type=argparse.FileType(), help='specify file of stopwords (containing one word per line) to remove from the given text after parsing') parser.add_argument('--imagefile', metavar='file', type=argparse.FileType('w'), default='-', help='file the completed PNG image should be written to (default: stdout)') parser.add_argument('--fontfile', metavar='path', dest='font_path', help='path to font file you wish to use (default: DroidSansMono)') parser.add_argument('--mask', metavar='file', type=argparse.FileType(), help='mask to use for the image form') parser.add_argument('--colormask', metavar='file', type=argparse.FileType(), help='color mask to use for image coloring') parser.add_argument('--relative_scaling', type=float, default=0, metavar='rs', help=' scaling of words by frequency (0 - 1)') parser.add_argument('--margin', type=int, default=2, metavar='width', help='spacing to leave around words') parser.add_argument('--width', type=int, default=400, metavar='width', help='define output image width') parser.add_argument('--height', type=int, default=200, metavar='height', help='define output image height') parser.add_argument('--color', metavar='color', help='use given color as coloring for the image - accepts any value from PIL.ImageColor.getcolor') parser.add_argument('--background', metavar='color', default='black', type=str, dest='background_color', help='use given color as background color for the image - accepts any value from PIL.ImageColor.getcolor') parser.add_argument('--no_collocations', action='store_true', help='do not add collocations (bigrams) to word cloud (default: add unigrams and bigrams)') args = parser.parse_args(arguments) if args.colormask and args.color: raise ValueError('specify either a color mask or a color function') with args.text: args.text = args.text.read() if args.stopwords: with args.stopwords: args.stopwords = set(map(str.strip, args.stopwords.readlines())) if args.mask: args.mask = np.array(Image.open(args.mask)) color_func = wc.random_color_func if args.colormask: image = np.array(Image.open(args.colormask)) color_func = wc.ImageColorGenerator(image) if args.color: color_func = wc.get_single_color_func(args.color) args.collocations = not args.no_collocations args.color_func = color_func return args
def wordCount(url): fileName = url[url.rfind('/') + 1:url.rfind('.')] it.contentInfo(url, True) # 读取文件 fn = open('../../fileInfo/' + fileName + '.txt') string_data = fn.read() # 读出整个文件 fn.close() # 关闭文件 # 文本预处理 pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') # 定义正则表达式匹配模式 string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除 # 文本分词 seg_list_exact = jieba.cut(string_data, cut_all=False) # 精确模式分词 object_list = [] # 自定义去除词库 remove_words = [ u'的', u',', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。', u' ', u'、', u'中', u'在', u'了', u'通常', u'如果', u'我们', u'需要', u'…', u':', u'“', u'”', u':', u'但', u'型', u'被' ] for word in seg_list_exact: # 循环读出每个分词 if word not in remove_words: # 如果不在去除词库中 object_list.append(word) # 分词追加到列表 # 词频统计 word_counts = collections.Counter(object_list) # 对分词做词频统计 word_counts_top10 = word_counts.most_common(10) # 获取前10最高频的词 # 输出检查 for item in word_counts_top10: print item[0], item[1] # 词频展示 mask = np.array(Image.open('./timg.jpeg')) # 定义词频背景 wc = wordcloud.WordCloud( font_path='simhei.ttf', # 设置字体格式 mask=mask, # 设置背景图 width=800, height=600, max_words=200, # 最多显示词数 max_font_size=100 # 字体最大值 ) # 从字典生成词云 wc.generate_from_frequencies(word_counts) # 从背景图建立颜色方案 image_colors = wordcloud.ImageColorGenerator(mask) # 将词云颜色设置为背景图方案 wc.recolor(color_func=image_colors) # 保存图片 plt.imsave('../../fileInfo/' + fileName + '.png', wc) # 显示词云 plt.imshow(wc) # 关闭坐标轴 plt.axis('off')
def show_novel(): # 导入停用词,转换为列表格式 stop_list = [] with open('materials/停用词.txt', 'r') as f: for line in f.readlines(): stop_list.append(line.strip()) # 导入小说原文 with open('materials/全职高手.txt', 'r', encoding='gbk') as f: text = f.read() # 导入词库字典,编码方式必须是utf-8 jieba.load_userdict('materials/词库字典.txt') # 分词功能的函数,完成分词 def txt_cut(f): return [w for w in jieba.cut(f) if w not in stop_list and len(w) > 1] textcut = txt_cut(text) # 对词频进行简单统计 word_count = pd.Series(textcut).value_counts().sort_values( ascending=False)[0:20] # 画出柱状图并保存 fig = plt.figure(figsize=(15, 8)) x = word_count.index.tolist() y = word_count.values.tolist() sns.barplot(x, y, palette='BuPu_r') plt.title('词频Top20') plt.ylabel('count') sns.despine(bottom=True) plt.savefig('词频统计.png', dpi=400) plt.show() # 实例化一个词云类,添加分词 fig_cloud = plt.figure(figsize=(15, 5)) # font_path导入中文字体,mask设置词云图案的形状 cloud = wordcloud.WordCloud(font_path='materials/simkai.ttf', mask=imread('materials/test.jpg'), mode='RGBA', background_color=None).generate( ' '.join(textcut)) # 对词的颜色做美化 img = imread('materials/color.jpg') cloud_colors = wordcloud.ImageColorGenerator(np.array(img)) cloud.recolor(color_func=cloud_colors) # 调用matplotlib接口 plt.imshow(cloud) plt.axis('off') plt.savefig('wordcloud.png', dpi=400) plt.show()
def plotWordColud(user_id): user_text, friend_text = "", "" with open("weibo.csv", 'r') as file: reader = csv.reader(file) weibos = list(reader) with open("Edges.csv", 'r') as file: reader = csv.reader(file) edges = list(reader) friends = [] for edge in edges: if edge[0] == user_id: if edge[1] not in friends: friends.append(edge[1]) if edge[1] == user_id: if edge[0] not in friends: friends.append(edge[0]) for weibo in weibos: if weibo[1] == user_id: user_text += weibo[2] if weibo[1] in friends: friend_text += weibo[2] user_text_list = segment(user_text) friend_text_list = segment(friend_text) user_ = ",".join(user_text_list) friend_ = ",".join(friend_text_list) #设定指定的背景 jpg = imread('logo.jpg') mask = np.array(Image.open('logo.jpg')) image_colors = wordcloud.ImageColorGenerator(mask) #保存 wc = WordCloud(background_color="white", max_words=200, min_font_size=10, max_font_size=35, width=400, font_path="/Users/wu/Downloads/msyh/msyh.ttf", mask=mask, color_func=image_colors) wc.generate(user_) file_path = "./wordCloud/user_.png" wc.to_file(file_path) wc.generate(friend_) file_path = "./wordCloud/friend_.png" wc.to_file(file_path)
def getKeyWords(self): """ 根据微博文本内容统计关键词 """ text = "" for weibo in self.weibos: text += weibo['text'] segs = jieba.cut(text) wcdict = {} for word in segs: if len(word) == 1: continue else: wcdict[word] = wcdict.get(word, 0) + 1 wcls = list(wcdict.items()) wcls.sort(key=lambda x: x[1], reverse=True) xx = [ '他们', '没有', '自己', '一个', '什么', '这样', '知道', '我们', '这个', '这些', '不过', '已经', '要是', '觉得', '那样', '而且', "微博", "转发", "通过", "现在", "有人", "时候" ] for pair in wcls[:10]: if pair[0] not in xx: self.keywors.append(pair) #生成词云 jpg = imread('cc.jpg') mask = np.array(Image.open('cc.jpg')) image_colors = wordcloud.ImageColorGenerator(mask) text_list = [] for (word, cnt) in wcls: times = min(cnt, 30) for i in range(times): text_list.append(word) yc_text = ",".join(text_list) if len(yc_text) > 0: wc = WordCloud(background_color="white", max_words=300, min_font_size=15, repeat=False, max_font_size=50, width=400, font_path="/Users/wu/Downloads/msyh/msyh.ttf", mask=mask, color_func=image_colors) #wc.generate(yc_text) wc.generate_from_frequencies(dict(wcls)) file_path = "./derived/kw_" + self.user_id + ".png" wc.to_file(file_path)
def parse(self, response): a = [] b = [] c = '' for quote in response.css('.detail_text'): for i in quote.css('p::text').extract(): a.append(i) for i in a: c += i d = sta(c) data = Counter(d) e = data.most_common() f = () g = list(f) for i in e: g.append(i) h = tuple(g) j = dict(h) mask = np.array(Image.open(path.join(path.dirname(__file__), "py.png"))) transformed_mask = np.ndarray((mask.shape[0], mask.shape[1]), np.int32) for i in range(len(mask)): transformed_mask[i] = list(map(self.transform_format, mask[i])) cloud = wordcloud.WordCloud( background_color="#000", width=1000, height=800, mask=transformed_mask, collocations=False).generate_from_frequencies(j) image_colors = wordcloud.ImageColorGenerator(mask) plt.figure() # plot words plt.imshow(cloud.recolor(color_func=image_colors), interpolation="bilinear") # remove axes plt.axis("off") # show the result plt.show()
def word_cloud(wordsCnt, i): mask = np.array(Image.open('wordcloud.jpg')) # 定义词云背景 wc = wordcloud.WordCloud( font_path='C:/Windows/Fonts/simhei.ttf', # 设置字体格式 mask=mask, # 设置背景图 max_words=200, # 最多显示词数 max_font_size=100 # 字体最大值 ) wc.generate_from_frequencies(wordsCnt) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案 wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 plt.imshow(wc) # 显示词云 plt.axis('off') # 关闭坐标轴 plt.show() # 显示图像 wc.to_file('WordCloud' + str(i)) # 输出文件 设置文件名
def draw_word_cloud(word, graph): """画出词云图片,保存到本地,并可视化""" font_path = 'C:\\Windows\\Fonts\\Deng.ttf' # 设置字体路径 wc = wordcloud.WordCloud(font_path=font_path, background_color='white', max_words=200, mask=graph, scale=1.5, random_state=10) # wordCloud参数设置 wc.generate(word) image_color = wordcloud.ImageColorGenerator(graph) wc.to_file('word9.png') # 将生成的文件保存起来 plt.imshow(wc.recolor(color_func=image_color)) plt.axis('off') plt.show()
def get_word_cloud(self, chinese_slice=False, stopwords=None, image_out_name=None): """ :param chinese_slice: Whether Use jieba to slice the sentences. :param stopwords: a set include some words to exclude. :return: """ font_path = self.font_path if image_out_name is None: image_out_name = 'word-heart.png' if chinese_slice: text = ",".join(self.chat_content) text_list = jieba.lcut(text) text = " ".join(text_list) image_out_name = 'zh-'.__add__(image_out_name) else: text = " ".join(self.chat_content) mk = imageio.imread("heart.png") # 构建并配置词云对象w,注意要加scale参数,提高清晰度 w = wordcloud.WordCloud(width=1000, height=700, background_color='white', font_path=font_path, mask=mk, scale=2, stopwords=stopwords, contour_width=1, contour_color='red') # 将string变量传入w的generate()方法,给词云输入文字 w.generate(text) # 展示图片 # 根据原始背景图片的色调进行上色 image_colors = wordcloud.ImageColorGenerator(mk) plt.imshow(w.recolor(color_func=image_colors)) # 根据原始黑白色调进行上色 # plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3), interpolation='bilinear') #生成黑白词云图 # 根据函数原始设置进行上色 # plt.imshow(wc) # 隐藏图像坐标轴 plt.axis("off") plt.show() # 将词云图片导出到当前文件夹 w.to_file(image_out_name)
def word_cloud_generator(self, font_name="ShangShouXiuYuanTi.ttf"): origin_img_path = BASE_DIR + '/static/backgrounds/' + self.img_name new_img_path = BASE_DIR + '/static/resume/images/' + self.img_name mask_image = imread(origin_img_path, flatten=False) if self.color_adaption: image_colors = wc.ImageColorGenerator(mask_image) word_pic = WordCloud(font_path=font_base_path + font_name, background_color=self.background_color, mask=mask_image, color_func=image_colors).generate(self.text) else: word_pic = WordCloud(font_path=font_base_path + font_name, background_color=self.background_color, mask=mask_image).generate(self.text) imsave(new_img_path, word_pic) return new_img_path
def plot(self): wc = wordcloud.WordCloud( background_color='white', # 设置背景颜色 font_path='/System/Library/Fonts/Hiragino Sans GB.ttc', # 设置字体格式 mask=self.mask, # 设置背景图 max_words=200, # 最多显示词数 max_font_size=80, # 字体最大值 scale=128 # 调整图片清晰度,值越大越清楚 ) wc.generate_from_frequencies(self.word_counts) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(self.mask) # 从背景图建立颜色方案 wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 wc.to_file(self.output_pic_path) # 将图片输出为文件 plt.imshow(wc) # 显示词云 plt.axis('off') # 关闭坐标轴 plt.show() # 显示图像