def do_charts(): nvshen = pd.read_csv('nvshen.csv') nvshen.sort_values('weight_score', ascending=False, inplace=True) bar = Bar() count_top = nvshen['count'][0:10].values.tolist() count_bottom = nvshen['count'][-10:-1].values.tolist() count = [''.join(list(filter(str.isdigit, i))) for i in count_top] + \ [''.join(list(filter(str.isdigit, i))) for i in count_bottom] name_top = nvshen['name'][0:10] name_bottom = nvshen['name'][-10:-1] name = name_top.values.tolist() + name_bottom.values.tolist() score_top = nvshen["weight_score"][0:10] score_bottom = nvshen["weight_score"][-10:-1] score = score_top.values.tolist() + score_bottom.values.tolist() bar.add_xaxis(name) bar.add_yaxis("女神得分/百分制", score, gap="0%") bar.add_yaxis("打分人数/万", count, gap="0%") bar.set_global_opts( title_opts=opts.TitleOpts(title="女神大会", subtitle="女神大排名-top10"), datazoom_opts=opts.DataZoomOpts(is_show=True, orient="vertical"), xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)), toolbox_opts=opts.ToolboxOpts()) bar.render('女神大排名-top10.html') word_name = nvshen['name'].values.tolist() word_value = nvshen["weight_score"].values.tolist() words = [i for i in zip(word_name, word_value)] wordcloud = WordCloud() wordcloud.add("", words, word_size_range=[10, 40], shape='circle') wordcloud.render("女神词云.html")
def wordcloud_render(self, length_num=0): self.jieba_cut(length_num) c = WordCloud() c.add('', self.word_tuple_result, shape=SymbolType.DIAMOND) c.set_global_opts(title_opts=opts.TitleOpts(title=self.title), legend_opts=opts.LegendOpts(is_show=self.is_show)) return c
def word(): w = WordCloud() import jieba txt = '' with open('data.csv', 'r') as f: reader = csv.reader(f) for row in reader: txt += filter_emoji(row[0]).replace('[', '').replace( ']', '').replace(' ', '').replace(' ', '').replace("'", '').replace(',', '') txt = jieba.lcut(txt) count = {} for wo in txt: if wo in count: count[wo] += 1 else: count[wo] = 1 w.add(series_name="热点分析", data_pair=list(count.items()), word_size_range=[6, 66]) w.set_global_opts( title_opts=opts.TitleOpts( title="热点分析", title_textstyle_opts=opts.TextStyleOpts(font_size=23)), tooltip_opts=opts.TooltipOpts(is_show=True), ) w.render()
def ci_yun(): wordcloud = WordCloud() new = list(zip(data['省区平台'], k)) words = new wordcloud.add("", words, word_size_range=[20, 100], shape="diamond") wordcloud.set_global_opts(title_opts=opts.TitleOpts(title="热销平台")) wordcloud.render("../graph/5.4(热销平台).html")
def show_word_cloud(document): # 需要清除的标点符号 left_words = ['.', ',', '?', '!', ';', ':', '\'', '(', ')'] # 生成字典 dic = corpora.Dictionary([document]) # 计算得到每个单词的使用频率 words_set = dic.doc2bow(document) # 生成单词列表和使用频率列表 words, frequences = [], [] for item in words_set: key = item[0] frequence = item[1] word = dic.get(key=key) if word not in left_words: words.append(word) frequences.append(frequence) # 使用pyecharts生成词云 word_cloud = WordCloud(width=1000, height=620) word_cloud.add(name='Alice\'s word cloud', attr=words, value=frequences, shape='circle', word_size_range=[20, 100]) word_cloud.render()
def drawWordCloud(words, name): # 图表初始化配置 init_opts = opts.InitOpts(page_title=name) wc = WordCloud(init_opts=init_opts) # 标题配置 title = opts.TitleOpts(title=name, pos_left="50%") toolbox_opts = opts.ToolboxOpts( orient="vertical", pos_bottom="40%", pos_left="90%", ) wc.set_global_opts( title_opts=title, toolbox_opts=toolbox_opts, ) wc.add( "", words, word_size_range=[20, 300], shape="diamond", textstyle_opts=opts.TextStyleOpts(font_family="cursive"), ) wc.render("{0}.html".format(name))
def ci_yun(self, your_id, your_data): new = list(zip(your_id, your_data)) words = new wordcloud = WordCloud( init_opts=opts.InitOpts(width="1920px", height='960px')) wordcloud.add('', words, word_size_range=[20, 100]) wordcloud.set_global_opts(title_opts=opts.TitleOpts(title='新型冠状病毒词云图')) wordcloud.render(self.filename + "词云图.html")
def show_word_charts(): word1 = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px')) word1.add("", [*zip(key_words.words, key_words.num)], word_size_range=[20, 200], shape=SymbolType.DIAMOND) word1.set_global_opts(title_opts=opts.TitleOpts('完美关系豆瓣短评词云图'), toolbox_opts=opts.ToolboxOpts()) word1.render()
def generate_word_cloud(word_list, path_name=None): wordcloud = WordCloud() wordcloud.add('词云图', tuple(zip(word_list.index, word_list)), word_size_range=[20, 100]) wordcloud.set_global_opts(title_opts=opts.TitleOpts(title='电影评论')) wordcloud.render(path=path_name) print(f'Generate word cloud file done: {path_name}')
def wordcloud_analysis(df): key_words = get_keyword(df) word = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px')) word.add('', [*zip(key_words.words, key_words.num)], word_size_range=[20, 200], shape='diamond') word.set_global_opts(title_opts=opts.TitleOpts(title='短评词云图'), toolbox_opts=opts.ToolboxOpts()) return word
def create_wordcloud(self, json_file, title='词云', column='title'): text = self.get_text(json_file, column=column) clear_word = self.split_word(text) data = self.word_counter(clear_word) wd = WordCloud() wd.add(series_name=title, data_pair=data, word_size_range=[40, 150]) wd.set_global_opts(title_opts=opts.TitleOpts(title="你的文章词云", subtitle="基于你的博客数据生成", title_textstyle_opts=opts.TextStyleOpts(font_size=23)), tooltip_opts=opts.TooltipOpts(is_show=True)) # wd.render_notebook() wd.render(self.path + os.sep + 'topic_wordcloud.html')
def word_cloud(self, text): all_words = [word.lower() for word in jieba.cut(text, cut_all=False) if word not in self.stopwords and len(word) > 1] counter = {k:v for k,v in Counter(all_words).items() if v >1} wordcloud = WordCloud() wordcloud.add("Word cloud", counter.items(), word_size_range=[10, 100], shape='diamond') _name = uuid.uuid4() wordcloud.render(os.path.join(self.base_dir, 'templates/vis/word_cloud/{}.html'.format(_name))) return _name
def wc(): myWordCloud = WordCloud("绘制词云", width=1000, height=620) name = ['Sam S Club', 'Macys', 'Amy Schumer', 'Jurassic World', 'Charter Communications', 'Chick Fil A', 'Planet Fitness', 'Pitch Perfect', 'Express', 'Home', 'Johnny Depp', 'Lena Dunham', 'Lewis Hamilton', 'KXAN', 'Mary Ellen Mark', 'Farrah Abraham', 'Rita Ora', 'Serena Williams', 'NCAA baseball tournament', 'Point Break'] value = [10000, 6181, 4386, 4055, 2467, 2244, 1898, 1484, 1112, 965, 847, 582, 555, 550, 462, 366, 360, 282, 273, 265] myWordCloud.add("", name, value, word_size_range=[20, 100]) return myWordCloud.render_embed()
def generateWoldCloud(keyPhrases): """ generate the word cloud to make the key-phrases visual :param keyPhrases: in format of [(content, frequency),...)] :return: WordCloud """ logger.debug("Start to generate the WoldCloud") myWordCloud = WordCloud() myWordCloud.add('', keyPhrases, shape='circle') return myWordCloud
def visualizationPart(model_xgb): # 输出特征重要性 # 树模型对象,条形图高度,显示排序后的最大特征数量,X轴文字,grid不显示网格 # importance_type = weight是特征在树中出现的次数,gain是使用特征分裂的平均值增益,cover是作为分裂节点的覆盖的样本比例 xgb.plot_importance(model_xgb, height=0.5, importance_type='gain', max_num_features=10, xlabel='Gain Split', grid=False) plt.show() # 输出树形规则图 # 树模型对象, 树的个数0-9, yes_color为真的线条颜色 xgb.to_graphviz(model_xgb, num_trees=1, yes_color='#638e5e', no_color='#a40000').view() # 获取数据 importance = model_xgb.get_booster().get_score(importance_type='weight', fmap='') tuples = [(k, importance[k]) for k in importance] tuples = sorted(tuples, key=lambda x: x[1], reverse=True) labels, values = zip(*tuples) print(importance) print(tuples) print(labels) print(values) # 词云 mywordcloud = WordCloud() # 词云图的轮廓也可以选择,有 'circle', 'cardioid', 'diamond', 'triangle-forward', 'triangle', 'pentagon',默认的词云轮廓为circle mywordcloud.add('', tuples, shape='pentagon') # 渲染图片 # 指定渲染图片存放的路径 mywordcloud.render('词云.html') # 环形饼图 circular_pie_chart = ( Pie(init_opts=opts.InitOpts(width="1600px", height="1000px")) # 图形的大小设置 .add( series_name="特征重要性", data_pair=[list(z) for z in zip(labels, values)], radius=["15%", "50%"], # 饼图内圈和外圈的大小比例 center=["30%", "40%"], # 饼图的位置:左边距和上边距 label_opts=opts.LabelOpts(is_show=True), # 显示数据和百分比 ).set_global_opts(legend_opts=opts.LegendOpts( pos_left="left", orient="vertical")) # 图例在左边和垂直显示 .set_series_opts(tooltip_opts=opts.TooltipOpts( trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"), )) circular_pie_chart.render('环形饼图.html')
def get_word_pic(items): from pyecharts.globals import SymbolType from pyecharts.charts import WordCloud my_wd = WordCloud() my_wd.add('title', items, word_size_range=[10, 100], shape=SymbolType.DIAMOND) my_wd.set_global_opts(title_opts=opts.TitleOpts(title="词频分布示意图"), toolbox_opts=opts.ToolboxOpts(), tooltip_opts=opts.TooltipOpts()) return my_wd
def wordcloud(self, name=None, values=None, width=1200, height=620): """ 词云图 :param name: 词云列表 :param values: 词云列表权重值,与词云列表对应 :param width: 宽度默认1200 :param height: 高度默认600 :return:词云图 """ name = name value = values wordcloud = WordCloud(width, height) wordcloud.add('', name, value, word_size_range=[20, 100]) self.page.add(wordcloud) self.page.render()
def show_word_charts(df): # 添加关键词,不会被分词掉 key_words = ['男主', '女主', '郑京浩', '金素妍'] for i in key_words: jieba.add_word(i) # 对用户评论进行分词 text = df['comment'].str.cat(sep='。') # 使用句号拼接评论 word_num = jieba.analyse.extract_tags(text, topK=100, withWeight=True, allowPOS=()) # 权重前100个词,list里嵌套tuple # 停用词:在信息处理过程中过滤掉的某些字或词 stop_words = [] with open('files/stopwords.txt', mode='r', encoding='utf-8') as f: lines = f.readlines() for line in lines: stop_words.append(line.strip()) stop_words_extend = [ '韩剧', '男二', '真的', '还是', '这部', '完全', '一部', '撑起', '最后', '就算' '但是', '不错', '觉得', '这么', '简直', '男女', '实在', '那么', '一集', '虽然', '郑敬', '各种', '爱上', '这个', '整部', '时候', '看过', '有点', '居然', '不要', '评分', '主角', '素妍', '现在', '果然', '怎么', '部分', '在于', '...', 'bug', '30', 'CP' ] stop_words.extend(stop_words_extend) # 单个元素用append,list用extend # 去除停用词 word_num_selected = [] for i in word_num: if i[0] not in stop_words: word_num_selected.append(i) # 绘制词云图 df_keywords = pd.DataFrame(word_num_selected, columns=['word', 'num']) word1 = WordCloud(init_opts=opts.InitOpts(width='1000px', height='750px')) word1.add(series_name='关键词', data_pair=[*zip(df_keywords['word'], df_keywords['num'])], shape=SymbolType.ARROW, pos_top='0%', pos_left='0%') word1.set_global_opts(title_opts=opts.TitleOpts('用户评论词云图'), toolbox_opts=opts.ToolboxOpts()) word1.render('charts/word_chart.html')
def show_word_cloud(document): left_words = ['.', ',' '?', '!', ';', ':', '\'', '(', ')'] dic = corpora.Dictionary([document]) words_set = dic.doc2bow(document) words, frequences = [], [] for item in words_set: key = item[0] frequence = item[1] word = dic.get(key=key) if word not in left_words: words.append(word) frequences.append(frequence) #word_cloud = WordCloud(width = 1000, height = 620) word_cloud = WordCloud() word_cloud.add("", list(zip(words, frequences)), shape='circle', word_size_range=[20, 100]) word_cloud.render('46.Alice/words.html')
def analysis_short_comment(self): cut_words = "" for line in open(self.short_comments_file, 'r', encoding='utf-8'): line.strip('\n') # 正则去掉标点等无效的字符 line = re.sub(r"[A-Za-z0-9\:\·\—\,\。\“ \”\....]", "", line) # cut_all=False为精确模式,cut_all=True为全词模式 seg_list = jieba.cut(line, cut_all=False) cut_words += (" ".join(seg_list)) all_words = cut_words.split() c = Counter() for x in all_words: if len(x) > 1 and x != '\r\n': c[x] += 1 words = c.most_common(500) # 输出词频最高的前500词 logger.logger.debug(words) wordcloud = WordCloud() wordcloud.add("", words, word_size_range=[5, 100], shape='circle') wordcloud.set_global_opts(title_opts=opts.TitleOpts(title="隐秘的角落 短评")) wordcloud.render(self.path+"\\short_comment.html")
def draw_view(url): try: sql = """SELECT cb_name_id, cb_hot from changed_bond where cb_hot > 5 order by cb_hot desc limit 1000""" cur = get_cursor(sql) rows = cur.fetchall() words = [] for row in rows: words.append((row[0].replace('转债', ''), row[1])) chart = WordCloud( opts.InitOpts(height='1000px', width='1424px', theme=ThemeType.MACARONS, chart_id='cb_wordcloud')) chart.add( series_name="", # 添加数据 data_pair=words, # 字间隙 word_gap=5, # 调整字大小范围 word_size_range=[5, 100], # shape="cardioid", is_draw_out_of_bound=True, rotate_step=1, # 选择背景图,也可以不加该参数,使用默认背景 # mask_image='购物车.jpg' ) chart.set_global_opts( title_opts=opts.TitleOpts( title="", title_textstyle_opts=opts.TextStyleOpts(font_size=23)), tooltip_opts=opts.TooltipOpts(is_show=True), ) html = chart.render_embed('template.html', env) return '可转债热度分析', views.nav_utils.build_analysis_nav_html(url), html except Exception as e: print("processing is failure. ", e) raise e
def companyTypeWordCloud(): with open('./datas/20年第四季度.csv', 'r') as csvfile: reader = csv.reader(csvfile) column = [row[4] for row in reader] object_list = [] for word in column: # 循环读出每个分词 object_list.append(word) # print(object_list) word_counts = collections.Counter(object_list) # 对分词做词频统计 word_counts_top10 = word_counts.most_common(10) # 获取前10最高频的词 print(word_counts) # 输出检查 word_cloud = WordCloud() data = dict( sorted({k: v for k, v in word_counts.items() if len(k) >= 2}.items(), key=lambda x: x[1], reverse=True)[:200]) print(data) word_cloud.add(data.keys(), data.items()) word_cloud.render('./html/companyType_20.html')
def show_word_cloud(document): letf_words = ['.', ',', '?', '!', ';', ':', '\'', '(', ')'] #生成字典 dic = corpora.Dictionary([document]) #计算每个词使用频率 words_set = dic.doc2bow(document) #生成单词列表和使用频率列表 words, frequences = [], [] for item in words_set: key = item[0] frequence = item[1] if word not in letf_words: words.append(word) frequences.append(frequences) #使用pyecharts生成词云 word_cloud = WordCloud(width=1000, height=620) word_cloud.add(series_name='Alice\'s word cloud', attr=words, value=frequences, word_size_range=[20, 100]) word_cloud.render()
from pyecharts.charts import WordCloud from jieba import analyse # 基于TextRank算法从文本中提取关键词 textrank = analyse.textrank text = open('111.txt', 'r', encoding='gbk').read() keywords = textrank(text, topK=30) list1 = [] tup1 = () # 输出提取的关键词 for keyword, weight in textrank(text, topK=30, withWeight=True): print('%s %s' % (keyword, weight)) tup1 = (keyword, weight) # 关键词权重 list1.append(tup1) # 添加到列表中 mywordcloud = WordCloud() mywordcloud.add('', list1, word_size_range=[20, 100]) mywordcloud.render('wordclound.html')
# 筛选掉停用词 word_num_selected = [] for i in word_num: if i[0] not in stop_words: word_num_selected.append(i) else: pass return word_num_selected key_words = get_comment_word(df_all) key_words = pd.DataFrame(key_words, columns=['words', 'num']) print(key_words.head()) from pyecharts.charts import WordCloud word = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px')) word.add("", [*zip(key_words.words, key_words.num)], word_size_range=[20, 200], shape='diamond') word.set_global_opts(title_opts=opts.TitleOpts(title="职位需求关键词云图"), toolbox_opts=opts.ToolboxOpts()) # 生成图表页面 from pyecharts.charts import Page page = Page() page.add(regBar, slBar, shBar, eduPie, word) page.render('C++工作区域分布.html')
def analyze_signature(): # 个性签名列表 data = [] for user in friends: # 清除签名中的微信表情emoj,即<span class.*?</span> # 使用正则查找并替换方式,user.signature为源文本,将<span class.*?</span>替换成空 new_signature = re.sub(re.compile(r"<span class.*?</span>", re.S), "", user.signature) # 只保留签名为1行的数据,过滤为多行的签名 if (len(new_signature.split('\n')) == 1): data.append(new_signature) # 将个性签名列表转为string data = '\n'.join(data) # 进行分词处理,调用接口进行分词 # 这里不使用jieba或snownlp的原因是无法打包成exe文件或者打包后文件非常大 postData = { 'data': data, 'type': 'exportword', 'arg': '', 'beforeSend': 'undefined' } response = post('http://life.chacuo.net/convertexportword', data=postData) data = response.text.replace('{"status":1,"info":"ok","data":["', '') # 解码 data = data.encode('utf-8').decode('unicode_escape') # 将返回的分词结果json字符串转化为python对象,并做一些处理 data = data.split("=====================================")[0] # 将分词结果转化为list,根据分词结果,可以知道以2个空格为分隔符 data = data.split(' ') # 对分词结果数据进行去除一些无意义的词操作 stop_words_list = [ ',', ',', '、', 'the', 'a', 'is', '…', '·', 'э', 'д', 'э', 'м', 'ж', 'и', 'л', 'т', 'ы', 'н', 'з', 'м', '…', '…', '…', '…', '…', '、', '.', '。', '!', '!', ':', ':', '~', '|', '▽', '`', 'ノ', '♪', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '\'', '‘', '’', '“', '”', '的', '了', '是', '你', '我', '他', '她', '=', '\r', '\n', '\r\n', '\t', '以下关键词', '[', ']', '{', '}', '(', ')', '(', ')', 'span', '<', '>', 'class', 'html', '?', '就', '于', '下', '在', '吗', '嗯' ] tmp_data = [] for word in data: if (word not in stop_words_list): tmp_data.append(word) data = tmp_data # 进行词频统计,结果存入字典signature_dict中 signature_dict = {} for index, word in enumerate(data): print(u'正在统计好友签名数据,进度%d/%d,请耐心等待……' % (index + 1, len(data))) if (word in signature_dict.keys()): signature_dict[word] += 1 else: signature_dict[word] = 1 # 开始绘制词云 name = [x for x in signature_dict.keys()] value = [x for x in signature_dict.values()] wordcloud = WordCloud('微信好友个性签名词云图') wordcloud.add("", name, value, shape='star', word_size_range=[1, 100]) wordcloud.render('data/好友个性签名词云.html')
word_dict[w] = 1 else: word_dict[w] += 1 stop = pd.read_csv("Chinese_Stopwords.txt", encoding='utf-8', header=None) stop.columns = ['word'] stop = [' '] + list(stop.word) for i in range(len(stop)): if stop[i] in word_dict: word_dict.pop(stop[i]) word_dict_sort = sorted(word_dict.items(), key=lambda x: x[1]) words = word_dict_sort[-100:] wordcloud = WordCloud() wordcloud.add("", words, word_size_range=[20, 100], shape='circle') wordcloud.render_notebook() def speech_cut(speech): word_list = [] for word in word_dict_sort: words = pseg.cut(word[0]) for w, flag in words: if flag == speech: word_list.append(word) return word_list verb_word = speech_cut('v') wordcloud = WordCloud()
from jieba.analyse import * from pyecharts.charts import WordCloud import os os.chdir(r"E:\work\tpp\samsung\2021年\07月\W27") with open('新建文本文档.txt', encoding="utf-8") as f: data = f.read() dataAnlysed = [] for keyword, weight in textrank(data, withWeight=True, topK=11): if keyword == "程序": keyword = "小程序" dataAnlysed.append((keyword, weight)) dataAnlysed1 = [x for x in dataAnlysed if not (x[0] in ["督导"])] # dataAnlysed1 = [x for x in dataAnlysed if not (x[0] in ["对比","方面","苹果","用户","手机","介绍","支持","没有","效果","优势"] )] # # print(dataAnlysed) print(dataAnlysed1) wordcloud = WordCloud() wordcloud.add("", dataAnlysed1, shape="cardioid", word_size_range=[20, 100], rotate_step=180) wordcloud.render('q1.html')
("面相分析", 47), ("手相", 32), ("公益", 90), ("花鸟市场", 1446), ("风水", 93), ("面相分析", 47), ("手相", 32), ("公益", 90), ("花鸟市场", 1446), ("风水", 93), ("面相分析", 47), ("手相", 32), ("公益", 90), ("花鸟市场", 1446), ("风水", 93), ("面相分析", 47), ("手相", 32), ("公益", 90), ("花鸟市场", 1446), ("风水", 93), ("面相分析", 47), ("手相", 32), ("公益", 90), ] ''' # 创建词云图 wc = WordCloud() # 使用词云图的添加功能(需要呈现的内容和图片) wc.add("", words, word_size_range=[12, 12], mask_image="heart.jpg") wc.render()
from pyecharts.charts import WordCloud import pyecharts.options as opts import pandas as pd post_data = pd.read_csv('post_data.csv') post_data2 = post_data.groupby(by=['category']).agg({ 'views': sum }).reset_index() data = [ list(e) for e in zip(post_data2['category'], [str(e) for e in post_data2['views']]) ] wordcloud = WordCloud() wordcloud.add(series_name='', data_pair=data, word_size_range=[20, 100]) wordcloud.render('wordcloud.html')