def create_wc(words_in): """Create WordCloud object. Parameters ---------- words_in : list of tuple Words to plot, with their corresponding frequencies. Returns ------- wc : WordCloud() object Wordcloud definition. """ # Create the WordCloud object wc = WordCloud(background_color=None, mode='RGBA', width=800, height=400, prefer_horizontal=1, relative_scaling=0.5, min_font_size=25, max_font_size=80).generate_from_frequencies(words_in) # Change colour scheme to grey wc.recolor(color_func=_grey_color_func, random_state=3) return wc
def get_tagcloud(self, tags, tag_limit=None): tag_limit = tag_limit or len(tags) tags = sorted(tags, key=lambda kv: -kv['count'])[:tag_limit] # Get top X tags tag_dict = {t['tag_name']: t['count'] for t in tags} # Generate a word cloud image wordcloud = WordCloud( background_color='white', min_font_size=10, max_font_size=60, width=self.tagcloud_width, height=self.tagcloud_height or 30 * len(tags) / 2 + 10, font_path=os.path.sep.join([settings.STATIC_ROOT, 'fonts', 'OpenSans-Regular.ttf']) ).generate_from_frequencies(tag_dict) tag_counts = [t['count'] for t in tags] step = (float(max(tag_counts))) / len(self.color_selection) thresholds = list(reversed([int(round(i * step)) for i in range(len(self.color_selection))])) def get_color(word, font_size, position, orientation, random_state=None, **kwargs): index = next((i for i, t in enumerate(thresholds) if tag_dict[word] >= t), 0) return self.color_selection[index] wordcloud.recolor(color_func=get_color) image = wordcloud.to_image() filepath = self.get_write_to_path(ext="png") image.save(filepath) return encode_file_to_base64(filepath, "data:image/png;base64,")
def make_word_cloud(product, sentiment): if sentiment == "all": pos, neg = get_top_five_phrases(product,sentiment) pos.index = range(0,len(pos)) neg.index = range(0,len(neg)) pos_words_array = [] neg_words_array = [] for i in range(0,len(pos)): pos_words_array.append((pos["vocab"][i].upper(), float(pos["count"][i]))) for i in range(0,len(neg)): neg_words_array.append((neg["vocab"][i].upper(), float(neg["count"][i]))) wc = WordCloud(background_color="white", max_words=2000, max_font_size=300, random_state=42) # generate word cloud for positive positive_name = '../app/static/img/pos_wordcloud.png' wc.generate_from_frequencies(pos_words_array) wc.recolor(color_func=pos_color_func, random_state=3) wc.to_file(positive_name) # generate word cloud for negative negative_name = '../app/static/img/neg_wordcloud.png' wc.generate_from_frequencies(neg_words_array) wc.recolor(color_func=neg_color_func, random_state=3) wc.to_file(negative_name) return positive_name, negative_name
def test_recolor_too_small(): # check exception is raised when image is too small colouring = np.array(Image.new('RGB', size=(20, 20))) wc = WordCloud(width=30, height=30, random_state=0, min_font_size=1).generate(THIS) image_colors = ImageColorGenerator(colouring) with pytest.raises(ValueError, match='ImageColorGenerator is smaller than the canvas'): wc.recolor(color_func=image_colors)
def word_cloud(csv_file, stopwords_path, pic_path): pic_name = csv_file+"_词云图.png" path = os.path.abspath(os.curdir) csv_file = path+ "\\" + csv_file + ".csv" csv_file = csv_file.replace('\\', '\\\\') d = pd.read_csv(csv_file, engine='python', encoding='utf-8') content = [] for i in d['content']: try: i = translate(i) except AttributeError as e: continue else: content.append(i) comment_after_split = jieba.cut(str(content), cut_all=False) wl_space_split = " ".join(comment_after_split) backgroud_Image = plt.imread(pic_path) stopwords = STOPWORDS.copy() with open(stopwords_path, 'r', encoding='utf-8') as f: for i in f.readlines(): stopwords.add(i.strip('\n')) f.close() wc = WordCloud(width=1024, height=768, background_color='white', mask=backgroud_Image, font_path="C:\simhei.ttf", stopwords=stopwords, max_font_size=400, random_state=50) wc.generate_from_text(wl_space_split) img_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors) plt.imshow(wc) plt.axis('off') plt.show() wc.to_file(pic_name)
def wordCloud(text_array,name,keyword=""): new_text_arr=[] if keyword is not "": keyword=keyword.split(" ")[1] for text in text_array: if keyword in text: new_text_arr.append(text) text_array=new_text_arr cloud_text="" for text in text_array: cloud_text+=text+" " m_stopwords=['police','traffic','sir'] for word in m_stopwords: STOPWORDS.add(word) image_mask = os.path.join(BASE_DIR, 'static/tool/img/nebula.png') coloring = imread(image_mask) wordcloud = WordCloud(stopwords=STOPWORDS,background_color="white",mask=coloring,ranks_only=True,max_words=50).generate(cloud_text) filename=os.path.join(BASE_DIR, 'static/tool/img/'+name+'.png') image_colors = ImageColorGenerator(coloring) wordcloud.recolor(color_func=image_colors) wordcloud.to_file(filename) data_uri = open(filename, 'rb').read().encode('base64').replace('\n', '') img_tag = '<img src="data:image/png;base64,{0}" style="height:400px;">'.format(data_uri) layout=wordcloud.layout_ words_colours={} count=1 for lo in layout: entry={} entry['word']=lo[0][0] color=lo[len(lo)-1] color=color[4:] color=color[:-1] color_split=color.split(',') color_num=[int(x) for x in color_split] color_hex='#%02x%02x%02x' % tuple(color_num) # print color_num entry['color']=color_hex words_colours[count]=entry count+=1 # print words_colours list_html="" cap=51 if cap>len(words_colours): cap=len(words_colours) for i in range(1,cap): list_html+='<li class="list-group-item" ><a class="cloud-key-'+name+'" href="#" style="color:'+words_colours[i]['color']+'">' list_html+="#"+str(i)+" "+words_colours[i]['word']+'</a></li>' return (img_tag,list_html)
def generateWordCloud(node, contribs, maskImg=None, wordsToShow=100, normalize=True, normMin=0, normMax=1): contrib = contribs[node] # Find side of largest magnitude # take k words from that side # normalize those words (may need abs val) # generate text from those normalized values # contrib should be sorted high to low maxVal = abs(contrib[0][1]) minVal = abs(contrib[-1][1]) #print (contrib[0][1], contrib[-1][1]) #print ("max min = ", maxVal, minVal) newContrib = [] if (maxVal > minVal): # use front #newContrib = contrib[0:min(len(contrib), wordsToShow))] for i in range(min(len(contrib), wordsToShow)): newContrib.append(contrib[i]) else: # use back for i in range(min(len(contrib), wordsToShow) - 1, -1, -1): newContrib.append(contrib[i]) # if the most significant value was negative # swap the sign of all values in contrib for j in range(len(newContrib)): c = newContrib[j] newContrib[j] = (c[0], -1*c[1]) #print ("new contrib = ", newContrib) if (normalize): contrib = normalizeContrib(newContrib, normMin, normMax) # for each value in normalized contrib # assign color value for c in contrib: word, val = c # add word to color map #wordColorMap[word] = int(round(255*(1-val))) wordColorMap[word] = int(round(200*(1-val))) # generate text text = generateText(contrib, min(len(contrib), wordsToShow)) # gen word cloud #s = time.time() wc = WordCloud(background_color="white", max_words=2000, mask=maskImg) wc.generate(text) wc.recolor(color_func=gray_color_func) #e = time.time() #print ("word cloud only time = ", (e-s)) return wc
def wd_cloud(request): base_path = path.dirname(__file__) font_path = path.join(base_path, 'static/fonts/simsun.ttc') text = [list_.list_name for list_ in SongList.objects.all()] # join函数的作用是将列表中的多个字符串拼接成一个长字符串 text = ','.join(text) # 关于jieba的更多使用方法,可以参考原作者的github topK = 160 tags = jieba.analyse.extract_tags(text, topK=topK, withWeight=True) text = ','.join([tag[0] for tag in tags]) queryword = request.GET.get('queryword') # 使用objects.filter方法所得到的是一个QuerySet,而使用objects.get方法得到的是一个对象 # 因此此处使用的是filter方法,此外,如果需要选择不满足条件的集合,就使用exclude方法 res = SongList.objects.filter(list_name__contains=queryword) # 判断是否有歌曲重复出现在某几个歌单中 links = ['http://music.163.com/api/playlist/detail?id=' + str(r.list_id) for r in res] id_lists = list() name_dict = dict() json_texts = [json.loads(requests.get(link).text) for link in links] # songs_appear_manytimes is a list of tuples with each tuple containing # a song list's id, time it appears and its name songs_appear_manytimes = list() for i in range(len(json_texts)): t = json_texts[i]['result']['tracks'] for j in range(len(t)): song_id = t[j]['id'] song_name = t[j]['name'] id_lists.append(song_id) name_dict[song_id] = song_name d = collections.Counter(id_lists) for k in d: if d[k] > 1: songs_appear_manytimes.append((name_dict[k], d[k], k)) print(k) print(d[k]) print(name_dict[k]) json_response = dict() json_response['lists_contain_queryword'] = [(r.list_name, r.list_link) for r in res] json_response['songs_appaer_manytimes'] = songs_appear_manytimes pprint(json_response) region = (32, 107, 992, 661) mask = np.array(Image.open(path.join(base_path, "static/images/nike-logo.jpg")).crop(region).rotate(90)) mulan_style = np.array(Image.open(path.join(base_path, "static/images/a.png")).rotate(90)) color_style = ImageColorGenerator(mulan_style) wordcloud = WordCloud(font_path=font_path, mask=mask, background_color='white', max_words=400, width=400, height=800, max_font_size=50, min_font_size=20, relative_scaling=.9, scale=2.0).generate(text) wordcloud.recolor(color_func=color_style) cloud_img_path = path.join(base_path, "static/images/cloud.png") wordcloud.to_file(cloud_img_path) return JsonResponse(json_response)
def test_check_errors(): wc = WordCloud() assert_raises(NotImplementedError, wc.to_html) try: np.array(wc) raise AssertionError("np.array(wc) didn'm raise") except ValueError as e: assert_true("call generate" in str(e)) try: wc.recolor() raise AssertionError("wc.recolor didn'm raise") except ValueError as e: assert_true("call generate" in str(e))
def test_check_errors(): wc = WordCloud() with pytest.raises(NotImplementedError): wc.to_html() try: np.array(wc) raise AssertionError("np.array(wc) didn't raise") except ValueError as e: assert "call generate" in str(e) try: wc.recolor() raise AssertionError("wc.recolor didn't raise") except ValueError as e: assert "call generate" in str(e)
def make_wordle_from_mallet(word_weights_file, num_topics, words, TopicRanksFile, outfolder, font_path, dpi): """ # Generate wordles from Mallet output, using the wordcloud module. """ print("\nLaunched make_wordle_from_mallet.") for topic in range(0,num_topics): ## Gets the text for one topic. text = get_wordlewords(words, word_weights_file, topic) wordcloud = WordCloud(font_path=font_path, width=1600, height=1200, background_color="white", margin=4).generate(text) default_colors = wordcloud.to_array() rank = get_topicRank(topic, TopicRanksFile) figure_title = "topic "+ str(topic) + " ("+str(rank)+"/"+str(num_topics)+")" plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3)) plt.imshow(default_colors) plt.imshow(wordcloud) plt.title(figure_title, fontsize=30) plt.axis("off") ## Saving the image file. if not os.path.exists(outfolder): os.makedirs(outfolder) figure_filename = "wordle_tp"+"{:03d}".format(topic) + ".png" plt.savefig(outfolder + figure_filename, dpi=dpi) plt.close() print("Done.")
def generateWordCloud(node, contribs, wordsToShow=None, normalize=True, normMin=0, normMax=1): contrib = contribs[node] if (normalize): contrib = normalizeContrib(contrib, normMin, normMax) # generate text text = generateText(contrib, wordsToShow) # load mask d = path.dirname(__file__) circle_mask = imread(path.join(d, "black_circle_mask_whitebg.png")) # gen word cloud wc = WordCloud(background_color="white", max_words=2000, mask=circle_mask) wc.generate(text) # store to file wc.to_file(path.join(d, "node.png")) # show useColorFunc = False #normalize if (useColorFunc): plt.imshow(wc.recolor( color_func=pos_neg_color_func )) else: plt.imshow(wc) plt.axis("off") plt.show()
def test_recolor(): wc = WordCloud(max_words=50) wc.generate(THIS) array_before = wc.to_array() wc.recolor() array_after = wc.to_array() # check_list that the same places are filled assert_array_equal(array_before.sum(axis=-1) != 0, array_after.sum(axis=-1) != 0) # check_list that they are not the same assert_greater(np.abs(array_before - array_after).sum(), 10000) # check_list that recoloring is deterministic wc.recolor(random_state=10) wc_again = wc.to_array() assert_array_equal(wc_again, wc.recolor(random_state=10))
def run_yt(): yt = ds.acquire_youtube() ytimg = imread("ytlogo.png") wc = WordCloud(mask=ytimg) image_colors = ImageColorGenerator(ytimg) wc.generate(word_list_yt(ds.mean(yt[0]), ds.standard_deviation(yt[0]))) plt.imshow(wc.recolor(color_func = image_colors)) plt.axis("off") plt.savefig('popularWordsYT.png', bbox_inches = 'tight', dpi = 200) words, vidcount = ds.word_count_yt('title', ds.mean(yt[0]), ds.standard_deviation(yt[0])) data1 = [] data2 = [] labels = [] count = 0 for w in sorted(words, key=words.get, reverse=True): labels.append(w) data1.append(1000 * words[w]/vidcount) count +=1 if count == 10: break words, vidcount = ds.word_count_dailymotion( 0, 0 ) for w in labels: data2.append(1000 * words[w]/vidcount) create_dualbargraph(data1, data2, labels, 'wordUseCompYT.png') Theta = da.yt_thetas() for x in xrange(len(Theta)): Theta[x] = Theta[x]/10000 print Theta create_bargraph(Theta,('duration', 'date created', 'y-intercept'), 'barGraphYT.png')
def run(): f = open(u'words2.txt', 'r').read() words = list(jieba.cut(f)) a = [] for w in words: if len(w) > 1: a.append(w) text = r' '.join(a) bg = np.array(Image.open('bg.jpg')) wordcloud = WordCloud( background_color = 'white', #width = 1500, #height = 960, #margin = 10, mask = bg, font_path='C:/Windows/Fonts/simkai.ttf', ).generate(text) image_colors=ImageColorGenerator(bg) plt.imshow(wordcloud.recolor(color_func=image_colors)) plt.axis('off') plt.show() wordcloud.to_file('words_result3.png') return
def create_wordcloud(corpus, output, stopword_dict): lex_dic = build_lex_dic(corpus, stopword_dict=stopword_dict) total_words = get_total_words(lex_dic) ordered_freq_list = build_freq_list(lex_dic, total_words) fig = plt.figure(figsize=(10, 8), frameon=False) ax = plt.Axes(fig, [0., 0., 1., 1.]) ax.set_axis_off() fig.add_axes(ax) wordcloud = WordCloud(width=1000, height=800, max_words=100, background_color='white', relative_scaling=0.7, random_state=15, prefer_horizontal=0.5).generate_from_frequencies( ordered_freq_list[0:100]) wordcloud.recolor(random_state=42, color_func=my_color_func) ax.imshow(wordcloud) fig.savefig(output, facecolor='white')
def draw_tag_cloud(users_tokens): from PIL import Image import matplotlib.pyplot as plt from wordcloud import WordCloud, ImageColorGenerator trump_coloring = np.array(Image.open("pics/trump.png")) freqs = get_full_frequencies(users_tokens) freq_pairs = freqs.items() wc = WordCloud(max_words=2000, mask=trump_coloring, max_font_size=40, random_state=42) wc.generate_from_frequencies(freq_pairs) image_colors = ImageColorGenerator(trump_coloring) # plt.imshow(wc) # plt.axis("off") # # plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor # plt.imshow(trump_coloring, cmap=plt.cm.gray) plt.axis("off") plt.show()
def wcloud(wf, color, save_as=None): """Create a word cloud based on word frequencies, `wf`, using a color function from `wc_colors.py` Parameters ---------- wf : list (token, value) tuples color : function from `wc_colors.py` save_as : str filename Returns ------- None """ wc = WordCloud(background_color=None, mode='RGBA', width=2400, height=1600, relative_scaling=0.5, font_path='/Library/Fonts/Futura.ttc') wc.generate_from_frequencies(wf) plt.figure() plt.imshow(wc.recolor(color_func=color, random_state=42)) plt.axis("off") if save_as: plt.savefig(save_as, dpi=300, transparent=True)
def make_wordcloud(user_name, sent_words, sentiment): text = "" for word in sent_words: for i in range(1,int(100*np.abs(word[1]))): text = text + " " + word[0] if text == "": return wc = WordCloud(background_color="white",max_words=10, margin=10, random_state=1).generate(text) if sentiment == 'pos': wc.recolor(color_func=pos_color_func, random_state=3) else: wc.recolor(color_func=neg_color_func, random_state=3) wc.to_file(os.path.join('static/img/', sentiment+"_wordcloud_"+user_name+".png"))
def run_dm(): dm = ds.acquire_dailymotion() dmimg = imread("dmlogo.png") # Read the whole text. wc = WordCloud(mask=dmimg) image_colors = ImageColorGenerator(dmimg) wc.generate(word_list_dailymotion(ds.mean(dm[0]), ds.standard_deviation(dm[0]))) # Open a plot of the generated image. plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") plt.savefig('popularWordsDM.png', bbox_inches = 'tight', dpi = 200) words, vidcount = ds.word_count_dailymotion(ds.mean(dm[0]), ds.standard_deviation(dm[0])) data1 = [] data2 = [] labels = [] count = 0 for w in sorted(words, key=words.get, reverse=True): labels.append(w) data1.append(1000 * words[w]/vidcount) count +=1 if count == 10: break words, vidcount = ds.word_count_dailymotion( 0, 0 ) for w in labels: data2.append(1000 * words[w]/vidcount) create_dualbargraph(data1, data2, labels, 'wordUseCompDM.png') create_bargraph(da.dm_thetas(),('fans','duration','date created', 'y-intercept'), 'barGraphDM.png')
def cloudplot(person): person = re.sub(r'\+', ' ', person) text = GetTextRange(Emails, person) text = rmBoring(rmNonAlpha(text)).decode('ascii', 'ignore') plt.clf() d = path.dirname(path.abspath(__file__)) hilcolor = np.array(Image.open(path.join(d, "static/img/hillarylogo.jpg"))) wc = WordCloud(background_color="white", max_words=150, mask=hilcolor, stopwords=STOPWORDS.add("said"), max_font_size=80, random_state=42, relative_scaling = 0.5) wc.generate(text) image_colors = ImageColorGenerator(hilcolor) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") fig = plt.gcf() img = StringIO.StringIO() fig.savefig(img) img.seek(0) return send_file(img, mimetype='image/png')
def generate_word_cloud(text, mask_filename): d = path.dirname(__file__) #?? mask = imread(path.join(d, mask_filename)) # adding movie script specific stopwords stopwords = STOPWORDS.copy() stopwords.add("info") stopwords.add("meetbot") stopwords.add("supybot") wc = WordCloud(max_words=1000, mask=mask, stopwords=stopwords, margin=10, random_state=1).generate(text) wc.recolor(color_func=grey_color_func, random_state=3) _, tmpfilename = tempfile.mkstemp('-wordcloud.png') wc.to_file(tmpfilename) return tmpfilename
def generatewordcloud(freqTable, inputImageFileName, outputImageFileName): global stopwordshearing ImageFile.LOAD_TRUNCATED_IMAGES = True img = Image.open(inputImageFileName) img = img.resize((980,1080), Image.ANTIALIAS) sl = STOPWORDS | stopwordshearing speakerArray = np.array(img) wc = WordCloud(background_color="white", max_words=1000, mask=speakerArray, stopwords=sl, random_state=42) wc.generate_from_frequencies(freqTable) #print wc.words_ # create coloring from image image_colors = ImageColorGenerator(speakerArray) wc.recolor(color_func=image_colors) wc.to_file(outputImageFileName)
def post_process(): #with open('clda_data/out_prism', 'r') as fin: # phi_prism = [np.array(ast.literal_eval(line.strip())) for line in fin] #phi_prism = np.array(phi_prism) #theta_pb = np.load('/tmp/peircebayes/avg_samples.npz') #theta_pb = np.load('/home/rares/Desktop/peircebayes_all_no_sampling/last_sample.npz') theta_pb = np.load('data/avg_samples.npz') phi = theta_pb['arr_1'] print phi.shape vocab = pickle.load(open('data/vocab.pkl', 'r')) inv = dict((v, k) for k, v in vocab.iteritems()) axis = 1 index = list(np.ix_(*[np.arange(i) for i in phi.shape])) index[axis] = phi.argsort(axis) a = phi[index][:,-20:] counts = np.rint(a/np.sum(a, axis=1).reshape(-1,1)*1000).tolist() idx_l = index[axis][:,-20:].tolist() words = [[inv[i] for i in subl] for subl in idx_l] #pprint(words) index_prism = list(np.ix_(*[np.arange(i) for i in phi_prism.shape])) index_prism[axis] = phi_prism.argsort(axis) a_prism = phi_prism[index_prism][:,-20:] idx_l_prism = index_prism[axis][:,-20:].tolist() words_prism = [[inv[i] for i in subl] for subl in idx_l_prism] #pprint(words_prism) # topic 1 freq1 = list(reversed(zip(words[0], list(a[0,:])))) # topic 2 freq2 = list(reversed(zip(words[1], list(a[1,:])))) # topic 1 #freq1_prism = list(reversed(zip(words_prism[19], list(a_prism[19,:])))) # topic 2 #freq2_prism = list(reversed(zip(words_prism[18], list(a_prism[18,:])))) wc = WordCloud(background_color="white", width=400, height=400, random_state=1234).fit_words(freq1) plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3)) plt.axis("off") plt.savefig('data/topic_1.pdf', format='pdf') plt.close() plt.imshow(wc.fit_words(freq2).recolor(color_func=grey_color_func, random_state=3)) plt.axis("off") plt.savefig('data/topic_2.pdf', format='pdf') plt.close()
def generatewordcloud(speaker, inputImageFileName, outputImageFileName): speakerData = data[data.Speaker == speaker] allText = "" for index, row in speakerData.iterrows(): allText += str(row['Text'])+" " print (allText) ImageFile.LOAD_TRUNCATED_IMAGES = True img = Image.open(inputImageFileName) img = img.resize((980,1080), Image.ANTIALIAS) speakerArray = np.array(img) wc = WordCloud(background_color="white", max_words=1000, mask=speakerArray, stopwords=STOPWORDS) wc.generate(allText) # create coloring from image image_colors = ImageColorGenerator(speakerArray) wc.recolor(color_func=image_colors) wc.to_file(outputImageFileName)
def generate(text): resources = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources") masks = os.path.join(resources, "masks") fonts = os.path.join(resources, "fonts") mask = numpy.array(Image.open(os.path.join(masks, random.choice(os.listdir(masks))))) cloud = WordCloud( font_path=os.path.join(fonts, random.choice(os.listdir(fonts))), background_color="#1A1A1A", mask=mask, scale=2, max_words=None, relative_scaling=0.5, prefer_horizontal=1.0 ) cloud.generate(text) image_colors = ImageColorGenerator(mask) cloud.recolor(color_func=image_colors) cloud.to_file("cloud.png")
def make_wc_custom(self, mask, text, max): try: coloring = np.array(PIL.Image.open(mask)) wc = WordCloud(width=1024, height=768, max_words=max, mask=coloring) wc = wc.generate(' '.join(text)) image_colors = ImageColorGenerator(coloring) wc = wc.recolor(color_func=image_colors) img = wc.to_image() b = BytesIO() img.save(b, 'png') b.seek(0) return b except Exception as e: return str(e)
def wordcloud(wordSource): #writes origional catagory list to text file d = os.path.dirname(__file__) file = open("catagory.txt", 'w') for item in wordSource: file.write("%s\n" % item) thefile = open(os.path.join(d, "catagory.txt")).read() #adds words to exclude list STOPWORDS.add("chronic") STOPWORDS.add("disease") STOPWORDS.add("obstructive") STOPWORDS.add("status") # generate word cloud wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", width = 650, height = 250).generate_from_text(thefile) #re-colers and saves wordcloud as png wordcloud.recolor(color_func=grey_color_func, random_state=3) wordcloud.to_file("wordcloud.png")
def generateImage(): if len(stdin_input) > 0: text = stdin_input[-1] # get the latest text else: text = "" wc = WordCloud(font_path='C:\Windows\Fonts\Verdana.ttf', width=750, height=400).generate(text) wc = wc.recolor(color_func=grey_color_func, random_state=3) img = wc.to_image() scale = 0.5 # % of the whole screen size = tuple(int(i * scale) for i in screensize) img = img.resize(size, Image.ANTIALIAS) return img
if word not in stopwords: new_text.append(word) counter = Counter(new_text) pprint.pprint(counter.most_common(50)) # 绘图 font_path = '/System/Library/fonts/PingFang.ttc' mask = imread(MASKFILE) img_color = ImageColorGenerator(mask) other_stopwords = ['这是'] wordcloud = WordCloud( font_path=font_path, margin=2, # 设置页面边缘 mask=mask, scale=2, max_words=200, # 最多词个数 min_font_size=4, # 最小字体大小 random_state=42, background_color='white', # 背景颜色 max_font_size=150, # 最大字体大小 ) wordcloud.generate_from_frequencies(counter) wordcloud.recolor(color_func=img_color) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') #plt.show() plt.savefig(SAVEIMGFILE)
stopwords = stopwords ) wc = wc.generate(text)#生成词云 save_path = path.join(pwd, "profile.png") wc.to_file(save_path) #生成一张黑色背景,词云为彩色的图片 #灰色到白色过度颜色随机 def gray_color_func(word, font_size, position, orientation, random_stata=None, **kwargs): #颜色HSL(色度、饱和度、亮度),亮度随机 return 'hsl(0, 0%%, %d%%)' % random.randint(60, 100) #展示图片 plt.imshow(wc) plt.axis("off") #又画一张图 plt.figure() #灰色 wc_gray = wc.recolor(color_func = gray_color_func, random_state=None) plt.imshow(wc_gray) plt.axis("off") #画一张图 plt.figure() plt.imshow(alice_mask, cmap=plt.cm.gray) plt.axis("off") plt.show()
def run_visualize(): check = "\u2714" wrong = "\u2716" filepath = os.path.join(os.path.expanduser("~"), "Downloads/Image Analytics") workbook=xlsxwriter.Workbook(filepath+"/Image Analytics Data Visualization.xlsx") worksheet=workbook.add_worksheet("Dashboard") worksheet.hide_gridlines(2) merge_format_title = workbook.add_format({'bold': 1,'font_size':20,'font':"Georgia",'border': 10,'align': 'center','valign': 'vcenter',}) merge_format = workbook.add_format({'bold': 1,'font_size':10,'font':"Georgia",'border': 1,'align': 'center','valign': 'vcenter',}) worksheet.merge_range('C2:V5', 'Dashboard for Image Analytics', merge_format_title) worksheet.merge_range('C7:K8', 'Image and Tweets Statistics', merge_format) worksheet.merge_range('C26:K27', 'Label Analysis', merge_format) worksheet.merge_range('C53:K54', 'Safe Search Properties', merge_format) worksheet.merge_range('C72:K73', 'Word Cloud of Tweets', merge_format) worksheet.merge_range('C90:K91', 'Logo Analysis', merge_format) worksheet.merge_range('N7:V8', 'Statistics of Collected Images', merge_format) worksheet.merge_range('N26:V27', 'Web Search Properties', merge_format) worksheet.merge_range('N53:V54', 'Facial Expression', merge_format) worksheet.merge_range('N72:V73', 'Word Cloud of Image Text', merge_format) worksheet.merge_range('N90:V91', 'Landmark Analysis', merge_format) plt.style.use("seaborn") plt.rcParams["font.family"] = "Georgia" label = pd.read_excel(filepath+"/ImageDescription.xlsx",sheet_name="Label") web = pd.read_excel(filepath+"/ImageDescription.xlsx",sheet_name="Web Search") safe = pd.read_excel(filepath+"/ImageDescription.xlsx",sheet_name="Safe Search") face = pd.read_excel(filepath+"/ImageDescription.xlsx",sheet_name="Face") logo = pd.read_excel(filepath+"/ImageDescription.xlsx",sheet_name="Logo") landmark = pd.read_excel(filepath+"/ImageDescription.xlsx",sheet_name="Landmark") text_img = pd.read_excel(filepath+"/ImageDescription.xlsx",sheet_name="Text") plt.figure(figsize=(11,6)) try: stats = pd.read_excel(filepath+"/ImageDescription.xlsx",sheet_name="Stats") x = [] if "No of Tweets" in stats.columns: x = ("No of Tweets","Total Images","Unique Images") stats = pd.Series.from_array([int(stats["No of Tweets"].values),int(stats["Total Images"].values),int(stats["Unique Images"].values)]) else: x = ("No of Links","Total Images","Unique Images") stats = pd.Series.from_array([int(stats["No of Links"].values),int(stats["Total Images"].values),int(stats["Unique Images"].values)]) ax = stats.plot(kind="bar",fontsize=17,rot=0) ax.set_xticklabels(x) for bar in ax.patches: ax.annotate(str(bar.get_height()), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='bottom', fontsize=14) plt.ylabel("No of occurrences",fontsize=17) plt.yticks(fontsize=14) plt.savefig(filepath+"/stats.png",bbox_inches='tight') worksheet.insert_image("C10",filepath+"/stats.png",{'x_scale': 0.63, 'y_scale': 0.62}) wb.sheets[0].range('E27').value = check except xlrd.biffh.XLRDError: plt.plot([],[]) plt.text(-0.06, 0,"In Local System Image Analytics Real Tweet Stats not available",fontdict={'family': 'Georgia','color':'red','weight': 'normal','size': 25}) plt.axis("off") plt.savefig(filepath+"/stats.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("C10",filepath+"/stats.png",{'x_scale': 0.61, 'y_scale': 0.62}) wb.sheets[0].range('E27').value = wrong pass plt.figure(figsize=(11,6)) x = ("Label","Face","Logo","Landmark","Text","Web Search","Safe Search") stats_img = pd.Series.from_array([label["Label Names"].count(),face["File Name"].count(),logo["Logo Names"].count(),landmark["Landmark Names"].count(),text_img["Text"].count(),web["Web Search Properties"].count(),safe["File Name"].count()]) ax = stats_img.plot(kind="bar",fontsize=14,rot=0) ax.set_xticklabels(x) for bar in ax.patches: ax.annotate(str(bar.get_height()), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='bottom', fontsize=14) plt.ylabel("No of occurrences",fontsize=17) plt.yticks(fontsize=14) plt.savefig(filepath+"/stats_properties.png",bbox_inches='tight') worksheet.insert_image("N10",filepath+"/stats_properties.png",{'x_scale': 0.65, 'y_scale': 0.62}) plt.figure(figsize=(11,6)) ax = label["Label Names"].value_counts().nlargest(10).plot(kind="bar") plt.xticks(fontsize=16,rotation=65) for bar in ax.patches: ax.annotate(str(bar.get_height()), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='bottom', fontsize=14) plt.yticks(fontsize=14) plt.ylabel("No of occurrences",fontsize=18) plt.savefig(filepath+"/label.png",bbox_inches='tight') worksheet.insert_image("C29",filepath+"/label.png",{'x_scale': 0.64, 'y_scale': 0.62}) wb.sheets[0].range('F27').value = check plt.figure(figsize=(11,6)) ax = web["Web Search Properties"].value_counts().nlargest(10).plot(kind="bar") plt.xticks(fontsize=16,rotation=65) for bar in ax.patches: ax.annotate(str(bar.get_height()), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='bottom', fontsize=14) plt.yticks(fontsize=14) plt.ylabel("No of occurrences",fontsize=18) plt.savefig(filepath+"/websearch.png",bbox_inches='tight') worksheet.insert_image("N29",filepath+"/websearch.png",{'x_scale': 0.65, 'y_scale': 0.62}) wb.sheets[0].range('G27').value = check sw= set(STOPWORDS) plt.figure(figsize=(13,6)) try: content = pd.read_excel(filepath+"/ImageDescription.xlsx",sheet_name="Text Analytics") text = process_text("".join(content["CONTENT"])) tweet = WordCloud(font_path=fm.findfont("Georgia"),background_color="#EEEEEE",max_words=2000,normalize_plurals= True,stopwords=sw, width=1500, height=750).generate(text=text) tweet.recolor(color_func=rgb_func) plt.imshow(tweet) plt.axis("off") plt.savefig(filepath+"/tweet.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("C75",filepath+"/tweet.png",{'x_scale': 0.61, 'y_scale': 0.58}) wb.sheets[0].range('H27').value = check except xlrd.biffh.XLRDError: plt.plot([],[]) plt.text(-0.05, 0,"In Local System Image Analytics Tweets not available",fontdict={'family': 'Georgia','color':'red','weight': 'normal','size': 25}) plt.axis("off") plt.savefig(filepath+"/tweet.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("C75",filepath+"/tweet.png",{'x_scale': 0.56, 'y_scale': 0.59}) wb.sheets[0].range('H27').value = wrong pass plt.figure(figsize=(13,6)) if not text_img.empty: text = process_text("".join(text_img["Text"])) img_text = WordCloud(font_path=fm.findfont("Georgia"),background_color="#EEEEEE",max_words=2000,normalize_plurals= True,stopwords=sw, width=1500, height=750).generate(text=text) img_text.recolor(color_func=rgb_func) plt.imshow(img_text) plt.axis("off") plt.savefig(filepath+"/img_text.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("N75",filepath+"/img_text.png",{'x_scale': 0.61, 'y_scale': 0.58}) wb.sheets[0].range('I27').value = check else: plt.pie([],labels=[]) plt.text(-0.35, 0,"Data Not Available",fontdict={'family': 'Georgia','color':'red','weight': 'normal','size': 25}) plt.axis("off") plt.savefig(filepath+"/img_text.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("N75",filepath+"/img_text.png",{'x_scale': 0.58, 'y_scale': 0.59}) wb.sheets[0].range('I27').value = wrong plt.figure(figsize=(11,6)) plt.subplot(2,2,1) safe["Adult"].value_counts().plot(kind="pie",autopct='%1.1f%%',startangle=0,fontsize=14) pylab.ylabel('') plt.title("Adult",fontsize=15,fontweight="bold") plt.axis("equal") plt.subplot(2,2,2) safe["Medical"].value_counts().plot(kind="pie",autopct='%1.1f%%',startangle=0,fontsize=14) pylab.ylabel('') plt.title("Medical",fontsize=15,fontweight="bold") plt.axis("equal") plt.subplot(2,2,3) safe["Spoof"].value_counts().plot(kind="pie",autopct='%1.1f%%',startangle=0,fontsize=14) pylab.ylabel('') plt.title("Spoof",fontsize=15,fontweight="bold") plt.axis("equal") plt.subplot(2,2,4) safe["Violence"].value_counts().plot(kind="pie",autopct='%1.1f%%',startangle=0,fontsize=14) plt.title("Violence",fontsize=15,fontweight="bold") pylab.ylabel('') plt.axis("equal") plt.savefig(filepath+"/safe.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("C56",filepath+"/safe.png",{'x_scale': 0.69, 'y_scale': 0.62}) wb.sheets[0].range('J27').value = check plt.figure(figsize=(11,6)) if not face.empty: plt.subplot(221) face["Anger"].value_counts().plot(kind="pie",autopct='%1.1f%%',startangle=0,fontsize=14) pylab.ylabel('') plt.title("Anger",fontsize=15,fontweight="bold") plt.axis("equal") plt.subplot(222) face["Joy"].value_counts().plot(kind="pie",autopct='%1.1f%%',startangle=0,fontsize=14) pylab.ylabel('') plt.title("Joy",fontsize=15,fontweight="bold") plt.axis("equal") plt.subplot(212) face["Surprise"].value_counts().plot(kind="pie",autopct='%1.1f%%',startangle=0,fontsize=14) pylab.ylabel('') plt.title("Surprise",fontsize=15,fontweight="bold") plt.axis("equal") plt.savefig(filepath+"/face.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("N56",filepath+"/face.png",{'x_scale': 0.69, 'y_scale': 0.61}) wb.sheets[0].range('K27').value = check else: plt.pie([],labels=[]) plt.text(-0.35, 0,"Data Not Available",fontdict={'family': 'Georgia','color':'red','weight': 'normal','size': 25}) plt.savefig(filepath+"/face.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("N56",filepath+"/face.png",{'x_scale': 0.69, 'y_scale': 0.62}) wb.sheets[0].range('K27').value = wrong plt.figure(figsize=(11,6)) if not logo.empty: ax = logo["Logo Names"].value_counts().nlargest(10).plot(kind="bar") plt.xticks(fontsize=16,rotation=65) for bar in ax.patches: ax.annotate(str(bar.get_height()), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='bottom', fontsize=14) plt.yticks(fontsize=14) plt.ylabel("No of occurrences",fontsize=18) plt.savefig(filepath+"/logo.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("C93",filepath+"/logo.png",{'x_scale': 0.64, 'y_scale': 0.63}) wb.sheets[0].range('L27').value = check else: plt.plot([],[]) plt.axis("off") plt.text(-0.022, 0,"Data Not Available",fontdict={'family': 'Georgia','color':'red','weight': 'normal','size': 25}) plt.savefig(filepath+"/logo.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("C93",filepath+"/logo.png",{'x_scale': 0.65, 'y_scale': 0.64}) wb.sheets[0].range('L27').value = wrong plt.figure(figsize=(11,6)) if not landmark.empty: ax = landmark["Landmark Names"].value_counts().nlargest(10).plot(kind="bar") plt.xticks(fontsize=16,rotation=65) for bar in ax.patches: ax.annotate(str(bar.get_height()), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='bottom', fontsize=14) plt.yticks(fontsize=14) plt.ylabel("No of occurrences",fontsize=18) plt.savefig(filepath+"/landmark.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("N93",filepath+"/landmark.png",{'x_scale': 0.64, 'y_scale': 0.63}) wb.sheets[0].range('M27').value = check else: plt.plot([],[]) plt.axis("off") plt.text(-0.022, 0,"Data Not Available",fontdict={'family': 'Georgia','color':'red','weight': 'normal','size': 25}) plt.savefig(filepath+"/landmark.png",bbox_inches='tight',facecolor="#EEEEEE") worksheet.insert_image("N93",filepath+"/landmark.png",{'x_scale': 0.65, 'y_scale': 0.64}) wb.sheets[0].range('M27').value = wrong workbook.close() try: os.remove(filepath+"/stats.png") os.remove(filepath+"/stats_properties.png") os.remove(filepath+"/label.png") os.remove(filepath+"/websearch.png") os.remove(filepath+"/tweet.png") os.remove(filepath+"/img_text.png") os.remove(filepath+"/safe.png") os.remove(filepath+"/face.png") os.remove(filepath+"/logo.png") os.remove(filepath+"/landmark.png") except OSError: pass
from PIL import Image from nltk.corpus import stopwords stop_words=set(stopwords.words()) clean_mask = np.array(Image.open("../input/images/safe-zone.png")) clean_mask = clean_mask[:,:,1] #wordcloud for clean comments subset = train_data[train_data.clean==1] text = subset.comment_text.values wc = WordCloud(background_color='black',max_words=2000,mask=clean_mask,stopwords=stop_words) wc.generate(" ".join(text)) plt.figure(figsize=(20,10)) plt.axis('off') plt.title('Words frequent in clean comments',fontsize=20) plt.imshow(wc.recolor(colormap = 'viridis',random_state=17), alpha=0.98) plt.show() # In[17]: from PIL import Image from nltk.corpus import stopwords stop_words=set(stopwords.words()) clean_mask = np.array(Image.open("../input/images/nuclear.png")) clean_mask = clean_mask[:,:,1] #wordcloud for clean comments subset = train_data[train_data.toxic==1] text = subset.comment_text.values wc = WordCloud(background_color='black',max_words=2000,mask=clean_mask,stopwords=stop_words)
from wordcloud import WordCloud, ImageColorGenerator from PIL import Image import numpy as np import matplotlib.pyplot as plt import jieba # 打开文本 text = open('xyj.txt').read() # 中文分词 text = ' '.join(jieba.cut(text)) print(text[:100]) # 生成对象 mask = np.array(Image.open("color_mask.png")) wc = WordCloud(mask=mask, font_path='Hiragino.ttf', mode='RGBA', background_color=None).generate(text) # 从图片中生成颜色 image_colors = ImageColorGenerator(mask) wc.recolor(color_func=image_colors) # 显示词云 plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.show() # 保存到文件 wc.to_file('wordcloud4.png')
margin=2, #边缘 ranks_only=None, prefer_horizontal=0.9, mask=background_Image, #背景图形,如果想根据图片绘制,则需要设置 color_func=None, max_words=200, #显示最多的词汇量 stopwords=None, #停止词设置,修正词云图时需要设置 random_state=None, background_color='#ffffff', #背景颜色设置,可以为具体颜色,比如:white或者16进制数值。 font_step=1, mode='RGB', regexp=None, collocations=True, normalize_plurals=True, contour_width=0, colormap='viridis', #matplotlib色图,可以更改名称进而更改整体风格 contour_color='Blues', repeat=False, scale=2, min_font_size=10, max_font_size=200) wc.generate_from_text(text) wc.recolor(color_func=img_colors) # 显示并存储图像 plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.tight_layout() wc.to_file('sanguo.png') plt.show()
async def _(event): if not event.reply_to_msg_id: await event.edit("`Mohon Balas Ke Media Apapun Petercord`") return reply_message = await event.get_reply_message() if not reply_message.media: await event.edit("`Mohon Balas Ke Gambar/Sticker/Video Petercord`") return await event.edit("`Mendownload Media.....`") if reply_message.photo: await bot.download_media( reply_message, "wc.png", ) elif (DocumentAttributeFilename(file_name="AnimatedSticker.tgs") in reply_message.media.document.attributes): await bot.download_media( reply_message, "wc.tgs", ) os.system("lottie_convert.py wc.tgs wc.png") elif reply_message.video: video = await bot.download_media( reply_message, "wc.mp4", ) extractMetadata(createParser(video)) os.system("ffmpeg -i wc.mp4 -vframes 1 -an -s 480x360 -ss 1 wc.png") else: await bot.download_media( reply_message, "wc.png", ) try: await event.edit("`Sedang Memproses....`") text = open("userbot/utils/styles/alice.txt", encoding="utf-8").read() image_color = np.array(Image.open("wc.png")) image_color = image_color[::1, ::1] image_mask = image_color.copy() image_mask[image_mask.sum(axis=2) == 0] = 255 edges = np.mean( [ gaussian_gradient_magnitude(image_color[:, :, i] / 255.0, 2) for i in range(3) ], axis=0, ) image_mask[edges > 0.08] = 255 wc = WordCloud( max_words=2000, mask=image_mask, max_font_size=40, random_state=42, relative_scaling=0, ) wc.generate(text) image_colors = ImageColorGenerator(image_color) wc.recolor(color_func=image_colors) wc.to_file("wc.png") await event.client.send_file( event.chat_id, "wc.png", reply_to=event.reply_to_msg_id, ) await event.delete() os.system("rm *.png *.mp4 *.tgs *.webp") except BaseException as e: os.system("rm *.png *.mp4 *.tgs *.webp") return await event.edit(str(e))
hashtags_column = df["place_name"].values # text column all_hashtags = join_data(hashtags_column) stopwords = []; # https://www.flaticon.com/free-icon/world-map_290185#term=world%20map&page=3&position=19 image_name = "earth-globe.png" mask = np.array(Image.open(image_name)) wordcloud = WordCloud(stopwords=stopwords, background_color="white", mode="RGBA", max_words=1000, mask=mask).generate(",".join(all_hashtags)) # create coloring from image image_colors = ImageColorGenerator(mask) plt.figure(figsize=[7, 7]) plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear") plt.axis("off") # store to file plt.savefig("wordcloud.png", format="png") df_sentiment = df.drop(['created_at', 'text', 'place_type', "hashtags", "place_type", "place_fullname", "place_country_code", "place_country"], axis=1) df_sentiment = df_sentiment.dropna(subset=['place_name']) sort_by_city = df_sentiment.sort_values("place_name") print(sort_by_city.head(n=100)) # group by city and find mean sum_data = df_sentiment.groupby(['place_name'])['polarity', 'subjectivity'].mean().reset_index() print(sum_data) fig = plt.figure() ax = fig.add_subplot(111)
bfreq = nltk.FreqDist(bwords) #for word, freq in bfreq.most_common(100): # print('%s: %g' % (word, freq)) awords = nltk.word_tokenize(atext) awords = [word for word in awords if word not in stopwords and word not in commonwords and len(word) > 2 and "'" not in word] afreq = nltk.FreqDist(awords) #for word, freq in afreq.most_common(100): # print('%s: %g' % (word, freq)) cloud = np.array(Image.open('cloud.jpg')) bwc = WordCloud(background_color='white', stopwords = stopwords, width = 500, height = 500, mask = cloud, max_words = 1000).generate(btext) awc = WordCloud(background_color='white', stopwords = stopwords, width = 500, height = 500, mask = cloud, max_words = 1000).generate(atext) plt.imshow(bwc.recolor(colormap='viridis'), interpolation = 'bilinear') plt.title('Brian') ax = plt.gca() ax.axis('off') plt.tight_layout() plt.show() plt.imshow(awc.recolor(colormap='winter'), interpolation = 'bilinear') plt.title('Annie') ax = plt.gca() ax.axis('off') plt.tight_layout() plt.show()
def test_recolor_too_small_set_default(): # check no exception is raised when default colour is used colouring = np.array(Image.new('RGB', size=(20, 20))) wc = WordCloud(max_words=50, width=30, height=30, min_font_size=1).generate(THIS) image_colors = ImageColorGenerator(colouring, default_color=(0, 0, 0)) wc.recolor(color_func=image_colors)
tokenized_earl = Tokenizer(inputCol="text", outputCol="words") tWords_earl = tokenized_earl.transform(earl) SWremover_earl = StopWordsRemover(inputCol="words", outputCol="filtered") SWremoved_earl = SWremover_earl.transform(tWords_earl).select("filtered") FWords_earl = SWremoved_earl.rdd.flatMap(once) WCount_earl = FWords_earl.reduceByKey(operator.add) FreqWords_earl = WCount_earl.sortBy(lambda t: t[1], ascending = False).take(400) FreqWordDict_earl = dict(FreqWords_earl) mask_earl = np.array(Image.open("visualization/likesimba.png")) wordcloud_earl = WordCloud(width =1600,height=800, background_color="white", max_words=1000, mask=mask_earl).generate_from_frequencies(FreqWordDict_earl) image_colors_earl = ImageColorGenerator(mask_earl) title_earl = 'Most frequent words from tips review for Earl of Sandwich' plt.figure(figsize=[20,10],facecolor='k') plt.imshow(wordcloud_earl.recolor(color_func=image_colors_earl),interpolation="bilinear") plt.title(title_earl, size=25, y=1.01) plt.axis("off") plt.savefig("visualization/earl.png", format="png") #WordCloud for restaurants in Ontario --> location based df_yelp_tip.registerTempTable("df_yelp_tip") ontario = spark.sql("""SELECT * FROM df_yelp_tip where state == 'ON' """) ontario = ontario.select("text") tokenized_ontario = Tokenizer(inputCol="text", outputCol="words") tWords_ontario = tokenized_earl.transform(ontario) SWremover_ontario = StopWordsRemover(inputCol="words", outputCol="filtered") SWremoved_ontario = SWremover_ontario.transform(tWords_ontario).select("filtered") FWords_ontario = SWremoved_ontario.rdd.flatMap(once)
# WordCloud(font_path=None, width=400, height=200, margin=2, ranks_only=None, # prefer_horizontal=0.9, mask=None, scale=1, color_func=None, max_words=200, # min_font_size=4, stopwords=None, random_state=None, background_color='black', # max_font_size=None, font_step=1, mode='RGB', relative_scaling='auto', # regexp=None, collocations=True, colormap=None, normalize_plurals=True, # contour_width=0, contour_color='black', repeat=False, include_numbers=False, # min_word_length=0) # max_font_size, max_words, and background_color are the primary arguments used # to manipulate the wordcloud. # contour_width and contour_color are used to create an outline to the cloud. # background_color=None and mode='RGBA' at the same time makes the background # transparent. # stopwords=None does not mean stopwords will not be removed. It actually means # that the default in-built stopwords list will be used. To keep stopwords in # the wordcloud, pass an empty list. cloud = WordCloud(background_color='white', max_words=200, mask=mask, stopwords=stops, mode='RGB') cloud.generate(full_text) fig, ax = plt.subplots(figsize=(12.5, 7.5)) # Recoloring the wordcloud is done in this step. If a default wordcloud is # desired, pass cloud without recolor. # ax.imshow(cloud, interpolation='bilinear') ax.imshow(cloud.recolor(color_func=image_colors), interpolation='bilinear') ax.axis('off') plt.show() plt.close() # Demonstrates exporting the wordcloud to a file. export_path = r'.\images\wordcloud.jpg' cloud.to_file(export_path)
w = int(input("Width: ")) h = int(input("Height: ")) wordcloud = WordCloud(width=w, height=h, background_color='black', min_font_size=5, stopwords=stopwords, random_state=42).generate(finalText) # for grey scale def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(60, 100) # plot the wordcloud image plt.figure(figsize=(8, 2), facecolor=None) plt.imshow(wordcloud) plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3), interpolation="bilinear") plt.axis("off") plt.tight_layout(pad=0) plt.savefig("wordcloud.png") plt.show()
wc = WordCloud( background_color="white", #背景颜色 font_path=font_path, #字体选择 max_words=1000, #最大词数 mask=back_coloring, #背景图片 max_font_size=100, #最大字体大小 width=1000, height=860, margin=2) wc.fit_words(dict(over_list[20:])) plt.figure() #显示图片 plt.imshow(wc) plt.axis("off") plt.show() #保存图片 wc.to_file( path.join(path.dirname(__file__), "C:/Users/TTT/Desktop/杂/wordcloud1.png")) #改变颜色 image_color = ImageColorGenerator(back_coloring) #从背景图片生成颜色值 plt.imshow(wc.recolor(color_func=image_color)) #使用新的颜色值布局着色 plt.axis('off') #关闭坐标轴 #绘制背景颜色的词云 plt.figure() plt.imshow(back_coloring, cmap=plt.cm.gray) plt.axis('off') plt.show() wc.to_file( path.join(path.dirname(__file__), "C:/Users/TTT/Desktop/杂/wordcloud2.png"))
for feature in row['features']: text = " ".join([text, "_".join(feature.strip().split(" "))]) text_da = " ".join( [text_da, "_".join(row['display_address'].strip().split(" "))]) #text_desc = " ".join([text_desc, row['description']]) text = text.strip() text_da = text_da.strip() text_desc = text_desc.strip() plt.figure(figsize=(12, 6)) wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text) wordcloud.recolor(random_state=0) plt.imshow(wordcloud) plt.title("Wordcloud for features", fontsize=30) plt.axis("off") plt.show() # wordcloud for display address plt.figure(figsize=(12, 6)) wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_da) wordcloud.recolor(random_state=0) plt.imshow(wordcloud) plt.title("Wordcloud for Display Address", fontsize=30)
plt.figure(figsize=(10, 10)) plt.axis('off') plt.pause(2) count = 0 for words in text.split(" "): txt += words if count > 100: wc.generate(txt) count=0 plt.imshow(wc) plt.pause(0.1) else: count = count + 1 plt.show() """ # create coloring from image image_colors = ImageColorGenerator(parrot_color) wc.recolor(color_func=image_colors) plt.figure(figsize=(10, 10)) plt.imshow(wc, interpolation="bilinear") wc.to_file("parrot_new.png") plt.figure(figsize=(10, 10)) plt.title("Original Image") plt.imshow(parrot_color) plt.figure(figsize=(10, 10)) plt.title("Edge map") plt.imshow(edges) plt.show()
def TXT2WC(): # 获取当前文件路径 # __file__ 为当前文件, 在ide中运行此行会报错,可改为 # d = path.dirname('.') d = path.dirname(__file__) global stopwords_path # stopwords = {} isCN = 1 #默认启用中文分词 pic_name = var1.get() file_name = var2.get() new_name = var3.get() back_col = var4.get() back_coloring_path = "file/{}".format(pic_name) # 设置背景图片路径 text_path = 'file/{}'.format(file_name) #设置要分析的文本路径 font_path = 'file/msyhbd.ttf' # 为matplotlib设置中文字体路径没 stopwords_path = 'file/stopwords1893.txt' # 停用词词表 imgname1 = "{}-Str.jpg".format(new_name) # imgname1 = "云图BStr{}.png".format(random.randint(1,30)) # 保存的图片名字1(只按照背景图片形状) imgname2 = imgname1.replace("Str", "Col") # 保存的图片名字2(颜色按照背景图片颜色布局生成) my_words_list = ['路明非'] # 在结巴的词库中添加新词 back_coloring = imread(path.join(d, back_coloring_path)) # 设置背景图片 # 设置词云属性 wc = WordCloud( font_path=font_path, # 设置字体 background_color="white", # 背景颜色 max_words=400, # 词云显示的最大词数 mask=back_coloring, # 设置背景图片 max_font_size=350, # 字体最大值 random_state=42, scale=10, width=1366, height=768, margin=2, # 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离 ) add_word(my_words_list) text = open(path.join(d, text_path), 'r', encoding='utf-8').read() if isCN: text = jiebaclearText(text) # 生成词云, 可以用generate输入全部文本(wordcloud对中文分词支持不好,建议启用中文分词),也可以我们计算好词频后使用generate_from_frequencies函数 wc.generate(text) # wc.generate_from_frequencies(txt_freq) # txt_freq例子为[('词a', 100),('词b', 90),('词c', 80)] # 从背景图片生成颜色值 # 保存的图片名字1(只按照背景图片形状) image_colors = ImageColorGenerator(back_coloring) wc.to_file(path.join(d, "img/{}".format(imgname1))) plt.figure() # 以下代码显示图片 plt.imshow(wc) plt.axis("off") plt.show() # 绘制词云 if back_col == 'n': pass else: # 保存的图片名字2(颜色按照背景图片颜色布局生成) image_colors = ImageColorGenerator(back_coloring) plt.imshow(wc.recolor(color_func=image_colors)) # 保存图片 wc.to_file(path.join(d, "img/{}".format(imgname2))) plt.axis("off") # 绘制背景图片为颜色的图片 # plt.figure() # plt.imshow(back_coloring, cmap=plt.cm.gray) # plt.axis("off") plt.show()
def main(): """ Fonction principale, trouve les mots clés et genere le nuage de mot affiche avec matplotlib """ global listeFichier if len(listeFichier) > 0: l1 = 9 if langue.get() == 'anglais': l1 = 7 print(listeFichier) if methode.get() == 'Intervalle': ListeMotCle = methodeIntervalle(l1) else: # Méthode intervalle ListeMotCle = methodePageParPage(l1) SuppTemp() # Suppression fichier temporaire ListeMotCle = tirets(ListeMotCle) ListeMotCleAffichage = remake(ListeMotCle) if export.get() != 0: AddFichierCVS(ListeMotCleAffichage) AffichageMotCle(ListeMotCleAffichage) #Creation chaine des mots (avec repetitions) text = "" for mot in ListeMotCle: text = text + " " + mot stopwords = set(STOPWORDS) stopwords.add("said") alice_coloring = np.array(Image.open(path.join(d, "fond.png"))) wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring, stopwords=stopwords, max_font_size=400, random_state=42) # generate word cloud wc.generate(text) # create coloring from image image_colors = ImageColorGenerator(alice_coloring) # show plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.figure() # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear") plt.axis("off") #plt.figure() plt.show() """cloud_mask = np.array(Image.open(path.join(dirName,"fond.png"))) stopwords = set(STOPWORDS) for mot in ListeMotCle: stopwords.add(mot) wordcloud = WordCloud(background_color="white", max_words=2000, mask=cloud_mask, stopwords=stopwords) # génération du nuage de mots wordcloud = WordCloud().generate(text) # lower max_font_size wordcloud = WordCloud(max_font_size=40).generate(text) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show()""" MAJAffFic()
pass BigBag = BagOfWords + BagOfHashes stop_words = set(stopwords.words('english')) rawWord = [w for w in BigBag if w.lower() not in stop_words] IgnoreThese = ["yr-old", "here's", "year-old", "thi", "let's"] rawWord = [w for w in rawWord if w.lower() not in IgnoreThese] text = ' '.join(rawWord) import matplotlib.pyplot as plt import numpy as np from PIL import Image ##install wordcloud from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator #%% mask = np.array(Image.open("MD.png")) # random_state=7 wordcloud = WordCloud(background_color="white", mask=mask, collocations=False, stopwords=STOPWORDS, max_font_size=65).generate_from_text(text) image_colors = ImageColorGenerator(mask) # Open a plot of the generated image. plt.figure(figsize=(12, 24)) image = wordcloud.recolor(color_func=image_colors) plt.imshow(image, interpolation="bilinear") plt.title('WordCloud - Python') plt.axis("off") plt.show() plt.imsave('WordCloud.png', image)
# movie script of "a new hope" # http://www.imsdb.com/scripts/Star-Wars-A-New-Hope.html # May the lawyers deem this fair use. text = open("a_new_hope.txt").read() # preprocessing the text a little bit text = text.replace("HAN", "Han") text = text.replace("LUKE'S", "Luke") # adding movie script specific stopwords stopwords = STOPWORDS.copy() stopwords.add("int") stopwords.add("ext") wc = WordCloud(max_words=1000, mask=mask, stopwords=stopwords, margin=10, random_state=1).generate(text) # store default colored image default_colors = wc.to_array() plt.title("Custom colors") plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3)) wc.to_file("a_new_hope.png") plt.axis("off") plt.figure() plt.title("Default colors") plt.imshow(default_colors) plt.axis("off") plt.show()
data = myfile.read() final = rake_words(data) final = str(final) print(final) alice_coloring = np.array(Image.open("maxresdefault_burned.png")) stopwords = set(STOPWORDS) stopwords.add("said") wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring, stopwords=stopwords, max_font_size=40, random_state=42) # generate word cloud wc.generate(final) wc.to_file("C:\\Users\\sidharth.m\\Desktop\\srk_tweetcolor.jpg") # create coloring from image image_colors = ImageColorGenerator(alice_coloring) # show fig, axes = plt.subplots(1, 3) axes[0].imshow(wc, interpolation="bilinear") # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor axes[1].imshow(wc.recolor(color_func=image_colors), interpolation="bilinear") axes[2].imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear") for ax in axes: ax.set_axis_off() plt.show()
def topic_word_art(self, topic_idx=None, n_words=20, save_file=None, color_func=None, random_state=1, fig_row_size=16, **kwargs): if topic_idx is None: ncols = int(floor(sqrt(self.n_topics))) nrows = int(ceil(self.n_topics / float(ncols))) ncols_to_nrows_ratio = ncols / nrows plt.figure(figsize=(fig_row_size, ncols_to_nrows_ratio * fig_row_size)) for i in range(self.n_topics): plt.subplot(nrows, ncols, i + 1) self.topic_word_art(topic_idx=i, n_words=n_words, save_file=save_file, color_func=color_func, random_state=random_state, **kwargs) plt.gcf().subplots_adjust(wspace=.1, hspace=.1) # elif isinstance(topic_idx, (list, tuple, ndarray)) and len(topic_idx) == self.n_topics: # ncols = int(floor(sqrt(self.n_topics))) # nrows = int(ceil(self.n_topics / float(ncols))) # ncols_to_nrows_ratio = ncols / nrows # plt.figure(figsize=(fig_row_size, ncols_to_nrows_ratio * fig_row_size)) # for i in range(self.n_topics): # plt.subplot(nrows, ncols, i + 1) # self.topic_word_art(topic_idx=i, n_words=n_words, save_file=save_file, # color_func=color_func, random_state=random_state, # width=int(self.wordcloud_params['width'] * topic_idx[i]), # height=int(self.wordcloud_params['height'] * topic_idx[i])) # plt.gcf().subplots_adjust(wspace=.1, hspace=.1) else: kwargs = dict(self.wordcloud_params, **kwargs) if color_func is None: color_func = self.word_art_params.get( 'color_func', self.topic_color[topic_idx]) if isinstance(color_func, tuple): color_func = "rgb({}, {}, {})".format(*map(int, color_func)) if isinstance(color_func, basestring): color = color_func def color_func(word, font_size, position, orientation, random_state=None, **kwargs): return color elif not callable(color_func): TypeError("Unrecognized hsl_color type ()".format( type(color_func))) # kwargs = dict(self.word_art_params, **kwargs) wc = WordCloud(random_state=random_state, **kwargs) wc.fit_words([(self.word_preprocessor(k), v) for k, v in self.topic_word_weights[topic_idx]. iloc[:n_words].to_dict().iteritems()]) # wc.recolor(color_func=kwargs['color_func'], random_state=random_state) plt.imshow( wc.recolor(color_func=color_func, random_state=random_state)) plt.grid(False) plt.xticks([]) plt.yticks([])
from wordcloud import WordCloud,ImageColorGenerator from matplotlib import pyplot as plt import random from palettable.colorbrewer.sequential import YlGnBu_9 text = open('test.txt', 'r',encoding= 'UTF-8-sig').read() text =' '.join(jieba.cut(text)) icon_path ='icon.png' icon = Image.open(icon_path) mask = Image.new("RGB" ,icon.size, (255,255,255)) mask.paste(icon,icon) mask = np.array(mask) def color_func(word, font_size,position,orientation, random_state=None, **kwargs): return tuple(YlGnBu_9.colors[random.randint(0,8)]) font_path = 'SNsanafonGyou.ttf' wc = WordCloud(font_path=font_path,background_color="black",max_words=2000,\ mask=mask,max_font_size=300,random_state=1) wc.generate_from_text(text) wc.recolor(color_func=color_func,random_state=2) output_path = 'wordcloud.png' wc.to_file(output_path) plt.rcParams["figure.figsize"]=(25,25)#字装游 plt.imshow(wc) plt.axis("off") plt.show()
def word_cloud_bigram(bigrams, mask): len_bigrams = len(bigrams) print(" * Were computed a total of {} bigrams from dataset.\n".format(len_bigrams)) mask_default = "cloud_mask.png" #if os.path.join("data", "img", mask) def color_function(mask): if mask == "mapa_brasil_mask.png": def color_func(word, font_size, position, orientation, random_state=None,**kwargs): return "hsl(190, 40%%, %d%%)" % random.randint(30, 60) #sky color_cont = (219, 236, 240) elif mask == "thumbs_down_mask_3.png": def color_func(word, font_size, position, orientation, random_state=None,**kwargs): return "hsl(0, 80%%, %d%%)" % random.randint(30, 60) #fake color_cont = (250, 209, 209) elif mask == "thumbs_up_mask.png": def color_func(word, font_size, position, orientation, random_state=None,**kwargs): return "hsl(130, 40%%, %d%%)" % random.randint(30, 60) #real color_cont = (219, 240, 223) else: def color_func(word, font_size, position, orientation, random_state=None,**kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(60, 100) #grey color_cont = (219, 236, 240) return color_func, color_cont color_function, color_cont = color_function(mask) vectorizer = CountVectorizer(ngram_range=(2, 2)) bag_of_words = vectorizer.fit_transform(bigrams) vectorizer.vocabulary_ sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) words_dict = dict(words_freq) mask = np.array(Image.open(os.path.join("data", "img", mask))) # cloud_of_words = WordCloud(width = 1080, # height = 1080, # max_font_size = 110, # collocations = False, # mask = mask, # background_color = "white", # contour_width = 3, # contour_color = (219, 236, 240)).generate_from_frequencies(words_dict) # cloud_of_words.recolor(color_func=color_func, random_state=3) WC_height = 1000 WC_width = 1500 WC_max_words = 200 cloud_of_words = WordCloud(height=1080, width=1080, max_font_size = 110, collocations = False, background_color = "white", mask = mask, contour_width = 3, contour_color = color_cont) cloud_of_words.generate_from_frequencies(words_dict) cloud_of_words.recolor(color_func=color_function, random_state=3) #plt.title('Most frequently occurring bigrams connected by same colour and font size') plt.figure(figsize = (12, 10)) plt.imshow(cloud_of_words, interpolation = 'bilinear') plt.axis('off') plt.show() return cloud_of_words, len_bigrams
wc = WordCloud( background_color = 'white', #背景色 max_words = 400, #最大显示词数 mask = bg, #图片背景 max_font_size = 60, #字最大尺寸 random_state = 42, font_path='C:/Windows/Fonts/simkai.ttf' #字体 ).generate(text1) #为图片设置字体 my_font = fm.FontProperties(fname='C:/Windows/Fonts/simkai.ttf') #产生背景图片,基于彩色图像的颜色生成器 image_colors = ImageColorGenerator(bg) #画图 plt.imshow(wc.recolor(color_func=image_colors)) #为云图去掉坐标轴 plt.axis('off') #画云图,显示 plt.figure() #为背景图去掉坐标轴 plt.axis('off') plt.imshow(bg,cmap=plt.cm.gray) #保存 wc.to_file('man.png')
with open(file_path, 'r')as f: lines=f.readlines() text = ''.join(lines) txt_freq = Keywords(text).tf_idf() image_path = "/home/zxingoo/supertags/tags/material/shapes/pikachu.png" mask = np.array(Image.open(image_path)) wc = WordCloud( font_path,#设置字体 background_color="black", #背景颜色 max_words=400,# 词云显示的最大词数 mask=mask,#设置背景图片 ) wc.generate_from_frequencies(txt_freq) #plt.imshow(wc) plt.imshow(wc.recolor(color_func=image_colors_func(image_path)), interpolation="bilinear") #plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3), # interpolation="bilinear") wc.to_file("/home/zxingoo/supertags/tags/temp/cartoon4.png") #plt.axis("off") #plt.savefig('/home/zxingoo/supertags/tags/temp/thumbs-up2.png',dpi = 400, bbox_inches="tight") # store default colored image
def make_word_cloud(imagemaskurl, relative_scaling, nwords, text, title, customstopwords, width, height, color, colormap, maxfont, minfont, scale): if imagemaskurl is not None and imagemaskurl != '': # imgstr = re.search(r'base64,(.*)', imagemask).group(1) try: if imagemaskurl.startswith('data:image'): imgstr = re.search(r'base64,(.*)', imagemask).group(1) b = base64.b64decode(imgstr) else: r = requests.get(imagemaskurl) b = r.content image_bytes = io.BytesIO(b) im = Image.open(image_bytes).convert('RGBA') canvas = Image.new('RGBA', im.size, (255, 255, 255, 255)) canvas.paste(im, mask=im) mask = np.array(canvas) width, height = im.size except: mask = None text = 'Invalid Image Mask!' else: mask = None from wordcloud import STOPWORDS STOPWORDS = list(STOPWORDS) for word in customstopwords: STOPWORDS.append(word) STOPWORDS.append(word + 's') STOPWORDS.append(word + "'s") if color == '': color = None cloud = WordCloud(width=width, height=height, mask=mask, background_color=color, stopwords=STOPWORDS, max_words=nwords, colormap=colormap, max_font_size=maxfont, min_font_size=minfont, random_state=42, scale=scale, mode='RGBA', relative_scaling=relative_scaling).generate(text) try: coloring = ImageColorGenerator(mask) cloud.recolor(color_func=coloring) except: pass image = cloud.to_image() byte_io = io.BytesIO() image.save(byte_io, 'PNG') byte_io.seek(0) data_uri = base64.b64encode(byte_io.getvalue()).decode('utf-8').replace('\n', '') src = 'data:image/png;base64,{0}'.format(data_uri) x = np.array(list(cloud.words_.keys())) y = np.array(list(cloud.words_.values())) order = np.argsort(y)[::-1] x = x[order] y = y[order] trace = go.Bar(x=x, y=y) layout = go.Layout(margin=go.Margin(l=10, r=00), title='Relative frequency of words/bigrams') fig = go.Figure(data=[trace], layout=layout) children = [ H2(title, className='card-title'), Img(src=src, width=image.size[0], height=image.size[1], style={'maxWidth': '100%', 'height': 'auto', 'margin': '0 auto', 'display': 'block'}), # Details([ # Summary('View Frequency Plot'), # dcc.Graph(id='word-freq', figure=fig, config={'displayModeBar': False}) # ]) ] return children
my_wordcloud3 = WordCloud(background_color='white', max_words=5000, mask=coloring, max_font_size=50,random_state= 20,scale=2,font_path='C:/Windows/Fonts/simhei.ttf').generate(nicknametext) # In[16]: image_colors = ImageColorGenerator(coloring) image_colors2 = ImageColorGenerator(coloring) # In[17]: plt.figure(figsize=(16,9)) plt.imshow(my_wordcloud.recolor(color_func=image_colors)) plt.imshow(my_wordcloud) plt.axis('off') plt.savefig('D:/4.jpg') plt.figure(figsize=(16,9)) plt.imshow(my_wordcloud2.recolor(color_func=image_colors2)) plt.imshow(my_wordcloud2) plt.axis('off') plt.savefig('D:/5.jpg',dpi=600) plt.figure(figsize=(16,9)) plt.imshow(my_wordcloud3.recolor(color_func=image_colors)) plt.imshow(my_wordcloud3) plt.axis('off') plt.savefig('D:/6.jpg',dpi=600)
def main(city,keyword,region,pages): """ 主函数 """ csv_filename='zl_'+city+'_'+keyword+'.csv' txt_filename='zl_'+city+'_'+keyword+'.txt' headers=['职位','经验','学历','公司','规模','月薪','地点'] write_csv_headers(csv_filename, headers) for i in range(pages): # 获取该页中所有职位信息,写入csv文件 job_dict={} html=get_one_page(city,keyword,region,i) items=parse_one_page(html) for item in items: html=get_detail_page(item.get('job_url')) job_detail=get_job_detail(html) job_dict['职位']=item.get('职位') job_dict['经验']=job_detail.get('years') job_dict['学历']=job_detail.get('education') job_dict['公司']=item.get('公司') job_dict['规模']=job_detail.get('scale') job_dict['月薪']=item.get('月薪') job_dict['地点']=item.get('地点') # 对数据进行清洗,将标点符号等对词频统计造成影响的因素剔除 # [\u4E00-\u9FD5]中文字的unicode范围 pattern = re.compile(r'[\u4E00-\u9FD5]+') filterdata = re.findall(pattern, job_detail.get('requirement')) # filterdata = job_detail.get('requirement') write_txt_file(txt_filename,''.join(filterdata)) write_csv_rows(csv_filename,headers,job_dict) # 工资统计 if True: salaries=[] sal=read_csv_column(csv_filename,5) # 撇除第一项,并转换成整形,生成新的列表 for i in range(len(sal)-1): print(i) # 工资为'0'的表示招聘上写的是'面议',不做统计 if not (sal[i]=='0' or sal[i]=='月薪'): salaries.append(int(sal[i])) # 用直方图进行展示 # 注意生成文件后会报错:ValueError: invalid literal for int() with base 10: '月薪' plt.hist(salaries,bins=10,) plt.show() if True: # 职位描述词频统计 content=read_txt_file(txt_filename) segment=jieba.lcut(content) words_df=pd.DataFrame({'segment':segment}) # 忽略常用词 stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep=" ",names=['stopword'],encoding='utf-8') words_df=words_df[~words_df.segment.isin(stopwords.stopword)] words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size}) words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False) # 设置词云属性 color_mask = imread('china.jfif') wordcloud=WordCloud(font_path='simhei.ttf', # 设置字体可以显示中文 background_color="white", # 背景颜色 max_words=100, # 词云显示的最大词数 mask=color_mask, # 设置背景图片 max_font_size=100, # 字体最大值 random_state=42, width=1000,height=860,margin=2, # 设置图片默认的大小,但是如果使用背景图片的话那么保存的图片大小将会按照其大小保存,margin为词语边缘距离 ) # 生成词云, 可以用generate输入全部文本,也可以我们计算好词频后使用generate_from_frequencies函数 word_frequence={x[0]:x[1] for x in words_stat.head(100).values} # {'经验': 34, '开发': 34, '网站': 29, ... word_frequence_dict={} for key in word_frequence: word_frequence_dict[key]=word_frequence[key] wordcloud.generate_from_frequencies(word_frequence_dict) # 从背景图片生成颜色值 image_colors=ImageColorGenerator(color_mask) # 重新上色 wordcloud.recolor(color_func=image_colors) # 保存图片 wordcloud.to_file('output.jpg') plt.imshow(wordcloud) plt.axis("off") plt.show()
text = jiebaClearText(text) # 生成词云, 可以用generate输入全部文本(wordcloud对中文分词支持不好,建议启用中文分词),也可以我们计算好词频后使用generate_from_frequencies函数 cY.generate(text) #从背景图片生成颜色值 imageColors = ImageColorGenerator(backColoring) #绘制词云 plt.figure() plt.imshow(cY) plt.axis("off") plt.show() #保存图片 cY.to_file(path.join(dangQian, imgName1)) imageColors = ImageColorGenerator(backColoring) plt.imshow(cY.recolor(color_func= imageColors)) plt.axis("off") #绘制背景图片为颜色图片 plt.figure() plt.imshow(backColoring, cmap= plt.cm.gray) plt.axis('off') plt.show() #保存图片 cY.to_file(path.join(dangQian, imgName2))
def generatewordcloud(party, inputImageFileName, outputImageFileName): global stopwordshearing speakerData = data[data.Party == party] allText = "" for index, row in speakerData.iterrows(): allText += str(row['Text']).lower()+" " allText = allText.replace("e-mail","email") allText = allText.replace("e- mail","email") allText = allText.replace("op-ed","oped") #print (allText) ImageFile.LOAD_TRUNCATED_IMAGES = True img = Image.open(inputImageFileName) img = img.resize((980,1080), Image.ANTIALIAS) sl = STOPWORDS | stopwordshearing speakerArray = np.array(img) wc = WordCloud(background_color="white", max_words=1000, mask=speakerArray, stopwords=sl, random_state=42) wc.generate(allText) print wc.words_ # create coloring from image image_colors = ImageColorGenerator(speakerArray) wc.recolor(color_func=image_colors) wc.to_file(outputImageFileName)