def Get_Movie_Comment(url): html = requests.get(url).content page = etree.HTML(html) comment_url = page.xpath( "//*[@id='comments-section']/div[1]/h2/span/a/@href")[0] print("Comment_URL : " + comment_url) html = requests.get(comment_url).content page = etree.HTML(html) filename = 'jieba_data.txt' with open(filename, 'w', encoding="utf8" ) as f: # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据! comment_info = [] info = page.xpath("//*[@id='comments']")[0] for k in range(1, 21): # comment = {} # comment ['name'] = info.xpath("./ div["+str(k) + "] / div[2] / h3 / span[2] / a /text()")[0] # comment['href'] = info.xpath("./ div["+str(k) + "] / div[2] / h3 / span[2] / a /@href")[0] comment = info.xpath("./ div[" + str(k) + "] / div[2] / p/text()")[0] f.write(comment + "\n") # comment_info.append(comment) print("=======================================================") print(comment) text_from_file_with_apath = open('jieba_data.txt', encoding='UTF-8').read() wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=True) wl_space_split = " ".join(wordlist_after_jieba) wc = WordCloud() wc.font_path = "simhei.ttf" # 黑体e my_wordcloud = wc.generate(wl_space_split) plt.imshow(my_wordcloud) plt.axis("off") plt.show() return comment_info
def build_word_cloud(text,back_coloring_path=None,font_path=None,txt_freq=None,scale=5): """ text: 词云的内容 back_coloring_path: 使用背景图片 font_path: 使用的字体所在路径 txt_freq: 词云权重,覆盖词云的内容 scale: 图片清晰度 """ # 设置词云属性 wc = WordCloud( background_color="white", # 背景颜色 max_words=2000, # 词云显示的最大词数 max_font_size=100, # 字体最大值 random_state=42, scale=scale, width=1000, height=860, margin=2, # 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离 ) wc.font_path = font_path if font_path else 'SIMHEI.TTF' if back_coloring_path: back_coloring = imread(back_coloring_path) wc.mask = back_coloring if text: wc.generate(text) if txt_freq: wc.generate_from_frequencies(txt_freq) if back_coloring_path: image_colors = ImageColorGenerator(back_coloring) plt.figure() plt.imshow(wc) plt.axis("off") plt.show() # 绘制词云 wc.to_file('WordCloudDefautColors.png') if back_coloring_path: image_colors = ImageColorGenerator(back_coloring) plt.imshow(wc.recolor(color_func=image_colors)) else: plt.imshow(wc) plt.axis("off") # 绘制背景图片为颜色的图片 plt.figure() if back_coloring_path: plt.imshow(back_coloring, cmap=plt.cm.gray) else: plt.imshow(wc) plt.axis("off") plt.show() # 保存图片 wc.to_file('WordCloudColorsByImg.png')
def generate_word_cloud(word_text, stopword_list=list(), is_chinese=False): stopword_set = set(STOPWORDS) for item in stopword_list: stopword_set.add(item) word_cloud = WordCloud( # mask=np.array(Image.open('assert/background.png')), stopwords=stopword_set, width=1000, height=618, max_words=3000, prefer_horizontal=0.99, # random_state=2333 # max_font_size=200, background_color='#EEEEEE', ) if is_chinese: word_cloud.font_path = '../assert/msyh.ttf' word_cloud.max_words = 1000 word_text = ' '.join(jieba.cut(word_text)) word_cloud.generate(word_text) return word_cloud
def make_word_cloud(content): # read the mask image d = path.dirname(__file__) # alice_mask = np.array(Image.open(path.join(d, "mask/terran.jpg"))) mask = np.array(Image.open(path.join(d, mask_img))) # font__dir = '/var/www/FlaskApp/FlaskApp/word_cloud_min/_fonts/lth.ttf' # font__dir = 'C:\Users\zjsep_000\PycharmProjects\myDrone\word_cloud_min\_fonts\lth.ttf' # font__dir = '_fonts/lth.ttf' wc = WordCloud(background_color="white", max_words=1000, mask=mask) # give the absolute dir for font ttf file # wc.font_path = 'C:\Users\JI\Documents\GitHub\PycharmProjects\myDrone\word_cloud\_fonts\lth.ttf' wc.font_path = abs_font_dir # wc.font_path = 'C:\Users\zjsep_000\PycharmProjects\myDrone\word_cloud_min\_fonts\lth.ttf' # wc.font_path = '_fonts/lth.ttf' # wc.font_path = '/var/www/FlaskApp/FlaskApp/word_cloud_min/_fonts/lth.ttf' # brush options: {'shoujin_brush.ttf','Japan_brush.ttf','qingke_fangzheng.ttf','KouzanBrushFont.ttf'} # serfi-fonts:[] wc.generate_from_frequencies(content) # generate word cloud # wc.generate(text) # store to file wc.to_file(path.join(d, "img/output.png")) # store to static foder in web server # wc.to_file(path.join(d, "../static/output.png")) # show plt.imshow(wc) plt.axis("off") plt.figure() plt.imshow(mask, cmap=plt.cm.gray) plt.axis("off") plt.show()
def poltword(p): #dftolist(data) is text list tags = jieba.analyse.extract_tags(cf.dftostr(data[p]), topK=20) print("\n".join(tags)) #words_ls = jieba.cut(cf.dftostr(data[p]), cut_all=True) #words_split1 = " ".join(words_ls) words_split2 = " ".join(tags) wc1 = WordCloud(width=1980, height=1680) wc1.font_path = "msyh.ttf " wc2 = WordCloud( width=400, height=200, background_color="#ffffff", # 设置背景颜色 max_words=500, # 词的最大数(默认为200) max_font_size=60, # 最大字体尺寸 min_font_size=10, # 最小字体尺寸(默认为4) colormap='bone', # string or matplotlib colormap, default="viridis" random_state=10, # 设置有多少种随机生成状态,即有多少种配色方案 font_path='simhei.ttf') #my_wordcloud1 = wc1.generate(words_split1) my_wordcloud2 = wc2.generate(words_split2) plt.imshow(my_wordcloud2)
l = int(100.0 * float(random.randint(40, 140)) / 255.0) # 在此处修改亮度 return "hsl({}, {}%, {}%)".format(h, s, l) content = open('lyrics.txt', encoding='utf-8').read() p = re.compile(r'\w*', re.L) newcontent = p.sub('', content) newcontent = newcontent.replace('.', '') newcontent = newcontent.replace('作词', '') newcontent = newcontent.replace('作曲', '') final = jieba.cut(newcontent, cut_all=False) space_split = " ".join(final) d = path.dirname(__file__) # background = imread(path.join(d, "heart.png")) # 可自定义背景图在cloudy内添加mask=background一项 cloudy = WordCloud( background_color="white", #背景颜色 max_words=250, # 词云显示的最大词数, max_font_size=50, #字体最大值 random_state=30, scale=2, relative_scaling=0.5, color_func=random_color_func, ) cloudy.font_path = "Consolas+YaHei+hybrid.ttf" my_wordcloud = cloudy.generate(space_split) plt.imshow(my_wordcloud) plt.axis("off") plt.savefig('wordcloud.png', dpi=1000) plt.show()
#coding:utf-8 import matplotlib.pyplot as plt from wordcloud import WordCloud import jieba background_Image = plt.imread('') text_from_file_with_apath = open('text.txt', 'rb').read() wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=True) wl_space_split = " ".join(wordlist_after_jieba) my_wordcloud = WordCloud(background_color="white", margin=0, width=512, height=512, mask=background_Image, max_words=2000, max_font_size=64, random_state=42).generate(wl_space_split) my_wordcloud.font_path = 'msyh.ttf' plt.imshow(my_wordcloud) plt.axis("off") plt.show()
def generate_wordcloud(text_path, mask_path=None, width=400, height=400, lan='en', font_path=None, want_worlds=[], stop_words=[], path_to_save='.'): """ generate a word cloud of the mask picture you provide and a word cloud color by your picture :param text_path: use to generate words :param mask_path: picture you want to show :param width: the width of the word cloud picture, if mask_path is not provided :param height: the height of the word cloud picture, if mask_path is not provided :param lan: the language of your text :param font_path: if lan is 'cn', a chinese font must provide :param want_worlds: the special word you don't want to separate :param stop_words: the words you don't want to show up in your word cloud :param path_to_save: the directory you want to save your word cloud (!! is directory not file) :return: no return """ image_colors = None wc = WordCloud(background_color='white', max_words=1000, max_font_size=400, random_state=42) # check path if not os.path.isfile(text_path): print('## text_path is invalid !!') return if lan == 'cn': # if the lan is cn, then the path of chinese font path can't be null if not os.path.isfile(font_path): print('## chinese font_path cannot be null !!') return text = _generate_cn_words(text=open(text_path).read(), want_words=want_worlds, stop_words=stop_words) wc.font_path = font_path else: text = open(text_path).read() if not os.path.isfile(font_path): wc.font_path = font_path if mask_path is None and width > 0 and height > 0: wc.height = height wc.width = width elif os.path.isfile(mask_path): mask = np.array(Image.open(mask_path)) wc.mask = mask image_colors = ImageColorGenerator(mask) else: print('## mask_path is invalid !!') return wc.generate(text=text) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis('off') plt.show() if os.path.isdir(path_to_save): wc.to_file(os.path.join(path_to_save, 'words.png')) else: print('## path_to_save is invalid !!') return if mask_path is not None: img1 = Image.open(os.path.join(path_to_save, 'words.png')) img2 = Image.open(mask_path) width = img1.size[0] height = img1.size[1] for i in range(0, width): for j in range(0, height): data1 = (img1.getpixel((i, j))) data2 = (img2.getpixel((i, j))) if (data1[0] <= 250 or data1[1] <= 250 or data1[2] <= 250): img1.putpixel((i, j), (data2[0], data2[1], data2[2], 255)) # if (data1[0] == 255 # and data1[1] == 255 # and data1[2] == 255): # img1.putpixel((i, j), (205, 205, 205, 255)) plt.imshow(img1) plt.axis('off') plt.show() img1.save(os.path.join(path_to_save, 'wordcloud.png'))
creat_folder(f_path) lyric_path = f_path foler_path = f_path + '/' #singer_id=2116#eason start_url = 'http://music.163.com/artist?id={}'.format(singer_id) html = get_html(start_url) singer_infos = get_singer_info(html) #获取歌词,同时判断是否有歌词(纯音乐没有歌词),没有歌词会导致程序异常停止(没有歌词就没有必要获取歌词了) for singer_info in singer_infos: lyric = get_lyric(singer_info[1]) if lyric == None: continue write_text(singer_info[0], lyric, foler_path) all_lyric(lyric_path) os.chdir(lyric_path) text = '' #将所有的歌词拼成一个字符串,方便后面进行分词 with open('all_lyric.txt', 'r', encoding='utf-8') as f: for line in f: text += line words_ls = jieba.cut(text, cut_all=True) #利用jieba进行分词 words_split = ' '.join(words_ls) wc = WordCloud(width=1980, height=1680) wc.font_path = "simhei.ttf" my_wordcloud = wc.generate(words_split) #生成词云 plt.imshow(my_wordcloud) plt.axis("off") #关闭坐标轴 plt.savefig('lyric') #save image plt.show() # print('end')
def unknowncoll(filename='unknownwords.p', stem=False): """ Word cloud from sentiment analysis. Finds the bi-collocation of unknown words (words without sentiment) and displays the 10 most common words based on frequency in a word-cloud, colored green for words seen mostly in positive sentiments and red for the opposite. Comparison is made on all comments concatenated -> filename: name of the file to load unknown words from -> stem: stem the words """ bigram_measures = nltk.collocations.BigramAssocMeasures() subreddits = scraper.load_data('sub-reddits.txt') fullcomment = [] print 'building comment' for name, data in subreddits.items(): for sub_id, sub in data.items(): fullcomment += [fixer(comment, True, stem).split() for comment in sub.comments] print 'getting unknowns' unknownwords = unknownsent(filename) #flatten the comment structure fullcomment = [word for comment in fullcomment for word in comment] basefinder = BigramCollocationFinder.from_words(fullcomment) count = 0 for unknown, unknownscore in unknownwords: finder = copy.copy(basefinder) print '\n' + unknown #only bigrams that contain the unknown word finder.apply_ngram_filter(lambda w1, w2: unknown != w1 and unknown != w2) wordcloud = WordCloud() wordcloud.font_path = 'C:\Windows\Fonts\comic.ttf' #trick the wordcloud to accept custom input wordcloud.generate('generate') colls = finder.score_ngrams(bigram_measures.raw_freq) colls = colls[:10] maximum = colls[1][1] #generate the tuple (word, score) for the wordcloud cloudwords = [(word, score) for ((word, _), score) in colls if word != unknown] cloudwords += [(word, score) for ((_, word), score) in colls if word != unknown] #normalize the scores cloudwords = [(word, score / maximum) for (word, score) in cloudwords] #tricking part 2. wordcloud.fit_words(cloudwords) wordcloud.to_image() if(unknownscore > 0): wordcloud = wordcloud.recolor(color_func=green_color_func, random_state=3) else: wordcloud = wordcloud.recolor(color_func=red_color_func, random_state=3) count += 1 plt.figure(count) plt.title(unknown) plt.imshow(wordcloud) plt.axis("off") plt.savefig('plots/' + unknown + '.png', bbox_inches='tight') plt.close()