示例#1
0
def Get_Movie_Comment(url):
    html = requests.get(url).content
    page = etree.HTML(html)
    comment_url = page.xpath(
        "//*[@id='comments-section']/div[1]/h2/span/a/@href")[0]
    print("Comment_URL : " + comment_url)
    html = requests.get(comment_url).content
    page = etree.HTML(html)
    filename = 'jieba_data.txt'
    with open(filename, 'w', encoding="utf8"
              ) as f:  # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据!
        comment_info = []
        info = page.xpath("//*[@id='comments']")[0]
        for k in range(1, 21):
            # comment = {}
            # comment ['name'] = info.xpath("./ div["+str(k) + "] / div[2] / h3 / span[2] / a /text()")[0]
            # comment['href'] = info.xpath("./ div["+str(k) + "] / div[2] / h3 / span[2] / a /@href")[0]
            comment = info.xpath("./ div[" + str(k) +
                                 "] / div[2] / p/text()")[0]
            f.write(comment + "\n")
            # comment_info.append(comment)
            print("=======================================================")
            print(comment)

    text_from_file_with_apath = open('jieba_data.txt', encoding='UTF-8').read()

    wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=True)
    wl_space_split = " ".join(wordlist_after_jieba)
    wc = WordCloud()
    wc.font_path = "simhei.ttf"  # 黑体e
    my_wordcloud = wc.generate(wl_space_split)
    plt.imshow(my_wordcloud)
    plt.axis("off")
    plt.show()
    return comment_info
示例#2
0
def build_word_cloud(text,back_coloring_path=None,font_path=None,txt_freq=None,scale=5):
    """
    text: 词云的内容
    back_coloring_path: 使用背景图片
    font_path: 使用的字体所在路径
    txt_freq: 词云权重,覆盖词云的内容
    scale: 图片清晰度
    """

    # 设置词云属性
    wc = WordCloud(
                background_color="white",  # 背景颜色
                max_words=2000,  # 词云显示的最大词数
                max_font_size=100,  # 字体最大值
                random_state=42,
                scale=scale,
                width=1000, height=860, margin=2,  # 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
               )

    wc.font_path = font_path if font_path else 'SIMHEI.TTF'

    if back_coloring_path:  
        back_coloring = imread(back_coloring_path) 
        wc.mask = back_coloring

    if text:
        wc.generate(text)
    
    if txt_freq:
        wc.generate_from_frequencies(txt_freq)

    
    if back_coloring_path:
        image_colors = ImageColorGenerator(back_coloring)
    plt.figure()
    plt.imshow(wc)
    plt.axis("off")
    plt.show()
    # 绘制词云
    wc.to_file('WordCloudDefautColors.png')

    if back_coloring_path:
        image_colors = ImageColorGenerator(back_coloring)
        plt.imshow(wc.recolor(color_func=image_colors))
    else:
        plt.imshow(wc)
    plt.axis("off")
    # 绘制背景图片为颜色的图片
    plt.figure()
    if back_coloring_path:
        plt.imshow(back_coloring, cmap=plt.cm.gray)
    else:
        plt.imshow(wc)
    plt.axis("off")
    plt.show()
    # 保存图片
    wc.to_file('WordCloudColorsByImg.png')
示例#3
0
def generate_word_cloud(word_text, stopword_list=list(), is_chinese=False):
    stopword_set = set(STOPWORDS)
    for item in stopword_list:
        stopword_set.add(item)
    word_cloud = WordCloud(
        # mask=np.array(Image.open('assert/background.png')),
        stopwords=stopword_set,
        width=1000,
        height=618,
        max_words=3000,
        prefer_horizontal=0.99,
        # random_state=2333
        # max_font_size=200,
        background_color='#EEEEEE',
    )
    if is_chinese:
        word_cloud.font_path = '../assert/msyh.ttf'
        word_cloud.max_words = 1000
        word_text = ' '.join(jieba.cut(word_text))
    word_cloud.generate(word_text)
    return word_cloud
def make_word_cloud(content):
    # read the mask image
    d = path.dirname(__file__)
    # alice_mask = np.array(Image.open(path.join(d, "mask/terran.jpg")))
    mask = np.array(Image.open(path.join(d, mask_img)))
    # font__dir = '/var/www/FlaskApp/FlaskApp/word_cloud_min/_fonts/lth.ttf'
    # font__dir = 'C:\Users\zjsep_000\PycharmProjects\myDrone\word_cloud_min\_fonts\lth.ttf'
    # font__dir = '_fonts/lth.ttf'

    wc = WordCloud(background_color="white", max_words=1000, mask=mask)

    # give the absolute dir for font ttf file
    # wc.font_path = 'C:\Users\JI\Documents\GitHub\PycharmProjects\myDrone\word_cloud\_fonts\lth.ttf'
    wc.font_path = abs_font_dir
    # wc.font_path = 'C:\Users\zjsep_000\PycharmProjects\myDrone\word_cloud_min\_fonts\lth.ttf'

    # wc.font_path = '_fonts/lth.ttf'
    # wc.font_path = '/var/www/FlaskApp/FlaskApp/word_cloud_min/_fonts/lth.ttf'
    # brush options: {'shoujin_brush.ttf','Japan_brush.ttf','qingke_fangzheng.ttf','KouzanBrushFont.ttf'}
    # serfi-fonts:[]

    wc.generate_from_frequencies(content)
    # generate word cloud
    # wc.generate(text)


    # store to file
    wc.to_file(path.join(d, "img/output.png"))
    # store to static foder in web server
    # wc.to_file(path.join(d, "../static/output.png"))

    # show
    plt.imshow(wc)
    plt.axis("off")
    plt.figure()
    plt.imshow(mask, cmap=plt.cm.gray)
    plt.axis("off")
    plt.show()
示例#5
0
def poltword(p):
    #dftolist(data) is text list
    tags = jieba.analyse.extract_tags(cf.dftostr(data[p]), topK=20)
    print("\n".join(tags))

    #words_ls = jieba.cut(cf.dftostr(data[p]), cut_all=True)
    #words_split1 = " ".join(words_ls)
    words_split2 = " ".join(tags)
    wc1 = WordCloud(width=1980, height=1680)
    wc1.font_path = "msyh.ttf "
    wc2 = WordCloud(
        width=400,
        height=200,
        background_color="#ffffff",  # 设置背景颜色
        max_words=500,  # 词的最大数(默认为200)
        max_font_size=60,  # 最大字体尺寸
        min_font_size=10,  # 最小字体尺寸(默认为4)
        colormap='bone',  # string or matplotlib colormap, default="viridis"
        random_state=10,  # 设置有多少种随机生成状态,即有多少种配色方案
        font_path='simhei.ttf')

    #my_wordcloud1 = wc1.generate(words_split1)
    my_wordcloud2 = wc2.generate(words_split2)
    plt.imshow(my_wordcloud2)
示例#6
0
    l = int(100.0 * float(random.randint(40, 140)) / 255.0)  # 在此处修改亮度
    return "hsl({}, {}%, {}%)".format(h, s, l)


content = open('lyrics.txt', encoding='utf-8').read()
p = re.compile(r'\w*', re.L)
newcontent = p.sub('', content)
newcontent = newcontent.replace('.', '')
newcontent = newcontent.replace('作词', '')
newcontent = newcontent.replace('作曲', '')
final = jieba.cut(newcontent, cut_all=False)
space_split = " ".join(final)

d = path.dirname(__file__)
# background = imread(path.join(d, "heart.png")) # 可自定义背景图在cloudy内添加mask=background一项
cloudy = WordCloud(
    background_color="white",  #背景颜色
    max_words=250,  # 词云显示的最大词数,
    max_font_size=50,  #字体最大值
    random_state=30,
    scale=2,
    relative_scaling=0.5,
    color_func=random_color_func,
)
cloudy.font_path = "Consolas+YaHei+hybrid.ttf"
my_wordcloud = cloudy.generate(space_split)
plt.imshow(my_wordcloud)
plt.axis("off")
plt.savefig('wordcloud.png', dpi=1000)
plt.show()
示例#7
0
#coding:utf-8
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
background_Image = plt.imread('')

text_from_file_with_apath = open('text.txt', 'rb').read()
wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=True)
wl_space_split = " ".join(wordlist_after_jieba)
my_wordcloud = WordCloud(background_color="white",
                         margin=0,
                         width=512,
                         height=512,
                         mask=background_Image,
                         max_words=2000,
                         max_font_size=64,
                         random_state=42).generate(wl_space_split)
my_wordcloud.font_path = 'msyh.ttf'
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()
示例#8
0
文件: word_cloud.py 项目: XuLw/pypypy
def generate_wordcloud(text_path,
                       mask_path=None,
                       width=400,
                       height=400,
                       lan='en',
                       font_path=None,
                       want_worlds=[],
                       stop_words=[],
                       path_to_save='.'):
    """
    generate a word cloud of the mask picture you provide and a word cloud color by your picture
        :param text_path: use to generate words
        :param mask_path: picture you want to show
        :param width: the width of the word cloud picture, if mask_path is not provided
        :param height: the height of the word cloud picture, if mask_path is not provided
        :param lan: the language of your text
        :param font_path: if lan is 'cn', a chinese font must provide
        :param want_worlds:  the special word you don't want to separate
        :param stop_words:  the words you don't want to show up in your word cloud
        :param path_to_save: the directory you want to save your word cloud (!! is directory not file)
        :return: no return
    """
    image_colors = None
    wc = WordCloud(background_color='white',
                   max_words=1000,
                   max_font_size=400,
                   random_state=42)
    # check path
    if not os.path.isfile(text_path):
        print('## text_path is invalid !!')
        return

    if lan == 'cn':
        # if the lan is cn, then the path of chinese font path can't be null
        if not os.path.isfile(font_path):
            print('## chinese font_path cannot be null !!')
            return
        text = _generate_cn_words(text=open(text_path).read(),
                                  want_words=want_worlds,
                                  stop_words=stop_words)
        wc.font_path = font_path
    else:
        text = open(text_path).read()

        if not os.path.isfile(font_path):
            wc.font_path = font_path

    if mask_path is None and width > 0 and height > 0:
        wc.height = height
        wc.width = width
    elif os.path.isfile(mask_path):
        mask = np.array(Image.open(mask_path))
        wc.mask = mask
        image_colors = ImageColorGenerator(mask)
    else:
        print('## mask_path is invalid !!')
        return

    wc.generate(text=text)
    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis('off')
    plt.show()

    if os.path.isdir(path_to_save):
        wc.to_file(os.path.join(path_to_save, 'words.png'))
    else:
        print('## path_to_save is invalid !!')
        return

    if mask_path is not None:
        img1 = Image.open(os.path.join(path_to_save, 'words.png'))
        img2 = Image.open(mask_path)
        width = img1.size[0]
        height = img1.size[1]

        for i in range(0, width):
            for j in range(0, height):
                data1 = (img1.getpixel((i, j)))
                data2 = (img2.getpixel((i, j)))
                if (data1[0] <= 250 or data1[1] <= 250 or data1[2] <= 250):
                    img1.putpixel((i, j), (data2[0], data2[1], data2[2], 255))
        # if (data1[0] == 255
        #         and data1[1] == 255
        #         and data1[2] == 255):
        #     img1.putpixel((i, j), (205, 205, 205, 255))

        plt.imshow(img1)
        plt.axis('off')
        plt.show()
        img1.save(os.path.join(path_to_save, 'wordcloud.png'))
示例#9
0
    creat_folder(f_path)
    lyric_path = f_path
    foler_path = f_path + '/'
    #singer_id=2116#eason
    start_url = 'http://music.163.com/artist?id={}'.format(singer_id)
    html = get_html(start_url)
    singer_infos = get_singer_info(html)
    #获取歌词,同时判断是否有歌词(纯音乐没有歌词),没有歌词会导致程序异常停止(没有歌词就没有必要获取歌词了)
    for singer_info in singer_infos:
        lyric = get_lyric(singer_info[1])
        if lyric == None:
            continue
        write_text(singer_info[0], lyric, foler_path)
    all_lyric(lyric_path)
    os.chdir(lyric_path)
    text = ''
    #将所有的歌词拼成一个字符串,方便后面进行分词
    with open('all_lyric.txt', 'r', encoding='utf-8') as f:
        for line in f:
            text += line
    words_ls = jieba.cut(text, cut_all=True)  #利用jieba进行分词
    words_split = ' '.join(words_ls)
    wc = WordCloud(width=1980, height=1680)
    wc.font_path = "simhei.ttf"
    my_wordcloud = wc.generate(words_split)  #生成词云
    plt.imshow(my_wordcloud)
    plt.axis("off")  #关闭坐标轴
    plt.savefig('lyric')  #save image
    plt.show()
    # print('end')
示例#10
0
def unknowncoll(filename='unknownwords.p', stem=False):
    """
    Word cloud from sentiment analysis.
    
    Finds the bi-collocation of unknown words (words without sentiment) 
    and displays the 10 most common words based on frequency in a word-cloud, 
    colored green for words seen mostly in positive sentiments and red 
    for the opposite. Comparison is made on all comments concatenated
    
    -> filename: name of the file to load unknown words from
    -> stem: stem the words
    """
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    subreddits = scraper.load_data('sub-reddits.txt')
    fullcomment = []
    
    print 'building comment'
    for name, data in subreddits.items():
        for sub_id, sub in data.items():
            fullcomment += [fixer(comment, True, stem).split() for comment in sub.comments]

    print 'getting unknowns'
    unknownwords = unknownsent(filename)
    
    #flatten the comment structure
    fullcomment = [word for comment in fullcomment for word in comment]
    
    basefinder = BigramCollocationFinder.from_words(fullcomment)
    count = 0
    
    for unknown, unknownscore in unknownwords:
        finder = copy.copy(basefinder)
        
        print '\n' + unknown
        #only bigrams that contain the unknown word
        finder.apply_ngram_filter(lambda w1, w2: unknown != w1 and unknown != w2)
        
        wordcloud = WordCloud()
        wordcloud.font_path = 'C:\Windows\Fonts\comic.ttf'
        #trick the wordcloud to accept custom input
        wordcloud.generate('generate')
        
        colls = finder.score_ngrams(bigram_measures.raw_freq)
        colls = colls[:10]        
        maximum = colls[1][1]
        
        #generate the tuple (word, score) for the wordcloud
        cloudwords = [(word, score) for ((word, _), score) in colls if word != unknown]
        cloudwords += [(word, score) for ((_, word), score) in colls if word != unknown]
        
        #normalize the scores
        cloudwords = [(word, score / maximum) for (word, score) in cloudwords]
        
        #tricking part 2.
        wordcloud.fit_words(cloudwords)
        wordcloud.to_image()
        if(unknownscore > 0):
            wordcloud = wordcloud.recolor(color_func=green_color_func, random_state=3)
        else:
            wordcloud = wordcloud.recolor(color_func=red_color_func, random_state=3)
        
        count += 1
        plt.figure(count)
        plt.title(unknown)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.savefig('plots/' + unknown + '.png', bbox_inches='tight')
        plt.close()