Пример #1
0
def main():
    wiki = wikipedia.page('Web scraping')
    text = wiki.content
    text = re.sub(r'==.*?==+', '', text)
    text = text.replace('\n', '')

    # black square
    wordcloud = WordCloud(width=3000,
                          height=2000,
                          random_state=1,
                          background_color='black',
                          colormap='Set2',
                          collocations=False,
                          stopwords=STOPWORDS).generate(text)
    #wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='salmon', colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(text)
    wordcloud.to_file("wordcloud1.png")

    # Thumb
    mask = np.array(Image.open('upvote.png'))
    wordcloud = WordCloud(width=3000,
                          height=2000,
                          random_state=1,
                          background_color='black',
                          colormap='Set2',
                          collocations=False,
                          stopwords=STOPWORDS,
                          mask=mask).generate(text)
    wordcloud.to_file("upvote.png")
Пример #2
0
def wordCloud():
    #reads the words from a txt file
    tags = open("path/to/file", "r").read().split("\n")
    #reads the words from a csv file(used to display the frequency of words)
    dataset = pd.read_csv("path/to/file/file_name.csv")
    #display frequency of each word
    print(
        pd.Series(np.concatenate([word.split()
                                  for word in dataset.Tag])).value_counts())

    #dictionary with the frequecy of each word
    tagFreq = Counter()
    for word in tags:
        tagFreq[word] += 1
    #cloud mask from local machine
    mask = np.array(
        Image.open(path.join("path/to/mask/file", "mask_file_name.png")))
    #cloud mask from an online resource
    #mask = np.array(Image.open(requests.get('http://www.clker.com/cliparts/O/i/x/Y/q/P/yellow-house-hi.png', stream=True).raw))
    wordcloud = WordCloud(width=1000,
                          height=900,
                          background_color='white',
                          relative_scaling=.8,
                          mask=mask).generate_from_frequencies(tagFreq)
    #save the word cloud as a png file
    wordcloud.to_file("myfile.png")
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Пример #3
0
def get_wordcloud(image,font,sw,word,result):
    wordcloud = WordCloud(scale=15, font_path=font, mask=image, stopwords=sw, background_color='white',
                          max_words=80000,max_font_size=10, random_state=42)
    wordcloud.generate(word)
    img_colors = ImageColorGenerator(image)
    plt.imshow(wordcloud.recolor(color_func=img_colors))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
    wordcloud.to_file(result)
    print('Task Done!')
Пример #4
0
def Plot_WordCloud(text, catdog):
    print("\nOK. The wordcloud for \n".format(catdog))
    #Create wordclouds
    wordcloud = WordCloud(background_color="black", width=1600, height=800).generate(text)
     # Display the generated image:
    # the matplotlib way:

    plt.figure( figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    wordcloud.to_file(graph_directory+ catdog+".png")
    def create_topic_wordcloud_img(self, topic, frec):
        # Create wordcloud img for the topic using words frequencies e.g:
        #frec = [('hi', 0.01), ('bye', 0.5), ('wow', 0.3), ('dfg', 0.1), ('efef', 0.9)]

        import wordcloud

        wordcloud = wordcloud.WordCloud(
            prefer_horizontal=1,
            ranks_only=True,
            background_color='white',
            mask=imread('topic_distribution_visualization/red-circle.png'
                        )).fit_words(frec)
        picture_path = '%stopic_wordcloud/topic%s.png' % (
            self._viz_output_path, topic)
        wordcloud.to_file(picture_path)
        pass
Пример #6
0
def show_wordcloud(data, title=None):
    wordcloud = WordCloud(
        background_color='beige',
        stopwords=stopwords_c,
        max_words=60,
        max_font_size=30,
        scale=3,
        random_state=1  # random value
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title:
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()
    wordcloud.to_file("D:\\1. Merene\\NLP\Challenge 4\\C_W_Cloud_st.png")
Пример #7
0
    def get_wc_data(self, request, pk='reviewID'):
        allData = ""
        stopwords = set(STOPWORDS)

        for i in range(12):
            reviews = YelpReviews.objects.filter(business=pk,
                                                 date__month=i + 1)
            for x in reviews:
                allData = allData + x.review

            wordcloud = wc(stopwords=stopwords,
                           max_words=25,
                           background_color="white").generate(allData)
            wordcloud.to_file("./images/image{}.png".format(i))
            oo = WordCloudPhoto(title="x",
                                image="images/image{}.png".format(i))
            oo.save()
        serializer = WordCloudPhotoSerializer(oo, many=True)
        response = {'message': 'Word Cloud Data', 'result': serializer.data}
        return Response(response, status=status.HTTP_200_OK)
Пример #8
0
def generate_pic(frequency, name):
    wordcloud = WordCloud(
        width=900,  # 词云图片宽度,默认400像素
        height=383,  # 词云图片高度 默认200像素
        background_color='black',  # 词云图片的背景颜色,默认为黑色
        font_path='Microsoft Yahei.ttf',  # 指定字体路径 默认None
        font_step=1,  # 字号增大的步进间隔 默认1号
        min_font_size=4,  # 最小字号 默认4号
        max_font_size=None,  # 最大字号 根据高度自动调节
        max_words=30,  # 最大词数 默认200
        scale=15,  # 默认值1。值越大,图像密度越大越清晰
        prefer_horizontal=0.9,  # 默认值0.90,浮点数类型。表示在水平如果不合适,就旋转为垂直方向,水平放置的词数占0.9%
        relative_scaling=0,
        # 默认值0.5,浮点型。设定按词频倒序排列,上一个词相对下一位词的大小倍数。有如下取值:“0”表示大小标准只参考频率排名,“1”如果词频是2倍,大小也是2倍
        collocations=False,  # 是否包括两个词的搭配
        mask=None)
    # 根据确诊病例的数目生产词云
    wordcloud.generate_from_frequencies(frequency)
    # 保存词云
    wordcloud.to_file('%s.png' % (name))
Пример #9
0
"""爬取「后浪」弹幕"""
import requests
import re
import wordcloud
import os
os.chdir(r'.\Module\requests\images')
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
}
res = requests.get(
    "https://api.bilibili.com/x/player/pagelist?bvid=BV1FV411d7u7&jsonp=jsonp",
    headers=headers,
    verify=False)
cid = res.json()['data'][0]['cid']
print(cid)
danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
result = requests.get(danmu_url, headers=headers, verify=False).content.decode(
    'utf-8')  # 如果不设置'verify=False',会引起 SSLError
pattern = re.compile('<d.*?>(.*?)</d>')
danmu_list = pattern.findall(result)
wordcloud = wordcloud.WordCloud(font_path='msyh.ttc', width=900,
                                height=1600).generate("".join(danmu_list))
wordcloud.to_file('wordcloud.png')
Пример #10
0
wordmap = {}
for w in wordList:
    if not (w in wordmap) and not (w.lower() in stopwords.get_words()):
        wordmap[w] = 1
    elif not (w.lower() in stopwords.get_words()):
        wordmap[w] = wordmap[w] + 1
t = OrderedDict(sorted(wordmap.items(), key=lambda x: x[1], reverse=True))
print(type(t))
# convert list to string and generate
wordcloud = WordCloud(
    width=1000, height=500).generate_from_frequencies(wordmap)
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud.to_file("cloud.png")
plt.close()
print("done")
with open(file_name, 'a') as outfile:
    outfile.seek(0)
    outfile.truncate()
    outfile.write("\n" + "Favortes Given" + "\n")
    for key in favorites_given:
        outfile.write(str(users[key].encode('utf-8') + " ".encode("utf-8") +
                      str(favorites_given[key]).encode("utf-8") + "\n".encode("utf-8"),"utf-8"))
    outfile.write("\n" + "Favorites Received" + "\n")
    for key in favorites_received:
        outfile.write(str(users[key].encode('utf-8') + " ".encode("utf-8") +
                      str(favorites_received[key]).encode("utf-8") + "\n".encode("utf-8"),"utf-8"))
    outfile.write("\n" + "Ratio" + "\n")
    for key in favorites_received:
Пример #11
0
    j = j + 1
    text += " " + col[6]
    if "Central Time (US & Canada)" in col[5]: text1 += " " + col[6]
    if "Eastern Time (US & Canada)" in col[5]: text2 += " " + col[6]
    if "Pacific Time (US & Canada)" in col[5]: text3 += " " + col[6]
counter = 2
# text = tweetmap[23]
stopwords = set(STOPWORDS)
stopwords.update(["https", "http", "ud83d", "ude2d", "ude02", "co"])

# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=stopwords,
                      max_font_size=50,
                      max_words=100,
                      background_color="white").generate(text)
wordcloud.to_file("first_review.png")
wordcloud1 = WordCloud(stopwords=stopwords,
                       max_font_size=50,
                       max_words=100,
                       background_color="white").generate(text1)
wordcloud1.to_file("second_review.png")
wordcloud2 = WordCloud(stopwords=stopwords,
                       max_font_size=50,
                       max_words=100,
                       background_color="white").generate(text2)
wordcloud2.to_file("third_review.png")
wordcloud3 = WordCloud(stopwords=stopwords,
                       max_font_size=50,
                       max_words=100,
                       background_color="white").generate(text3)
wordcloud3.to_file("fourth_review.png")
Пример #12
0
cluster_content = '/Users/liziyang/Downloads/CaseWestern-master/cluster_content'
content_list = os.listdir(cluster_content)
# print(content_list)

for elem in content_list:
    complete_name = os.path.join(cluster_content, elem)
    cluster_name = elem.replace('.csv', '')
    print(cluster_name)
    f = open(complete_name, 'r').read()
    f = f.lower()
    f.replace('and', '')
    f.replace('the', '')
    f.replace('victim', '')
    f.replace('the suspect', '')
    f.replace('the victim', '')
    f.replace('sex crime', '')
    f.replace('victim states', '')

    wordcloud = WordCloud(background_color="white",
                          width=1000,
                          height=860,
                          margin=2,
                          stopwords=stopwords).generate(f)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(cluster_name)
    plt.show()
    wordcloud.to_file('test.png')
# 保存图片,但是在第三模块的例子中 图片大小将会按照 mask 保存