Exemplo n.º 1
0
def main(city, keyword, region, pages):
    '''
    主函数
    '''
    csv_filename = 'zl_' + city + '_' + keyword + '.csv'
    txt_filename = 'zl_' + city + '_' + keyword + '.txt'
    headers = [
        'job', 'years', 'education', 'salary', 'company', 'scale', 'job_url'
    ]
    salaries = []

    write_csv_headers(csv_filename, headers)
    for i in range(pages):
        '''
        获取该页中所有职位信息,写入csv文件
        '''
        job_dict = {}
        html = get_one_page(city, keyword, region, i)
        items = parse_one_page(html)
        for item in items:
            html = get_detail_page(item.get('job_url'))
            job_detail = get_job_detail(html)

            job_dict['job'] = item.get('job')
            job_dict['years'] = job_detail.get('years')
            job_dict['education'] = job_detail.get('education')
            job_dict['salary'] = item.get('salary')
            job_dict['company'] = item.get('company')
            job_dict['scale'] = job_detail.get('scale')
            job_dict['job_url'] = item.get('job_url')

            # 对数据进行清洗,将标点符号等对词频统计造成影响的因素剔除
            pattern = re.compile(r'[一-龥]+')
            filterdata = re.findall(pattern, job_detail.get('requirement'))
            write_txt_file(txt_filename, ''.join(filterdata))
            write_csv_rows(csv_filename, headers, job_dict)

    sal = read_csv_column(csv_filename, 3)
    # 撇除第一项,并转换成整形,生成新的列表
    for i in range(len(sal) - 1):
        # 工资为'0'的表示招聘上写的是'面议',不做统计
        if not sal[i] == '0':
            salaries.append(int(sal[i + 1]))

    plt.hist(
        salaries,
        bins=10,
    )
    plt.show()

    content = read_txt_file(txt_filename)
    segment = jieba.lcut(content)
    words_df = pd.DataFrame({'segment': segment})

    stopwords = pd.read_csv("stopwords.txt",
                            index_col=False,
                            quoting=3,
                            sep=" ",
                            names=['stopword'],
                            encoding='utf-8')
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

    words_stat = words_df.groupby(by=['segment'])['segment'].agg(
        {"计数": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"],
                                                      ascending=False)

    # 设置词云属性
    color_mask = imread('background.jfif')
    wordcloud = WordCloud(
        font_path="simhei.ttf",  # 设置字体可以显示中文
        background_color="white",  # 背景颜色
        max_words=100,  # 词云显示的最大词数
        mask=color_mask,  # 设置背景图片
        max_font_size=100,  # 字体最大值
        random_state=42,
        width=1000,
        height=860,
        margin=
        2,  # 设置图片默认的大小,但是如果使用背景图片的话,                                                   # 那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
    )

    # 生成词云, 可以用generate输入全部文本,也可以我们计算好词频后使用generate_from_frequencies函数
    word_frequence = {x[0]: x[1] for x in words_stat.head(100).values}
    word_frequence_dict = {}
    for key in word_frequence:
        word_frequence_dict[key] = word_frequence[key]

    wordcloud.generate_from_frequencies(word_frequence_dict)
    # 从背景图片生成颜色值
    image_colors = ImageColorGenerator(color_mask)
    # 重新上色
    wordcloud.recolor(color_func=image_colors)
    # 保存图片
    wordcloud.to_file('output.png')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Exemplo n.º 2
0
    return segList


def deleteFile(fileName):
    if os.path.isfile(fileName):
        os.remove(fileName)


stopDict = '/home/web_dev/tools/word_cloud/stopword.txt'
# importStopword(stopDict)
d = path.dirname(__file__)
text = open(path.join(d, fileName), encoding='gbk', errors='ignore').read()
text = processChinese(text)
background = imageio.imread('/home/web_dev/tools/word_cloud/test.png')

wc = WordCloud(font_path='/home/web_dev/tools/word_cloud/msyh.ttf',
               background_color='white',
               max_words=100,
               mask=background,
               random_state=42)
wc.generate(text)
imgColors = ImageColorGenerator(background)
plt.figure()
plt.imshow(wc)
plt.axis('off')
# plt.show()

wc.to_file(imageName)
time.sleep(5)
deleteFile(fileName)
Exemplo n.º 3
0
# 중립 단어
neutral_data = pnn_data['label'] == 0
neutral_data = pnn_data[neutral_data]
neutral_data = neutral_data.drop('label', axis=1)
print(neutral_data)
# 긍정 단어
positive_data = pnn_data['label'] == 1
positive_data = pnn_data[positive_data]
positive_data = positive_data.drop('label', axis=1)
print(positive_data)

##############################################################################
# 트위터 모양 클라우딩
twit_coloring = np.array(Image.open('./data/twit.png'))
from wordcloud import ImageColorGenerator
image_colors = ImageColorGenerator(twit_coloring)

# 부정단어 클라우딩
negative_words = ' '.join([word for word in negative_data['title']])
negative_wc = WordCloud(font_path=font_location,
                        background_color='white',
                        width=1000,
                        height=500,
                        random_state=20,
                        max_font_size=120,
                        mask=twit_coloring,
                        colormap='Reds').generate(negative_words)

fig, ax = plt.subplots(figsize=(12, 6))
plt.imshow(negative_wc, interpolation='bilinear')
plt.axis('off')
Exemplo n.º 4
0
import codecs
from collections import Counter
# 读入一个txt文件
import jieba as jieba
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from scipy.misc import imread

rem = [u',', u'、', u'。', u'的', u'和', '\u3000', '\n']

comment_text = codecs.open('/home/dmrf/文档/毛概/19大报告.txt', 'r', 'utf-8').read()
# 结巴分词,生成字符串,如果不通过分词,无法直接生成正确的中文词云

cut_text = " ".join(jieba.cut(comment_text))

bg_pic = imread('../Pic/gongchandang.png')
wordcloud = WordCloud(mask=bg_pic,
                      font_path='../Font/SourceHanSansSC-Heavy.otf',
                      background_color='white',
                      max_words=200,
                      mode='RGBA').generate(cut_text)
d = wordcloud.words_
print(d)
# 从背景图片生成颜色值
image_colors = ImageColorGenerator(bg_pic)
plt.figure()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
wordcloud.to_file("19da.png")
Exemplo n.º 5
0
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import io

#content related
text = io.open('endgame.txt', mode="r", encoding="UTF-8").read()
stopword = set(STOPWORDS)

#appearence related
# to have rectangle shape commment line no. 13 and remove "mask = custom_mask " in line no.16
custom_mask = np.array(Image.open('batman.png'))
wc = WordCloud(background_color='yellow',
               max_words=100,
               contour_width=10,
               contour_color='black',
               max_font_size=100,
               stopwords=stopword,
               mask=custom_mask)
wc.generate(text)
colours = ImageColorGenerator(custom_mask)

wc.recolor(color_func=colours)  #to have colourful words comment this line.

#plotting

plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
Exemplo n.º 6
0
# https://orig00.deviantart.net/edb5/f/2015/246/d/e/panda_we_bare_bears_by_yocatglitter-d98a6sj.png
bears_coloring = np.array(Image.open(path.join(d, "minnie.jpg")))

stopwords = set(STOPWORDS)
stopwords.add("said")

wc = WordCloud(background_color="white",
               max_words=2000,
               mask=bears_coloring,
               stopwords=stopwords,
               max_font_size=40,
               random_state=42)
# generate word cloud
wc.generate(text)

# create coloring from image
image_colors = ImageColorGenerator(bears_coloring)

# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.figure()
plt.imshow(bears_coloring, cmap=plt.cm.gray, interpolation="bilinear")
plt.axis("off")
plt.show()
Exemplo n.º 7
0
# 获取好友签名信息并储存在 siglist 中
siglist = []
for indedx,friend in enumerate(friends):
    sigture = friend['Signature']
    # 如果存在签名的话
    if len(friend['Signature']) > 0:
        # 将个性签名中的表情符号去掉(这里没有去除干净,利用正则表达式)
        sigture = sigture.replace('span','').replace('class','').replace('emoji','').replace('< =','').replace('"','').replace('</>','').replace('>','')
        siglist.append(sigture)

# 将siglist中的元素拼接为一个字符串
text = ''.join(siglist)

# jieba(结巴分词:有全模式、精确模式、默认模式、新词识别、搜索引擎模式)
# jieba.cut()所接收的两个参数,第一个参数为需要分词的字符串,第二个为是否采用全模式
word_list = jieba.cut(text, cut_all=True)
# 空格拼接
word_space_split = ' '.join(word_list)
# 字体的颜色为对应路径的背景图片的颜色
coloring = np.array(Image.open("D:/PythonWorkplace/WeChat/image.png"))
# font_path: 字体路径;  random_state: 为每个字体返回一个PIL颜色;  scale:按照比例放大画布;max_font_size:显示的最大字体的大小
# 如果参数 mask 为空,则使用二维遮罩绘制词云。如果 mask 非空,设置的宽高值将被忽略,遮罩形状被 mask 取代
my_wordcloud = WordCloud(background_color="white", max_words=2000,
                         mask=coloring, max_font_size=150, random_state=42, scale=3,
                         font_path="C:/Windows/Fonts/simkai.ttf").generate(word_space_split)
# 画布的颜色
image_colors = ImageColorGenerator(coloring)
plt.imshow(my_wordcloud.recolor(color_func=image_colors))
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()                                                                                                                                                                                      
Exemplo n.º 8
0
def image_color_func(path: str):
    colormap = np.array(Image.open(path))
    return ImageColorGenerator(colormap)
Exemplo n.º 9
0
async def _(event):
    if not event.reply_to_msg_id:
        await event.edit("`Are you mad Bish! Reply to Any media..`")
        return
    reply_message = await event.get_reply_message()
    if not reply_message.media:
        await event.edit("`Reply to a image/sticker/video`")
        return
    await event.edit("`Downloading Media..`")
    if reply_message.photo:
        await bot.download_media(
            reply_message,
            "wc.png",
        )
    elif (DocumentAttributeFilename(file_name="AnimatedSticker.tgs")
          in reply_message.media.document.attributes):
        await bot.download_media(
            reply_message,
            "wc.tgs",
        )
        os.system("lottie_convert.py wc.tgs wc.png")
    elif reply_message.video:
        video = await bot.download_media(
            reply_message,
            "wc.mp4",
        )
        extractMetadata(createParser(video))
        os.system("ffmpeg -i wc.mp4 -vframes 1 -an -s 480x360 -ss 1 wc.png")
    else:
        await bot.download_media(
            reply_message,
            "wc.png",
        )
    try:
        await event.edit("`Processing...`")
        text = open("userbot/alice.txt", encoding="utf-8").read()
        image_color = np.array(Image.open("wc.png"))
        image_color = image_color[::1, ::1]
        image_mask = image_color.copy()
        image_mask[image_mask.sum(axis=2) == 0] = 255
        edges = np.mean(
            [
                gaussian_gradient_magnitude(image_color[:, :, i] / 255.0, 2)
                for i in range(3)
            ],
            axis=0,
        )
        image_mask[edges > 0.08] = 255
        wc = WordCloud(
            max_words=2000,
            mask=image_mask,
            max_font_size=40,
            random_state=42,
            relative_scaling=0,
        )
        wc.generate(text)
        image_colors = ImageColorGenerator(image_color)
        wc.recolor(color_func=image_colors)
        wc.to_file("wc.png")
        await event.client.send_file(
            event.chat_id,
            "wc.png",
            reply_to=event.reply_to_msg_id,
        )
        await event.delete()
        os.system("rm *.png *.mp4 *.tgs *.webp")
    except BaseException as e:
        os.system("rm *.png *.mp4 *.tgs *.webp")
        return await event.edit(str(e))
Exemplo n.º 10
0
def get_url():

    excel_file = 'AyodhyaKand.xlsx'
    df = pd.read_excel(excel_file, header=2, usecols=['Verses'])
    values = df.values.tolist()
    number_of_verses = [item for sublist in values for item in sublist]
    #print(number_of_verses)

    sarga_number = 1
    number_of_chapters = 119
    j = 0

    text = ''

    ls = []
    ls_hindi = []
    ls_final = []

    file = open("ayodhyakanda.txt", "w", encoding="utf-8")

    while number_of_chapters >= sarga_number:
        url = requests.get(
            f"http://www.valmikiramayan.net/utf8/ayodhya/sarga{sarga_number}/ayodhyasans{sarga_number}.htm"
        )
        soup = BeautifulSoup(url.content, 'html.parser')

        for k in range(0, number_of_verses[j]):
            try:
                match = soup.findAll(class_='SanSloka')[k]
                match = str(match)
                tags = re.compile(r'<[^>]+>')
                match = tags.sub('', match)

                ls.append(match)
                #print(ls)
            except (IndexError, ValueError):
                pass
        j += 1
        sarga_number += 1
        print(sarga_number)

    for letter in ls:
        if not re.search(r'[a-zA-Z]', letter):
            ls_hindi.append(letter)

    text = ' '.join(ls_hindi)
    text.strip(' ')
    text = text.replace('\n\n', '')
    #print(text)
    re_text = (re.sub('[०१२३४५६७८९\\n\\r\|\-]+', '', text))
    #print(re_text)
    tokenizer = TokenizeSentence('sanskrit')

    tokens_ls = []
    for i in re_text.split():
        if len(i) > 3:
            tokens_re = tokenizer.tokenize(i)
            tokens_ls.append(tokens_re)

    tokens = [item for sublist in tokens_ls for item in sublist]

    text_new = ([token for token in tokens if token not in STOPS_LIST])
    d = defaultdict(int)
    for i in text_new:
        d[i] += 1
    #print(len(text_new))

    file.write(text)

    mask = np.array(Image.open("bg_1.jpg"))
    image_colors = ImageColorGenerator(mask)
    wc = WordCloud(font_path="‪C:\\Users\\BB\\Desktop\\Sanskrit2003.ttf",
                   background_color="#000000",
                   max_words=150,
                   width=1920,
                   height=1080,
                   mask=mask,
                   random_state=1).fit_words(d)
    wc.recolor(color_func=image_colors)
    wc.to_file('wordcloud_ayodhya.png')
Exemplo n.º 11
0
          words = ' '.join(get_all_tweets(user,consumer_key, consumer_secret, access_key, access_secret))

          # remove URLs, RTs, and twitter handles
          no_urls_no_tags = " ".join([word for word in words.split()
                                      if 'http' not in word
                                          and not word.startswith('@')
                                          and word.lower()  != 'rt'
                                          and word.lower() != 'n'
                                          and word.lower()  != 'w'
                                          and word.lower()  != 'u'
                                          and word.lower() != 'got'
                                      ])
          if image:
            twitter_mask = imread('./'+image, flatten=flatten)
            if flatten is False:
              image_colors = ImageColorGenerator(twitter_mask)
            wordcloud = WordCloud(
                                  font_path=font,
                                  stopwords=STOPWORDS,
                                  background_color=background_color,
                                  width=1800,
                                  height=1400,
                                  mask=twitter_mask
                                 ).generate(no_urls_no_tags)
          else:
            wordcloud = WordCloud(
                      font_path=font,
                      stopwords=STOPWORDS,
                      background_color=background_color,
                      width=1800,
                      height=1400
def main():
    # 循环获取第一个电影的前10页评论
    commentList = []
    NowPlayingMovie_list = getNowPlayingMovie_list()
    for i in range(10):
        num = i + 1
        commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num)
        commentList.append(commentList_temp)

    # 将列表中的数据转换为字符串
    comments = ''
    for k in range(len(commentList)):
        comments = comments + (str(commentList[k])).strip()

    # 使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)

    # 使用结巴分词进行中文分词
    segment = jieba.lcut(cleaned_comments)
    words_df = pd.DataFrame({'segment': segment})

    # 去掉停用词
    stopwords = pd.read_csv("stopwords.txt",
                            index_col=False,
                            quoting=3,
                            sep="\t",
                            names=['stopword'],
                            encoding='utf-8')  # quoting=3全不引用
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

    # 统计词频
    words_stat = words_df.groupby(by=['segment'])['segment'].agg(
        {"计数": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"],
                                                      ascending=False)
    #  print(words_stat.head())

    bg_pic = numpy.array(Image.open("alice_mask.png"))

    # 用词云进行显示
    wordcloud = WordCloud(font_path="simhei.ttf",
                          background_color="white",
                          max_font_size=80,
                          width=2000,
                          height=1800,
                          mask=bg_pic,
                          mode="RGBA")
    word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
    # print(word_frequence)
    """
    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)
        #print(word_frequence_list)
    """
    wordcloud = wordcloud.fit_words(word_frequence)

    image_colors = ImageColorGenerator(bg_pic)  # 根据图片生成词云颜色

    plt.imshow(wordcloud)  #显示词云图片
    plt.axis("off")
    plt.show()
    wordcloud.to_file('show_Chinese.png')  # 把词云保存下来
Exemplo n.º 13
0
import wordcloud

with open(r'./word.txt', 'r', encoding='utf-8') as f:
    content_list = f.read()
    print(type(content_list))

from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
#from scipy.misc import
import imageio

back_group = imageio.imread('back.png')
wc = WordCloud(
    background_color='white',
    mask=back_group,
    font_path='C:\Windows\Fonts\simkai.ttf',
    max_font_size=60,
    min_font_size=5,
)

wc_color = wc.generate(content_list)
ciyun_color = ImageColorGenerator(back_group)
new_color = wc.recolor(color_func=ciyun_color)
plt.imshow(new_color)
plt.axis('off')
plt.savefig('ciyun_pic.jpg')
def test_recolor_too_small_set_default():
    # check no exception is raised when default colour is used
    colouring = np.array(Image.new('RGB', size=(20, 20)))
    wc = WordCloud(max_words=50, width=30, height=30).generate(THIS)
    image_colors = ImageColorGenerator(colouring, default_color=(0, 0, 0))
    wc.recolor(color_func=image_colors)
Exemplo n.º 15
0
# 生成词云, 可以用generate输入全部文本(wordcloud对中文分词支持不好,建议启用中文分词),也可以我们计算好词频后使用generate_from_frequencies函数
wc.generate(text1)


def random_color_func(word=None,
                      font_size=None,
                      position=None,
                      orientation=None,
                      font_path=None,
                      random_state=None):
    h = randint(0, 50)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(randint(60, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)


image_colors = ImageColorGenerator(back_coloring)

plt.figure()
# 以下代码显示图片
plt.imshow(wc.recolor(color_func=random_color_func), interpolation="bilinear")
plt.axis("off")
plt.savefig(imgname1, dpi=500)
plt.show()
# 绘制词云
image_colors = ImageColorGenerator(back_coloring)
plt.figure()
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.savefig(imgname2, dpi=500)
plt.show()
Exemplo n.º 16
0
    "s6": 3,
    "json": 3,
    "chez scheme": 3,
    "GIMP": 3,
    "gcc": 3,
    "debian": 3,
    "printf(\"Linux\")": 3,
    "echo Linux": 3,
    "(println \"Linux\")": 3,
    "console.log \"Linux\"": 3,
    "(format t \"Linux\")": 3,
    "writeln(\"Linux\")": 3,
    "PRINT \"Linux\"": 3,
    "(insert \"Linux\")": 3,
    "fmt.Println(\"Linux\")": 3,
    "main = putStrLn \"Linux\"": 3,
    "(display \"Linux\")": 3,
    "System.out.println(\"Linux\");": 3,
    "document.write(\"Linux\")": 3,
    "type [Linux]": 3,
    "disp('Linux')": 3,
    "<?php\necho \"Linux\"\n?>": 3,
    "Ubuntu": 3
}
image_colors = ImageColorGenerator(pic, [255, 255, 255])
wc = WordCloud(background_color="white", mask=pic)
wc.generate_from_frequencies(jsondict)

plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.show()
Exemplo n.º 17
0
from wordcloud import WordCloud, ImageColorGenerator  # 词云库

# 1、读入txt文本数据
text = open(r'十九大.txt', "r").read()

# 2、结巴分词:cut_all参数可选, True为全模式,False为精确模式,默认精确模式
cut_text = jieba.cut(text, cut_all=False)
result = "/".join(cut_text)  # 必须给个符号分隔开分词结果,否则不能绘制词云

# 3、初始化自定义背景图片
image = Image.open(r'樱花540.jpg')
graph = np.array(image)

# 4、产生词云图
# 有自定义背景图:生成词云图由自定义背景图像素大小决定
wc = WordCloud(font_path=r"yahei.ttf",
               background_color='white',
               max_font_size=200,
               mask=graph)
wc.generate(result)

# 5、绘制文字的颜色以背景图颜色为参考
image_color = ImageColorGenerator(graph)  # 从背景图片生成颜色值
wc.recolor(color_func=image_color)
wc.to_file(r"wordcloud.png")  # 按照背景图大小保存绘制好的词云图,比下面程序显示更清晰

# 6、显示图片
plt.figure("词云图")  # 指定所绘图名称
plt.imshow(wc)  # 以图片的形式显示词云
plt.axis("off")  # 关闭图像坐标系
plt.show()
index = np.arange(3)
bar_width = 0.3
plt.bar(index, t, width=0.3, color='g', label=u'热度指标变化')
plt.xlabel(u'阶段(3-6,7-8,9-12)', fontdict=font)
plt.ylabel(u'hot_index', fontdict=font)
plt.title(u'热度指标变化', fontdict=font)
plt.legend(loc=2)
plt.savefig('d:/picture/hot.png')
plt.show()
#3.3绘图词云
print(df['视频标签'])
labs = str(list(df['视频标签']))
labs = ''.join(jieba.cut(labs))
cloud_mask = np.array(Image.open("d:/timg.jpg"))
wordcloud = wc(font_path='c:/Users/Windows/fonts/simkai.ttf',
               stopwords=['考试', '数学', '搞笑'],
               max_words=50,
               background_color="white",
               mask=cloud_mask)
wcd = wordcloud.generate(labs)
image_colors = ImageColorGenerator(cloud_mask)
wordcloud.recolor(color_func=image_colors)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wcd.to_file("d:/picture/kaoyan.png")
#散点图
plt.plot_date(df.index, df['hot_index'])
plt.savefig('d:/picture/data_hot_index.png')
plt.show()
Exemplo n.º 19
0
Arquivo: main.py Projeto: rlaokok/bbs
def img_test():
    if request.method == 'GET':
        return render_template("selenium.html")
    if request.method == 'POST':
        PATH = os.path.dirname(__file__)
        mask_name = request.form  # 데이터 받는부분
        suffix = mask_name['date'][:-3]
        img_path = f"static/img/{mask_name['img_name']}_{suffix}_test.png"
        print(mask_name)
        import codecs
        if mask_name['img_name'] == "human" or mask_name['img_name'] == "dog" \
            or mask_name['img_name'] == "bird":
            if not os.path.exists(os.path.join(PATH, img_path)):
                text = mask_name['textarea']
                if not text:
                    return json.dumps({"noData": "no"})
                else:
                    okt = Okt()
                    morphs = [okt.pos(text)]
                    tags = []
                    words = []

                    for morph in morphs:
                        for word, tag in morph:
                            if tag in ["Noun", "verb", "Adjective", "Number"] and ('것' not in word) \
                                    and ('이' not in word) and ('약' not in word) and ("고" not in word) and \
                                    ("등" not in word) and ("있다" not in word) and ("수" not in word) \
                                    and ("뎌" not in word) and ("기자" not in word):
                                words.append(word)
                                tags.append(tag)

                    count = Counter(words)
                    test_words = dict(count.most_common())
                    from matplotlib import pyplot as plt
                    import matplotlib
                    matplotlib.use('agg')
                    from wordcloud import WordCloud, ImageColorGenerator
                    import nltk  # 자연어 처리 Natural Language Toolkit  http://www.nltk.org/
                    from nltk.corpus import stopwords

                    mask = np.array(
                        Image.open(
                            os.path.join(
                                PATH,
                                f"static/img/{mask_name['img_name']}.png")))
                    image_color = ImageColorGenerator(mask)
                    wordcloud = WordCloud(
                        font_path='C:/Windows/Fonts/malgun.ttf',
                        background_color='white',
                        colormap="Accent_r",
                        mask=mask,
                        color_func=
                        image_color,  #lambda *args, **kwargs: "black",
                        width=1500,
                        height=1000).generate_from_frequencies(test_words)

                    if not os.path.exists(os.path.join(PATH, img_path)):
                        plt.imshow(wordcloud.recolor(color_func=image_color),
                                   interpolation="bilinear")
                        plt.axis('off')
                        plt.savefig(os.path.join(PATH, img_path))
                        plt.close()
                        img_path2 = img_path
                    else:
                        img_path2 = img_path
                    return json.dumps(str(img_path2))
            else:
                return json.dumps(str(img_path))
Exemplo n.º 20
0
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image
#´ò¿ª¸è´Ê²¢½øÐзִÊ
with open(r"C:\Users\john\Desktop\jay.txt","r",encoding="utf-8") as f:
    text=f.read()
cut_text=jieba.cut(text)
result=" ".join(cut_text)
#Ñ¡Óñ³¾°Í¼Æ¬
image=np.array(Image.open(r"C:\Users\john\Desktop\jay.jpg"))
#ÉèÖòÎÊý
wc=WordCloud(font_path=r"C:\Windows\Fonts\STZHONGS.TTF",
             background_color="white",
             width=500,
             height=350,
             max_font_size=50,
             min_font_size=10,
             mask=image
             )
wc.generate(result)
#ÉèÖñ³¾°ÑÕÉ«ËæͼƬÑÕÉ«¸Ä±ä
image_colors=ImageColorGenerator(image)
plt.show(wc.recolor(color_func=image_colors))
#չʾͼƬ
plt.imshow(wc)
plt.axis("off")
plt.show()
#±£´æͼƬ
wc.to_file(r"C:\Users\john\Desktop\jay1.png")


Exemplo n.º 21
0
# data=[]
# for i in top_1000:
#      #print(i)
#      data.append(i[0])
#backgroud_Image = plt.imread('man.jpeg')
backgroud_Image = plt.imread('xin.jpeg')
matplotlib.rcParams['figure.figsize'] = (12.0, 12.0)  # 设定图像尺寸
wordcloud = WordCloud(font_path="/System/Library/Fonts/STHeiti Light.ttc",
                      background_color="white",
                      max_font_size=80)  # 设定词云的字体路径,防止中文出错
wordcloud = WordCloud(
    background_color='white',  # 设置背景颜色
    mask=backgroud_Image,  # 设置背景图片
    font_path=
    '/System/Library/Fonts/STHeiti Light.ttc',  # 若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
    max_words=2000,  # 设置最大现实的字数
    stopwords=STOPWORDS,  # 设置停用词
    max_font_size=150,  # 设置字体最大值
    random_state=30  # 设置有多少种随机生成状态,即有多少种配色方案
)
img_colors = ImageColorGenerator(backgroud_Image)
print(img_colors)

word_frequence = {x[0]: x[1] for x in top_1000}  # 取前1000个词频最高的词语
print(word_frequence)
wordcloud = wordcloud.fit_words(word_frequence)
#使用后词云变成图色
wordcloud.recolor(color_func=img_colors)
plt.imshow(wordcloud)
plt.show()
Exemplo n.º 22
0
keywords = dict()
for i in result:
    keywords[i[0]] = i[1]

from PIL import Image, ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator

imgname = '中国地图.jpg'
imgname = '胡总.jpg'
imgname = '习大大作报告.jpg'
imgname = '人民.jpg'
imgname = '共产党.jpg'
image = Image.open(imgname)

graph = np.array(image)

wc = WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
               background_color='white',
               max_words=1000,
               mask=graph)
wc.generate_from_frequencies(keywords)
image_color = ImageColorGenerator(graph)

plt.imshow(wc)
plt.imshow(wc.recolor(color_func=image_color))
plt.axis("off")
# plt.savefig(imgname, dpi=1024)
plt.show()
Exemplo n.º 23
0
stopwords = set(STOPWORDS)
stopwords.add("said")

# 你可以通过 mask 参数 来设置词云形状
wc = WordCloud(background_color="white",
               max_words=2000,
               mask=alice_coloring,
               stopwords=stopwords,
               max_font_size=100,
               min_font_size=16,
               random_state=42)
# generate word cloud
wc.generate(text)  #直接使用WordCloud.generate对英文文本进行分词

# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)

# show
# 在只设置mask的情况下,你将会得到一个拥有图片形状的词云
plt.imshow(wc, interpolation="bilinear")  #有背景图形状的词云
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
# 我们还可以直接在构造函数中直接给颜色
# 通过这种方式词云将会按照给定的图片颜色布局生成字体颜色策略
plt.imshow(wc.recolor(color_func=image_colors),
           interpolation="bilinear")  #有背景图形状和颜色的词云
plt.axis("off")
plt.figure()
plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")  #原背景图
Exemplo n.º 24
0
import jieba
words = ''
with open('C:\\Users\\lenovo\\Downloads\\红楼梦.txt', "r",
          encoding=" utf-8") as hlm:
    for line in hlm.readlines():
        line = line.strip('\n')
        words += ''.join(jieba.cut(line))
        words = words.replace('道', '')
        words = words.replace('笑', '')
        words += ''
#处理自己的数据
# filename="arthas.txt"
# with open(filename."r")as f:
#     arthastxt=f.read()
bimg = plt.imread("C:\\Users\\lenovo\\Desktop\\555.jpg")  #读图片

wordcloud = WordCloud(relative_scaling=0.8,
                      background_color=None,
                      mode="RGBA",
                      mask=bimg,
                      font_path='C:/Windows/Fonts/msyh.ttc',
                      stopwords=STOPWORDS.add('说着')).generate(words)

image_colors = ImageColorGenerator(bimg)
wordcloud.recolor(color_func=image_colors)
plt.imshow(wordcloud, interpolation='nearest')
plt.axis("off")
plt.savefig('C:\\Users\\lenovo\\Desktop\\521.jpg')
#wordcloud.to_file('C:\\Users\\lenovo\\Desktop\\523.jpg')
Exemplo n.º 25
0
    max_font_size=150,# 设置字体最大值  
    random_state=30# 设置有多少种随机生成状态,即有多少种配色方案  
)

#过滤关键词
key_words = ["他们","我们","这种","已经","但是","这个","一个","会主","资产阶级","无产阶级","自己"]
for key in key_words:
    text = text.replace(key,"")
wc.generate_from_text(text)  
print('开始加载文本')

#graph = backgroud_Image
#设置字体颜色为随渐变色背景图形渐变
image = Image.open(ipath)
graph = np.array(image)
img_colors = ImageColorGenerator(graph)  
wc.recolor(color_func=img_colors)

# 显示词云图
plt.figure(1)
plt.title("马克思",fontproperties=font_set)
plt.imshow(wc)  
# 是否显示x轴、y轴下标  
plt.axis('off')  
plt.show()
#time.sleep(3)
plt.close()

# 显示词云图
plt.figure(2)
plt.imshow(wc)  
Exemplo n.º 26
0
def nube_palabras(texto,
                  n_grama=1,
                  n_terminos=100,
                  graficar=True,
                  dim_figura=(10, 10),
                  hor=0.6,
                  titulo='Términos más frecuentes',
                  ubicacion_archivo='',
                  forma=None,
                  color_fondo='white',
                  color_contorno='blue',
                  grosor_contorno=0,
                  colores_forma=False,
                  semilla=1234,
                  devolver_nube=False,
                  mask=None):
    """ Permite graficar o exportar una nube de palabras (o n-gramas) a partir de un texto de entrada.

    :param texto: (str) Texto de entrada que se desea analizar.
    :param n_grama: (int) Valor por defecto: 1. Cantidad de elementos a tener en cuenta en la generación de n-gramas. Por ejemplo, con n=1 y n=2 se graficarán palabras y bigramas, respectivamente.
    :param n_terminos: (int) Valor por defecto: 100. Cantidad de n-gramas a incluir en la nube. Se graficaran los n_terminos más frecuentes en el texto.
    :param graficar: (bool) {True, False} Valor por defecto: True. Permite visualizar la gráfica en el `IDE`_ que esté utilizando.
    :param dim_figura: (float, float) Valor por defecto: (10, 10). Corresponden al tamaño (ancho y alto) de la figura a generar.
    :param hor: (float) (valor entre 0 y 1). Proporción de los términos que se mostrarán de manera horizontal en la nube. Para hor=0 todos los térmonos se mostrarán verticalmente; para hor=1 todos los términos se mostrarán horizontalmente, y para valores intermedios habrá una combinación de términos en ambas representaciones.
    :param titulo: (str) Valor por defecto: 'Términos más frecuentes'. Corresponde al título de la nube de palabras.
    :param ubicacion_archivo: (str) Valor por defecto: vacío (''). Ruta donde desea exportar la gráfica como archivo tipo imagen. Al nombrar el archivo se recomienda utilizar la extensión jpg. Si no se especifica una ruta, la gráfica no será exportada.
    :param forma: (str o numpy array) o None, Valor por defecto: None. Arreglo de Numpy o ubicación de archivo de imagen que contenga la forma que se le desea dar a la nube de palabras. Si forma es None, se ordenarán los términos de la nube en un círculo.
    :param color_fondo: (str o tuple) Valor por defecto: 'white'. Color de fondo de la nube de palabras. Se puede ingresar como el nombre del color (si Python lo reconoce), código hexadecimal o como una tupla con sus valores (R, G, B). 
    :param color_contorno: (str o tuple) Valor por defecto: 'blue'. Color del contorno de la forma de la nube de palabras. Se puede ingresar como el nombre del color (si Python lo reconoce), código hexadecimal o como una tupla con sus valores (R, G, B).
    :param grosor_contorno: (int) Valor por defecto: 0. Grosor de la línea (contorno) que define la forma de la nube de palabras. Si grosor_contorno=0, no se graficará el contorno.
    :param colores_forma: (bool) {True, False} Valor por defecto: False. Indica si se quieren utilizar en la nube de palabras los colores extraídos de la imagen o numpy array utilizada para definir la forma de la nube.
    :param semilla: (int) Valor por defecto: 1234. Corresponde al estado inicial del generador. Este parámetro incide en la posición y color de las palabras. En caso de querer replicar la nube de palabras, se recomienda utilizar un mismo valor de semilla.
    :param devolver_nube: (bool) {True, False} Valor por defecto: False. Indica si se desea obtener la nube de palabras como un objeto tipo WordCloud.
    :return: objeto tipo WordCloud, solo si devolver_nube=True.
    """
    # Si se usa el parámetro anterior (mask) se pone la deprecation warning y se asigna su valor a "forma"
    if forma is None and mask is not None:
        msj = """El parámetro 'mask' ha sido reemplazado por el parámetro 'forma'.
            Futuras versiones de la librería no contarán con el parámetro 'mask'."""
        warnings.warn(msj, DeprecationWarning, stacklevel=2)
        forma = mask
    # Por defecto se crea una máscara circular para ordenar la nube
    if forma is None:
        x, y = np.ogrid[:600, :600]
        forma = (x - 300)**2 + (y - 300)**2 > 260**2
        forma = 255 * forma.astype(int)
    else:
        # Si se pasó la ubicación de una imagen, se carga
        if isinstance(forma, str):
            forma = cv2.imread(forma)
        # Si se pasó una imagen a color, se conservan sus colores y se convierte a escala de grises
        colores_nube = None
        if len(forma.shape) == 3:
            colores_nube = ImageColorGenerator(forma)
            forma = cv2.cvtColor(forma, cv2.COLOR_BGR2GRAY)
        # Se aplica un umbral para eliminar ruido y marcas de agua de la máscara
        forma = cv2.threshold(forma, 0, 255,
                              cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
        # Si la mayoría de la imagen está en negro, se invierte la imagen máscara
        if np.mean(forma) < 100:
            forma = 255 - forma
        # Operación de "closing" para rellenar huecos en la imagen máscara
        kernel = np.ones((5, 5), np.uint8)
        forma = 255 - cv2.morphologyEx(255 - forma, cv2.MORPH_CLOSE, kernel)
    # Obtener diccionario de 'n_terminos' más frecuentes con sus frecuencias
    dictu = frecuencia_ngramas(texto, n_grama, n_terminos)
    # Crear la nube de palabras
    nube = WordCloud(background_color=color_fondo,
                     contour_color=color_contorno,
                     prefer_horizontal=hor,
                     mask=forma,
                     random_state=semilla,
                     contour_width=grosor_contorno)
    figura = nube.generate_from_frequencies(dictu)
    # Si se eligió mantener los colores de la imagen de forma, se cambian los colores a la nube
    if colores_forma == True and colores_nube is not None:
        nube = nube.recolor(color_func=colores_nube)
    # Devolver el objeto de la nube, para graficarlo de otra manera
    if devolver_nube:
        return figura
    else:
        # Graficar y/o guardar la imagen generada
        grafica_nube(figura, dim_figura, titulo, ubicacion_archivo, graficar)
Exemplo n.º 27
0
         trump_New_York_Times_info[key]['score'])
    )  #Navigates through the nested dictionary to find desires variables and puts desired variables into the table columns

conn.commit()  #Commits the information into the table
cur.close()  #Closes the cursor

#New York Times Word Cloud
d = path.dirname(
    __file__
)  #Sets the path of the running script equal to d so that the word coud can later be saved to my working directory
text = open(path.join(d, 'trump.rtf')).read(
)  #Opens text file containing print headlines and reads that file
trump_mask = np.array(
    Image.open(path.join(d, "trump_pic.png"))
)  #Uploads image in directory named trump_pic.png to be used for the colors of the words in the cloud
trump_colors = ImageColorGenerator(
    trump_mask
)  #Assigns colors from the picture in the line above to variable trump_colors to be used later
stopwords = set(
    STOPWORDS
)  #Initializes stop words, which are the words to be used in the cloud
cloud = WordCloud(mask=trump_mask, stopwords=stopwords).generate(
    text)  #Creates the word cloud based on the text file above
plt.imshow(cloud.recolor(color_func=trump_colors), interpolation='bilinear'
           )  #Word cloud is colored based on colors saved from picture above
plt.axis('off')  #No axis
plt.figure()  #Finishes creating the word cloud
cloud.to_file(
    path.join(d, 'trump.png')
)  #Saves the word cloud as trump.png in the directory I am working from
Exemplo n.º 28
0
words_df = pandas.DataFrame({'segment': segment})
words_df.head()
stopwords = pandas.read_csv("/hwj/dorahacks/stopwords.txt",
                            index_col=False,
                            quoting=3,
                            sep="\t",
                            names=['stopword'],
                            encoding="utf8")
#去掉我们不需要的高频语气词等
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

words_stat = words_df.groupby(by=['segment'])['segment'].agg(
    {"计数": numpy.size})
words_stat = words_stat.reset_index().sort_values(by="计数", ascending=False)

from scipy.misc import imread
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
#%matplotlib inline
bimg = imread('/hwj/dorahacks/1.jpg')
wordcloud = WordCloud(background_color="black",
                      mask=bimg,
                      font_path='/hwj/dorahacks/STFANGSO.ttf')
#wordcloud=wordcloud.fit_words(words_stat.head(4000).itertuples(index=False))
words = words_stat.set_index("segment").to_dict()
wordcloud = wordcloud.fit_words(words["计数"])
bimgColors = ImageColorGenerator(bimg)
plt.axis("off")
plt.imshow(wordcloud.recolor(color_func=bimgColors))
plt.show()
Exemplo n.º 29
0
# the matplotlib way:
import matplotlib.pyplot as plt
imgdir = r"/Users/yangjie/Desktop/moon2.jpg"
import os
dirs, filename = os.path.split(imgdir)
name, ext = os.path.splitext(filename)
destDir = os.path.join(dirs, name)
try:
    removedirs(destDir)
except:
    pass
mkdirs(destDir)
import numpy as np
from PIL import Image
mask = np.array(Image.open(imgdir))
image_colors = ImageColorGenerator(mask)
# take relative word frequencies into account, lower max_font_size
#wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
for scale in range(1, 100, 3):
    wordcloud = WordCloud(
        scale=scale,
        mask=mask,
        font_path='/Users/yangjie/Documents/env/python/simhei.ttf',
        max_font_size=80,
        relative_scaling=.5).fit_words(frequencies)
    plt.figure(dpi=100)
    # plt.imshow(wordcloud)
    plt.axis("off")
    plt.imshow(wordcloud.recolor(color_func=image_colors))
    #     plt.imshow(wordcloud.recolor(color_func=grey_color_func))
    plt.savefig(destDir + str(r"/result_graf_%d.png" % scale))
Exemplo n.º 30
0
result = jieba.analyse.extract_tags(string, topK=50, withWeight=False)
# pprint.pprint(result)

text_string = ','.join(result)

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from matplotlib import pyplot as plt
from matplotlib.pyplot import imread
import random
from os import path

print(text_string)
dir = path.dirname(__file__)
backgroundImage = np.array(Image.open(path.join(dir, 'sp_background.jpg')))
image_color = ImageColorGenerator(backgroundImage)
wc = WordCloud(
    width=400,
    height=300,
    margin=2,
    ranks_only=None,
    prefer_horizontal=0.9,
    mask=backgroundImage,
    color_func=None,
    max_words=200,  # 显示最多的词汇量
    stopwords=None,  # 停止词设置,修正词云图时需要设置
    random_state=None,
    background_color='#ffffff',  # 背景颜色设置,可以为具体颜色,比如:white或者16进制数值。
    font_step=1,
    mode='RGB',
    regexp=None,