Exemplo n.º 1
0
def parse_args(arguments):
    # prog = 'python wordcloud_cli.py'
    parser = make_parser()
    args = parser.parse_args(arguments)

    if args.colormask and args.color:
        raise ValueError('specify either a color mask or a color function')

    args = vars(args)

    with args.pop('text') as f:
        text = f.read()

    if args['stopwords']:
        with args.pop('stopwords') as f:
            args['stopwords'] = set(map(lambda l: l.strip(), f.readlines()))

    if args['mask']:
        mask = args.pop('mask')
        args['mask'] = np.array(Image.open(mask))

    color_func = wc.random_color_func
    colormask = args.pop('colormask')
    color = args.pop('color')
    if colormask:
        image = np.array(Image.open(colormask))
        color_func = wc.ImageColorGenerator(image)
    if color:
        color_func = wc.get_single_color_func(color)
    args['color_func'] = color_func

    imagefile = args.pop('imagefile')

    return args, text, imagefile
Exemplo n.º 2
0
def __main__():
    # 当前路径
    d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
    #读文件
    with open(os.path.join(d, 'English.txt'), encoding='utf-8') as file:
        text = file.read()
    font_path = os.path.join(d, 'GABRWFFR.TTF')
    pic = Image.open(os.path.join(d, 'fish.jpg'))
    mask = np.array(pic)
    # create coloring from image
    pic_color = wordcloud.ImageColorGenerator(mask)

    w = wordcloud.WordCloud(font_path=font_path,
                            background_color='white',
                            mask=mask,
                            contour_width=3,
                            contour_color="blue")
    w.generate(text)
    #recolor wordcloud
    w.recolor(color_func=pic_color)

    plt.figure()
    plt.subplot(1, 2, 1)
    plt.imshow(pic)
    plt.subplot(1, 2, 2)
    plt.imshow(w, interpolation="bilinear")
    plt.show()

    w.to_file(os.path.join(d, 'fish.png'))
Exemplo n.º 3
0
def create_word_cloud():
    # 设置词云形状图片,numpy+PIL方式读取图片
    # 数据清洗词列表
    stop_words = [
        '就是', '不是', '但是', '还是', '只是', '这样', '这个', '一个', '什么', '电影', '没有'
    ]
    # 设置词云的一些配置,如:字体,背景色,词云形状,大小,生成词云对象
    wc = wordcloud.WordCloud(mask=imread('lianxi/background1.png'),
                             background_color=None,
                             stopwords=stop_words,
                             max_words=250,
                             scale=4,
                             mode='RGBA',
                             min_font_size=10,
                             max_font_size=70,
                             random_state=42,
                             font_path="C:\\Windows\\Fonts\\SimHei.TTF")
    # 生成词云
    wc.generate(cut_word())
    img = imread('lianxi/color.jpg')
    cloud_colors = wordcloud.ImageColorGenerator(np.array(img))
    wc.recolor(color_func=cloud_colors)
    plt.figure(figsize=(20, 20))
    plt.rcParams['font.family'] = 'SimHei'
    # 开始画图
    plt.imshow(wc)
    # 为云图去掉坐标轴
    plt.axis("off")
    plt.savefig('dataout/图7豆瓣电影词语云.png')
Exemplo n.º 4
0
    def show_cloud(self):
        # 读取文件
        fn = open('job.txt')  # 打开文件
        string_data = fn.read()  # 读出整个文件
        fn.close()  # 关闭文件

        # 文本预处理
        pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')  # 定义正则表达式匹配模式
        string_data = re.sub(pattern, '', string_data)  # 将符合模式的字符去除

        # 文本分词
        seg_list_exact = jieba.cut(string_data, cut_all=False)  # 精确模式分词
        object_list = []

        # # 去除模式
        # remove_words = [u';', u':', u'\xa0', u'有', u'数据', u'开发', u'经验', u'大',
        #                 u'熟悉', u'/', u'。', u' ', u'、', u'技术', u'能力', u'了',u'的',
        #                 u'和',u'1',u'2',u'3',u'4',u'5',u'6',u'7',u'8',u'9',u'0',
        #                 u'等',u'相关',u'学习',u'或',u'了',u'进行',u'者',u'要求',u'描述',
        #                 u'任职',u'以上',u'项目',u'应用',u'业务',u'平台',u'沟通',u'对',u'产品',
        #                 u'设计',u'年',u'优先',u',',u'’',u'‘',u'及',u'负责',u'工作',u'职位',
        #                 u'具备',u'具有',u'系统',u'良好',u'团队',u'以上学历',u'使用',u'精通',u'公司',
        #                 u'通常', u'如果', u'我们', u'需要']  # 自定义去除词库
        #
        # for word in seg_list_exact:  # 循环读出每个分词
        #     if word not in remove_words:  # 如果不在去除词库中
        #         object_list.append(word)  # 分词追加到列表

        filter_words = [
            u'Hadoop', u'Spark', u'Hive', u'Flink', u'Hbase', u'hdfs',
            u'Python'
        ]  # 自定义需要统计的关键词
        # 保留模式
        for word in seg_list_exact:  # 循环读出每个分词
            if word in filter_words:  # 如果不在去除词库中
                object_list.append(word)  # 分词追加到列表

        # 词频统计
        word_counts = collections.Counter(object_list)  # 对分词做词频统计
        word_counts_top10 = word_counts.most_common(30)  # 获取前10最高频的词
        print(word_counts_top10)  # 输出检查

        # 词频展示
        mask = np.array(Image.open('./wordcloud_bg.jpg'))  # 定义词频背景

        wc = wordcloud.WordCloud(
            font_path="./SourceHanSerifSC-Bold.otf",  # 设置字体格式
            mask=mask,  # 设置背景图
            max_words=30,  # 最多显示词数
            max_font_size=400  # 字体最大值
        )

        wc.generate_from_frequencies(word_counts)  # 从字典生成词云
        image_colors = wordcloud.ImageColorGenerator(mask)  # 从背景图建立颜色方案
        wc.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案

        plt.imshow(wc)  # 显示词云
        plt.axis('off')  # 关闭坐标轴
        plt.show()  # 显示图像
        plt.imsave("./out.jpg", wc)
Exemplo n.º 5
0
def make_word_cloud(object_list):
    # 词频展示
    print('\n开始制作词云……')  # 提示当前状态
    word_counts = collections.Counter(object_list)
    mask = numpy.array(Image.open(background))  # 定义词频背景
    wc = wordcloud.WordCloud(
        font_path='file/simfang.ttf',  # 设置字体(这里选择“仿宋”)
        background_color='white',  # 背景颜色
        mask=mask,  # 文字颜色+形状(有mask参数再设定宽高是无效的)
        max_words=number,  # 显示词数
        max_font_size=150  # 最大字号
    )

    wc.generate_from_frequencies(word_counts)  # 从字典生成词云
    wc.recolor(color_func=wordcloud.ImageColorGenerator(mask))  # 将词云颜色设置为背景图方案
    plt.figure('词云')  # 弹框名称与大小
    plt.subplots_adjust(top=0.99,
                        bottom=0.01,
                        right=0.99,
                        left=0.01,
                        hspace=0,
                        wspace=0)  # 调整边距
    plt.imshow(wc, cmap=plt.cm.gray, interpolation='bilinear')  # 处理词云
    plt.axis('off')  # 关闭坐标轴
    print('制作完成!')  # 提示当前状态
    plt.show()
Exemplo n.º 6
0
def show_cloud():
    # Read all of comments.
    total_comment_text = ""
    for comment in database.get_comments():
        total_comment_text += re.sub('<[^<]+?>', '', comment['text']).replace(
            '\n', '').strip()  # re用来去除html标签

    # Cut sentences to short words.
    wordlist = jieba.lcut(total_comment_text)
    wordliststr = " ".join(wordlist)

    font = os.path.join(os.path.dirname(__file__), "word_cloud_yahei.ttf")
    mask = np.array(Image.open(
        'word_cloud_mask.png'))  # background and shape of the word cloud
    # Get the word cloud.
    wd = wordcloud.WordCloud(scale=8,
                             width=1920,
                             height=1080,
                             font_path=font,
                             mask=mask,
                             max_font_size=100,
                             min_font_size=12,
                             background_color="white",
                             stopwords=get_stop_words()).generate(wordliststr)
    image_colors = wordcloud.ImageColorGenerator(mask)
    wd.recolor(color_func=image_colors)  # color is from the background image
    plt.figure()
    plt.imshow(wd)
    plt.axis("off")
    plt.show()
Exemplo n.º 7
0
def cloudplot():
    # 设置模板图像的路径
    target_coloring = imread(r'C:\Users\ctrl\Desktop\heart.jpg')
    # 以词频和背景模板为依据生成词云对象
    word_cloud = WordCloud(font_path=r'C:\Windows\Fonts\simhei.ttf',
                           background_color="white",
                           max_words=2000,
                           mask=target_coloring).generate_from_frequencies(
                               AnalyzeData())
    # 生成颜色分布
    image_color = wordcloud.ImageColorGenerator(target_coloring)
    # image_color =

    import matplotlib.pyplot as plt
    # 仅按照词频、边界、默认颜色生成词云图像
    plt.imshow(word_cloud)
    plt.axis("off")
    plt.figure()

    # 重新上色,按照图像色彩分布生成
    plt.imshow(word_cloud.recolor(color_func=image_color))
    plt.axis("off")
    plt.figure()

    # 绘制原始图像
    plt.imshow(target_coloring, cmap=plt.cm.gray)
    plt.axis("off")
    plt.show()

    word_cloud.to_file(filename + '.png')
Exemplo n.º 8
0
def get_wordcloud():  # 词云生成
    words_str = ""
    with open("jd_comment.txt") as f:
        for line in f:
            line = re.sub(
                u"[0-9\s+.!/,$%^*()?;;:-【】\"\']+|[+—!,;:。?、~@#¥%…&*()><-]+",
                "", line)  # 去掉多余字符
            if line == "": continue
            line = line.replace("\n", "")  # 去掉换行符
            seg_list = jieba.cut(line, cut_all=False)
            words_str += (" ".join(seg_list))
    stopwords = stopwordslist()
    words = [
        word for word in words_str.split(" ")
        if word not in stopwords and len(word) > 1
    ]
    word_counts = Counter()  # 词频统计
    for x in words:
        word_counts[x] += 1

    mask = np.array(PIL.Image.open(r'./background.jpg'))
    wc = WordCloud(font_path=r'C:\Windows\Fonts\SimHei.TTF',
                   max_words=2000,
                   mask=mask,
                   repeat=False)
    wc.generate_from_frequencies(word_counts)
    image_colors = wordcloud.ImageColorGenerator(
        mask)  # 可以去掉 # 基于彩色图像生成相应彩色 文字颜色跟随背景图颜色
    wc.recolor(color_func=image_colors)
    wc.to_file("词云.jpg")
Exemplo n.º 9
0
def question_41():
    data_name = '汽车之家论坛'

    text = open('{}_cut.txt'.format(data_name), 'r', encoding='utf-8').read()
    alice_coloring = imread("bk.jpg")

    wc = wordcloud.WordCloud(background_color="white",
                             width=800,
                             height=600,
                             mask=alice_coloring,
                             max_font_size=20,
                             random_state=1,
                             max_words=100,
                             font_path='C:\\Windows\\msyh.ttf')

    wc.generate(text)
    image_colors = wordcloud.ImageColorGenerator(alice_coloring)

    plt.axis("off")
    plt.figure()
    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis("off")
    plt.figure(dpi=300)
    plt.axis("off")
    wc.to_file('{}_word_cloud.png'.format(data_name))
Exemplo n.º 10
0
    def create_word_cloud(self, text):
        """

        :param word_list: 经过去除标点符号、分词、去停用词后的词列表
        :return:
        """
        word_list = self.tokenization(text)
        word_counts = collections.Counter(word_list)  # 对分词做词频统计
        word_counts_topK = word_counts.most_common(self.topK)  # 获取前K最高频的词
        # print(word_counts_topK)
        # 词频展示
        mask = np.array(Image.open(self.wc_background))  # 定义词频背景
        wc = wordcloud.WordCloud(
            font_path=self.font_path,  # 设置字体格式
            mask=mask,  # 设置背景图
            max_words=self.max_words,  # 最多显示词数
            max_font_size=self.max_font_size  # 字体最大值
        )

        wc.generate_from_frequencies(word_counts)  # 从字典生成词云
        image_colors = wordcloud.ImageColorGenerator(mask)  # 从背景图建立颜色方案
        wc.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
        plt.imshow(wc)  # 显示词云
        plt.axis('off')  # 关闭坐标轴
        plt.savefig(os.path.join(self.save_path, self.wc_name))
        plt.show()  # 显示图像
Exemplo n.º 11
0
def title():
    import jieba
    import jieba.analyse

    title = []
    jieba.load_userdict('dict.txt')
    jieba.analyse.set_stop_words('stop_words.txt')
    records5 = collection1.find({})
    for record5 in records5:
        title.append(record5['title'])
    words = "".join(title)
    wordlist = jieba.cut(words, cut_all=True)
    word_space_split = " ".join(wordlist)
    stopwords = set()
    mask2 = np.array(Image.open('CR.jpg'))  # 定义词频背景
    wc2 = wordcloud.WordCloud(
        background_color="white",
        font_path="fonts/simkai.ttf",
        stopwords=stopwords,
        mask=mask2,  # 设置背景图
        max_words=200,  # 最多显示词数
        max_font_size=100  # 字体最大值
    )
    wc2.generate(word_space_split)  # 从字典生成词云
    image_colors = wordcloud.ImageColorGenerator(mask2)  # 从背景图建立颜色方案
    wc2.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
    plt.imshow(wc2)  # 显示词云
    plt.axis('off')  # 关闭坐标轴
    plt.show()  # 显示图像
Exemplo n.º 12
0
def word_cloud(segment, title):
    """
    生成词云
    :param bullets: 弹幕列表
    :return: 图片url和排名前10的词
    """

    word_counts = collections.Counter(segment)
    word_counts_top10 = word_counts.most_common(10)
    print(word_counts_top10)

    # mask = np.array(Image.open(url_for('static',filename='background.jpg')))  # 定义词频背景
    # mask = np.array(Image.open('D:/Python Projects/BilibiliRank/app/static/background.jpg'))  # 定义词频背景
    root_dir = current_app.config['ROOT_DIR']
    mask = np.array(Image.open(root_dir + '/static/background.jpg'))  # 定义词频背景
    wc = wordcloud.WordCloud(
        font_path=root_dir+'/static/STFQLBYTJW.ttf',  # 设置字体格式
        mask=mask,  # 设置背景图
        max_words=200,  # 最多显示词数
        max_font_size=100  # 字体最大值
    )

    wc.generate_from_frequencies(word_counts)  # 从字典生成词云
    image_colors = wordcloud.ImageColorGenerator(mask)  # 从背景图建立颜色方案
    wc.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
    # wc.to_file(url_for('static',filename='wordcloud/wordcloud.png'))
    r_img_url = '/static/wordcloud/wordcloud' + str(time()) + '.png'
    wc.to_file(root_dir + r_img_url)
    print('图片生成成功!')
    # plt.imshow(wc)  # 显示词云
    # plt.axis('off')  # 关闭坐标轴
    # plt.show()  # 显示图像
    return r_img_url, word_counts_top10
Exemplo n.º 13
0
 def words_cloud(self, chin_list: list, eng_list: list, rm_words=None):
     if rm_words is None:
         rm_words = [
             '', 'and', 'to', 'the', 'a', 'for', 'our', 'we', 'with', 'in',
             'of', 'We', '的', 'be', 'that', 'their', 'as', 'on', 'an', 'is',
             '和', 'have', 'are', 'by', 'most', '-', 'where', 'its'
         ]
     words_list = []
     remove_words = rm_words
     all_list = chin_list + eng_list
     for word in all_list:
         if word not in remove_words:
             words_list.append(word)
     word_counts = collections.Counter(words_list)
     mask = np.array(Image.open('./wordcloud.jpg'))
     wc = wordcloud.WordCloud(
         font_path='/r2/dockerfile/py2/fonts/simhei.ttf',  # 设置字体格式
         mask=mask,
         scale=4,
         background_color='white',
         max_words=2000,
         max_font_size=100)
     wc.generate_from_frequencies(word_counts)
     image_colors = wordcloud.ImageColorGenerator(mask)
     wc.recolor(color_func=image_colors)
     plt.imshow(wc.recolor(color_func=image_colors),
                interpolation="bilinear")
     plt.axis('off')
     wc.to_file(f'{self.path}/word_cloud.jpg')
Exemplo n.º 14
0
def main():
    base_logo_path = "../../assets/publiccompany/logos/%s.png"
    base_transcript_path = "../../assets/publiccompany/transcripts/%s.txt"
    ticker = "gm"
    img = Image.open(base_logo_path % ticker)
    img = img.resize((1280, 1080))
    img_color = np.array(img)
    img_color_gen = wordcloud.ImageColorGenerator(img_color)
    stop_words = wordcloud.STOPWORDS
    [
        stop_words.add(word)
        for word in ["quarter", "think", "customer", "growth", "Azure"]
    ]
    #img_color_gen = wordcloud.ImageColorGenerator(np.array(Image.open("../../assets/publiccompany/logos/mcd.png")))
    #img_mask = ImageOps.invert(img)
    img_mask = img
    #img_mask.save("nflx-mask.png")
    img_mask = np.array(img_mask)
    #src_text = open("../../assets/publiccompany/transcripts/nflx.txt").read()
    src_text = open(base_transcript_path % ticker).read()
    wc = wordcloud.WordCloud(width=1280,
                             height=720,
                             mask=img_mask,
                             stopwords=stop_words,
                             max_words=1000)
    wc.generate(src_text)
    wc.recolor(color_func=img_color_gen)
    wc.to_file("wc.jpg")
Exemplo n.º 15
0
    def makewordcloud(self):
        content = ''
        with open('comment.csv', 'r') as f:
            reader = csv.reader(f)
            for row in reader:
                content += row[1]
                content = content + '\n'

        jieba.analyse.set_stop_words('stop.txt')
        tags = jieba.analyse.extract_tags(content, topK=100, withWeight=True)
        word_freq = {}
        for v, n in tags:
            word_freq[v] = str(int(n * 10000))
            word_freq[v] = int(n * 10000)

        mask = np.array(Image.open('index.png'))  # 定义词频背景
        wc = wordcloud.WordCloud(
            font_path='C:/Windows/Fonts/simhei.ttf',  # 设置字体格式
            mask=mask,  # 设置背景图
            max_words=2000,  # 最多显示词数
            max_font_size=120,  # 字体最大值
            background_color='white')

        wc.generate_from_frequencies(word_freq)  # 从字典生成词云
        image_colors = wordcloud.ImageColorGenerator(mask)  # 从背景图建立颜色方案
        wc.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
        plt.imshow(wc)  # 显示词云
        plt.axis('off')  # 关闭坐标轴
        plt.show()  # 显示图像'''
Exemplo n.º 16
0
def cloudplot():
    # 设置词云整体形状
    target_coloring = imageio.imread(r'data\alice.png')
    
    # 以词频和背景模板为依据生成词云对象
    word_cloud = WordCloud(font_path=r'C:\windows\Fonts\simhei.ttf',
                           background_color="white", max_words=2000, mask=target_coloring).generate_from_frequencies(AnalyzeData())
    # 生成颜色分布
    image_color = wordcloud.ImageColorGenerator(target_coloring)
    # image_color =


    # 仅按照词频、边界、默认颜色生成词云图像
    plt.imshow(word_cloud)
    plt.axis("off")
    plt.figure()

    # 重新上色,按照图像色彩分布生成
    plt.imshow(word_cloud.recolor(color_func=image_color))
    plt.axis("off")
    plt.figure()

    # 绘制原始图像
    plt.imshow(target_coloring, cmap=plt.cm.gray)
    plt.axis("off")
    plt.show()

    word_cloud.to_file(filename + '.png')
Exemplo n.º 17
0
    def get_wordcloud(self,imagePath,words_str,store_path=''):
        background=plt.imread(imagePath) #设定云图背景图案,参数为图片路径,不设置的话云图默认为方形
        img = Image.open(imagePath)
        width = img.width
        height = img.height
        wc=wordcloud.WordCloud(mask=background,font_path=self.FONTPATH,background_color='white',width=width,height=height,max_font_size=400,min_font_size=5)

        alice_coloring = np.array(img)

        image_colors = wordcloud.ImageColorGenerator(alice_coloring)
        #font_path是中文字体路径,因为wordcloud库本身只支持英文,需要下载中文字体;
        # max_font_size和min_font_size分别设置云图最大词语的大小和最小词语的大
        wc.generate(words_str)#生成词云
        # show
        fig, axes = plt.subplots(1, 3)
        axes[0].imshow(wc, interpolation="bilinear")
        # recolor wordcloud and show
        # we could also give color_func=image_colors directly in the constructor
        axes[1].imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
        axes[2].imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
        for ax in axes:
            ax.set_axis_off()
        plt.show()


        wc.to_file(store_path or self.OUTPUTPATH)#将词云存储到指定路径
Exemplo n.º 18
0
def main():
    print("@@@@@@@@欢迎来到词云生成器@@@@@@@@")
    file_name = input("请输入txt文档所在路径: ")
    file_info = readFile(file_name)
    print("文件内容读取成功")
    txt = cutWords(file_info)
    main_choice = getChoice("是否要改变词云形成形状?<1>是 <2>否 ", 1, 2)
    color_mask = None
    if main_choice == 1:
        file_name = input("请输入形状图片所在路径: ")
        color_mask = getImage(file_name)
        print("获取形状图片信息成功")
        image_colors = wordcloud.ImageColorGenerator(color_mask)
    plt.ion()
    while True:
        plt.close(1)
        back_color = input("请输入背景颜色的英文单词(red, black, white等): ")
        scale = getChoice("请输入图片清晰度(1比较模糊, 4较为清晰) ", 1, 100)

        choice = getChoice("是否使用默认字体<1>是 <2>否 ", 1, 2)
        font_path = "msyh.ttc"
        if choice == 2:
            font_path = getFont()

        repeat = False
        choice = getChoice("是否允许词汇重复出现<1>是 <2>否 ", 1, 2)
        if choice == 1:
            repeat = True

        max_words = getChoice("请输入最大出现词汇数量(默认200): ", 1, 99999)

        print("正在生成词云,时间可能较长,请稍后。。。(P.S. 显示时图片可能较模糊,但保存后图片会很清晰)")
        w = wordcloud.WordCloud(font_path=font_path,
                                background_color=back_color,
                                width=1000,
                                height=700,
                                mask=color_mask,
                                scale=scale,
                                repeat=repeat,
                                max_words=max_words)

        w.generate(txt)
        if main_choice == 1:
            choice = getChoice("词云底色是否要与形状图片相同<1>是 <2>否 ", 1, 2)
            if choice == 1:
                w.recolor(color_func=image_colors)
        print("生成图片中。。。")
        plt.figure(1)
        plt.imshow(w)
        plt.axis('off')
        plt.pause(0.001)
        plt.show()
        if getChoice("是否满意<1>是 <2>否 ", 1, 2) == 1:
            break

    name = input("请输入保存图片名: ")
    print("图片保存中。。。")
    w.to_file(name + ".png")
    name = input("图片保存成功!")
Exemplo n.º 19
0
    def extract_two(self, file=None):
        fn = codecs.open(file, 'r+', encoding='utf-8')
        string_data = fn.read()
        fn.close()

        # 文本预处理
        pattern = re.compile(
            '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~。“”、:?,【】!()——↓0-9a-zA-Z\.\.\.\.\.\.]+'
        )
        # pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')
        string_data = re.sub(pattern, '', string_data)  # 将符合模式的字符去除
        string_data = string_data.replace('\n', '')
        string_data = string_data.replace('\u3000', '')
        string_data = string_data.replace('\r', '')
        string_data = string_data.replace(' ', '')
        logging.info(string_data)

        # 文本分词
        seg_list_exact = jieba.cut(string_data, cut_all=False)  # 精确模式分词
        object_list = []
        remove_words_custom = [
            u'的', u',', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。',
            u' ', u'、', u'中', u'在', u'了', u'通常', u'如果', u'我们', u'需要', u'月',
            u'日'
        ]  # 自定义去除词库
        remove_words = self.parse_multiple_files(
            ['中文停用词表.txt', '哈工大停用词表.txt', '四川大学机器智能实验室停用词库.txt', '百度停用词表.txt'])
        remove_words = remove_words_custom + remove_words
        for word in seg_list_exact:  # 循环读出每个分词
            if word not in remove_words:  # 如果不在去除词库中
                logging.info('\n')
                logging.info(word)
                object_list.append(word)  # 分词追加到列表
        logging.info(object_list)

        # 词频统计
        word_counts = collections.Counter(object_list)  # 对分词做词频统计
        word_counts_top10 = word_counts.most_common(10)  # 获取前10最高频的词
        print(word_counts_top10)  # 输出检查
        # 词频展示
        font_path = r'C:\Windows\Fonts\simfang.ttf'
        mask = np.array(Image.open('background.jpg'))  # 定义词频背景
        wc = wordcloud.WordCloud(
            background_color='white',  # 设置背景颜色
            font_path=font_path,  # 设置字体格式
            mask=mask,  # 设置背景图
            max_words=200,  # 最多显示词数
            max_font_size=200,  # 字体最大值
            scale=80  # 调整图片清晰度,值越大越清楚
        )
        wc.generate_from_frequencies(word_counts)  # 从字典生成词云
        image_colors = wordcloud.ImageColorGenerator(mask)  # 从背景图建立颜色方案
        wc.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
        plt.figure()
        plt.imshow(wc)  # 显示词云
        plt.axis('off')  # 关闭坐标轴
        plt.show()  # 显示图像
        wc.to_file("bb.jpg")  # 将图片输出为文件
Exemplo n.º 20
0
def parse_args(arguments):
    prog = 'python wordcloud_cli.py'
    description = ('A simple command line interface for wordcloud module.')
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--text', metavar='file', type=argparse.FileType(), default='-',
        help='specify file of words to build the word cloud (default: stdin)')
    parser.add_argument('--stopwords', metavar='file', type=argparse.FileType(),
        help='specify file of stopwords (containing one word per line) to remove from the given text after parsing')
    parser.add_argument('--imagefile', metavar='file', type=argparse.FileType('w'), default='-',
        help='file the completed PNG image should be written to (default: stdout)')
    parser.add_argument('--fontfile', metavar='path', dest='font_path',
        help='path to font file you wish to use (default: DroidSansMono)')
    parser.add_argument('--mask', metavar='file', type=argparse.FileType(),
        help='mask to use for the image form')
    parser.add_argument('--colormask', metavar='file', type=argparse.FileType(),
        help='color mask to use for image coloring')
    parser.add_argument('--relative_scaling', type=float, default=0,
        metavar='rs', help=' scaling of words by frequency (0 - 1)')
    parser.add_argument('--margin', type=int, default=2,
        metavar='width', help='spacing to leave around words')
    parser.add_argument('--width', type=int, default=400,
        metavar='width', help='define output image width')
    parser.add_argument('--height', type=int, default=200,
        metavar='height', help='define output image height')
    parser.add_argument('--color', metavar='color',
        help='use given color as coloring for the image - accepts any value from PIL.ImageColor.getcolor')
    parser.add_argument('--background', metavar='color', default='black', type=str, dest='background_color',
        help='use given color as background color for the image - accepts any value from PIL.ImageColor.getcolor')
    parser.add_argument('--no_collocations', action='store_true',
        help='do not add collocations (bigrams) to word cloud (default: add unigrams and bigrams)')
    args = parser.parse_args(arguments)

    if args.colormask and args.color:
        raise ValueError('specify either a color mask or a color function')

    with args.text:
        args.text = args.text.read()

    if args.stopwords:
        with args.stopwords:
            args.stopwords = set(map(str.strip, args.stopwords.readlines()))

    if args.mask:
        args.mask = np.array(Image.open(args.mask))

    color_func = wc.random_color_func
    if args.colormask:
        image = np.array(Image.open(args.colormask))
        color_func = wc.ImageColorGenerator(image)

    if args.color:
        color_func = wc.get_single_color_func(args.color)

    args.collocations = not args.no_collocations

    args.color_func = color_func
    return args
Exemplo n.º 21
0
def wordCount(url):
    fileName = url[url.rfind('/') + 1:url.rfind('.')]
    it.contentInfo(url, True)
    # 读取文件
    fn = open('../../fileInfo/' + fileName + '.txt')
    string_data = fn.read()  # 读出整个文件
    fn.close()  # 关闭文件

    # 文本预处理
    pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')  # 定义正则表达式匹配模式
    string_data = re.sub(pattern, '', string_data)  # 将符合模式的字符去除

    # 文本分词
    seg_list_exact = jieba.cut(string_data, cut_all=False)  # 精确模式分词
    object_list = []
    # 自定义去除词库
    remove_words = [
        u'的', u',', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。',
        u' ', u'、', u'中', u'在', u'了', u'通常', u'如果', u'我们', u'需要', u'…', u':',
        u'“', u'”', u':', u'但', u'型', u'被'
    ]

    for word in seg_list_exact:  # 循环读出每个分词
        if word not in remove_words:  # 如果不在去除词库中
            object_list.append(word)  # 分词追加到列表

    # 词频统计
    word_counts = collections.Counter(object_list)  # 对分词做词频统计
    word_counts_top10 = word_counts.most_common(10)  # 获取前10最高频的词
    # 输出检查
    for item in word_counts_top10:
        print item[0], item[1]

    # 词频展示
    mask = np.array(Image.open('./timg.jpeg'))  # 定义词频背景
    wc = wordcloud.WordCloud(
        font_path='simhei.ttf',  # 设置字体格式
        mask=mask,  # 设置背景图
        width=800,
        height=600,
        max_words=200,  # 最多显示词数
        max_font_size=100  # 字体最大值
    )
    # 从字典生成词云
    wc.generate_from_frequencies(word_counts)
    # 从背景图建立颜色方案
    image_colors = wordcloud.ImageColorGenerator(mask)
    # 将词云颜色设置为背景图方案
    wc.recolor(color_func=image_colors)
    # 保存图片
    plt.imsave('../../fileInfo/' + fileName + '.png', wc)
    # 显示词云
    plt.imshow(wc)
    # 关闭坐标轴
    plt.axis('off')
Exemplo n.º 22
0
def show_novel():
    # 导入停用词,转换为列表格式
    stop_list = []
    with open('materials/停用词.txt', 'r') as f:
        for line in f.readlines():
            stop_list.append(line.strip())

    # 导入小说原文
    with open('materials/全职高手.txt', 'r', encoding='gbk') as f:
        text = f.read()

    # 导入词库字典,编码方式必须是utf-8
    jieba.load_userdict('materials/词库字典.txt')

    # 分词功能的函数,完成分词
    def txt_cut(f):
        return [w for w in jieba.cut(f) if w not in stop_list and len(w) > 1]

    textcut = txt_cut(text)

    # 对词频进行简单统计
    word_count = pd.Series(textcut).value_counts().sort_values(
        ascending=False)[0:20]

    # 画出柱状图并保存
    fig = plt.figure(figsize=(15, 8))
    x = word_count.index.tolist()
    y = word_count.values.tolist()
    sns.barplot(x, y, palette='BuPu_r')
    plt.title('词频Top20')
    plt.ylabel('count')
    sns.despine(bottom=True)
    plt.savefig('词频统计.png', dpi=400)
    plt.show()

    # 实例化一个词云类,添加分词
    fig_cloud = plt.figure(figsize=(15, 5))
    # font_path导入中文字体,mask设置词云图案的形状
    cloud = wordcloud.WordCloud(font_path='materials/simkai.ttf',
                                mask=imread('materials/test.jpg'),
                                mode='RGBA',
                                background_color=None).generate(
                                    ' '.join(textcut))

    # 对词的颜色做美化
    img = imread('materials/color.jpg')
    cloud_colors = wordcloud.ImageColorGenerator(np.array(img))
    cloud.recolor(color_func=cloud_colors)

    # 调用matplotlib接口
    plt.imshow(cloud)
    plt.axis('off')
    plt.savefig('wordcloud.png', dpi=400)
    plt.show()
Exemplo n.º 23
0
def plotWordColud(user_id):
    user_text, friend_text = "", ""
    with open("weibo.csv", 'r') as file:
        reader = csv.reader(file)
        weibos = list(reader)

    with open("Edges.csv", 'r') as file:
        reader = csv.reader(file)
        edges = list(reader)
    friends = []
    for edge in edges:
        if edge[0] == user_id:
            if edge[1] not in friends:
                friends.append(edge[1])
        if edge[1] == user_id:
            if edge[0] not in friends:
                friends.append(edge[0])

    for weibo in weibos:
        if weibo[1] == user_id:
            user_text += weibo[2]
        if weibo[1] in friends:
            friend_text += weibo[2]

    user_text_list = segment(user_text)
    friend_text_list = segment(friend_text)

    user_ = ",".join(user_text_list)
    friend_ = ",".join(friend_text_list)

    #设定指定的背景
    jpg = imread('logo.jpg')
    mask = np.array(Image.open('logo.jpg'))
    image_colors = wordcloud.ImageColorGenerator(mask)

    #保存
    wc = WordCloud(background_color="white",
                   max_words=200,
                   min_font_size=10,
                   max_font_size=35,
                   width=400,
                   font_path="/Users/wu/Downloads/msyh/msyh.ttf",
                   mask=mask,
                   color_func=image_colors)
    wc.generate(user_)
    file_path = "./wordCloud/user_.png"
    wc.to_file(file_path)

    wc.generate(friend_)
    file_path = "./wordCloud/friend_.png"
    wc.to_file(file_path)
Exemplo n.º 24
0
    def getKeyWords(self):
        """ 根据微博文本内容统计关键词 """
        text = ""
        for weibo in self.weibos:
            text += weibo['text']

        segs = jieba.cut(text)

        wcdict = {}
        for word in segs:
            if len(word) == 1:
                continue
            else:
                wcdict[word] = wcdict.get(word, 0) + 1
        wcls = list(wcdict.items())
        wcls.sort(key=lambda x: x[1], reverse=True)

        xx = [
            '他们', '没有', '自己', '一个', '什么', '这样', '知道', '我们', '这个', '这些', '不过',
            '已经', '要是', '觉得', '那样', '而且', "微博", "转发", "通过", "现在", "有人", "时候"
        ]

        for pair in wcls[:10]:
            if pair[0] not in xx:
                self.keywors.append(pair)

        #生成词云
        jpg = imread('cc.jpg')
        mask = np.array(Image.open('cc.jpg'))
        image_colors = wordcloud.ImageColorGenerator(mask)
        text_list = []

        for (word, cnt) in wcls:
            times = min(cnt, 30)
            for i in range(times):
                text_list.append(word)
        yc_text = ",".join(text_list)
        if len(yc_text) > 0:
            wc = WordCloud(background_color="white",
                           max_words=300,
                           min_font_size=15,
                           repeat=False,
                           max_font_size=50,
                           width=400,
                           font_path="/Users/wu/Downloads/msyh/msyh.ttf",
                           mask=mask,
                           color_func=image_colors)
            #wc.generate(yc_text)
            wc.generate_from_frequencies(dict(wcls))
            file_path = "./derived/kw_" + self.user_id + ".png"
            wc.to_file(file_path)
Exemplo n.º 25
0
    def parse(self, response):
        a = []
        b = []
        c = ''
        for quote in response.css('.detail_text'):
            for i in quote.css('p::text').extract():
                a.append(i)

        for i in a:
            c += i

        d = sta(c)

        data = Counter(d)
        e = data.most_common()

        f = ()
        g = list(f)
        for i in e:
            g.append(i)

        h = tuple(g)

        j = dict(h)

        mask = np.array(Image.open(path.join(path.dirname(__file__),
                                             "py.png")))
        transformed_mask = np.ndarray((mask.shape[0], mask.shape[1]), np.int32)

        for i in range(len(mask)):
            transformed_mask[i] = list(map(self.transform_format, mask[i]))

        cloud = wordcloud.WordCloud(
            background_color="#000",
            width=1000,
            height=800,
            mask=transformed_mask,
            collocations=False).generate_from_frequencies(j)

        image_colors = wordcloud.ImageColorGenerator(mask)

        plt.figure()
        # plot words
        plt.imshow(cloud.recolor(color_func=image_colors),
                   interpolation="bilinear")
        # remove axes
        plt.axis("off")
        # show the result
        plt.show()
Exemplo n.º 26
0
def word_cloud(wordsCnt, i):
    mask = np.array(Image.open('wordcloud.jpg'))  # 定义词云背景
    wc = wordcloud.WordCloud(
        font_path='C:/Windows/Fonts/simhei.ttf',  # 设置字体格式
        mask=mask,  # 设置背景图
        max_words=200,  # 最多显示词数
        max_font_size=100  # 字体最大值
    )
    wc.generate_from_frequencies(wordsCnt)  # 从字典生成词云
    image_colors = wordcloud.ImageColorGenerator(mask)  # 从背景图建立颜色方案
    wc.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
    plt.imshow(wc)  # 显示词云
    plt.axis('off')  # 关闭坐标轴
    plt.show()  # 显示图像
    wc.to_file('WordCloud' + str(i))  # 输出文件 设置文件名
def draw_word_cloud(word, graph):
    """画出词云图片,保存到本地,并可视化"""
    font_path = 'C:\\Windows\\Fonts\\Deng.ttf'  # 设置字体路径
    wc = wordcloud.WordCloud(font_path=font_path,
                             background_color='white',
                             max_words=200,
                             mask=graph,
                             scale=1.5,
                             random_state=10)  # wordCloud参数设置
    wc.generate(word)
    image_color = wordcloud.ImageColorGenerator(graph)
    wc.to_file('word9.png')  # 将生成的文件保存起来
    plt.imshow(wc.recolor(color_func=image_color))
    plt.axis('off')
    plt.show()
Exemplo n.º 28
0
    def get_word_cloud(self,
                       chinese_slice=False,
                       stopwords=None,
                       image_out_name=None):
        """
        :param chinese_slice:  Whether Use jieba to slice the sentences.
        :param stopwords: a set include some words to exclude.
        :return:
        """
        font_path = self.font_path
        if image_out_name is None:
            image_out_name = 'word-heart.png'
        if chinese_slice:
            text = ",".join(self.chat_content)
            text_list = jieba.lcut(text)
            text = " ".join(text_list)
            image_out_name = 'zh-'.__add__(image_out_name)
        else:
            text = " ".join(self.chat_content)
        mk = imageio.imread("heart.png")

        # 构建并配置词云对象w,注意要加scale参数,提高清晰度
        w = wordcloud.WordCloud(width=1000,
                                height=700,
                                background_color='white',
                                font_path=font_path,
                                mask=mk,
                                scale=2,
                                stopwords=stopwords,
                                contour_width=1,
                                contour_color='red')
        # 将string变量传入w的generate()方法,给词云输入文字
        w.generate(text)
        # 展示图片
        # 根据原始背景图片的色调进行上色
        image_colors = wordcloud.ImageColorGenerator(mk)
        plt.imshow(w.recolor(color_func=image_colors))
        # 根据原始黑白色调进行上色
        # plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3), interpolation='bilinear') #生成黑白词云图
        # 根据函数原始设置进行上色
        # plt.imshow(wc)

        # 隐藏图像坐标轴
        plt.axis("off")
        plt.show()

        # 将词云图片导出到当前文件夹
        w.to_file(image_out_name)
Exemplo n.º 29
0
 def word_cloud_generator(self, font_name="ShangShouXiuYuanTi.ttf"):
     origin_img_path = BASE_DIR + '/static/backgrounds/' + self.img_name
     new_img_path = BASE_DIR + '/static/resume/images/' + self.img_name
     mask_image = imread(origin_img_path, flatten=False)
     if self.color_adaption:
         image_colors = wc.ImageColorGenerator(mask_image)
         word_pic = WordCloud(font_path=font_base_path + font_name,
                              background_color=self.background_color,
                              mask=mask_image,
                              color_func=image_colors).generate(self.text)
     else:
         word_pic = WordCloud(font_path=font_base_path + font_name,
                              background_color=self.background_color,
                              mask=mask_image).generate(self.text)
     imsave(new_img_path, word_pic)
     return new_img_path
    def plot(self):
        wc = wordcloud.WordCloud(
            background_color='white',  # 设置背景颜色
            font_path='/System/Library/Fonts/Hiragino Sans GB.ttc',  # 设置字体格式
            mask=self.mask,  # 设置背景图
            max_words=200,  # 最多显示词数
            max_font_size=80,  # 字体最大值
            scale=128  # 调整图片清晰度,值越大越清楚
        )

        wc.generate_from_frequencies(self.word_counts)  # 从字典生成词云
        image_colors = wordcloud.ImageColorGenerator(self.mask)  # 从背景图建立颜色方案
        wc.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
        wc.to_file(self.output_pic_path)  # 将图片输出为文件
        plt.imshow(wc)  # 显示词云
        plt.axis('off')  # 关闭坐标轴
        plt.show()  # 显示图像