예제 #1
0
def make_worldcloud(file_path):  #对应的文件编码
    text_from_file_with_apath = open(file_path, 'r', encoding='UTF-8').read()
    wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=False)
    wl_space_split = " ".join(wordlist_after_jieba)
    print(wl_space_split)
    backgroud_Image = plt.imread('D:/下载/123.jpg')  #图片路径
    print('加载图片成功')
    '''设置词云样式'''
    stopwords = STOPWORDS.copy()
    stopwords.add("如果")  #屏蔽词,可以多个
    wc = WordCloud(
        width=1024,
        height=768,
        background_color='white',  #背景色
        mask=backgroud_Image,
        font_path='D:/下载/simsun.ttc',  #字体文件
        max_font_size=400,
        random_state=50,
    )
    wc.generate_from_text(wl_space_split)  #开始加载文字
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)  #字体颜色为背景图片的颜色
    plt.imshow(wc)  #显示云词图
    plt.axis('off')  #是否显示X轴,Y轴下标
    plt.show()  #显示
    #获取模块所在的路径
    d = path.dirname(__file__)
    # os.path.join(): 将多个路径组合后返回
    #print(d)
    wc.to_file(path.join(d, "h11.jpg"))  #生成图片名称
    print('生成云词成功')
예제 #2
0
def parse_comment():
    comments = []
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        try:
            for line in lines:
                comment = line.split(',')[2]
                if comment:
                    comments.append(comment)
        except Exception as e:
            print(e)
    comment_after_split = jieba.cut(str(comments), cut_all=False)
    words = ''.join(comment_after_split)
    #多虑没用的停止词
    stopwords = STOPWORDS.copy()
    stopwords.add('电影')
    stopwords.add('一部')
    stopwords.add('一个')
    stopwords.add('没有')
    stopwords.add('什么')
    stopwords.add('有点')
    stopwords.add('感觉')
    stopwords.add('毒液')
    stopwords.add('就是')
    stopwords.add('觉得')
    bg_image = plt.imread('venmo1.jpg')
    wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, font_path='STKAITI.TTF',
                   stopwords=stopwords, max_font_size=400, random_state=50)
    wc.generate_from_text(words)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
예제 #3
0
    def create_word_cloud_b64(word_list):
        random.shuffle(word_list)
        words = " ".join(word_list)

        stopwords = STOPWORDS.copy()
        stopwords.add("http")
        stopwords.add("https")
        stopwords.add("localstorage")
        stopwords.add("com_0")
        stopwords.add("new")
        stopwords.add("http_www")
        stopwords.add("sourceid")

        try:
            wordcloud = WordCloud(stopwords=stopwords,
                                  max_words=150,
                                  background_color="white",
                                  height=700,
                                  width=1200,
                                  margin=0,
                                  regexp=r"\w[\w'@\.]+").generate(words)

            plt.axis("off")
            plt.imshow(
                wordcloud)  # this must be here, or the wc image is blank
            wordcloud_bytes = cStringIO.StringIO()
            plt.savefig(wordcloud_bytes, bbox_inches='tight', format='jpg')
            wordcloud_bytes.seek(0)
            wordcloud_b64 = base64.b64encode(wordcloud_bytes.read())
            return wordcloud_b64

        except Exception as e:
            logging.error("Problem making word cloud: {}".format(e))

        return False
def word_cloud(csv_file, stopwords_path, pic_path):
    pic_name = csv_file+"_词云图.png"
    path = os.path.abspath(os.curdir)
    csv_file = path+ "\\" + csv_file + ".csv"
    csv_file = csv_file.replace('\\', '\\\\')
    d = pd.read_csv(csv_file, engine='python', encoding='utf-8')
    content = []
    for i in d['content']:
        try:
            i = translate(i)
        except AttributeError as e:
            continue
        else:
            content.append(i)
    comment_after_split = jieba.cut(str(content), cut_all=False)
    wl_space_split = " ".join(comment_after_split)
    backgroud_Image = plt.imread(pic_path)
    stopwords = STOPWORDS.copy()
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        for i in f.readlines():
            stopwords.add(i.strip('\n'))
        f.close()

    wc = WordCloud(width=1024, height=768, background_color='white',
                   mask=backgroud_Image, font_path="C:\simhei.ttf",
                   stopwords=stopwords, max_font_size=400,
                   random_state=50)
    wc.generate_from_text(wl_space_split)
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)
    plt.imshow(wc)
    plt.axis('off')  
    plt.show() 
    wc.to_file(pic_name)
예제 #5
0
def generate_word_cloud(data):
    # 设置分词
    signatures = jieba.cut(str(data))
    words = ' '.join(signatures)
    print('签名:',words)

    # 设置屏蔽词
    stopwords = STOPWORDS.copy()
    stopwords.add("span")
    stopwords.add("class")
    stopwords.add("emoji")
    stopwords.add("emoji2764")

    # 导入背景图片
    bg_img = plt.imread('love.jpg')

    # 设置词云参数
    wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_img, stopwords=stopwords,
                   max_font_size=400, random_state=50, font_path='STKAITI.TTF')
    # 将分词后的数据导入词图
    wc.generate_from_text(words)
    # 绘制图像
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    wc.to_file('词云图.jpg')
예제 #6
0
def wordCl(strs):
    stopwords = STOPWORDS.copy()
    #     stopwords.add('感觉')
    #     stopwords.add('第一集')
    #     stopwords.add('已经')
    #     stopwords.add('为什么')
    #     stopwords.add('啊啊啊')
    #     stopwords.add('不要')
    #     stopwords.add('没有')
    #     stopwords.add('?')
    #     stopwords.add('觉得')
    #     stopwords.add('时候')
    #     stopwords.add('开始')
    #     stopwords.add('一下')
    #     stopwords.add('自己')
    #     stopwords.add('就是')
    #     stopwords.add('还有')
    #     stopwords.add('但是')
    #     stopwords.add('怎么')
    #     stopwords.add('不要')
    word_list = [" ".join(jieba.cut(strs))]
    new_text = ' '.join(word_list)
    imagename = path.join(path.dirname(__file__), BACKGROUNDIMG)  # 背景图片路径
    coloring = imread(imagename)  # 读取背景图片
    #fontname=path.join(path.dirname(__file__), "msyh.ttf")  # 使用的是微软雅黑字体
    wordcloud = WordCloud(stopwords=stopwords,
                          min_font_size=10,
                          mask=coloring,
                          font_path="msyh.ttf",
                          scale=24,
                          background_color='white').generate(new_text)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
예제 #7
0
    def create_wordcloud(self, text):
        """生成词云"""
        # 对图片做处理
        maskPic = np.array(Image.open(IMAGE_PATH))

        # 停用词
        stopwords = STOPWORDS.copy()
        words = ["其次", "不錯看", "其实"]
        list(map(stopwords.add, words))

        wordcloud = WordCloud(
            font_path=TTF_PATH,  # 字体路径(中文字需要添加)
            width=500,
            height=400,
            stopwords=stopwords,
            max_font_size=100,
            random_state=30,
            min_font_size=10,
            background_color="white").generate(text.replace("\n", ""))

        # 改变字体颜色
        img_colors = ImageColorGenerator(maskPic)
        # 字体颜色为背景图片的颜色
        wordcloud.recolor(color_func=img_colors)
        # 生成图片并保存
        img = wordcloud.to_image()
        img.save(SAVE_IMAGE_PATH)
        print("词云图【成功】")
def main(save_files = False, db_filename = '../output/database.sqlite'):
    conn = sqlite3.connect(db_filename)
    c = conn.cursor()

    # Retrieve papers
    c.execute('''SELECT *
                 FROM Papers''')

    paper_content = c.fetchall()
    conn.close()

    titles = ''

    for pc in paper_content:
        titles += pc[1]

    # A Marvin Minsky mask
    mask = np.array(Image.open("../files/minsky_mask.png"))

    wc = WordCloud(background_color="white", max_words=2000, mask=mask, stopwords=STOPWORDS.copy())
    # Generate word cloud
    wc.generate(titles)
    
    if (save_files):
        # Store to file
        wc.to_file("../files/title_cloud.png")
    
    # Show word cloud
    plt.imshow(wc)
    plt.axis("off")
    # Show mask
#    plt.figure()
#    plt.imshow(mask, cmap=plt.cm.gray)
#    plt.axis("off")
    plt.show()
예제 #9
0
def generate_wordcloud():
    comments = []
    with open('data/comments.txt', 'r', encoding='utf-8') as f:
        rows = f.readlines()
        try:
            for row in rows:
                comment = row.split(',')[2]
                if comment != '':
                    comments.append(comment)
        except Exception as e:
            print(e)
    comment_after_split = jieba.cut(str(comments), cut_all=False)
    words = ' '.join(comment_after_split)
    stopwords = STOPWORDS.copy()
    stopwords.add('电影')
    stopwords.add('一部')
    stopwords.add('一个')
    stopwords.add('没有')
    stopwords.add('什么')
    stopwords.add('有点')
    stopwords.add('感觉')
    stopwords.add('毒液')
    stopwords.add('就是')
    stopwords.add('觉得')
    bg_image = plt.imread('bg.jpg')
    wc = WordCloud(background_color='lightblue',
                   mask=bg_image,
                   font_path='STKAITI.TTF',
                   stopwords=stopwords,
                   max_font_size=400,
                   random_state=50)
    wc.generate_from_text(words)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
예제 #10
0
def hope():
    d = path.dirname(__file__)

    # read the mask image
    # taken from
    # http://www.stencilry.org/stencils/movies/star%20wars/storm-trooper.gif
    mask = imread(path.join(d, "stormtrooper_mask.png"))
    text = open(path.join(d, 'input.txt')).read()

    # preprocessing the text a little bit
    #text = text.replace("HAN", "Han")
    #text = text.replace("LUKE'S", "Luke")

    # adding movie script specific stopwords
    stopwords = STOPWORDS.copy()
    stopwords.add("int")
    stopwords.add("ext")

    wc = WordCloud(max_words=1000,
                   mask=mask,
                   stopwords=stopwords,
                   margin=10,
                   random_state=1).generate(text)
    # store default colored image
    default_colors = wc.to_array()
    plt.title("Custom colors")
    plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3))
    wc.to_file("a_new_hope.png")
    plt.axis("off")
    plt.figure()
    plt.title("Default colors")
    plt.imshow(default_colors)
    plt.axis("off")
    plt.show()
예제 #11
0
def WorldCloud_pic(text_path, pic_path, font_path):
    text = open(text_path, 'r', encoding='UTF-8').read()
    word_list = jieba.cut(text, cut_all=False)
    wl_space_split = " ".join(word_list)
    print(wl_space_split)
    backgroud_Image = plt.imread(pic_path)
    print('加载图片成功!')
    stopwords = STOPWORDS.copy()  # 使用词云自带的停词表
    stopwords.add("哈哈")  # 可以加多个屏蔽词
    wc = WordCloud(
        width=1024,
        height=768,
        background_color='white',  # 设置背景颜色
        mask=backgroud_Image,  # 设置背景图片
        font_path=font_path,  # 设置中文字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
        max_words=600,  # 设置最大现实的字数
        stopwords=stopwords,  # 设置停用词
        max_font_size=400,  # 设置字体最大值
        random_state=50,  # 设置有多少种随机生成状态,即有多少种配色方案
    )
    wc.generate_from_text(wl_space_split)  # 开始加载文本
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)  # 字体颜色为背景图片的颜色
    plt.imshow(wc)  # 显示词云图
    plt.axis('off')  # 是否显示x轴、y轴下标
    plt.show()  # 显示
    d = path.dirname(__file__)  # 获得模块所在的路径的
    wc.to_file(path.join(d, "词云.jpg"))
    print('生成词云成功!')
예제 #12
0
 def SetStopWords(self):
     """
     停词设置
     """
     stopwords = STOPWORDS.copy()
     stopwords.add("电影")
     return stopwords
예제 #13
0
def get_word_cloud(comments):
    comments_after_aplit = jieba.cut(str(comments), cut_all=False)
    words = ' '.join(comments_after_aplit)
    # print(words)

    stopwords = STOPWORDS.copy()
    stopwords.add('哪吒')
    stopwords.add('电影')
    stopwords.add('我命')
    stopwords.add('不由')

    bg_img = plt.imread('circle.png')
    wc = WordCloud(width=1024,
                   height=768,
                   background_color='white',
                   mask=bg_img,
                   stopwords=stopwords,
                   max_font_size=200,
                   random_state=50,
                   font_path='STKAITI.TTF')
    wc.generate_from_text(words)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    wc.to_file('词云图.jpg')
예제 #14
0
 def wcplot(text, n):
     data_dir = 'data2'
     pwd = os.getcwd() 
     #file_names = os.listdir(os.path.join(pwd, data_dir))
     file_ = img_dict[n]
     file_dir = os.path.join(pwd, data_dir, file_)
     mask = imread(file_dir, mode= 'L')
 
     stopwords = STOPWORDS.copy()
     stopwords.add("int")
     stopwords.add("ext")
     
     wc = WordCloud(background_color="white", max_words=200,mask=mask,stopwords=stopwords,
                    random_state=3,font_path=font_path).generate(text)
     
     wc.recolor(color_func=color_func, random_state=3)
     plt.figure(figsize=(10,10),facecolor='k')
     plt.imshow(wc)
     #title = ("top words for topic %d") % n
     #plt.title(title)
     pic_name = ('{}.jpg') .format(input_movie.replace('/','_'))
 
     wc.to_file(pic_name)
     plt.axis("off")
     
     plt.show()
예제 #15
0
def word_cloud(csv_file, stopwords_path, pic_path):
    pic_name = csv_file[:-4] + "_词云图.png"
    d = pd.read_csv(csv_file, engine='python', encoding='utf-8')
    content = d['job_desc'].values
    comment_after_split = jieba.cut(str(content), cut_all=False)
    wl_space_split = " ".join(comment_after_split)
    background_image = plt.imread(pic_path)
    stopwords = STOPWORDS.copy()
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        for i in f.readlines():
            stopwords.add(i.strip('\n'))
        f.close()

    wc = WordCloud(width=1024,
                   height=768,
                   background_color='white',
                   mask=background_image,
                   font_path="simhei.ttf",
                   stopwords=stopwords,
                   max_font_size=400,
                   random_state=50)
    wc.generate_from_text(wl_space_split)
    img_colors = ImageColorGenerator(background_image)
    wc.recolor(color_func=img_colors)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    wc.to_file(pic_name)
예제 #16
0
def make_cloud(words, image, size=10, filename='figures/cloud.png', max_words=200, horizontal=0.8):

    # Remove URLs, 'RT' text, screen names, etc
    my_stopwords = ['RT', 'amp', 'lt']
    words_no_urls = ' '.join([word for word in words.split()
                              if word not in my_stopwords])

    # Add stopwords, if needed
    stopwords = STOPWORDS.copy()
    stopwords.add("RT")
    stopwords.add('amp')
    stopwords.add('lt')

    # Load up a logo as a mask & color image
    logo = imread(image)

    # Generate colors
    image_colors = ImageColorGenerator(logo)

    # Generate plot
    wc = WordCloud(stopwords=stopwords, mask=logo, color_func=image_colors, scale=0.8,
                   max_words=max_words, background_color='white', random_state=42, prefer_horizontal=horizontal)

    wc.generate(words_no_urls)

    plt.figure(figsize=(size, size))
    plt.imshow(wc)
    plt.axis("off")
    plt.savefig(filename)
예제 #17
0
def word_cloud(csv_file, stopwords_path, pic_path):
    pic_name = csv_file+"_词云图.png"
    path = os.path.abspath(os.curdir)
    csv_file = path+ "\\" + csv_file + ".csv"
    csv_file = csv_file.replace('\\', '\\\\')
    d = pd.read_csv(csv_file, engine='python', encoding='utf-8')
    content = []
    for i in d['content']:
        try:
            i = translate(i)
        except AttributeError as e:
            continue
        else:
            content.append(i)
    comment_after_split = jieba.cut(str(content), cut_all=False)
    wl_space_split = " ".join(comment_after_split)
    backgroud_Image = plt.imread(pic_path)
    stopwords = STOPWORDS.copy()
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        for i in f.readlines():
            stopwords.add(i.strip('\n'))
        f.close()

    wc = WordCloud(width=1024, height=768, background_color='white',
                   mask=backgroud_Image, font_path="C:\simhei.ttf",
                   stopwords=stopwords, max_font_size=400,
                   random_state=50)
    wc.generate_from_text(wl_space_split)
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)
    plt.imshow(wc)
    plt.axis('off')  
    plt.show() 
    wc.to_file(pic_name)
예제 #18
0
def signatures_cloud():
    signatures = []
    with open('wechatfriends.txt', mode='r', encoding='utf-8') as f:
        rows = f.readlines()
        for i in rows:
            signature = i.split(',')[5]
            if signature != '':
                signatures.append(signature)
    f.close()
    split = jieba.cut(str(signatures), cut_all=False)
    words = ' '.join(split)
    stopwords = STOPWORDS.copy()
    stopwords.add('span')
    stopwords.add('span')
    stopwords.add('class')
    stopwords.add('emoji')
    stopwords.add('emoji1f334')
    stopwords.add('emoji1f388')
    stopwords.add('emoji1f33a')
    stopwords.add('emoji1f33c')
    stopwords.add('emoji1f633')
    bg_image = plt.imread('moon.jpeg')
    wc = WordCloud(width=1000,
                   height=1000,
                   background_color='white',
                   mask=bg_image,
                   font_path='simhei.ttf',
                   stopwords=stopwords,
                   max_font_size=400,
                   random_state=50)
    wc.generate_from_text(words)
    # plt.imshow(wc)
    plt.axis('off')
    wc.to_file('个性签名云图.jpg')
예제 #19
0
def gen_tag_cloud(frequencies, filename):
    """生成标签云"""
    d = path.dirname(__file__)
    mask = np.array(Image.open(path.join(d, "view.jpg")))
    stopwords = STOPWORDS.copy()

    wc = WordCloud(background_color="white",
                   max_words=2000,
                   mask=mask,
                   stopwords=stopwords,
                   margin=10,
                   random_state=42,
                   font_path="msyh.ttf",
                   width=1280,
                   height=1024).fit_words(frequencies)
    image_colors = ImageColorGenerator(mask)

    plt.imshow(wc)
    plt.axis("off")
    plt.figure()
    wc.to_file(filename + "_tag_cloud_default.png")

    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis("off")
    plt.figure()
    wc.to_file(filename + "_tag_cloud_colored.png")
예제 #20
0
def generate_wordcloud():
    comments = []
    with open('maoyan.csv', mode='r', encoding='utf-8') as f:
        rows = f.readlines()
        for row in rows:
            comment = row.split(':')[0]
            if comment != '':
                comments.append(comment)

    comment_after_split = jieba.cut(str(comments), cut_all=False)
    words = ''.join(comment_after_split)
    print(words)

    stopwords = STOPWORDS.copy()
    stopwords.add('电影')
    stopwords.add('一出')
    stopwords.add('好戏')
    stopwords.add('有点')

    bg_image = plt.imread('123.jpg')
    wc = WordCloud(width=1024,
                   height=768,
                   background_color='white',
                   mask=bg_image,
                   stopwords=stopwords,
                   max_font_size=400,
                   random_state=50,
                   font_path='STKAITI.TTF')

    wc.generate_from_text(words)
    wc.to_file('output/词云图.jpg')
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
예제 #21
0
def make_worldcloud(file_path):
    text_from_file_with_apath = open(file_path, 'r', encoding='UTF-8').read()
    wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=False)
    wl_space_split = " ".join(wordlist_after_jieba)
    print(wl_space_split)
    backgroud_Image = plt.imread('./douban.jpg')
    print('加载图片成功!')
    '''设置词云样式'''
    stopwords = STOPWORDS.copy()
    stopwords.add("哈哈")
    stopwords.add("还是")  #可以加多个屏蔽词
    wc = WordCloud(
        width=1024,
        height=768,
        background_color='white',  # 设置背景颜色
        mask=backgroud_Image,  # 设置背景图片
        font_path='E:\simsun.ttf',  # 设置中文字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
        max_words=300,  # 设置最大现实的字数
        stopwords=stopwords,  # 设置停用词
        max_font_size=400,  # 设置字体最大值
        random_state=50,  # 设置有多少种随机生成状态,即有多少种配色方案
    )
    wc.generate_from_text(wl_space_split)  #开始加载文本
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)  #字体颜色为背景图片的颜色
    plt.imshow(wc)  # 显示词云图
    plt.axis('off')  # 是否显示x轴、y轴下标
    plt.show()  #显示
    # 获得模块所在的路径的
    d = path.dirname(__file__)
    # os.path.join():  将多个路径组合后返回
    wc.to_file(path.join(d, "h11.jpg"))
    print('生成词云成功!')
예제 #22
0
def ciyun(filepath):
    comment = []
    with open(filepath, 'r') as f:
        rows = f.readlines()
        for row in rows:
            if len(row.split(';')) == 5:
                comment.append(row.split(';')[4].replace('\n', ''))

    comment2 = json.dumps(comment, ensure_ascii=False)  # 转码显示中文
    print("comment2", comment2)
    comment_after_split = jieba.cut(str(comment2), cut_all=False)

    # 查看分词效果
    wl_space_split = " ".join(comment_after_split)
    print("wl_space_split", wl_space_split)
    # 以上都运行无误

    # 导入背景图
    backgroud_Image = plt.imread('1.jpg')  # 读取图片数据
    stopwords = STOPWORDS.copy()
    # 可以加多个屏蔽词
    stopwords.add("剧情")
    stopwords.add("一部")
    stopwords.add("一个")
    stopwords.add("没有")
    stopwords.add("什么")
    stopwords.add("有点")
    stopwords.add("这部")
    stopwords.add("这个")
    stopwords.add("不是")
    stopwords.add("真的")
    stopwords.add("感觉")
    stopwords.add("觉得")
    stopwords.add("还是")
    stopwords.add("女主")
    stopwords.add("皇后")
    stopwords.add("贵妃")
    stopwords.add("于妈")
    stopwords.add("就是")
    stopwords.add("可以")

    # 设置词云参数
    # 参数分别是指定字体、背景颜色、最大的词的大小、使用给定图作为背景形状

    wc = WordCloud(background_color='white',
                   mask=backgroud_Image,
                   font_path='DroidSansFallback.ttf',
                   stopwords=stopwords,
                   max_font_size=400,
                   random_state=50)
    wc.generate_from_text(wl_space_split)
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)

    # 保存结果到本地
    if filepath == 'yanxi.txt':
        wc.to_file('yanxi_wordcloud.jpg')
예제 #23
0
def ciyun(data):
    comment = jieba.cut(str(data['comments']), cut_all=False)  #分词
    wl_space_split = " ".join(comment)
    backgroud_Image = plt.imread('shenteng.jpg')  #添加背景
    stopwords = STOPWORDS.copy()
    print(" STOPWORDS.copy()", help(STOPWORDS.copy()))
    # 可以自行加多个屏蔽词,也可直接下载停用词表格
    stopwords.add("电影")
    stopwords.add("一部")
    stopwords.add("一个")
    stopwords.add("没有")
    stopwords.add("什么")
    stopwords.add("有点")
    stopwords.add("这部")
    stopwords.add("这个")
    stopwords.add("不是")
    stopwords.add("真的")
    stopwords.add("感觉")
    stopwords.add("觉得")
    stopwords.add("还是")
    stopwords.add("特别")
    stopwords.add("非常")
    stopwords.add("可以")
    stopwords.add("因为")
    stopwords.add("为了")
    stopwords.add("比较")
    #print(stopwords)
    # 设置词云参数
    # 参数分别是指定字体/背景颜色/最大的词的大小,使用给定图作为背景形状
    wc = WordCloud(width=1024,
                   height=768,
                   background_color='white',
                   mask=backgroud_Image,
                   font_path='‪C:\Windows\Fonts\simhei.ttf',
                   stopwords=stopwords,
                   max_font_size=400,
                   random_state=50)
    print(wl_space_split)
    wc.generate_from_text(str(wl_space_split))
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    wc.to_file(r'shenteng_wordcloud.jpg')
예제 #24
0
파일: mine.py 프로젝트: kshabahang/fbminer
def makeWordCloud(text):
	#preprocess
	stopwords = STOPWORDS.copy()
#        text.replace("State","")
#        text.replace("year","")
#        text.replace("Congress","")
#        text.replace("will","")
	wC = WordCloud(max_words=2000, stopwords=stopwords, margin=5, random_state=1, width = 1600, height = 800).generate(text)
	plt.imshow(wC)
	plt.show()
예제 #25
0
    def cloud(self):
        # 设置分词
        comments = self.getComments()
        print(str(comments))
        comment_after_split = jieba.cut(str(comments),
                                        cut_all=False)  # 非全模式分词,cut_all=false
        words = " ".join(comment_after_split)  # 以空格进行拼接
        print(type(words))
        print(''.join(words))
        # print(words)

        # 设置屏蔽词
        stopwords = STOPWORDS.copy()
        stopwords.add("电影")
        stopwords.add("一部")
        stopwords.add("一个")
        stopwords.add("没有")
        stopwords.add("什么")
        stopwords.add("有点")
        stopwords.add("这部")
        stopwords.add("这个")
        stopwords.add("不是")
        stopwords.add("真的")
        stopwords.add("感觉")
        stopwords.add("觉得")
        stopwords.add("还是")
        stopwords.add("但是")
        stopwords.add("就是")
        stopwords.add("他们")
        stopwords.add("可能")
        stopwords.add("应该")
        stopwords.add("怎么")
        stopwords.add("大家")

        # 导入背景图
        bg_image = plt.imread('g.jpg')
        # font = '/System/Library/Assets/com_apple_MobileAsset_Font5/6bb29eea6a5b99f3100a5e3f862e6457103557de.asset/AssetData/Hannotate.ttc'
        font = '/System/Library/Assets/com_apple_MobileAsset_Font5/4cecce0dd640f147de4d0e4155a97d3cdf47971e.asset/AssetData/Xingkai.ttc'
        # 设置词云参数,参数分别表示:画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小
        wc = WordCloud(width=1024,
                       height=768,
                       background_color='white',
                       mask=bg_image,
                       font_path=font,
                       stopwords=stopwords,
                       max_font_size=400,
                       random_state=50)
        # 将分词后数据传入云图
        wc.generate_from_text(words)
        plt.imshow(wc)
        plt.axis('off')  # 不显示坐标轴
        plt.show()
        # 保存结果到本地
        wc.to_file('myecharts/词云图.jpg')
예제 #26
0
def draw_cloud(comments):
    data = comments['content']

    comment_data = []
    print("由于数据量比较大,分词这里稍微有点慢,请耐心等候")
    for item in data:
        if pd.isnull(item) == False:
            comment_data.append(item)

    comment_after_split = jieba.cut(str(comment_data), cut_all=False)
    words = ' '.join(comment_after_split)

    # 自定义停用词
    stopwords = STOPWORDS.copy()
    stopwords.add('复仇者联盟')
    stopwords.add('复联')
    stopwords.add('联盟')
    stopwords.add('复仇')
    stopwords.add('电影')
    stopwords.add('一部')
    stopwords.add('一个')
    stopwords.add('没有')
    stopwords.add('什么')
    stopwords.add('有点')
    stopwords.add('感觉')
    stopwords.add('就是')
    stopwords.add('觉得')
    stopwords.add('但是')
    stopwords.add('自己')
    stopwords.add('我们')
    stopwords.add('真的')
    stopwords.add('可以')
    stopwords.add('非常')
    stopwords.add('还是')
    stopwords.add('还有')
    stopwords.add('这部')

    # 这里的字体路径请根据自己电脑的实际情况设置'simfang.ttf'
    fortpath = './SourceHanSansCN-Normal-2.otf'
    wc = WordCloud(width=1000,
                   height=700,
                   background_color='#000000',
                   font_path=fortpath,
                   scale=5,
                   stopwords=stopwords,
                   max_font_size=200)
    wc.generate_from_text(words)

    plt.figure(figsize=(10, 8))
    plt.imshow(wc)
    plt.axis('off')
    plt.savefig('./WordCloud.png')
    plt.show()
예제 #27
0
def ciyun(filepath):
    comment = []
    with open(filepath, 'r') as f:
        rows = f.readlines()
        for row in rows:
            if len(row.split(',')) == 5:
                comment.append(row.split(',')[4].replace('\n', ''))

    comment2 = json.dumps(comment, ensure_ascii=False)  #转码显示中文
    print "comment2", comment2
    comment_after_split = jieba.cut(str(comment2), cut_all=False)

    wl_space_split = " ".join(comment_after_split)
    print "wl_space_split", wl_space_split
    #导入背景图
    backgroud_Image = plt.imread('1.jpg')
    stopwords = STOPWORDS.copy()
    #可以加多个屏蔽词
    stopwords.add("电影")
    stopwords.add("一部")
    stopwords.add("一个")
    stopwords.add("没有")
    stopwords.add("什么")
    stopwords.add("有点")
    stopwords.add("这部")
    stopwords.add("这个")
    stopwords.add("不是")
    stopwords.add("真的")
    stopwords.add("感觉")
    stopwords.add("觉得")
    stopwords.add("还是")

    #设置词云参数
    #参数分别是指定字体、背景颜色、最大的词的大小、使用给定图作为背景形状
    wc = WordCloud(width=1024,
                   height=768,
                   background_color='white',
                   mask=backgroud_Image,
                   font_path='DroidSansFallbackFull.ttf',
                   stopwords=stopwords,
                   max_font_size=400,
                   random_state=50)
    wc.generate_from_text(wl_space_split)
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)
    #plt.imshow(wc)
    #plt.axis('off')#不显示坐标轴
    #plt.show()
    #保存结果到本地
    if filepath == 'xie_zheng.txt':
        wc.to_file('xie_zheng_ciyun.jpg')
    elif filepath == 'yaoshen.txt':
        wc.to_file('yaoshen_ciyun.jpg')
예제 #28
0
def plotTwiiterWordCloud():
	args = sys.argv
	tracefile = open(args[2], 'r')
	nLines = sum(1 for line in tracefile)
	tracefile.seek(0)

	dictTerms = dict()
	blacklist = STOPWORDS.copy()
	blacklist.add('rt')
	punctuation = set(string.punctuation)
	punctuation.remove('@')
	punctuation.remove('&')
	# punctuation.remove('#')
	for line in tqdm(tracefile, total=nLines):
		try:
			linesplited = line.split(', ')
			tweet = linesplited[6].lower()
			for p in punctuation:
				tweet = tweet.replace(p, '')
			terms = tweet.split(' ')
			for t in terms:
				if (len(t) > 1) and 'http' not in t and (t not in blacklist):
					try:
						dictTerms[t] += 1
					except KeyError:
						dictTerms[t] = 1
		except IndexError:
			print 'IndexError'
	for t in blacklist:
		try:
			del dictTerms[t]
		except KeyError:
			continue
	popularTerms = sorted(dictTerms.keys(), key=lambda w:dictTerms[w], reverse=True)
	popularTerms = [p for p in popularTerms if (dictTerms[p]) > 1]
	print len(popularTerms)
	text = list()
	terms = ''
	for p in popularTerms:
		text.append((p, dictTerms[p]))
		for i in range(dictTerms[p]):
			terms += ' ' + p
	# print terms
	maskfile = 'csgo-icon'
	mask = imread(maskfile + '.jpg') # mask=mask
	wc = WordCloud(mask=mask, background_color='white', width=1280, height=720).generate(terms) # max_words=10000
	default_colors = wc.to_array()
	plt.figure()
	plt.imshow(default_colors)
	plt.axis('off')
	plt.savefig(maskfile + '-wordcloud.png', dpi=500, bbox_inches='tight', pad_inches=0) # bbox_inches='tight'
	plt.show()
예제 #29
0
def makeCloud(text, imgFile, words):
    """
    Makes a word cloud and stores it in a jpeg file
    """
    excludewords = STOPWORDS.copy()
    
    for word in words:
        excludewords.add(word)
    
    wordcloud = WordCloud(max_words=NUM_OF_WORDS, width=WIDTH, height=HEIGHT, stopwords=excludewords).generate(text)
    image = wordcloud.to_image()
    image.show()
    image.save(imgFile + '.jpeg')      
예제 #30
0
    def get_stop_words(self):
        stopwords = STOPWORDS.copy()
        # 添加屏蔽词
        stopwords.add(self.movie.movie_name)

        with open('stopword.txt', 'r', encoding='utf_8_sig', newline='') as f:
            for line in f.readlines():
                if '\r\n' in line:
                    stopwords.add(line[:len(line) - 2:])
                else:
                    stopwords.add(line)
        # print(stopwords)
        return stopwords
예제 #31
0
def paint_word_cloud():
    comments = []
    with open('./dataSource.txt', 'r', encoding='utf-8') as f:
        rows = f.readlines()
        try:
            for row in rows:
                comment = row.split(':')[2]
                if comment != '':
                    comments.append(comment)
            print(comments)
        except Exception as e:
            print(e)
    comment_after_split = jieba.cut(str(comments), cut_all=False)
    words = ' '.join(comment_after_split)
    print(words)
    # 多虑没用的停止词
    stopwords = STOPWORDS.copy()
    stopwords.add('电影')
    stopwords.add('一部')
    stopwords.add('一个')
    stopwords.add('没有')
    stopwords.add('什么')
    stopwords.add('有点')
    stopwords.add('感觉')
    stopwords.add('海王')
    stopwords.add('就是')
    stopwords.add('觉得')
    stopwords.add('但是')
    bg_image = plt.imread('./cloudBack.jpg')
    print('load image success')
    print(words)
    wc = WordCloud(
        # 画布的宽度和高度,如果设置mask不生效
        #width=1024, height=768,
        # 背景色
        background_color='white',
        # 词云形状
        mask=bg_image,
        # 字体路径,若有中文,必须添加这句代码,否则中文变方框
        font_path='/Users/lichuang.lc/Desktop/python/out/STZHONGS.TTF',
        # 设置停用时间
        stopwords=stopwords,
        # 最大字号,如果不指定则为图像高度
        max_font_size=400,
        #有多少种配色方案
        random_state=50)
    wc.generate_from_text(words)
    wc.to_file('./man.jpg')
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
예제 #32
0
def writeFreq(text, outFile, words):
    """
    Writes frequencies of words into the specified file
    """

    excludewords = STOPWORDS.copy()
    
    for word in words:
        excludewords.add(word)
    
    wordcloud = WordCloud(max_words=NUM_OF_WORDS, stopwords=excludewords)
    freqList  = wordcloud.process_text(text)

    for item in freqList:
        outFile.write(item[0] + ',' + str(item[1]) + '\n')
예제 #33
0
def main():
    d = os.path.dirname(__file__)
    DOC_NAME = "cvpr2015papers.txt"
    text = open(os.path.join(d, DOC_NAME)).read()

    # adding computer vision specific stopwords
    stopwords = STOPWORDS.copy()
    stopwords.add("image")

    wc = WordCloud(max_words=300, stopwords=stopwords, width=800, height=400)
    wc.generate(text)
    wc.to_file(os.path.join(d, "cvpr2015wordcloud.png"))

    plt.imshow(wc)
    plt.axis("off")
    plt.show()
def main():
    d = os.path.dirname(__file__)
    DOC_NAME = "cvpr2015papers.txt"
    text = open(os.path.join(d, DOC_NAME)).read()

    # adding computer vision specific stopwords
    stopwords = STOPWORDS.copy()
    stopwords.add("image")

    wc = WordCloud(max_words=300, stopwords=stopwords, width=800, height=400)
    wc.generate(text)
    wc.to_file(os.path.join(d, "cvpr2015wordcloud.png"))

    plt.imshow(wc)
    plt.axis("off")
    plt.show()
예제 #35
0
def generate_word_cloud(text, mask_filename):
    d = path.dirname(__file__)  #??
    mask = imread(path.join(d, mask_filename))

    # adding movie script specific stopwords
    stopwords = STOPWORDS.copy()
    stopwords.add("info")
    stopwords.add("meetbot")
    stopwords.add("supybot")

    wc = WordCloud(max_words=1000, mask=mask, stopwords=stopwords, margin=10,
                random_state=1).generate(text)

    _, tmpfilename = tempfile.mkstemp('-wordcloud.png')
    wc.to_file(tmpfilename)
    return tmpfilename
예제 #36
0
    def get_wordcloud(self, filmname):
        movieid = MovieHelper().select(1, {"name": filmname})
        if (movieid == None):
            return False
        datalist = CommentHelper().select(conditions={"movieid": movieid})
        commentlist = []
        for data in datalist:
            if data[1] != "" or data[1] != ",":
                commentlist.append(data[1])

        comments_after_split = jieba.cut(str(commentlist), cut_all=False)
        wordlist = "".join(comments_after_split)

        stopwords = STOPWORDS.copy()
        stopwords.add("电影")
        stopwords.add("一部")
        stopwords.add("一个")
        stopwords.add("没有")
        stopwords.add("什么")
        stopwords.add("有点")
        stopwords.add("这部")
        stopwords.add("这个")
        stopwords.add("不是")
        stopwords.add("真的")
        stopwords.add("感觉")
        stopwords.add("觉得")
        stopwords.add("还是")
        stopwords.add("但是")
        stopwords.add("就是")

        bg_image = np.array(Image.open('bg.jpg'))
        # 设置词云参数,参数分别表示:画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小
        wc = WordCloud(width=2048,
                       height=768,
                       background_color='white',
                       mask=bg_image,
                       font_path='STKAITI.TTF',
                       stopwords=stopwords,
                       max_font_size=800,
                       random_state=50)

        wc.generate_from_text(wordlist)
        plt.imshow(wc)
        plt.axis("off")
        plt.show()

        wc.to_file("词云.jpg")
예제 #37
0
def show_fen_ci_qianmin():
    # ​ jieba是一个基于Python的分词库,完美支持中文分词,功能强大
    import jieba
    # ​ Matplotlib是一个Python的2D绘图库,能够生成高质量的图形,可以快速生成绘图、直方图、功率谱、柱状图、误差图、散点图等
    import matplotlib.pyplot as plt
    # ​ wordcloud是一个基于Python的词云生成类库,可以生成词云图
    # 可能安装失败 https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud
    # python -m pip install wordcloud-1.5.0-cp37-cp37m-win32.whl  
    from wordcloud import WordCloud, STOPWORDS

    # 获取所有个性签名
    signatures = []
    with open(friends_data, mode='r', encoding='utf-8') as f:
        rows = f.readlines()
        for row in rows:
            signature = row.split(',')[5]
            if signature != '':
                signatures.append(signature)

    # 设置分词
    split = jieba.cut(str(signatures), cut_all=False)  # False精准模式分词、True全模式分词
    words = ' '.join(split)  # 以空格进行拼接
    # print(words)

    # 设置屏蔽词,去除个性签名中的表情、特殊符号等
    stopwords = STOPWORDS.copy()
    stopwords.add('span')
    stopwords.add('class')
    stopwords.add('emoji')
    stopwords.add('emoji1f334')
    stopwords.add('emoji1f388')
    stopwords.add('emoji1f33a')
    stopwords.add('emoji1f33c')
    stopwords.add('emoji1f633')

    # 导入背景图
    bg_image = plt.imread(current_dir+'/010-wechat-bg.jpg')

    # 设置词云参数,参数分别表示:画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小
    wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, font_path='STKAITI.TTF',
                stopwords=stopwords, max_font_size=400, random_state=50)
    # 将分词后数据传入云图
    wc.generate_from_text(words)
    plt.imshow(wc)  # 绘制图像
    plt.axis('off')  # 不显示坐标轴
    # 保存结果到本地
    wc.to_file(current_dir+'/个性签名词云图.jpg')
예제 #38
0
def to_wordcloud(text):
	# background_image=plt.imread('./xxx.jpg')
	stopwords=STOPWORDS.copy()
	for i in ['电影','电影院','影片','IMAX']:
	    stopwords.add(i)
	# mask=background_image,
	wc = WordCloud(width=1024,height=768,background_color='white',font_path = 'simhei.ttf',stopwords=stopwords,max_font_size=400,random_state=50)
	wc.generate_from_text(text)

	# img_colors= ImageColorGenerator(background_image)
	# wc.recolor(color_func=img_colors)

	wc.to_file('maoyan_'+code+'.jpg')

	plt.imshow(wc)
	plt.axis('off')
	plt.show()
예제 #39
0
def mk_wordcloud():
    text_content = open('test.txt', 'r', encoding = 'utf-8').read()
    wordlist_cut_by_jieba = jieba.cut(text_content, cut_all=False)
    wordlist_space = ' '.join(wordlist_cut_by_jieba)
    #print(wordlist_space)
    background_image = plt.imread('xpj.jpg')
    print('加载图片')
    #屏蔽词
    stopwords = STOPWORDS.copy()
    stopwords.add("还是")
    stopwords.add("但是")
    stopwords.add("不是")
    stopwords.add("就是")
    stopwords.add("没有")
    stopwords.add("知道")
    stopwords.add("因为")
    stopwords.add("看到")
    stopwords.add("还有")
    stopwords.add("觉得")
    stopwords.add("有点")
    stopwords.add("这么")
    stopwords.add("其实")
    stopwords.add("一个")
    stopwords.add("为什么")
    stopwords.add("开始")
    stopwords.add("不要")
    stopwords.add("本来")
    stopwords.add("虽然")
    stopwords.add("出来")
    wc = WordCloud(
        width = 750,
        height = 1335,
        background_color = 'white',
        mask = background_image,
        font_path = 'C:\Windows\Fonts\simsun.ttc',
        max_words = 400,#最多字数
        stopwords = stopwords,
        max_font_size=400,#字体最大值
        random_state = 50, #随机生成状态,即多少种配色方案
    )
    wc.generate_from_text(wordlist_space)
    img_color = ImageColorGenerator(background_image)#背景色
    wc.recolor(color_func=img_color) #字体颜色为背景图片的颜色
    plt.imshow(wc) #显示词云图
    plt.axis('off')#不显示下标
    plt.show()
def generate_wordcloud(text):
    def my_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        """
        To change colors change the range for random ints below:
        Hue values are between 0 and 360
        Follows rainbow: 
        Red Orange Yellow Green Blue Indigo Violet
         0   50  100  150  200  250  300   360
        """
        hue_lower = 0
        hue_upper = 150

        saturation = 500

        light_lower = 80
        light_upper = 120

        return "hsl(%d, %d%%, %d%%)" % (
            random.randint(hue_lower, hue_upper),
            saturation,
            random.randint(light_lower, light_upper),
        )

    stopwords = STOPWORDS.copy()
    stopwords.add("us")
    stopwords.add("one")
    stopwords.add("will")
    stopwords.add("u")

    rand_num = random.randint(1, 100)

    wc = WordCloud(
        max_words=100, stopwords=stopwords, margin=10, random_state=rand_num, width=2000, height=1200
    ).generate(text)

    fig = plt.figure(figsize=(32, 20), dpi=100)
    plt.imshow(wc.recolor(color_func=my_color_func, random_state=1))

    # Save image
    outfilename = "tmp.png"
    wc.to_file(outfilename)
    plt.axis("off")

    plt.show()
예제 #41
0
def makeWC(theText, mask_image, mw):
    SW = STOPWORDS.copy()
    mywords = ['and', 'the', 'to', 'by', 'in', 'of', 'up',
           'Facebook', 'Twitter', 'Pinterest', 'Flickr',
           'Google', 'Instagram', 'login', 'Login', 'Log',
           'website', 'Website', 'Contact', 'contact',
           'twitter', 'Branding', 'Tweet', 'pic', 'location',
           'Details'
           ] + list(bad_words())
    [SW.add(w) for w in mywords]
    wordcloud = WordCloud(
                relative_scaling=0, 
                prefer_horizontal=random.uniform(0.5, 1), 
                stopwords=SW,
                background_color='black',
                max_words=mw, 
                mask = mask_image
                ).generate(theText)
    return wordcloud
예제 #42
0
def main():

    data = {}
    data['questions'] = []
    data['stakeholders'] = []
    data['wordclouds'] = {}


    stopwords = STOPWORDS.copy()
    sphere_stopwords = {'1a':['live', 'born', 'year', 'yrs', 'since'], \
    '1b':['live', 'born', 'year', 'yrs', 'moved', 'since'], \
    '2a':['time'], \
    '2b':[], \
    '3':[], \
    '4a':['bangalore', 'advantage', 'advantages'], \
    '4b':['bangalore', 'challenge', 'challenges'], \
    '4c':['work'], \
    '4d':['dependency', 'dependencies', 'external'], \
    '4e':['area', 'bangalore'], \
    '4f':[], \
    '5b':['food', 'air'], \
    '6a':['end', 'user', 'enduser'], \
    '6b':['month', 'income', 'per', 'household'], \
    '6c':['interact', 'interaction','end', 'user', 'enduser'], \
    '6d':['design', 'end', 'user', 'enduser'], \
    '7a':['quality', 'control', 'challenge'], \
    '7b':['end', 'user', 'access', 'challenge'], \
    '7c':[], \
    '7d':[], \
    '8a':['tool', 'resource', 'resource'], \
    '8b':['average', 'age', 'team'], \
    '8c':[], \
    '8d':[], \
    '8e':[], \
    '9a':[], \
    '9b':[], \
    '10a':[], \
    '10b':[], \
    '10c':[], \
    '10d':[], \
    '10e':[], \
    '10f':[], \
    '11a':[], \
    '11b':[], \
    '11c':[], \
    '12a':[], \
    '12b':[], \
    '12c':[], \
    '13a':[], \
    '13b':[], \
    '13c':[], \
    '13d':[], \
    '13e':[], \
    '13f':[], \
    '13g':[], \
    '13h':[], \
    '13i':[], \
    '14a':[], \
    '14b':[], \
    }

    #Filter out standalone words 2 letters or shorter
    shortword = re.compile(r'\W*\b\w{1,2}\b')

    with open('alldata.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            stakeholder = row['Code']
            data['stakeholders'].append(stakeholder)
            data[stakeholder] = {}
            data[stakeholder]['alltext'] = ''
            for key in row:
                if key != 'Code':
                    question = key
                    if question not in data['questions']:
                        data['questions'].append(question)
                        data[question] = ''
                    raw_response = shortword.sub('', row[key].lower().translate(None,string.punctuation))
                    stemmed_response = ' '.join([stem(word) for word in raw_response.split()])
                    data[stakeholder][question] = row[key]
                    data[stakeholder]['alltext'] += stemmed_response
                    data[stakeholder]['alltext'] += ' '
                    data[question] += stemmed_response
                    data[question] += ' '

    #Generate word clouds:
    for question in sorted(data['questions']):
        if question is not '5a':
        #Number of words per question
        # print question, ':', len(data[question].split())
            try:
                s = stopwords.union(set(sphere_stopwords[question]))
                data['wordclouds'][question] = WordCloud(stopwords=s).generate(data[question])
            except:
                print question

    for stakeholder in data['stakeholders']:
        try:
            data['wordclouds'][stakeholder] = WordCloud(stopwords=stopwords).generate(data[stakeholder]['alltext'])
        except:
            print stakeholder

    pickle.dump(data, open('alldata_v2.pickle', 'wb'))
예제 #43
0
cursor = links.find({},{"body":1})


test=""

for document in cursor :
    test=test+document['body']

with codecs.open("text_mining/my_stopwords.txt","r",encoding="utf-8") as f:
     read_data = f.readlines()





stopwords = STOPWORDS.copy()


for data in read_data:

    stopwords.add(data)


stopwords = map(lambda s: s.strip(), stopwords)


mask_choko = np.array(Image.open("text_mining/chokomag.png"))


wordcloud = WordCloud( stopwords=stopwords,background_color="black", max_words=10000,mask=mask_choko).generate(test)
예제 #44
0
def main():

    data = pickle.load( open('alldata.pickle', 'rb') )
    data['wordclouds'] = {}

    stopwords = STOPWORDS.copy()
    sphere_stopwords = { 'common': ['sam', 'mayu', 'mani'],
    '1a':['live', 'born', 'year', 'years', 'yrs', 'since', 'bangalore'], \
    '1b':['live', 'born', 'year', 'yrs', 'moved', 'since', 'bangalore'], \
    '2a':['balance', 'time'], \
    '2b':['inspiration'], \
    '3':['lifestyle', ], \
    '4a':['bangalore', 'advantage', 'advantages'], \
    '4b':['bangalore', 'challenge', 'challenges'], \
    '4c':['work'], \
    '4d':['dependency', 'dependencies', 'external'], \
    '4e':['area', 'bangalore'], \
    '4f':['measures'], \
    '5b':['food', 'air'], \
    '6a':['end', 'user', 'enduser'], \
    '6b':['month', 'income', 'per', 'household'], \
    '6c':['interact', 'interaction','end', 'user', 'enduser'], \
    '6d':['design', 'end', 'user', 'enduser'], \
    '7a':['quality', 'control', 'challenge'], \
    '7b':['end', 'user', 'access', 'challenge'], \
    '7c':[], \
    '7d':[], \
    '8a':['tool', 'tools', 'resource', 'resources'], \
    '8b':['average', 'age', 'years', 'team', 'people'], \
    '8c':['fund', 'funding', 'funded', 'money'], \
    '8d':['tech', 'technology'], \
    '8e':['office', 'location', 'work', 'space'], \
    '9a':['skill', 'skills'], \
    '9b':['training'], \
    '10a':['active', 'internal', 'collaboration', 'collaborate'], \
    '10b':['active', 'external', 'collaboration', 'collaborate'], \
    '10c':['lead', 'leads', 'learning', 'collaboration', 'collaborate'], \
    '10d':['part', 'formal', 'collaboration', 'collaborate', 'platform'], \
    '10e':['culture', 'open', 'share', 'sharing', 'sector'], \
    '10f':['share', 'shares', 'shared'], \
    '11a':['partner', 'partners', 'partnership', 'partnerships'], \
    '11b':['criteria', 'partner', 'partners', 'partnership', 'partnerships'], \
    '11c':['partner', 'partners', 'partnership', 'partnerships', 'sector'], \
    '12a':['monitoring', 'evaluation', 'method', 'methods', 'impact'], \
    '12b':['goal', 'next', 'year', 'years'], \
    '12c':['impact', 'studies', 'data', 'shared'], \
    '13a':['entrepreneur'], \
    '13b':['start', 'starting', 'startup'], \
    '13c':['entrepreneur', 'entrepreneurs', 'interact', 'interaction'], \
    '13d':['entrepreneur', 'entrepreneurs', 'role', 'local', 'needs'], \
    '13e':['advantage', 'advantages', 'local', 'entrepreneur', 'entrepreneurs'], \
    '13f':['barrier', 'barriers', 'entry', 'local', 'entrepreneur', 'entrepreneurs'], \
    '13g':['challenge', 'challenges', 'local', 'entrepreneur', 'entrepreneurs'], \
    '13h':['entrepreneur', 'entrepreneurs', 'fail'], \
    '13i':['resource', 'resources', 'need', 'needed', 'strengthen', 'local', 'entrepreneur', 'entrepreneurs'], \
    '14a':['recommend', 'stakeholder', 'stakeholders'], \
    '14b':['map', 'visual'], \
    }

    #Filter out standalone words 2 letters or shorter
    shortword = re.compile(r'\W*\b\w{1,2}\b')

    questions = []
    text = {}

    for stakeholder in data:
        text[stakeholder] = ''
        for question in data[stakeholder]:
            if question not in questions:
                questions.append(question)
                text[question] = ''
            response = shortword.sub('', ' '.join(data[stakeholder][question]).lower().translate(None,string.punctuation))
            text[stakeholder] += response + ' '
            text[question] += response + ' '

    #Generate word clouds:
    for question in sorted(questions):
        if question is not '5a':
            try:
                s = stopwords.union(set(sphere_stopwords[question]+sphere_stopwords['common']))
                wordcloud = WordCloud(stopwords=s, width=1600, height=800, background_color='white').generate(text[question])
                wordcloud.to_file('clouds/'+question+'-'+str(len(text[question].split()))+'words.png')
            except:
                print question

    for stakeholder in data:
        try:
            s = stopwords.union(set(sphere_stopwords['common']))
            wordcloud = WordCloud(stopwords=s, width=1600, height=800, background_color='white').generate(text[stakeholder])
            wordcloud.to_file('clouds/'+stakeholder+'-'+str(len(text[stakeholder].split()))+'words.png')
        except:
            print stakeholder