Exemplo n.º 1
0
def build_wordcloud(msg):
    msg = ','.join([m.split(DELIMITER)[-1]for m in msg])
    words = list(jieba.cut(msg))
    for i in range(len(words)):
        if words[i] in stopwords:
            words[i] = ''
    wc = WC(font_path='./resource/msyh.ttf', #如果是中文必须要添加这个,否则乱码
        background_color='white',
        width=1000,
        height=800,).generate(' '.join(words))
    wc.to_file('./output/chat/message_cloud.png')
    return './output/chat/message_cloud.png'
Exemplo n.º 2
0
 def make_wc(self):
     with open('/home/suifeng/文档/text.txt', 'r') as f:
         text = f.read()
     text_spilt = ''.join(jieba.cut(text))
     wc = WC(
         background_color='black',  #背景色
         max_words=100,  #最大词数
         max_font_size=80,  #最大字体大小
         random_state=35,  # 最多的字样随机形态
         font_path='MSYH.TTC'  #字体路径,必须有字体,字体文件可以从Windows系统下复制过来
     ).generate(text_spilt)
     plt.imshow(wc)
     plt.axis('off')
     plt.show()
     wc.to_file('/home/suifeng/图片/python词云图.png')
Exemplo n.º 3
0
def wc():
    # 读取小说内容
    file = 'C:/Users/26015/Desktop/金瓶梅/金瓶梅.txt'
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()

    # 产生词云
    wordcloud = WC(
        font_path='C:/Windows/Fonts/simkai.ttf'
        , width=1400
        , height=700
    ).generate(content)
    # plt.imshow(wordcloud)
    # plt.axis('off')
    # plt.show()

    # 保存图片
    wordcloud.to_file("金瓶梅.jpg")
Exemplo n.º 4
0
    def __init__(self, movie, stop_words, srt_folder):
        self.movie = movie
        self.stop_words = stop_words
        self.subtitle = Subtitle.get_from_movie(movie, srt_folder)
        lines = [line.content for line in self.subtitle.get_lines()]
        self.words = tokenize_text(' '.join(lines))
        self.wordcloud_title = f"{self.movie.original_title} ({self.movie.release_date})"
        self.filename = os.path.join(CONFIG['PNG_FOLDER'],
                                     f"{self.wordcloud_title}.png")
        self.cloud = WC(background_color="white",
                        max_words=200,
                        stopwords=set(self.stop_words),
                        width=1280,
                        height=720,
                        collocations=False)
        # Related to issue_5: Duplicated words in word cloud.
        # With this parameter in False we avoid repeated words.

        self.cloud.generate(self.words)
        if not os.path.exists(CONFIG['PNG_FOLDER']):
            os.mkdir(CONFIG['PNG_FOLDER'])
Exemplo n.º 5
0
    def topic_wordcloud(self, figname: str = "wordcloud"):
        """
        Uses WordCloud library to display the topic
       
        Use Laplace Smoothing and compare the vocabs from
        the date of interest with vocab from the date of
        the year before
        """
        from wordcloud import WordCloud as WC
        import matplotlib.pyplot as plt

        prev_yr_voc = self._prev_voc
        prev_day_voc = self._voc.previous_day()
        prev_yr_voc.voc.update(prev_day_voc)
        self._voc = self.laplace_smoothing(self._voc, prev_yr_voc)

        # Create wordcloud
        wc = WC().generate_from_frequencies(self._voc)
        plt.imshow(wc)
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(figname)
Exemplo n.º 6
0
    def __init__(self):
        self._min_word_len = 1
        self._text_processors = [
            replace_abbr_not(),
            # remove_len_less_than(3),
            sentence_tokenize(),
        ]
        self._sentence_processors = [
            word_tokenize(),
            remove_stop_words(),
        ]
        self._words_processors = [
            word_lemmatizing(),
            correct_spelling(),
        ]
        self._word_list = []

        self._wc = WC(
            max_words=100,
            font_path=None,
            background_color='white',
            # width=400, height=200,
            mask=cv2.imread('word_cloud.mask0.png'))
Exemplo n.º 7
0
def word_cloud_1(image,df1,number_clusters,selected_adjectives):

    for i in range(number_clusters):
        
    
        df_words=df1[df1['own_cluster']==i]

        
        df_words=df_words.adjectives

        
        flat_text=[item for sublist in df_words for item in sublist]

        
        text=""

        
        for word in flat_text:
    
            if word.lower() in selected_adjectives:
            
                text=text+" "+word
        
       
        wordcloud = WC(mask=image,background_color="white",contour_width=3, contour_color="black").generate(text)

        plt.imshow(wordcloud, interpolation='bilinear')
        
        plt.suptitle("Most numerous words in cluster".upper())
        
        plt.title(str(i))

        plt.axis("off")

        plt.show()
        
    return
Exemplo n.º 8
0
def wc2():
    # 读取小说内容
    file = 'C:/Users/26015/Desktop/金瓶梅/金瓶梅.txt'
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()

    cut_content = jieba.cut(content.strip())

    # 读取停用词
    stopwords = stopwordslist('stopwords.txt')

    outstr = ''
    for word in cut_content:
        if word not in stopwords and word != '\t':
            outstr = outstr + word + " "

    # 产生词云
    wordcloud = WC(
        font_path='C:/Windows/Fonts/simkai.ttf'
        , width=1400
        , height=700
    ).generate(outstr)
    # 保存图片
    wordcloud.to_file("金瓶梅2.jpg")
Exemplo n.º 9
0
    def weibo_cloud(self):
        """
        微博分词  词云
        :return:
        """
        try:
            # 获取微博数据
            data_count.main()
            # 读取数据文件
            with open("./data/weibo/data-quting.txt", "r",
                      encoding="utf-8") as f:
                wl_space_split = f.read()

            # print(wl_space_split)

            self.fig.clf()  # 清除之前的画图
            ax = self.fig.add_subplot(111)
            # 设置中文字体
            font = r'C:\Windows\Fonts\simfang.ttf'
            # print("123456798")
            # 对分词后的文本生成词云
            my_wordcloud = WC(
                collocations=False,
                font_path=font,
                width=3000,
                height=2000,
                background_color="white").generate(wl_space_split)

            # 显示词云图
            ax.imshow(my_wordcloud)
            # 是否显示x轴、y轴下标
            ax.axis("off")
            self.canvas.draw()  # TODO:这里开始绘制

        except Exception as e:
            print(e)
Exemplo n.º 10
0
token.counter
## SHOW - number of times each bigram and word appear

bigrams = {k: v for k, v in token.counter.items() if k.count("~")}
cnt = Counter(bigrams)
cnt.most_common(5)
# [('of~the', 14615),
#  ('in~the', 9913),
#  ('to~the', 7339),
#  ('on~the', 4883),
#  ('and~the', 4843)]
bigrams
## SHOW the selection

# Word-cloud
wc = WC().generate_from_frequencies(bigrams)
plt.imshow(wc)
plt.axis("off")
plt.tight_layout()
## SHOW
plt.savefig("books_wordcloud.png", dpi=300)

## SKIP
# Histogram
hist = defaultdict(list)
_ = [hist[v].append(k) for k, v in bigrams.items()]
plt.plot(np.log([len(hist.get(i, [0])) for i in range(1, 2000)]))
plt.grid()
plt.xlabel("Frequency")
plt.ylabel("Number of bigrams (log-scale)")
plt.tight_layout()
Exemplo n.º 11
0
plt.plot(n, v, '.')
plt.plot(n, k * n**beta)
plt.legend(['Measured', 'Predicted'])
plt.grid()
plt.xlabel('Number of tokens')
plt.ylabel('Vocabulary Size')
plt.tight_layout()
plt.savefig('heaps_law2.png', dpi=300)

# Activities
date = dict(year=2022, month=1, day=10)
voc = Vocabulary(date, lang='Es', country='MX')
words = {k: v for k, v in voc.voc.items() if not k.count('~')}

wc = WC().generate_from_frequencies(words)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()
plt.savefig('wordcloud_mx.png', dpi=300)

## Zipf's Law - $$f=\frac{c}{r}$$

countries = [
    'MX', 'CO', 'ES', 'AR', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO',
    'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY'
]
vocs = Parallel(n_jobs=-1)(
    delayed(Vocabulary)(date, lang='Es', country=country)
    for country in tqdm(countries))
words = [{k: v
Exemplo n.º 12
0
    words.insert(0, '<s>')
    words.append('</s>')
    _ = [(a, b) for a, b in zip(words, words[1:])]
    bigrams.update(_)

words = set([a for a, b in bigrams])
words2 = set([b for a, b in bigrams])

prev = dict()
for (a, b), v in bigrams.items():
    try:
        prev[a] += v
    except KeyError:
        prev[a] = v

wc = WC().generate_from_frequencies(prev)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()

P = defaultdict(Counter)
for (a, b), v in bigrams.items():
    next = P[a]
    next[b] = v / prev[a]

wc = WC().generate_from_frequencies(P['<s>'])
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()
plt.savefig('wordcloud_prob_start.png', dpi=300)
Exemplo n.º 13
0
#! -*- coding: utf-8 -*-

from scipy.misc import imread
from wordcloud import ImageColorGenerator, WordCloud as WC

text = open('files/python_en.txt').read()
back_coloring = imread('python_logo.png')

wc = WC(background_color='white', mask=back_coloring, random_state=42, margin=2)
wc.generate(text)

image_colors = ImageColorGenerator(back_coloring)
wc.recolor(color_func=image_colors)

plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

wc.to_file('image_colored_wordcloud.png')

Exemplo n.º 14
0
#! -*- coding: utf-8 -*-

import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud as WC

text = open('files/python_en.txt').read()
alice_mask = np.array(Image.open('logo.png'))

wc = WC(background_color='white',
        mask=alice_mask,
        contour_width=2,
        contour_color='steelblue')
wc.generate(text)

plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
wc.to_file('masked_wordcloud.png')
Exemplo n.º 15
0
    X = tm.transform(_)
    m = LogisticRegression(multi_class='multinomial').fit(X, y[tr])
    # m = LinearSVC().fit(X, y[tr])
    _ = [D[x][0] for x in val]
    hy[val] = m.predict(tm.transform(_))

ci = bootstrap_confidence_interval(y, hy)
ci

D = list(tweet_iterator('../../../datasets/semeval/semeval2017_En_train.json'))
tm = TextModel(token_list=[-1]).fit(D)

id2word = {v:k for k, v in tm.model.word2id.items()}
_ = {id2word[k]:v for k, v in tm.model.wordWeight.items()}

wc = WC().generate_from_frequencies(_)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()
plt.savefig('semeval2017_idf.png', dpi=300)

cnt = Counter()
_ = [cnt.update(tm.tokenize(x)) for x in D]
wc = WC().generate_from_frequencies(cnt)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()
plt.savefig('semeval2017_tf.png', dpi=300)


perf = load_model('dataset/performance.gz')
Exemplo n.º 16
0
#! -*- coding: utf-8 -*-

import jieba
from scipy.misc import imread
from wordcloud import ImageColorGenerator, WordCloud as WC

text = open('files/python_zh.txt').read()
cn_text = ' '.join(jieba.cut(text))

font_path = '/usr/share/fonts/adobe-source-han-sans-cn/SourceHanSansCN-Normal.otf'
back_coloring = imread('images/python_logo.png')

wc = WC(font_path=font_path,
        background_color='white',
        mask=back_coloring,
        random_state=42,
        margin=2)
wc.generate(cn_text)

image_colors = ImageColorGenerator(back_coloring)
wc.recolor(color_func=image_colors)

plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

wc.to_file('images/wordcloud_cn.png')
Exemplo n.º 17
0
    okt = Okt()  # 형태소 분석 객체 생성
    kwds = okt.pos(x, norm=True, stem=True)
    kwds_filtered = [x for x, y in kwds if y in ['Noun', 'Adjective', 'Verb']]
    return kwds_filtered


df_rev.loc[:, 'kwd'] = df_rev.loc[:, 'text'].map(get_kwds)
df_rev.to_excel(writer, sheet_name='reviews')

for topic in [topic_diff_max, topic_diff_min]:
    for model in ['A', 'B']:
        df_filtered = df_rev[(df_rev['model'] == model)
                             & (df_rev['topic'] == topic)]
        list_kwd = df_filtered['kwd'].sum()
        dict_kwd_cnt = Counter(list_kwd)
        df_kwd_cnt = df({
            'kwd': list(dict_kwd_cnt.keys()),
            'cnt': list(dict_kwd_cnt.values())
        })
        df_kwd_cnt = df_kwd_cnt.sort_values(by='cnt', ascending=False)
        sheet_name = f'kwd_cnt_{model}_{topic}'
        df_kwd_cnt.to_excel(writer, sheet_name=sheet_name, index=False)
        wc = WC(max_words=40,
                font_path='./font/SeoulNamsanM.ttf',
                background_color='white')
        wc.generate_from_frequencies(dict_kwd_cnt)
        rst_png_file_path = f'./sample/kwd_cnt_{model}_{topic}.png'
        wc.to_file(rst_png_file_path)
        writer.sheets[sheet_name].insert_image(3, 4, rst_png_file_path)

writer.save()
Exemplo n.º 18
0
    _ = " & ".join(map(lambda x: "{:0.4f}".format(x), w))
    print(r"{} \\".format(_))

R_m = W.sum(axis=1)
C_m = W.sum(axis=0)
ind = np.dot(np.atleast_2d(R_m).T, np.atleast_2d(C_m))

for w in (W - ind):
    _ = " & ".join(map(lambda x: "{:0.4f}".format(x), w))
    print(r"{} \\".format(_))

# Example of the [bigrams](#tab:bivariate-distribution)

_ = [(bigram, [index[x] for x in bigram.split("~")]) for bigram in bigrams]
_ = {key: co_occurrence[i, j] for key, (i, j) in _}
wc = WC().generate_from_frequencies(_)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()
plt.savefig('wordcloud_us.png', dpi=300)

M = co_occurrence.sum(axis=1)


def get_diff(key):
    a, b = [index[x] for x in key.split('~')]
    if a == b:
        return -M[a] * M[b]
    return co_occurrence[a, b] - M[a] * M[b]

#stemming the tweets
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])

#Rejoining the tweets 
combined['clean_tweet'] = tokenized_tweet.apply(lambda x: ' '.join(w for w in x))


words = ' '.join([word for word in combined['clean_tweet']])


# Generating the word cloud 
from wordcloud import WordCloud as WC
cloud = WC(width=800,height=500,random_state=21,max_font_size=21).generate(words)

plt.figure(figsize = (10,7))
plt.imshow(cloud,interpolation="bilinear")
plt.axis('off')
plt.show()


# Generating normal words

normal_words = ' '.join([word for word in combined['clean_tweet'][combined['label'] == 0]])
normal_wordcloud = WC(width=800,height=500,random_state=21,max_font_size=110).generate(normal_words)
plt.figure(figsize = (10,7))
plt.imshow(normal_wordcloud,interpolation="bilinear")
plt.axis('off')
plt.show()
def review_analysis(src_file_path,
                    rst_file_path,
                    font_file_path='./font/SeoulNamsanM.ttf'):
    """
    구매 후기 데이터 처리 모듈
    :param src_file_path: 구매 후기 원본 데이터 경로
    :param rst_file_path: 처리 결과를 저장할 경로
    :param font_file_path: 글꼴 경로
    """

    from pandas import DataFrame as df
    import re
    from statistics import mean
    from konlpy.tag import Okt
    from collections import Counter
    from wordcloud import WordCloud as WC

    f = open(src_file_path, 'rt', encoding='utf-8', errors='ignore')
    rev = f.read()
    f.close()

    ptrn_html_code = re.compile(r'&[a-zA-Z]+?;')  # HTML 코드 패턴
    ptrn_html_tag = re.compile(r'</?[a-zA-Z]+?>')  # HTML 태그 패턴
    ptrn_all_clean = re.compile(r'[^0-9a-zA-Z가-힣.,:/\s]')  # 기타 불용어 패턴
    for cleaner in [ptrn_html_code, ptrn_html_tag, ptrn_all_clean]:
        rev = re.sub(cleaner, '', rev)

    re_phrase = r'^([AB]):([a-z]+?)관련/점수:([1-5])점/내용:([0-9a-zA-Z가-힣.,:\s]+?)$'
    ptrn_phrase = re.compile(re_phrase, re.M)
    rev_values = re.findall(ptrn_phrase, rev)
    models, topics, scores, texts = zip(*rev_values)
    df_rev = df({
        'model': models,
        'topic': topics,
        'score': scores,
        'text': texts
    })

    df_rev['score'] = pd.to_numeric(df_rev['score'],
                                    downcast='integer',
                                    errors='ignore')
    df_rev_mean = df_rev.pivot_table(values='score',
                                     index='model',
                                     columns='topic',
                                     aggfunc=mean)
    df_rev_diff = df_rev_mean.diff(periods=-1)
    topic_diff_max = df_rev_diff.loc['A', :].idxmax()  # B대비 A의 성과가 가장 좋은 토픽 탐색
    topic_diff_min = df_rev_diff.loc['A', :].idxmin()  # B대비 A의 성과가 가장 나쁜 토픽 탐색

    writer = pd.ExcelWriter(rst_file_path, engine='xlsxwriter')  # 엑셀 파일 객체 생성
    df_rev_mean.to_excel(writer, sheet_name='score_mean')
    df_rev_diff.to_excel(writer, sheet_name='score_diff')

    m_chart = writer.book.add_chart({'type': 'radar'})  # 토픽별 평균 점수로 방사형 차트 생성
    m_cat = ['score_mean', 0, 1, 0, 5]
    m_val_a = ['score_mean', 1, 1, 1, 5]
    m_val_b = ['score_mean', 2, 1, 2, 5]
    m_chart.add_series({'name': 'A', 'categories': m_cat, 'values': m_val_a})
    m_chart.add_series({'name': 'B', 'categories': m_cat, 'values': m_val_b})
    m_chart.set_title({'name': '토픽별 점수 평균'})
    writer.sheets['score_mean'].insert_chart(3, 0, m_chart)

    d_chart = writer.book.add_chart({'type': 'column'})  # 토픽별 평균 점수 차이로 차트 생성
    d_cat = ['score_diff', 0, 1, 0, 5]
    d_val = ['score_diff', 1, 1, 1, 5]
    d_chart.add_series({'name': 'A-B', 'categories': d_cat, 'values': d_val})
    d_chart.set_title({'name': '토픽별 점수 평균 차이 (A-B)'})
    writer.sheets['score_diff'].insert_chart(2, 0, d_chart)

    def get_kwds(x):
        okt = Okt()  # 형태소 분석 객체 생성
        kwds = okt.pos(x, norm=True, stem=True)
        kwds_filtered = [
            x for x, y in kwds if y in ['Noun', 'Adjective', 'Verb']
        ]
        return kwds_filtered

    df_rev.loc[:, 'kwd'] = df_rev.loc[:, 'text'].map(get_kwds)
    df_rev.to_excel(writer, sheet_name='reviews')

    for topic in [topic_diff_max, topic_diff_min]:
        for model in ['A', 'B']:
            df_filtered = df_rev[(df_rev['model'] == model)
                                 & (df_rev['topic'] == topic)]
            list_kwd = df_filtered['kwd'].sum()
            dict_kwd_cnt = Counter(list_kwd)
            df_kwd_cnt = df({
                'kwd': list(dict_kwd_cnt.keys()),
                'cnt': list(dict_kwd_cnt.values())
            })
            df_kwd_cnt = df_kwd_cnt.sort_values(by='cnt', ascending=False)
            sheet_name = f'kwd_cnt_{model}_{topic}'
            df_kwd_cnt.to_excel(writer, sheet_name=sheet_name, index=False)
            wc = WC(max_words=40,
                    font_path=font_file_path,
                    background_color='white')
            wc.generate_from_frequencies(dict_kwd_cnt)
            rst_png_file_path = f'./sample/kwd_cnt_{model}_{topic}.png'
            wc.to_file(rst_png_file_path)
            writer.sheets[sheet_name].insert_image(3, 4, rst_png_file_path)

    writer.save()
Exemplo n.º 21
0
token.counter
## SHOW - number of times each bigram and word appear

bigrams = {k: v for k, v in token.counter.items() if k.count("~")}
cnt = Counter(bigrams)
cnt.most_common(5)
# [('of~the', 14615),
#  ('in~the', 9913),
#  ('to~the', 7339),
#  ('on~the', 4883),
#  ('and~the', 4843)]
bigrams
## SHOW the selection

# Word-cloud
wc = WC().generate_from_frequencies(bigrams)
plt.imshow(wc)
plt.axis("off")
plt.tight_layout()
## SHOW
plt.savefig("books_wordcloud.png", dpi=300)

## SKIP
# Histogram
hist = defaultdict(list)
_ = [hist[v].append(k) for k, v in bigrams.items()]
plt.plot(np.log([len(hist.get(i, [0])) for i in range(1, 2000)]))
plt.grid()
plt.xlabel("Frequency")
plt.ylabel("Number of bigrams (log-scale)")
plt.tight_layout()