示例#1
0
def SearchSimilarWords(word):

    # 問い合わせしたい単語がWordnetに存在するか確認する
    cur = conn.execute(f"select wordid from word where lemma='{word}'")
    word_id = 99999999  #temp
    for row in cur:
        word_id = row[0]

    # Wordnetに存在する語であるかの判定
    if word_id == 99999999:
        print(f"「{word}」は、Wordnetに存在しない単語です。")
        return
    else:
        print(f"【「{word}」の類似語はね、以下ですよ】\n")

    # 入力された単語を含む概念を検索する
    cur = conn.execute(f"select synset from sense where wordid='{word_id}'")
    synsets = []
    for row in cur:
        synsets.append(row[0])

    # 概念に含まれる単語を検索して画面出力する
    no = 1
    l_empty = []
    for synset in synsets:
        cur1 = conn.execute(f"select name from synset where synset='{synset}'")
        for row1 in cur1:
            print("%sつめの概念 : %s" % (no, row1[0]))
        cur2 = conn.execute(
            "select def from synset_def where (synset='%s' and lang='jpn')" %
            synset)
        sub_no = 1
        for row2 in cur2:
            print("意味%s : %s" % (sub_no, row2[0]))
            # 対象に追加
            l_empty.append(row2[0])
            sub_no += 1
        cur3 = conn.execute(
            f"select wordid from sense where (synset='{synset}' and wordid!={word_id})"
        )
        sub_no = 1
        for row3 in cur3:
            target_word_id = row3[0]
            cur3_1 = conn.execute(
                f"select lemma from word where wordid={target_word_id}")
            for row3_1 in cur3_1:
                print("類義語%s : %s" % (sub_no, row3_1[0]))
                # 対象に追加
                l_empty.append(row3_1[0])
                sub_no += 1
        print("\n")
        no += 1

    list_str = ",".join(l_empty)
    wordcloud = WordCloud(background_color="white",
                          font_path="./TakaoPGothic.ttf",
                          width=800,
                          height=600).generate(list_str)

    wordcloud.to_file("./wordcloud_sample.png")
    answer = random.choice(l_empty)
    l_phrase = []
    l_phrase.append(f"私はね、{word} とはですね、言ってみればもはや {answer} だと思うんですよ。")
    l_phrase.append(f"{word} ってことはですよ、{answer} とも考えられるということですよ。")
    l_phrase.append(f"{word} って、もう {answer} ですよね。")
    phrase = random.choice(l_phrase)
    print(phrase)
示例#2
0

def chinese_jieba(text):
    wordlist_jieba = jieba.cut(text)
    space_wordlist = " ".join(wordlist_jieba)
    return space_wordlist


#读取csv文件
df = pd.read_csv('douban_movie.csv')
comment_list = df['comment'].values.tolist()
score_list = df['score'].values.tolist()
text = ""
stopwords = [
    line.strip() for line in open('stop.txt', encoding='UTF-8').readlines()
]
for jj in range(len(comment_list)):
    text = text + chinese_jieba(comment_list[jj])
    word_counts = collections.Counter(text)
print(text)
mask_pic = imread('movie.jpg')
wordcloud = WordCloud(
    font_path="C:/Windows/Fonts/simfang.ttf",  #设置字体
    mask=mask_pic,  #设置背景图片
    background_color="white",  #设置背景颜色
    max_font_size=150,  # 设置字体最大值
    max_words=2000,  # 设置最大显示的字数
    stopwords=stopwords  #设置停用词,停用词则不再词云图中表示
).generate(text)  #根据文本生成词云
imge = wordcloud.to_image()
wordcloud.to_file('key.png')
示例#3
0
文件: st_app.py 项目: nadcharin/mlops
    plt.title("Tag distribution", fontsize=20)
    plt.xlabel("Tag", fontsize=16)
    ax.set_xticklabels(tags, rotation=90, fontsize=14)
    plt.ylabel("Number of projects", fontsize=16)
    plt.show()
    st.pyplot(plt)

    # Plot word clouds top top tags
    plt.figure(figsize=(20, 8))
    tag = st.selectbox("Choose a tag", tags, index=0)
    subset = df[df.tags.apply(lambda tags: tag in tags)]
    text = subset.text.values
    cloud = WordCloud(
        stopwords=STOPWORDS,
        background_color="black",
        collocations=False,
        width=500,
        height=300,
    ).generate(" ".join(text))
    plt.axis("off")
    plt.imshow(cloud)
    st.pyplot(plt)

    # Preprocessing
    st.write("---")
    st.subheader("Preprocessing")
    filters = st.text_input("filters", "[!\"'#$%&()*+,-./:;<=>?@\\[]^_`{|}~]")
    lower = st.checkbox("lower", True)
    stem = st.checkbox("stem", False)
    text = st.text_input("Input text", "Conditional generation using Variational Autoencoders.")
    preprocessed_text = data.preprocess(text=text, lower=lower, stem=stem, filters=filters)
示例#4
0
#print(dir(tweetblob))

#Filter Words
wordsToFilter = [
    "about", "https", "in", "the", "thing", "will", "could", tweetSearch
]
filteredDictionary = dict()

for word in tweetblob.words:
    #skip tiny words
    if len(word) < 2:
        continue
    #skip words with random characters or numbers
    if not word.isalpha():
        continue
    #skip words in our filter
    if word.lower() in wordsToFilter:
        continue
    #don't want lower case words smaller than 5 letters
    if len(word) < 5 and word.upper() != word:
        continue

    #Try lower case only, try with upper case!
    filteredDictionary[word.lower()] = tweetblob.word_counts[word.lower()]

#Create the word cloud
wordcloud = WordCloud().generate_from_frequencies(filteredDictionary)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
示例#5
0
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import jieba

with open('Data.txt', 'r', encoding='utf-8') as f:
    text = ' '.join(jieba.cut(f.read(), cut_all=False))

wc = WordCloud(
    background_color='black',
    font_path='C:\\Windows\\Fonts\\msyh.ttc',
    width=1080,
    height=2340,
).generate(text)

imageFile = wc.to_image()
imageFile.save("image.png")
imageFile.show()
示例#6
0
cursor = db.cursor()
#cursor.execute("insert into nlpt_feedback values(user_input_feedback,(1 if ans="pos" else -1))")
cursor.execute("select feedback from nlpt_feedback where lab=1")
pos = cursor.fetchall()
pos = [x[0] for x in pos]

cursor.execute("select feedback from nlpt_feedback where lab=-1")
neg = cursor.fetchall()
neg = [x[0] for x in neg]
db.close()

postext = " ".join(pos)
negtext = " ".join(neg)
stopwords = set(STOPWORDS)

wc = WordCloud(background_color="white", random_state=42)
wc.generate(postext)
plt.figure()
plt.axis('off')
plt.imshow(wc, interpolation="bilinear")
plt.savefig('images/pos.png', bbox_inches='tight')

wc.generate(negtext)
plt.imshow(wc, interpolation="bilinear")
plt.savefig('images/neg.png', bbox_inches='tight')

db = MySQLdb.connect("127.0.0.1", "root", "spiderman", "feedback")
cursor = db.cursor()
cursor.execute("select feedback from nlpt_feedback where lab=1")
posdata = cursor.fetchall()
cursor.execute("select feedback from nlpt_feedback where lab=-1")
示例#7
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from wordcloud import WordCloud

textwc = ""
with open('hive.txt', encoding='utf-8') as f:
    textwc = ''.join(f.readlines())

wordcloud = WordCloud(
    font_path="/usr/share/fonts/truetype/andika/Andika-R.ttf",
    width=4000,
    height=2000,
    mask=None,
    color_func=None,
    max_words=300,
    min_font_size=12,
    stopwords=None,
    background_color="gray",
    max_font_size=300,
    colormap="gist_heat",
    contour_width=0,
    contour_color="white")

wordcloud.generate(textwc)

wordcloud.to_file('hive_mod.png')
from nltk.corpus import stopwords
stop = stopwords.words('english')


reviews['Title'] = reviews['Title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
reviews['Content'] = reviews['Content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


## Tokenization:





## Word Cloud:
wc = WordCloud(background_color="white", max_words=2000)
wc.generate(' '.join(reviews['Content']))


import matplotlib.pyplot as plt
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
#plt.figure(figsize=(4, 3))
#plt.axis("off")
plt.show()





示例#9
0
Generating a square wordcloud from the US constitution using default arguments.
"""

import os

from os import path
from wordcloud import WordCloud

# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()

# Read the whole text.
text = open(path.join(d, 'constitution.txt')).read()

# Generate a word cloud image
wordcloud = WordCloud().generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# lower max_font_size
"""
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
示例#10
0
 def _perform_wordcloud_visualization(self, condition):
     if condition:
         print("Please wait plotting wordcloud...")
         wc = WordCloud(width=1000, height=400).generate(self.text)
         plt.axis("off")
         plt.imshow(wc)
示例#11
0
def calling(file):
    # 1. 크롤링한 파일을 읽기전용으로 호출
    file = open("movie.txt", 'r', encoding='utf-8')
    lines = file.readlines()

    # 2. 변수 reaction에 전체댓글을 다시저장
    reaction = []
    for line in lines:
        reaction.append(line)
    file.close()

    okt = Okt()

    # 4. 각 문장별로 형태소 구분하기
    sentences_tag = []
    for sentence in reaction:
        morph = okt.pos(sentence, norm=True, stem=True)
        sentences_tag.append(morph)

    # 5. 형용사인 품사만 선별해 리스트에 담기
    adj_list = []
    finallist = []
    for sentence1 in sentences_tag:
        polarlist = ['None1', 'None2']
        for word, tag in sentence1:
            if tag in ['Adjective'] and ("이다" not in word) and (
                    "아니다" not in word) and ("있다" not in word) and (
                        "없다" not in word) and ("많다" not in word) and (
                            "같다" not in word) and ("그렇다" not in word) and (
                                "이렇다" not in word) and ("어떻다" not in word):
                adj_list.append(word)
                with open('KnuSentiLex-master/data/SentiWord_info.json',
                          encoding='utf-8-sig',
                          mode='r') as f:
                    data = json.load(f)
                result = ['None3', 'None4']
                for i in range(0, len(data)):
                    if data[i]['word'] == word:
                        result.pop()
                        result.pop()
                        result.append(data[i]['word_root'])
                        result.append(data[i]['polarity'])

                r_word = result[0]
                s_word = result[1]

                polarlist.pop()
                polarlist.pop()
                polarlist.append(r_word)
                polarlist.append(s_word)

        polar_word1 = polarlist[0]
        polar_word2 = polarlist[1]

        finallist.append(polar_word2)

    print("-2점 : ", finallist.count('-2'))
    print("-1점 : ", finallist.count('-1'))
    print("0점 : ", finallist.count('0'))
    print("1점 : ", finallist.count('1'))
    print("2점 : ", finallist.count('2'))
    # print("총 데이터 개수 : ", len(finallist))

    a = finallist.count('-2')
    b = finallist.count('-1')
    c = finallist.count('0')
    d = finallist.count('1')
    e = finallist.count('2')

    # 감성분석되지 않은 데이터는 제외한 총 데이터 개수
    all = a + b + c + d + e

    a2 = a * 2
    b2 = b * 4
    c2 = c * 6
    d2 = d * 8
    e2 = e * 10

    all2 = a2 + b2 + c2 + d2 + e2

    # 별점(5점만점)
    starnum = round((all2 / all), 2)

    print("별점 : ", starnum)

    # print("별점(소수점둘째자리까지) : ", "%0.2f" % starnum)

    # 6. 선별된 품사별 빈도수 계산 & 빈도수대로 정렬
    counts = Counter(adj_list)
    tags = counts.most_common(200)

    # 7. 워드클라우드 만들기
    wordcloud = WordCloud(font_path='c:/Windows/Fonts/malgun.ttf',
                          background_color='white',
                          width=1200,
                          height=800).generate_from_frequencies(dict(tags))

    fig = plt.figure()
    plt.axis('off')
    plt.imshow(wordcloud)
    # plt.show()
    fig.savefig('./static/images/wordcloud_image.png')

    return starnum
示例#12
0
文件: targeter.py 项目: jufx/tools

def grey_color_func(word,
                    font_size,
                    position,
                    orientation,
                    random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % 27  #% random.randint(60, 100)


# This function takes in your text and your mask and generates a wordcloud.
#def generate_wordcloud(words, mask):
word_cloud = WordCloud(width=512,
                       height=512,
                       background_color='white',
                       stopwords=set(STOPWORDS),
                       collocations=False).generate(words)
plt.figure(figsize=(512, 512), facecolor='white', edgecolor='blue')
plt.imshow(word_cloud)
plt.axis('off')
plt.tight_layout(pad=0)
#plt.show()
plt.imshow(word_cloud.recolor(color_func='hsl(0, 0%, 27%)', random_state=3),
           interpolation="bilinear")
plt.axis("off")
plt.figure()
#plt.title("Default colors")
default_colors = word_cloud.to_array()
plt.imshow(default_colors, interpolation="bilinear")
plt.axis("off")
    stopwords.add("t")
    stopwords.add("co")
    stopwords.add("https")
    stopwords.add("will")
    stopwords.add("people")
    stopwords.add("amp")
    stopwords.add("time")
    stopwords.add("got")
    stopwords.add("now")
    stopwords.add("got")
    stopwords.add("say")
    stopwords.add("getting")
    stopwords.add("day")
    stopwords.add("today")
    stopwords.add("COVID")
    stopwords.add("vaccine")
    stopwords.add("COVID19")
    stopwords.add("CovidVaccine")
    wordCloud = WordCloud(
        background_color='white',
        max_words=500,
        stopwords=stopwords
    )
    text = df["text"].to_csv()
    wordCloud.generate(text)
    print("First batch:")
    plt.figure(figsize=(18, 18))
    plt.imshow(text, interpolation='bilinear')
    plt.axis('off')
    plt.show()
import sqlite3
conn = sqlite3.connect('data.db')
user = {}
for i in conn.execute("select mid,name from user order by id").fetchall():
    user[i[0]] = i[1]
wordlist = []
for i in conn.execute("select following from relation order by id").fetchall():
    if i[0] in user:
        wordlist.append(user[i[0]])
wl_space_split = " ".join(wordlist)
mask_png = imread("fate.jpeg")
my_wordcloud = WordCloud(
    font_path=r"C:\Windows\Fonts\simhei.ttf",  # 词云自带的字体不支持中文,在windows环境下使用黑体中文
    background_color="white",  # 背景颜色
    max_words=500,  # 词云显示的最大词数
    max_font_size=100,  # 字体最大值
    random_state=42,
    mask=mask_png,
    width=1000,
    height=860,
    margin=2,
).generate(wl_space_split)
image_colors = ImageColorGenerator(mask_png)
plt.figure()
plt.imshow(my_wordcloud.recolor(color_func=image_colors))
plt.axis("off")
plt.figure()
plt.imshow(mask_png, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
my_wordcloud.to_file("wordcloud.png")
示例#15
0
emotion_filename = ""
emotion = pd.read_csv(emotion_filename)

#Emotion wordcloud formation
shame = []
disgust = []
joy = []
sadness = []
fear = []
guilt = []
anger = []

default_color = 'grey'

wc = WordCloud(width=1600, height=800, collocations=False, relative_scaling=0,\
               max_font_size = 100,background_color = 'white', \
               ).generate(' '.join(emotion.hashtag).lower())

#Categorization of hashtags
for item in emotion.itertuples():
    if item.anger != 0:
        anger.append(item.hashtag)
    if item.shame != 0:
        shame.append(item.hashtag)
    if item.disgust != 0:
        disgust.append(item.hashtag)
    if item.joy != 0:
        joy.append(item.hashtag)
    if item.sadness != 0:
        sadness.append(item.hashtag)
    if item.fear != 0:
from wordcloud import WordCloud, STOPWORDS

comment_word = ""

stopword = set(STOPWORDS)

file = open("word.txt", "r+")

text = file.read().replace("\n", " ")

wc = WordCloud(stopwords=stopword,
               width=792,
               height=507,
               min_font_size=10,
               background_color="Black")

# generate word cloud
wc.generate(text)

# store to file
wc.to_file("wordcloud.png")
print("Successfull")
示例#17
0
counts = Counter(t1_tokenized)
print("Number of distinct words "+str(len(counts)))
print("Number of tokens "+str(len(t1_tokenized)))
print("Number of characters "+str(len(rawt1)))
print(t1_tokenized)
fdist = FreqDist(t1_tokenized)
fdist.plot(30, cumulative=False)
plt.show()


# In[2]:


word_cloud_dict=Counter(t1_tokenized)
wordcloud = WordCloud(width = 1000, height = 1000,
                      background_color = 'white',
                      stopwords = None).generate_from_frequencies(word_cloud_dict)
plt.figure(figsize = (8,8) , facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()


# In[3]:


stop_words = set(stopwords.words("english"))
filtered_text = []
for w in t1_tokenized:
    if w not in stop_words:
示例#18
0
from wordcloud import WordCloud
import matplotlib.pyplot as plt

#main-1
#打开文本
text = open('J:/文档e盘/深度学习/03 词云/03 词云/constitution.txt').read()
#生成对象
wc = WordCloud().generate(text)

#显示词云
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

#保存文件
wc.to_file('wordcloud.png')

#main-2
#上面是英文的例子,下面看一下关于中文的情况
text = open('J:/文档e盘/深度学习/03 词云/03 词云/xyj.txt', encoding='UTF-8').read()

wc = WordCloud(font_path='Hiragino.ttf',
               width=800,
               height=600,
               mode='RGBA',
               background_color=None).generate(text)

plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
        this_topic = this_hotel.loc[idx]
        sc = this_topic['Score'].mean()
        print("\nAverage Sentiment for" ,h, "Topic", c, ":", sc)
        
#Word Clouds
def shades_of_grey(word, font_size, position, orientation, random_state=None, \
                   **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60,1000)
#Word cloud for all word in the reviews
st = set(STOPWORDS)
st.add("hotel")
st.add("room")
st.add("quot")
st.add("one")
st.add("casino")
wc = WordCloud(stopwords=st,width=600, height=400)
s = ""
for i in range(len(comments)):
    s += comments[i]
wc.generate(s)
# Display the word cloud.
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.figure()
plt.show()

#From the sentiment words of all words
corpus_sentiment = {}
n_sw = 0
for i in range(n_reviews):
    # Iterate over the terms with nonzero scores
示例#20
0
def word_cloud(book_name):
    # !pip install wordcloud

    import nltk
    from konlpy.corpus import kobill
    from konlpy.tag import Twitter
    t = Twitter()
    from wordcloud import WordCloud

    import matplotlib.pyplot as plt
    import platform
    import io
    import base64
    img = io.BytesIO()

    # OS별 matplotlib 한국어 처리
    path = "static/AppleGothic.ttf"  # window 사용자의 경우 path 설정 중요
    from matplotlib import font_manager, rc
    if platform.system() == 'Darwin':
        rc('font', family='AppleGothic')
    elif platform.system() == 'Windows':
        font_name = font_manager.FontProperties(fname=path).get_name()
        rc('font', family=font_name)
    else:
        print('Unknown system... sorry~~~~')

    # 워드 클라우드 만들기 시작

    files_ko = kobill.fileids()
    books_all = pd.read_csv('static/books_all.csv')

    book_name = book_name  # input으로 받음

    files_ko = kobill.fileids()

    doc_ko = books_all[books_all['name'] == book_name].iloc[0].text
    tokens_ko = t.nouns(doc_ko)

    with open('static/project_stopwords.txt', 'r', encoding='utf-8') as f:
        stop_words = f.read().split(' ')

    ko = nltk.Text(tokens_ko)
    ko = [each_word for each_word in ko if each_word not in stop_words]
    ko = nltk.Text(ko)

    data = ko.vocab().most_common(150)

    # for win : font_path='c:/Windows/Fonts/malgun.ttf'
    wordcloud = WordCloud(
        font_path='static/AppleGothic.ttf',
        relative_scaling=0.2,
        background_color='white',
    ).generate_from_frequencies(dict(data))

    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(img, format='png')
    img.seek(0)

    return base64.b64encode(img.getvalue()).decode()
示例#21
0
Satisfiedemp = dataemp.loc[dataemp["label"] == 1]
UnSatisfiedemp = dataemp.loc[dataemp["label"] == 0]

# In[34]:

Satisfiedemp.shape

# In[67]:

ignorewords = Satisfiedemp["pros"].isin(
    ['Amazon', 'company', 'work', 'place', 'employee', 'team', 'time'])
satwords = Satisfiedemp.loc[~(ignorewords), "pros"]

# In[68]:

wordcloud = WordCloud().generate(' '.join(satwords))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# In[71]:

ignorewords = UnSatisfiedemp["cons"].isin([
    'management',
    'manager',
    'employee',
    'Amazon',
    'customer',
    'team',
    'time',
    'job',
示例#22
0
def find_speaker(name):
    speaker_list = []
    for x in transcript:
        namelen = len(name)
        if x[0:namelen + 1] == (' ' + name) or x[0:namelen] == name:
            speaker_list.append(x)

    speaker_list = ' '.join(speaker_list).split()
    #print(speaker_list)

    stripped_speech = []
    for i in speaker_list:
        if i not in stopwords:
            stripped_speech.append(i)

    speaker_count = {}
    for word in stripped_speech:
        if word in speaker_count.keys():
            speaker_count[word] = speaker_count[word] + 1
        else:
            speaker_count[word] = 1

    d = collections.Counter(speaker_count)
    tot = len(speaker_list)

    top_30 = d.most_common(30)

    print(name + " spoke a total number of " + str(tot) + " words.")
    for word, count in d.most_common(30):
        print(word, ": ", count)

        #PLOT BAR CHART
        plt.bar(word, count)
        plt.title('Word Count')
        plt.xlabel('Word')
        plt.ylabel('Count')
        plt.show()

    #WORD CLOUD

    picture = name + ".jpg"

    char_mask = np.array(Image.open(picture))

    # Create a word cloud image
    wc = WordCloud(background_color="white",
                   max_words=1000,
                   mask=char_mask,
                   stopwords=stopwords,
                   contour_width=3,
                   contour_color='lightgrey')

    # Generate a wordcloud
    text = ' '.join(stripped_speech)
    wc.generate(text)
    print(wc)

    # show
    plt.figure(figsize=[30, 20])
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()
示例#23
0
        f1_score(test[category], prediction, average='weighted')))
    print('Test precision is {}'.format(
        precision_score(test[category], prediction, average='macro')))

    print(confusion_matrix(test[category], prediction))
    print("\n")

######
from wordcloud import WordCloud, STOPWORDS
plt.figure(figsize=(40, 25))
# clean
subset = review_random_set[review_random_set.flavour == 1]
text = subset.combined.values
cloud = WordCloud(stopwords=STOPWORDS,
                  background_color='black',
                  collocations=False,
                  width=2500,
                  height=1800).generate(" ".join(text))
plt.axis('off')
plt.title("Flavour", fontsize=40)
plt.imshow(cloud)

review_random_set = pd.read_csv('binary_reviews.csv')
review_random_set = review_random_set.drop(columns=['Unnamed: 0'])
# %%
##### figuring out the sentiment part and representation
copy = review_random_set.copy()
copy['sentiment'] = 1
copy.iloc[500:, 18] = -1

for row in range(0, len(copy)):
示例#24
0
                    orientation,
                    random_state=None,
                    **kwargs):
    return


words = open('cloud.txt')
word_count = []
for line in words.readlines():
    word = line.strip().split(':')
    word_count.append((unicode(word[0]), int(word[1])))

bg_mask = np.array(Image.open('bg.png'))

wc = WordCloud(
    font_path='./font/msyh.ttc',  #设置字体 
    backgrond_color="white",  #背景颜色
    max_words=2000,  # 词云显示的最大词数
    mask=bg_mask,
    max_font_size=90,  #字体最大值
    random_state=41,
    scale=3)

wc.fit_words(word_count)
image_colors = ImageColorGenerator(bg_mask)

plt.figure()
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis('off')
plt.show()
wc.to_file('test.png')
示例#25
0
    font_name = font_manager.FontProperties(
        fname="c:/Windows/fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:
    print('unknown...')

# data = ko_con_text.vocab().most_common(500)
# data=ko
tmp_data = dict(data)

# plt.figure(figsize=(16,8))
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show()

korea_coloring = np.array(Image.open("bb.jpg"))
image_colors = ImageColorGenerator(korea_coloring)
wordcloud = WordCloud(
    font_path='c:\\windows\\fonts\\NanumGothic.ttf',
    relative_scaling=0.1,
    mask=korea_coloring,
    background_color='white',
    min_font_size=4,
    max_font_size=40,
).generate_from_frequencies(tmp_data)
plt.figure(figsize=(12, 12))
plt.imshow(wordcloud.recolor(color_func=image_colors),
           interpolation="bilinear")
plt.axis("off")
plt.show()
示例#26
0
def main():
    """Tweet Classifier App with Streamlit """

    # Creates a main title and subheader on your page -
    # these are static across all pages

    # Creating sidebar with selection box
    options = [
        "Prediction", "Purpose of the App", "Exploratory Data Analysis",
        "About Global Warming", "Machine Learning Models",
        "Natural Language Processing"
    ]
    selection = st.sidebar.selectbox("Choose Option", options)

    if selection == "Exploratory Data Analysis":
        df_senti1 = raw[raw['sentiment'] == 1]
        tweet_senti1 = " ".join(review for review in df_senti1.message)

        #create word cloud in eda

        st.image(img3,
                 width=600,
                 caption="Visualising the climate change threat")

        st.title("Insight From The Data")
        st.subheader(
            "A Representation Of The Most Common Words In Each Sentiment Class"
        )
        sent_groups = st.radio('Sentiment Views:', (
            'Positive, those who believe climate change is a threat',
            'Negative sentiment, opposing the belief that climate change is a threat',
            'Neutral, an impartial stance on climate change',
            'News Report, topical news reported on climate change'))
        if sent_groups == (
                'Positive, those who believe climate change is a threat'):
            df_senti1 = clean[clean['sentiment'] == 1]
            tweet_senti1 = " ".join(review
                                    for review in df_senti1.clean_stp_words)
            # Create and generate a word cloud image:
            wordcloud_1 = WordCloud(
                max_font_size=50, max_words=100,
                background_color="white").generate(tweet_senti1)
            plt.imshow(wordcloud_1, interpolation='bilinear')
            #plt.set_title('Tweets under Pro Class 1',fontsize=50)
            plt.axis('off')
            plt.show()
            st.pyplot()
            if st.checkbox('Interpretation of Diagram, Sentiment 1'):
             """Common words of interest in pro-sentiment include `To fight`,`to tackle`, `belive in` and `fight climate`. It appears that 
					tweets in this category are providing solutions to fight climate change. Many of the sentiments reflected are related to 
					on Trumps commentary. In the pro sentiment class we find that people do not agree with Trump.')"""
        if sent_groups == 'News Report, topical news reported on climate change':
            df_senti_2 = clean[clean['sentiment'] == 2]
            tweet_senti_2 = " ".join(review
                                     for review in df_senti_2.clean_stp_words)
            # Create and generate a word cloud image:
            wordcloud_2 = WordCloud(
                max_font_size=50, max_words=100,
                background_color="white").generate(tweet_senti_2)
            plt.imshow(wordcloud_2, interpolation='bilinear')
            #plt.set_title('Tweets under Pro Class 1',fontsize=50)
            plt.axis('off')
            plt.show()
            st.pyplot()
            if st.checkbox('Interpretation of Diagram, Sentiment 2') :
             """Common words news tweets are `Trump, global warming, via`,`Scientists`,`researchers`,`ÈPA` and `report`.
				 	This could reveal the sentiment that humans are the cause of climate change because they burn fossil fuels. 
				 	News reports can be highly influential on overall sentiment as many rely of the media to validate their beliefs. 
				 	It is evident that the word Trump is  most common. According to research in the news, the momentum for these 
				 	sentiments comes from the commentary that president Trump has made about climate change."""

        if sent_groups == "Neutral, an impartial stance on climate change":
            df_senti_0 = clean[clean['sentiment'] == 0]
            tweet_senti_0 = " ".join(review
                                     for review in df_senti_0.clean_stp_words)
            #Create and generate a word cloud image:
            wordcloud_0 = WordCloud(
                max_font_size=50, max_words=100,
                background_color="white").generate(tweet_senti_0)
            plt.imshow(wordcloud_0, interpolation='bilinear')
            #plt.set_title('Tweets under Pro Class 1',fontsize=50)
            plt.axis('off')
            plt.axis("off")
            plt.show()
            st.pyplot()
            if st.checkbox('Interpretation of Diagram, Sentiment 0'):
             """The sentiments in class 0 represents people that are neutral towards climate change. The reason could be that they are
					not aware of climate change, or do not have enough information, this can be seen by words such as `interviewer`,
					`Trump`, `think`. Common words in neutral tweets include `care about`,`think`,`maybe`. This could indicate
					uncerainty toward climate change validity or an apathetic inclination.Interestingly, the appearance of the word
				`	ignore` tells us that these tweeters find the matter confusing.')"""

        st.subheader(
            "**Observe the frequency of the 20 most common words in each class**"
        )
        Pro = clean[clean['sentiment'] == 1]
        Anti = clean[clean['sentiment'] == -1]
        Neutral = clean[clean['sentiment'] == 0]
        News = clean[clean['sentiment'] == 2]

        common = st.selectbox('Select Sentiment Type',
                              ('Positive', 'Negative', 'Neutral', 'News'))
        if common == 'Positive':
            Pro['temp_list'] = Pro['clean_stp_words'].apply(
                lambda x: str(x).split())
            top = Counter(
                [item for sublist in Pro['temp_list'] for item in sublist])
            temp_positive = pd.DataFrame(top.most_common(20))
            temp_positive.columns = ['Common_words', 'count']
            temp_positive = temp_positive.style.background_gradient(
                cmap='Greens_r')
            st.write(temp_positive, width=200)
        if common == 'Negative':
            Anti['temp_list'] = Anti['clean_stp_words'].apply(
                lambda x: str(x).split())
            top = Counter(
                [item for sublist in Anti['temp_list'] for item in sublist])
            temp_neg = pd.DataFrame(top.most_common(20))
            temp_neg.columns = ['Common_words', 'count']
            temp_neg = temp_neg.style.background_gradient(cmap='Greens_r')
            st.write(temp_neg, width=200)

        if common == 'News':
            News['temp_list'] = News['clean_stp_words'].apply(
                lambda x: str(x).split())
            top = Counter(
                [item for sublist in News['temp_list'] for item in sublist])
            temp_news = pd.DataFrame(top.most_common(20))
            temp_news.columns = ['Common_words', 'count']
            temp_news = temp_news.style.background_gradient(cmap='Greens_r')
            st.write(temp_news, width=200)

        if common == 'Neutral':
            Neutral['temp_list'] = Neutral['clean_stp_words'].apply(
                lambda x: str(x).split())
            top = Counter(
                [item for sublist in Neutral['temp_list'] for item in sublist])
            temp_net = pd.DataFrame(top.most_common(20))
            temp_net.columns = ['Common_words', 'count']
            temp_net = temp_net.style.background_gradient(cmap='Greens_r')
            st.write(temp_net, width=200)

        st.subheader("**A Closer Look At The Data Distribution**")
        temp = raw.groupby(
            'sentiment').count()['message'].reset_index().sort_values(
                by='message', ascending=False)
        temp['percentage'] = round(
            (temp['message'] / temp['message'].sum()) * 100, 0)
        labels1 = temp['sentiment']
        labels = ["Sentiment  %s" % i for i in temp['sentiment']]
        sizes = temp['percentage']
        fig1, ax1 = plt.subplots(figsize=(6, 6))
        fig1.subplots_adjust(0.3, 0, 1, 1)

        theme = plt.get_cmap('Greens_r')
        ax1.set_prop_cycle(
            "color", [theme(1. * i / len(sizes)) for i in range(len(sizes))])
        _, _ = ax1.pie(sizes, startangle=90, labels=labels1, radius=1800)

        ax1.axis('equal')
        total = sum(sizes)
        plt.legend(loc='upper left',
                   labels=[
                       '%s, %1.1f%%' % (l, (float(s) / total) * 100)
                       for l, s in zip(labels, sizes)
                   ],
                   prop={'size': 7},
                   bbox_to_anchor=(0.0, 1),
                   bbox_transform=fig1.transFigure)

        plt.show()  # Equal aspect ratio ensures that pie is drawn as a circle.
        st.pyplot()  #c, use_container_width=True)

        if st.checkbox('Interpretation of Pie Chart'):
         """More than half of the tweets analysed reflect a belief in climate change. 
					Although it is not an overwhelming majority figure, believers are in the majority.
					As science begins to offer clearer evidence it is likely that many neutral tweeters 
					could sway their beliefs. Less than ten percent of the sample population do not believe 
					in climate change. If the sample is a good representation of the population than the
					market for evironmentally friendly or environmentally conscious goods and services could
					be a desireable product to fairly large sector of the population')"""




    if selection == "Purpose of the App":

        st.header(
            "**The Impact Of Climate Change Sentiment And Maximising Profit**")
        img2 = Image.open("Images/gw.jpeg.jpg")
        st.image(img2,
                 width=400,
                 caption="Visualising the climate change threat")
        """This app will reveal the overall sentiment toward climate change by analysing recent
			tweets (post made on the social media application Twitter).By understanding how potential consumers 
			view climate change, companies can make informed decisions on product development and marketing. This app
			 will answer the question: Do people see climate change as a real threat?"""

        st.subheader(
            "A brief Look At The Raw Data (Database of tweets analysed)")

        if st.checkbox('Show raw data'):  # data is hidden if box is unchecked
            st.write(raw[["sentiment",
                          "message"]])  # will write the df to the page
            data = pd.DataFrame(raw, columns=['sentiment', 'message'])
            st.write(data.plot(kind='hist', color='green'))
            st.pyplot()
            data = {
                'Sentiment Type': ['-1', '0', '1', '2'],
                'Sentiment Meaning': [
                    'Negative sentiment, opposing the belief that climate change is a threat',
                    'Neutral, an impartial stance on climate change',
                    'Positive, supporting the belief that climate change poses a threat',
                    'News Report, topical news reported on climate change'
                ]
            }
            sentiment = pd.DataFrame(
                data, columns=['Sentiment Type', 'Sentiment Meaning'])
            sentiment = sentiment.set_index('Sentiment Type')
            st.write(sentiment, width=800)

            st.subheader("**Interpretation Of Sentiment Distribution**")
            """In the database ,most of the tweets indicate that alot of people believe climate change is a real threat and is man-man."""
            """Media coverage on climate change concerns substantiates the belief that climate change is a real threat.There are tweets 
				in the database that indicate that there are people who are nuetral on the subject of the subject
			    of Global warming ,however ,they are vastly outnumbered"""

    if selection == "Machine Learning Models":

        st.header("**Logistic Regression**")
        """The Logistic regression algorithm builds a regression model to predict
			the probability that a given data entry belongs to the category numbered as “1”.
			Logistic regression becomes a classification technique only when a decision
			threshold is brought into the picture. The setting of the threshold value is a very 
			is dependent on the classification problem itself.
			Logistic regression models the data using the sigmoid function.
			It squeezes the range of output values to exist only between 0 and 1.
			For binary classification ,the output value of a logistic regre.The threshold 
			value is usually set to 0.5 and determine if an observation will belong to class 0 or 1."""

        logistic_regression = Image.open("Images/logistic_regression.jpg")
        st.image(logistic_regression,
                 caption="sigmoid function for logistic regression ",
                 use_column_width=True)
        """For multiclass classification problems ,
			logistic regression models are combined into what is known as the one-vs-rest approach (or OvR).
			In the OvR case, a separate logistic regression model is trained for each label that the response
			variable takes on."""
        st.subheader("Pros and cons of Logistic Regression")
        """ - easy to implement and very efficient to train"""
        """ - Can overfit when data is unbalanced and Doesn't handle large number of categorical variables well."""
        logistic_reg_perf = Image.open('Images/logistic_reg_perfomance.jpg')
        st.image(logistic_reg_perf, use_column_width=True)

        st.header("**Random Forest tree**")
        """The building blocks of the random first model are Decision trees.Simple put ,the decision tree is a flowchart
			 of questions leading to a prediction.
			Random forest is a technique used in modeling predictions and behavior analysis and is built on decision trees.
			It contains many decision trees that represent a distinct instance of the classification of data input into the random forest. 
			The random forest technique takes consideration of the instances individually, taking the one with the majority of votes as 
			the selected prediction."""
        """Each decision tree in the forest considers a random subset of features when forming questions and only has access
				to a random set of the training data points.This increases diversity in the forest leading to more robust overall predictions and the name
				 ‘random forest.’ When it comes time to make a prediction, the random forest takes an average of all the individual decision tree estimates
				"""
        """Each tree in the classifications takes input from samples in the initial dataset.This is followed by a random selection of Features 
			(or indipendent variables) , which are used in growing the tree at each node. Every tree in the forest is pruned until 
			 the end of the exercise when the prediction is reached decisively. 
			Thus ,the random forest enables any classifiers with weak correlations to create a strong classifier"""
        decisiontree = Image.open("Images/random_forest.png")
        st.image(decisiontree,
                 caption="Random Forest tree process to predict a label ",
                 width=None)

        st.subheader("Pros and cons of the random forrest")
        """ - Can handle missing values well. Missing values are substituted by the variable appearing the most in a particular node."""
        """ - Provides the some of the highest accuracy of available classification methods"""
        """ - Some drawbacks is that the random forst classifyer method is that it requires a lot of computational reasources
			 time consuming ,and less intuitive compared to other algorithms"""
        random_for_perf = Image.open("Images/random_forest_perf.jpg")
        st.image(random_for_perf, use_column_width=True)

        st.header("Support Vector Machine")
        """A Support Vector Machine (SVM) is a supervised machine learning algorithm that can be employed for both 
			classification and regression purposes.SVMs are based on the idea of finding a hyperplane that best divides 
			a dataset into two classes"""
        """Support vectors are the data points nearest to the hyperplane, the points of a data set that, if removed, would alter 
			the position of the dividing hyperplane. Because of this, they can be considered the critical elements of a data set."""
        """Simply put ,a hyperplane is a line that linearly separates and classifies a set of data."""
        """The further from the hyperplane a data point lies, the higher the probability that it has been 
			classified correctly. Ideally ,we require a data point to be as far away as possible , while still being on 
			the correct side of the hyperplane .Whenever new testing data is added ,the side of the hyperplane is 
			lands on decides the class it is assigned to.
			"""
        svm = Image.open("Images/support_vector1.jpg")
        st.image(svm,
                 caption="Hyperplane deviding data points",
                 use_column_width=True)
        st.subheader("Pros and Cons of Support Vector Machines")
        """- it is very accurate and works well on smaller cleaner datasets"""
        """ - It can be more efficient because it uses a subset of training points"""
        """ - Less effective on noisier datasets with overlapping classes , 
			training time with SVMs can be high ,thus not suitable for larger datasets"""
        svm_perf = Image.open("Images/support_vector_perfomance.jpg")
        st.image(svm_perf, use_column_width=True)
        st.header("For more information on algorithm implimentation")
        "**Logistic regression**"
        " https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html"
        "**Random Forest **"

        " https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html"
        "**Support Vector Machines** "
        " https://scikit-learn.org/stable/modules/svm.html"

#Natural language Processing page ,it slowed down my computer because of the english library
    #if selection == 'Natural Language Processing':
    #st.info("Natural Language Processing")
    #tweet_text = st.text_area("Enter Text","Type Here")
    #nlp_task = ["Tokenization","NER","Lemmatization","POS Tags"]
    #task_choice = st.selectbox("Choose NLP Task",nlp_task)
    #if st.button("Analyze"):
    #st.info("Original Text {}".format(tweet_text))

    #docx = nlp(tweet_text)
    #if task_choice == 'Tokenization':
    #result = [ token.text for token in docx ]

    #elif task_choice == 'Lemmatization':
    #result = ["'Token':{},'Lemma':{}".format(token.text,token.lemma_) for token in docx]
    #elif task_choice == 'NER':
    #result = [(entity.text,entity.label_)for entity in docx.ents]
    #elif task_choice == 'POS Tags':
    #st.json(result)

    #if st.button("Tabulize"):
    #docx = nlp(tweet_text)
    #c_tokens = [ token.text for token in docx ]
    #c_lemma = [token.lemma_ for token in docx]
    #c_pos = [word.tag_ for word in docx]

    #new_df = pd.DataFrame(zip(c_tokens,c_lemma,c_pos),columns=['Tokens','Lemma','POS'])
    #st.dataframe(new_df)

    #if st.checkbox("Wordcloud"):

    #wordcloud =  WordCloud(max_font_size=30, max_words=100, background_color="orange").generate(tweet_text)
    #plt.imshow(wordcloud,interpolation='bilinear')
    #plt.axis("off")
    #st.pyplot()

    # Building out the "Information" page

    if selection == "About Global Warming":
        st.info("General Information")
        """ # Global Warming in 5 minutes """
        st.header("Natural Climate change")
        """ - Throughout its long history, Earth has warmed and cooled time and again.
		Climate has changed when the planet received more or less sunlight due to subtle shifts
		 in its orbit, as the atmosphere or surface changed, or when the Sun’s energy varied .This was
		 all without any help from humanity"""
        """ - Earth’s temperature begins with the Sun. Roughly 30% of incoming sunlight is 
		reflected back into space by bright surfaces like clouds and ice. The rest is absorbed by
		 the land and ocean, and the atmosphere. 
		 The absorbed solar energy heats our planet and makes it habitable."""
        """ - As the rocks, the air, and the seas get warmer, they radiate “heat” energy which 
		 travels into the atmosphere ,where it is absorbed by water vapor and long-lived greenhouse 
		 gases """
        """ - Greenhouse gases are those gases in the atmosphere that have an influence on the earth's energy balance. 
		The best known greenhouse gases, carbon dioxide (CO₂), methane and
		 nitrous oxide, can be found naturally in low concentrations in the atmosphere.
		"""
        """ - After absorbing the heat energy ,these greenhouse gases will radiate energy in all directions. Some of this energy is 
		 radiated back towards the Earth ,further warming atmosphere and surfaces - This is the natural greenhouse"""
        """ - Some natural forces that contribute to climate change include volcanic eruptions, which
		pump out clouds of dust and ash, which block out some sunlight. Volcanic debris also includes sulfur dioxide,
		 combines with water vapor and dust in the atmosphere to form sulfate aerosols, which reflect sunlight away
		  from the Earth’s leading to a cooling effect."""
        """ - Earth orbital changes - Shifts and wobbles in the Earth’s orbit can trigger changes in climate such as 
		the beginning and end of ice ages"""
        """ - Also natural is Solar variations. Although the Sun’s energy output appears constant
		from an everyday point of view, small changes over an extended period of time can lead to climate changes.
		 Some scientists suspect that a portion of the warming in the first half of the 20th century was due to an
		  increase in the output of solar energy"""
        """- Scientists constantly measure these natural effects, but none can account for the observed trend since 1970.
		 Scientists can only account for recent global warming by including the effects of human greenhouse gas emissions."""

        image = Image.open("Images/global temperature.jpg")
        st.image(image,
                 caption="Global temperature graph(Image: Global Warming Art)",
                 use_column_width=True)
        st.subheader("Some notable events in The Global Temperature timeline")
        """ Between 1850-1890 , the Mean global temperature was roughly 13.7°C.This is the time period of the First Industrial Revolution. 
			Coal, railroads, and land clearing speed up greenhouse gas emission, while 
			better agriculture and sanitation speed up population growth."""
        """Between 1870-1910 was the Second Industrial Revolution. Fertilizers and other chemicals, 
		electricity, and public health further accelerate population growth."""
        """ Around 1940 ,massive output of aerosols from industries and power plants 
		contributed to the global cooling trend from 1940-1970."""
        """ two major volcanic eruptions, El Chichon in 1982 and Pinatubo in 1991, pumped sulfur dioxide gas high into the atmosphere.
		 The gas was converted into tiny particles that lingered for more than a year, reflecting sunlight and shading Earth’s surface
		 causing cooling for two to three years."""
        """The 10 warmest years on record have all occurred since 1998, and 9 of the 10 have occurred since 2005."""
        """Models predict that Earth will warm between 2 and 6 degrees Celsius in the next century. When global warming has
		 happened at various times in the past two million years, it has taken the planet about 5,000 years to warm 5 degrees.
		 e predicted rate of warming for the next century is at least 20 times faster"""
        """- Factuations climate is natural but scientists say temperatures are now rising faster 
		than at many other times."""
        """ - Humans have been artificially raising the concentration of greenhouse gases in the atmosphere ,causing the enhanced
		Greenhouse effect """
        """ - Global warming is the unusually rapid increase in Earth’s average surface temperature over the past 
		century primarily due to the greenhouse gases released as people burn fossil fuels. """
        """ - According to IPCC in its 5th 2013 fifth assessment report ,there is 
		between a 95% and 100% probability that more than half of modern day warming was due to humans."""
        """ - Recent US fourth national climate assessment found that between 93% to 123% of observed 
			1951-2010 warming was due to human activities"""
        """ - Human activities like burning fossil fuels leading to higher carbon dioxide concentrations,
			farming and forestry — including land use change via agriculture and livestock
			cement manufacture
			aerosols — chlorofluorocarbons (CFCs) have been linked to Global warming"""
        """ - Greenhouse gases from these activities collect in the atmosphere and absorb sunlight and 
		solar radiation that have bounced off the earth’s surface. Normally, this radiation would escape 
		 into space—but these pollutants, which can last for
		 years to centuries in the atmosphere, trap the heat and cause the planet
		  to get hotter. That's what's known as the greenhouse effect """
        """- There are Natural external causes such as increases or decreases in volcanic activity or solar radiation.
		 For example, every 11 years or so, the Sun’s magnetic field flips ,this can cause small 
		 fluctuations in global temperature, up to about 0.2 degrees. On longer time scales – tens to hundreds
		  of millions of years – geological processes can drive changes in the climate, due to shifting
		   continents and mountain building"""
        """ # Evidence of Global Warming 📈 """
        """ - Across the globe, average sea level increased by 3.6mm per year between 2005 and 2015 """
        """ - According to the World Meteorological Organization (WMO),The world is about one degree Celsius warmer 
		than before widespread industrialisation"""
        """ - Data from NASA's Gravity Recovery and Climate Experiment show 
		The Greenland and Antarctic ice sheets have decreased in mass"""

        st.subheader("Suggested Readings :earth_africa: ")
        st.markdown("https://www.bbc.com/news/science-environment-24021772")
        st.markdown("https://climate.nasa.gov/evidence/")
        st.markdown(
            "https://earthobservatory.nasa.gov/features/GlobalWarming/page2.php"
        )
        st.markdown(
            "https://www.carbonbrief.org/analysis-why-scientists-think-100-of-global-warming-is-due-to-humans"
        )
        # read in word file

        st.subheader("Refereces")
        """1.https://www.newscientist.com/article/dn11639-climate-myths-the-cooling-after-1940-shows-co2-does-not-cause-warming/"""

        st.subheader("Climate change tweet classification")

    # Building out the predication page
    if selection == "Prediction":
        st.info("Prediction with ML Models")
        # Creating a text box for user input
        tweet_text = st.text_area("Enter Text to Classify ", "Type Here")
        #tweet_text=[tweet_text]
        #tweet_text = st.text_area("Enter Text","Type Here")
        all_ml_models = [
            "Logistic Regression", "Support Vector Machine",
            "Random Forest Tree"
        ]
        model_choice = st.selectbox("Choose ML Model", all_ml_models)
        prediction_labels = {
            "Neutral : This text neither supports nor refutes the belief of man-made Climate change":
            0,
            "Pro : This text shows belief in man-man climate change":
            1,
            "news : This text is links to factual news about climate change":
            2,
            "Anti : This text shows lack of belief in man-made climate change":
            -1
        }
        if st.button("Classify"):
            if model_choice == "Logistic Regression":
                predictor = joblib.load(
                    open(os.path.join("resources/saved_model_for_App.pkl"),
                         "rb"))
                prediction = predictor.predict([tweet_text])
            elif model_choice == "Support Vector Machine":
                predictor = joblib.load(
                    open(os.path.join("resources/saved_model_for_App.pkl"),
                         "rb"))
                prediction = predictor.predict([tweet_text])
                # st.write(prediction)
            elif model_choice == "Random Forest Tree":
                predictor = joblib.load(
                    open(os.path.join("resources/saved_model_for_App.pkl"),
                         "rb"))
                prediction = predictor.predict([tweet_text])

            #Results displayed on screen after User has clicked the classify button
            final_result = get_keys(prediction, prediction_labels)
            st.success("{}".format(final_result))
示例#27
0
def generateWordCloud(cursor):
    stopwords = set(STOPWORDS)
    stopwords.update([
        "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4",
        "ab", "able", "about", "above", "abst", "ac", "accordance",
        "according", "accordingly", "across", "act", "actually", "ad", "added",
        "adj", "ae", "af", "affected", "affecting", "affects", "after",
        "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj",
        "al", "all", "allow", "allows", "almost", "alone", "along", "already",
        "also", "although", "always", "am", "among", "amongst", "amoungst",
        "amount", "an", "and", "announce", "another", "any", "anybody",
        "anyhow", "anymore", "anyone", "anything", "anyway", "anyways",
        "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate",
        "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't",
        "arise", "around", "as", "a's", "aside", "ask", "asking", "associated",
        "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax",
        "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be",
        "became", "because", "become", "becomes", "becoming", "been", "before",
        "beforehand", "begin", "beginning", "beginnings", "begins", "behind",
        "being", "believe", "below", "beside", "besides", "best", "better",
        "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn",
        "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu",
        "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can",
        "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce",
        "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj",
        "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes",
        "con", "concerning", "consequently", "consider", "considering",
        "contain", "containing", "contains", "corresponding", "could",
        "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry",
        "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d",
        "d2", "da", "date", "dc", "dd", "de", "definitely", "describe",
        "described", "despite", "detail", "df", "di", "did", "didn", "didn't",
        "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't",
        "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds",
        "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each",
        "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty",
        "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty",
        "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er",
        "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even",
        "ever", "every", "everybody", "everyone", "everything", "everywhere",
        "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far",
        "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find",
        "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed",
        "following", "follows", "for", "former", "formerly", "forth", "forty",
        "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full",
        "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets",
        "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go",
        "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs",
        "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly",
        "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having",
        "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here",
        "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers",
        "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his",
        "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however",
        "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3",
        "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie",
        "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm",
        "immediate", "immediately", "importance", "important", "in",
        "inasmuch", "inc", "indeed", "index", "indicate", "indicated",
        "indicates", "information", "inner", "insofar", "instead", "interest",
        "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn",
        "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv",
        "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just",
        "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known",
        "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later",
        "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest",
        "let", "lets", "let's", "lf", "like", "liked", "likely", "line",
        "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks",
        "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly",
        "make", "makes", "many", "may", "maybe", "me", "mean", "means",
        "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't",
        "mill", "million", "mine", "miss", "ml", "mn", "mo", "more",
        "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu",
        "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2",
        "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly",
        "necessarily", "necessary", "need", "needn", "needn't", "needs",
        "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine",
        "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none",
        "nonetheless", "noone", "nor", "normally", "nos", "not", "noted",
        "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o",
        "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off",
        "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om",
        "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op",
        "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou",
        "ought", "our", "ours", "ourselves", "out", "outside", "over",
        "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3",
        "page", "pagecount", "pages", "par", "part", "particular",
        "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps",
        "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm",
        "pn", "po", "poorly", "possible", "possibly", "potentially", "pp",
        "pq", "pr", "predominantly", "present", "presumably", "previously",
        "primarily", "probably", "promptly", "proud", "provides", "ps", "pt",
        "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv",
        "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily",
        "really", "reasonably", "recent", "recently", "ref", "refs",
        "regarding", "regardless", "regards", "related", "relatively",
        "research", "research-articl", "respectively", "resulted", "resulting",
        "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro",
        "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa",
        "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se",
        "sec", "second", "secondly", "section", "see", "seeing", "seem",
        "seemed", "seeming", "seems", "seen", "self", "selves", "sensible",
        "sent", "serious", "seriously", "seven", "several", "sf", "shall",
        "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's",
        "should", "shouldn", "shouldn't", "should've", "show", "showed",
        "shown", "showns", "shows", "si", "side", "significant",
        "significantly", "similar", "similarly", "since", "sincere", "six",
        "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody",
        "somehow", "someone", "somethan", "something", "sometime", "sometimes",
        "somewhat", "somewhere", "soon", "sorry", "sp", "specifically",
        "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still",
        "stop", "strongly", "sub", "substantially", "successfully", "such",
        "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t",
        "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te",
        "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx",
        "that", "that'll", "thats", "that's", "that've", "the", "their",
        "theirs", "them", "themselves", "then", "thence", "there",
        "thereafter", "thereby", "thered", "therefore", "therein", "there'll",
        "thereof", "therere", "theres", "there's", "thereto", "thereupon",
        "there've", "these", "they", "theyd", "they'd", "they'll", "theyre",
        "they're", "they've", "thickv", "thin", "think", "third", "this",
        "thorough", "thoroughly", "those", "thou", "though", "thoughh",
        "thousand", "three", "throug", "through", "throughout", "thru", "thus",
        "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too",
        "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries",
        "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty",
        "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un",
        "under", "unfortunately", "unless", "unlike", "unlikely", "until",
        "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful",
        "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va",
        "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo",
        "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want",
        "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd",
        "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren",
        "werent", "weren't", "we've", "what", "whatever", "what'll", "whats",
        "what's", "when", "whence", "whenever", "when's", "where",
        "whereafter", "whereas", "whereby", "wherein", "wheres", "where's",
        "whereupon", "wherever", "whether", "which", "while", "whim",
        "whither", "who", "whod", "whoever", "whole", "who'll", "whom",
        "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely",
        "will", "willing", "wish", "with", "within", "without", "wo", "won",
        "wonder", "wont", "won't", "words", "world", "would", "wouldn",
        "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj",
        "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes",
        "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre",
        "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys",
        "yt", "z", "zero", "zi", "zz"
    ])
    #Generate text
    results = SelectRecentArticles(cursor)
    String = ""
    for result in results:
        String += result[0]
        String += " "
    # generate word cloud
    wordcloud = WordCloud(stopwords=stopwords,
                          background_color="white").generate(String)

    # set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud, interpolation='bilinear')
    # no axis details
    plt.axis("off")
    plt.savefig("../cloud.png")


# generateWordCloud(cursor)
示例#28
0
async def group_word(context):
    imported_1 = False
    if len(context.parameter) >= 1:
        imported_1 = True
    if not imported:
        try:
            await context.edit("支持库 `jieba` 未安装...\n正在尝试自动安装...")
            await execute(f'{executable} -m pip install jieba')
            await sleep(10)
            result = await execute(f'{executable} -m pip show jieba')
            if len(result) > 0:
                await context.edit('支持库 `jieba` 安装成功...\n正在尝试自动重启...')
                await context.client.disconnect()
            else:
                await context.edit(
                    f"自动安装失败..请尝试手动安装 `{executable} -m pip install jieba` 随后,请重启 PagerMaid-Modify 。"
                )
                return
        except:
            return
    if not imported_ and imported_1:
        try:
            await context.edit("支持库 `paddlepaddle-tiny` 未安装...\n正在尝试自动安装...")
            await execute(f'{executable} -m pip install paddlepaddle-tiny')
            await sleep(10)
            result = await execute(
                f'{executable} -m pip show paddlepaddle-tiny')
            if len(result) > 0 and not 'WARNING' in result:
                await context.edit(
                    '支持库 `paddlepaddle-tiny` 安装成功...\n正在尝试自动重启...')
                await context.client.disconnect()
            else:
                await context.edit(
                    f"自动安装失败,可能是系统不支持..\nAI 分词不可用,切换到基础分词。\n"
                    f"您可以尝试手动安装 `{executable} -m pip install paddlepaddle-tiny` 。"
                )
                await sleep(4)
        except:
            return
    try:
        await context.edit('正在生成中。。。')
    except:
        return
    if not exists("plugins/groupword"):
        makedirs("plugins/groupword")
    if not exists("plugins/groupword/wqy-microhei.ttc"):
        await context.edit('正在拉取中文字体文件。。。(等待时间请评估你的服务器)')
        r = get(
            'https://cdn.jsdelivr.net/gh/anthonyfok/fonts-wqy-microhei/wqy-microhei.ttc'
        )
        with open("plugins/groupword/wqy-microhei.ttc", "wb") as code:
            code.write(r.content)
    words = defaultdict(int)
    count = 0
    try:
        if imported_ and imported_1:
            try:
                jieba.enable_paddle()
            except:
                imported_1 = False
        async for msg in context.client.iter_messages(context.chat, limit=500):
            if msg.id == context.id:
                continue
            if msg.text and not msg.text.startswith(
                    '/') and not msg.text.startswith(
                        '-') and not '//' in msg.text:
                try:
                    if imported_ and imported_1:
                        for word in jieba.cut(msg.text.translate(punctuation),
                                              use_paddle=True):
                            word = word.lower()
                            words[word] += 1
                    else:
                        for word in jieba.cut(msg.text.translate(punctuation)):
                            word = word.lower()
                            words[word] += 1
                    count += 1
                except:
                    pass
    except:
        if count == 0:
            try:
                await context.edit('您已被 TG 官方限制。')
                return
            except:
                return
    try:
        image = WordCloud(
            font_path="plugins/groupword/wqy-microhei.ttc",
            width=800,
            height=400).generate_from_frequencies(words).to_image()
        stream = BytesIO()
        image.save(stream, 'PNG')
    except:
        await context.edit('词云生成失败。')
        return
    try:
        await context.client.send_message(context.chat,
                                          f'对最近的 {count} 条消息进行了分析。',
                                          file=stream.getvalue())
        await context.delete()
    except:
        return
示例#29
0
cut_text = " ".join(jieba.cut(x))
result = jieba.analyse.textrank(cut_text, topK=1000, withWeight=True)
keywords = dict()

for i in result:
    keywords[i[0]] = i[1]

d = path.dirname(__file__)  # 当前文件文件夹所在目录
color_mask = imread("/hwj/dorahacks/1.jpg")  # 读取背景图片
cloud = WordCloud(
    #设置字体,不指定就会出现乱码
    font_path="/hwj/dorahacks/STFANGSO.ttf",
    # font_path=path.join(d,'simsun.ttc'),
    width=200,
    height=200,
    #设置背景色
    background_color='white',
    #词云形状
    mask=color_mask,
    #允许最大词汇
    max_words=2000,
    #最大号字体
    max_font_size=40)
word_cloud = cloud.generate(cut_text)  # 产生词云
word_cloud.to_file("/hwj/dorahacks/user_img.jpg")  #保存图片
#  显示词云图片
plt.imshow(word_cloud)
plt.axis('off')
plt.show()

comment_text = x
# 结巴分词,生成字符串,如果不通过分词,无法直接生成正确的中文词云
示例#30
0
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# 한글 그래프 처리시 필수
from matplotlib import font_manager, rc
import re

font_path = "c:/Windows/Fonts/malgun.ttf"
font_name = font_manager.FontProperties(
    fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
plt.rcParams['axes.unicode_minus'] = False

text = open('ab.txt', 'r', encoding='utf-8').read()
# print(type(text))
## 영문 숫자 제거 (한글만 가져 오는 방식)
text = re.compile('[가-힣]+').findall(text)
# print(type(text))
text = ' '.join(text)  # list 를 string 으로 만들고 아이템 사이는 공백을 넣는다.
# print(type(text))

wordcloud = WordCloud(font_path=font_path).generate(text)

plt.imshow(wordcloud)
plt.axis("off")
plt.show()