def SearchSimilarWords(word): # 問い合わせしたい単語がWordnetに存在するか確認する cur = conn.execute(f"select wordid from word where lemma='{word}'") word_id = 99999999 #temp for row in cur: word_id = row[0] # Wordnetに存在する語であるかの判定 if word_id == 99999999: print(f"「{word}」は、Wordnetに存在しない単語です。") return else: print(f"【「{word}」の類似語はね、以下ですよ】\n") # 入力された単語を含む概念を検索する cur = conn.execute(f"select synset from sense where wordid='{word_id}'") synsets = [] for row in cur: synsets.append(row[0]) # 概念に含まれる単語を検索して画面出力する no = 1 l_empty = [] for synset in synsets: cur1 = conn.execute(f"select name from synset where synset='{synset}'") for row1 in cur1: print("%sつめの概念 : %s" % (no, row1[0])) cur2 = conn.execute( "select def from synset_def where (synset='%s' and lang='jpn')" % synset) sub_no = 1 for row2 in cur2: print("意味%s : %s" % (sub_no, row2[0])) # 対象に追加 l_empty.append(row2[0]) sub_no += 1 cur3 = conn.execute( f"select wordid from sense where (synset='{synset}' and wordid!={word_id})" ) sub_no = 1 for row3 in cur3: target_word_id = row3[0] cur3_1 = conn.execute( f"select lemma from word where wordid={target_word_id}") for row3_1 in cur3_1: print("類義語%s : %s" % (sub_no, row3_1[0])) # 対象に追加 l_empty.append(row3_1[0]) sub_no += 1 print("\n") no += 1 list_str = ",".join(l_empty) wordcloud = WordCloud(background_color="white", font_path="./TakaoPGothic.ttf", width=800, height=600).generate(list_str) wordcloud.to_file("./wordcloud_sample.png") answer = random.choice(l_empty) l_phrase = [] l_phrase.append(f"私はね、{word} とはですね、言ってみればもはや {answer} だと思うんですよ。") l_phrase.append(f"{word} ってことはですよ、{answer} とも考えられるということですよ。") l_phrase.append(f"{word} って、もう {answer} ですよね。") phrase = random.choice(l_phrase) print(phrase)
def chinese_jieba(text): wordlist_jieba = jieba.cut(text) space_wordlist = " ".join(wordlist_jieba) return space_wordlist #读取csv文件 df = pd.read_csv('douban_movie.csv') comment_list = df['comment'].values.tolist() score_list = df['score'].values.tolist() text = "" stopwords = [ line.strip() for line in open('stop.txt', encoding='UTF-8').readlines() ] for jj in range(len(comment_list)): text = text + chinese_jieba(comment_list[jj]) word_counts = collections.Counter(text) print(text) mask_pic = imread('movie.jpg') wordcloud = WordCloud( font_path="C:/Windows/Fonts/simfang.ttf", #设置字体 mask=mask_pic, #设置背景图片 background_color="white", #设置背景颜色 max_font_size=150, # 设置字体最大值 max_words=2000, # 设置最大显示的字数 stopwords=stopwords #设置停用词,停用词则不再词云图中表示 ).generate(text) #根据文本生成词云 imge = wordcloud.to_image() wordcloud.to_file('key.png')
plt.title("Tag distribution", fontsize=20) plt.xlabel("Tag", fontsize=16) ax.set_xticklabels(tags, rotation=90, fontsize=14) plt.ylabel("Number of projects", fontsize=16) plt.show() st.pyplot(plt) # Plot word clouds top top tags plt.figure(figsize=(20, 8)) tag = st.selectbox("Choose a tag", tags, index=0) subset = df[df.tags.apply(lambda tags: tag in tags)] text = subset.text.values cloud = WordCloud( stopwords=STOPWORDS, background_color="black", collocations=False, width=500, height=300, ).generate(" ".join(text)) plt.axis("off") plt.imshow(cloud) st.pyplot(plt) # Preprocessing st.write("---") st.subheader("Preprocessing") filters = st.text_input("filters", "[!\"'#$%&()*+,-./:;<=>?@\\[]^_`{|}~]") lower = st.checkbox("lower", True) stem = st.checkbox("stem", False) text = st.text_input("Input text", "Conditional generation using Variational Autoencoders.") preprocessed_text = data.preprocess(text=text, lower=lower, stem=stem, filters=filters)
#print(dir(tweetblob)) #Filter Words wordsToFilter = [ "about", "https", "in", "the", "thing", "will", "could", tweetSearch ] filteredDictionary = dict() for word in tweetblob.words: #skip tiny words if len(word) < 2: continue #skip words with random characters or numbers if not word.isalpha(): continue #skip words in our filter if word.lower() in wordsToFilter: continue #don't want lower case words smaller than 5 letters if len(word) < 5 and word.upper() != word: continue #Try lower case only, try with upper case! filteredDictionary[word.lower()] = tweetblob.word_counts[word.lower()] #Create the word cloud wordcloud = WordCloud().generate_from_frequencies(filteredDictionary) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
from wordcloud import WordCloud from PIL import Image import numpy as np import jieba with open('Data.txt', 'r', encoding='utf-8') as f: text = ' '.join(jieba.cut(f.read(), cut_all=False)) wc = WordCloud( background_color='black', font_path='C:\\Windows\\Fonts\\msyh.ttc', width=1080, height=2340, ).generate(text) imageFile = wc.to_image() imageFile.save("image.png") imageFile.show()
cursor = db.cursor() #cursor.execute("insert into nlpt_feedback values(user_input_feedback,(1 if ans="pos" else -1))") cursor.execute("select feedback from nlpt_feedback where lab=1") pos = cursor.fetchall() pos = [x[0] for x in pos] cursor.execute("select feedback from nlpt_feedback where lab=-1") neg = cursor.fetchall() neg = [x[0] for x in neg] db.close() postext = " ".join(pos) negtext = " ".join(neg) stopwords = set(STOPWORDS) wc = WordCloud(background_color="white", random_state=42) wc.generate(postext) plt.figure() plt.axis('off') plt.imshow(wc, interpolation="bilinear") plt.savefig('images/pos.png', bbox_inches='tight') wc.generate(negtext) plt.imshow(wc, interpolation="bilinear") plt.savefig('images/neg.png', bbox_inches='tight') db = MySQLdb.connect("127.0.0.1", "root", "spiderman", "feedback") cursor = db.cursor() cursor.execute("select feedback from nlpt_feedback where lab=1") posdata = cursor.fetchall() cursor.execute("select feedback from nlpt_feedback where lab=-1")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from wordcloud import WordCloud textwc = "" with open('hive.txt', encoding='utf-8') as f: textwc = ''.join(f.readlines()) wordcloud = WordCloud( font_path="/usr/share/fonts/truetype/andika/Andika-R.ttf", width=4000, height=2000, mask=None, color_func=None, max_words=300, min_font_size=12, stopwords=None, background_color="gray", max_font_size=300, colormap="gist_heat", contour_width=0, contour_color="white") wordcloud.generate(textwc) wordcloud.to_file('hive_mod.png')
from nltk.corpus import stopwords stop = stopwords.words('english') reviews['Title'] = reviews['Title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) reviews['Content'] = reviews['Content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) ## Tokenization: ## Word Cloud: wc = WordCloud(background_color="white", max_words=2000) wc.generate(' '.join(reviews['Content'])) import matplotlib.pyplot as plt plt.imshow(wc, interpolation='bilinear') plt.axis("off") #plt.figure(figsize=(4, 3)) #plt.axis("off") plt.show()
Generating a square wordcloud from the US constitution using default arguments. """ import os from os import path from wordcloud import WordCloud # get data directory (using getcwd() is needed to support running example in generated IPython notebook) d = path.dirname(__file__) if "__file__" in locals() else os.getcwd() # Read the whole text. text = open(path.join(d, 'constitution.txt')).read() # Generate a word cloud image wordcloud = WordCloud().generate(text) # Display the generated image: # the matplotlib way: import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() # lower max_font_size """ wordcloud = WordCloud(max_font_size=40).generate(text) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show()
def _perform_wordcloud_visualization(self, condition): if condition: print("Please wait plotting wordcloud...") wc = WordCloud(width=1000, height=400).generate(self.text) plt.axis("off") plt.imshow(wc)
def calling(file): # 1. 크롤링한 파일을 읽기전용으로 호출 file = open("movie.txt", 'r', encoding='utf-8') lines = file.readlines() # 2. 변수 reaction에 전체댓글을 다시저장 reaction = [] for line in lines: reaction.append(line) file.close() okt = Okt() # 4. 각 문장별로 형태소 구분하기 sentences_tag = [] for sentence in reaction: morph = okt.pos(sentence, norm=True, stem=True) sentences_tag.append(morph) # 5. 형용사인 품사만 선별해 리스트에 담기 adj_list = [] finallist = [] for sentence1 in sentences_tag: polarlist = ['None1', 'None2'] for word, tag in sentence1: if tag in ['Adjective'] and ("이다" not in word) and ( "아니다" not in word) and ("있다" not in word) and ( "없다" not in word) and ("많다" not in word) and ( "같다" not in word) and ("그렇다" not in word) and ( "이렇다" not in word) and ("어떻다" not in word): adj_list.append(word) with open('KnuSentiLex-master/data/SentiWord_info.json', encoding='utf-8-sig', mode='r') as f: data = json.load(f) result = ['None3', 'None4'] for i in range(0, len(data)): if data[i]['word'] == word: result.pop() result.pop() result.append(data[i]['word_root']) result.append(data[i]['polarity']) r_word = result[0] s_word = result[1] polarlist.pop() polarlist.pop() polarlist.append(r_word) polarlist.append(s_word) polar_word1 = polarlist[0] polar_word2 = polarlist[1] finallist.append(polar_word2) print("-2점 : ", finallist.count('-2')) print("-1점 : ", finallist.count('-1')) print("0점 : ", finallist.count('0')) print("1점 : ", finallist.count('1')) print("2점 : ", finallist.count('2')) # print("총 데이터 개수 : ", len(finallist)) a = finallist.count('-2') b = finallist.count('-1') c = finallist.count('0') d = finallist.count('1') e = finallist.count('2') # 감성분석되지 않은 데이터는 제외한 총 데이터 개수 all = a + b + c + d + e a2 = a * 2 b2 = b * 4 c2 = c * 6 d2 = d * 8 e2 = e * 10 all2 = a2 + b2 + c2 + d2 + e2 # 별점(5점만점) starnum = round((all2 / all), 2) print("별점 : ", starnum) # print("별점(소수점둘째자리까지) : ", "%0.2f" % starnum) # 6. 선별된 품사별 빈도수 계산 & 빈도수대로 정렬 counts = Counter(adj_list) tags = counts.most_common(200) # 7. 워드클라우드 만들기 wordcloud = WordCloud(font_path='c:/Windows/Fonts/malgun.ttf', background_color='white', width=1200, height=800).generate_from_frequencies(dict(tags)) fig = plt.figure() plt.axis('off') plt.imshow(wordcloud) # plt.show() fig.savefig('./static/images/wordcloud_image.png') return starnum
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % 27 #% random.randint(60, 100) # This function takes in your text and your mask and generates a wordcloud. #def generate_wordcloud(words, mask): word_cloud = WordCloud(width=512, height=512, background_color='white', stopwords=set(STOPWORDS), collocations=False).generate(words) plt.figure(figsize=(512, 512), facecolor='white', edgecolor='blue') plt.imshow(word_cloud) plt.axis('off') plt.tight_layout(pad=0) #plt.show() plt.imshow(word_cloud.recolor(color_func='hsl(0, 0%, 27%)', random_state=3), interpolation="bilinear") plt.axis("off") plt.figure() #plt.title("Default colors") default_colors = word_cloud.to_array() plt.imshow(default_colors, interpolation="bilinear") plt.axis("off")
stopwords.add("t") stopwords.add("co") stopwords.add("https") stopwords.add("will") stopwords.add("people") stopwords.add("amp") stopwords.add("time") stopwords.add("got") stopwords.add("now") stopwords.add("got") stopwords.add("say") stopwords.add("getting") stopwords.add("day") stopwords.add("today") stopwords.add("COVID") stopwords.add("vaccine") stopwords.add("COVID19") stopwords.add("CovidVaccine") wordCloud = WordCloud( background_color='white', max_words=500, stopwords=stopwords ) text = df["text"].to_csv() wordCloud.generate(text) print("First batch:") plt.figure(figsize=(18, 18)) plt.imshow(text, interpolation='bilinear') plt.axis('off') plt.show()
import sqlite3 conn = sqlite3.connect('data.db') user = {} for i in conn.execute("select mid,name from user order by id").fetchall(): user[i[0]] = i[1] wordlist = [] for i in conn.execute("select following from relation order by id").fetchall(): if i[0] in user: wordlist.append(user[i[0]]) wl_space_split = " ".join(wordlist) mask_png = imread("fate.jpeg") my_wordcloud = WordCloud( font_path=r"C:\Windows\Fonts\simhei.ttf", # 词云自带的字体不支持中文,在windows环境下使用黑体中文 background_color="white", # 背景颜色 max_words=500, # 词云显示的最大词数 max_font_size=100, # 字体最大值 random_state=42, mask=mask_png, width=1000, height=860, margin=2, ).generate(wl_space_split) image_colors = ImageColorGenerator(mask_png) plt.figure() plt.imshow(my_wordcloud.recolor(color_func=image_colors)) plt.axis("off") plt.figure() plt.imshow(mask_png, cmap=plt.cm.gray) plt.axis("off") plt.show() my_wordcloud.to_file("wordcloud.png")
emotion_filename = "" emotion = pd.read_csv(emotion_filename) #Emotion wordcloud formation shame = [] disgust = [] joy = [] sadness = [] fear = [] guilt = [] anger = [] default_color = 'grey' wc = WordCloud(width=1600, height=800, collocations=False, relative_scaling=0,\ max_font_size = 100,background_color = 'white', \ ).generate(' '.join(emotion.hashtag).lower()) #Categorization of hashtags for item in emotion.itertuples(): if item.anger != 0: anger.append(item.hashtag) if item.shame != 0: shame.append(item.hashtag) if item.disgust != 0: disgust.append(item.hashtag) if item.joy != 0: joy.append(item.hashtag) if item.sadness != 0: sadness.append(item.hashtag) if item.fear != 0:
from wordcloud import WordCloud, STOPWORDS comment_word = "" stopword = set(STOPWORDS) file = open("word.txt", "r+") text = file.read().replace("\n", " ") wc = WordCloud(stopwords=stopword, width=792, height=507, min_font_size=10, background_color="Black") # generate word cloud wc.generate(text) # store to file wc.to_file("wordcloud.png") print("Successfull")
counts = Counter(t1_tokenized) print("Number of distinct words "+str(len(counts))) print("Number of tokens "+str(len(t1_tokenized))) print("Number of characters "+str(len(rawt1))) print(t1_tokenized) fdist = FreqDist(t1_tokenized) fdist.plot(30, cumulative=False) plt.show() # In[2]: word_cloud_dict=Counter(t1_tokenized) wordcloud = WordCloud(width = 1000, height = 1000, background_color = 'white', stopwords = None).generate_from_frequencies(word_cloud_dict) plt.figure(figsize = (8,8) , facecolor = None) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad = 0) plt.show() # In[3]: stop_words = set(stopwords.words("english")) filtered_text = [] for w in t1_tokenized: if w not in stop_words:
from wordcloud import WordCloud import matplotlib.pyplot as plt #main-1 #打开文本 text = open('J:/文档e盘/深度学习/03 词云/03 词云/constitution.txt').read() #生成对象 wc = WordCloud().generate(text) #显示词云 plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.show() #保存文件 wc.to_file('wordcloud.png') #main-2 #上面是英文的例子,下面看一下关于中文的情况 text = open('J:/文档e盘/深度学习/03 词云/03 词云/xyj.txt', encoding='UTF-8').read() wc = WordCloud(font_path='Hiragino.ttf', width=800, height=600, mode='RGBA', background_color=None).generate(text) plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.show()
this_topic = this_hotel.loc[idx] sc = this_topic['Score'].mean() print("\nAverage Sentiment for" ,h, "Topic", c, ":", sc) #Word Clouds def shades_of_grey(word, font_size, position, orientation, random_state=None, \ **kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(60,1000) #Word cloud for all word in the reviews st = set(STOPWORDS) st.add("hotel") st.add("room") st.add("quot") st.add("one") st.add("casino") wc = WordCloud(stopwords=st,width=600, height=400) s = "" for i in range(len(comments)): s += comments[i] wc.generate(s) # Display the word cloud. plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.figure() plt.show() #From the sentiment words of all words corpus_sentiment = {} n_sw = 0 for i in range(n_reviews): # Iterate over the terms with nonzero scores
def word_cloud(book_name): # !pip install wordcloud import nltk from konlpy.corpus import kobill from konlpy.tag import Twitter t = Twitter() from wordcloud import WordCloud import matplotlib.pyplot as plt import platform import io import base64 img = io.BytesIO() # OS별 matplotlib 한국어 처리 path = "static/AppleGothic.ttf" # window 사용자의 경우 path 설정 중요 from matplotlib import font_manager, rc if platform.system() == 'Darwin': rc('font', family='AppleGothic') elif platform.system() == 'Windows': font_name = font_manager.FontProperties(fname=path).get_name() rc('font', family=font_name) else: print('Unknown system... sorry~~~~') # 워드 클라우드 만들기 시작 files_ko = kobill.fileids() books_all = pd.read_csv('static/books_all.csv') book_name = book_name # input으로 받음 files_ko = kobill.fileids() doc_ko = books_all[books_all['name'] == book_name].iloc[0].text tokens_ko = t.nouns(doc_ko) with open('static/project_stopwords.txt', 'r', encoding='utf-8') as f: stop_words = f.read().split(' ') ko = nltk.Text(tokens_ko) ko = [each_word for each_word in ko if each_word not in stop_words] ko = nltk.Text(ko) data = ko.vocab().most_common(150) # for win : font_path='c:/Windows/Fonts/malgun.ttf' wordcloud = WordCloud( font_path='static/AppleGothic.ttf', relative_scaling=0.2, background_color='white', ).generate_from_frequencies(dict(data)) plt.figure(figsize=(12, 8)) plt.imshow(wordcloud) plt.axis("off") plt.savefig(img, format='png') img.seek(0) return base64.b64encode(img.getvalue()).decode()
Satisfiedemp = dataemp.loc[dataemp["label"] == 1] UnSatisfiedemp = dataemp.loc[dataemp["label"] == 0] # In[34]: Satisfiedemp.shape # In[67]: ignorewords = Satisfiedemp["pros"].isin( ['Amazon', 'company', 'work', 'place', 'employee', 'team', 'time']) satwords = Satisfiedemp.loc[~(ignorewords), "pros"] # In[68]: wordcloud = WordCloud().generate(' '.join(satwords)) plt.imshow(wordcloud) plt.axis("off") plt.show() # In[71]: ignorewords = UnSatisfiedemp["cons"].isin([ 'management', 'manager', 'employee', 'Amazon', 'customer', 'team', 'time', 'job',
def find_speaker(name): speaker_list = [] for x in transcript: namelen = len(name) if x[0:namelen + 1] == (' ' + name) or x[0:namelen] == name: speaker_list.append(x) speaker_list = ' '.join(speaker_list).split() #print(speaker_list) stripped_speech = [] for i in speaker_list: if i not in stopwords: stripped_speech.append(i) speaker_count = {} for word in stripped_speech: if word in speaker_count.keys(): speaker_count[word] = speaker_count[word] + 1 else: speaker_count[word] = 1 d = collections.Counter(speaker_count) tot = len(speaker_list) top_30 = d.most_common(30) print(name + " spoke a total number of " + str(tot) + " words.") for word, count in d.most_common(30): print(word, ": ", count) #PLOT BAR CHART plt.bar(word, count) plt.title('Word Count') plt.xlabel('Word') plt.ylabel('Count') plt.show() #WORD CLOUD picture = name + ".jpg" char_mask = np.array(Image.open(picture)) # Create a word cloud image wc = WordCloud(background_color="white", max_words=1000, mask=char_mask, stopwords=stopwords, contour_width=3, contour_color='lightgrey') # Generate a wordcloud text = ' '.join(stripped_speech) wc.generate(text) print(wc) # show plt.figure(figsize=[30, 20]) plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.show()
f1_score(test[category], prediction, average='weighted'))) print('Test precision is {}'.format( precision_score(test[category], prediction, average='macro'))) print(confusion_matrix(test[category], prediction)) print("\n") ###### from wordcloud import WordCloud, STOPWORDS plt.figure(figsize=(40, 25)) # clean subset = review_random_set[review_random_set.flavour == 1] text = subset.combined.values cloud = WordCloud(stopwords=STOPWORDS, background_color='black', collocations=False, width=2500, height=1800).generate(" ".join(text)) plt.axis('off') plt.title("Flavour", fontsize=40) plt.imshow(cloud) review_random_set = pd.read_csv('binary_reviews.csv') review_random_set = review_random_set.drop(columns=['Unnamed: 0']) # %% ##### figuring out the sentiment part and representation copy = review_random_set.copy() copy['sentiment'] = 1 copy.iloc[500:, 18] = -1 for row in range(0, len(copy)):
orientation, random_state=None, **kwargs): return words = open('cloud.txt') word_count = [] for line in words.readlines(): word = line.strip().split(':') word_count.append((unicode(word[0]), int(word[1]))) bg_mask = np.array(Image.open('bg.png')) wc = WordCloud( font_path='./font/msyh.ttc', #设置字体 backgrond_color="white", #背景颜色 max_words=2000, # 词云显示的最大词数 mask=bg_mask, max_font_size=90, #字体最大值 random_state=41, scale=3) wc.fit_words(word_count) image_colors = ImageColorGenerator(bg_mask) plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) plt.axis('off') plt.show() wc.to_file('test.png')
font_name = font_manager.FontProperties( fname="c:/Windows/fonts/malgun.ttf").get_name() rc('font', family=font_name) else: print('unknown...') # data = ko_con_text.vocab().most_common(500) # data=ko tmp_data = dict(data) # plt.figure(figsize=(16,8)) # plt.imshow(wordcloud) # plt.axis("off") # plt.show() korea_coloring = np.array(Image.open("bb.jpg")) image_colors = ImageColorGenerator(korea_coloring) wordcloud = WordCloud( font_path='c:\\windows\\fonts\\NanumGothic.ttf', relative_scaling=0.1, mask=korea_coloring, background_color='white', min_font_size=4, max_font_size=40, ).generate_from_frequencies(tmp_data) plt.figure(figsize=(12, 12)) plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear") plt.axis("off") plt.show()
def main(): """Tweet Classifier App with Streamlit """ # Creates a main title and subheader on your page - # these are static across all pages # Creating sidebar with selection box options = [ "Prediction", "Purpose of the App", "Exploratory Data Analysis", "About Global Warming", "Machine Learning Models", "Natural Language Processing" ] selection = st.sidebar.selectbox("Choose Option", options) if selection == "Exploratory Data Analysis": df_senti1 = raw[raw['sentiment'] == 1] tweet_senti1 = " ".join(review for review in df_senti1.message) #create word cloud in eda st.image(img3, width=600, caption="Visualising the climate change threat") st.title("Insight From The Data") st.subheader( "A Representation Of The Most Common Words In Each Sentiment Class" ) sent_groups = st.radio('Sentiment Views:', ( 'Positive, those who believe climate change is a threat', 'Negative sentiment, opposing the belief that climate change is a threat', 'Neutral, an impartial stance on climate change', 'News Report, topical news reported on climate change')) if sent_groups == ( 'Positive, those who believe climate change is a threat'): df_senti1 = clean[clean['sentiment'] == 1] tweet_senti1 = " ".join(review for review in df_senti1.clean_stp_words) # Create and generate a word cloud image: wordcloud_1 = WordCloud( max_font_size=50, max_words=100, background_color="white").generate(tweet_senti1) plt.imshow(wordcloud_1, interpolation='bilinear') #plt.set_title('Tweets under Pro Class 1',fontsize=50) plt.axis('off') plt.show() st.pyplot() if st.checkbox('Interpretation of Diagram, Sentiment 1'): """Common words of interest in pro-sentiment include `To fight`,`to tackle`, `belive in` and `fight climate`. It appears that tweets in this category are providing solutions to fight climate change. Many of the sentiments reflected are related to on Trumps commentary. In the pro sentiment class we find that people do not agree with Trump.')""" if sent_groups == 'News Report, topical news reported on climate change': df_senti_2 = clean[clean['sentiment'] == 2] tweet_senti_2 = " ".join(review for review in df_senti_2.clean_stp_words) # Create and generate a word cloud image: wordcloud_2 = WordCloud( max_font_size=50, max_words=100, background_color="white").generate(tweet_senti_2) plt.imshow(wordcloud_2, interpolation='bilinear') #plt.set_title('Tweets under Pro Class 1',fontsize=50) plt.axis('off') plt.show() st.pyplot() if st.checkbox('Interpretation of Diagram, Sentiment 2') : """Common words news tweets are `Trump, global warming, via`,`Scientists`,`researchers`,`ÈPA` and `report`. This could reveal the sentiment that humans are the cause of climate change because they burn fossil fuels. News reports can be highly influential on overall sentiment as many rely of the media to validate their beliefs. It is evident that the word Trump is most common. According to research in the news, the momentum for these sentiments comes from the commentary that president Trump has made about climate change.""" if sent_groups == "Neutral, an impartial stance on climate change": df_senti_0 = clean[clean['sentiment'] == 0] tweet_senti_0 = " ".join(review for review in df_senti_0.clean_stp_words) #Create and generate a word cloud image: wordcloud_0 = WordCloud( max_font_size=50, max_words=100, background_color="white").generate(tweet_senti_0) plt.imshow(wordcloud_0, interpolation='bilinear') #plt.set_title('Tweets under Pro Class 1',fontsize=50) plt.axis('off') plt.axis("off") plt.show() st.pyplot() if st.checkbox('Interpretation of Diagram, Sentiment 0'): """The sentiments in class 0 represents people that are neutral towards climate change. The reason could be that they are not aware of climate change, or do not have enough information, this can be seen by words such as `interviewer`, `Trump`, `think`. Common words in neutral tweets include `care about`,`think`,`maybe`. This could indicate uncerainty toward climate change validity or an apathetic inclination.Interestingly, the appearance of the word ` ignore` tells us that these tweeters find the matter confusing.')""" st.subheader( "**Observe the frequency of the 20 most common words in each class**" ) Pro = clean[clean['sentiment'] == 1] Anti = clean[clean['sentiment'] == -1] Neutral = clean[clean['sentiment'] == 0] News = clean[clean['sentiment'] == 2] common = st.selectbox('Select Sentiment Type', ('Positive', 'Negative', 'Neutral', 'News')) if common == 'Positive': Pro['temp_list'] = Pro['clean_stp_words'].apply( lambda x: str(x).split()) top = Counter( [item for sublist in Pro['temp_list'] for item in sublist]) temp_positive = pd.DataFrame(top.most_common(20)) temp_positive.columns = ['Common_words', 'count'] temp_positive = temp_positive.style.background_gradient( cmap='Greens_r') st.write(temp_positive, width=200) if common == 'Negative': Anti['temp_list'] = Anti['clean_stp_words'].apply( lambda x: str(x).split()) top = Counter( [item for sublist in Anti['temp_list'] for item in sublist]) temp_neg = pd.DataFrame(top.most_common(20)) temp_neg.columns = ['Common_words', 'count'] temp_neg = temp_neg.style.background_gradient(cmap='Greens_r') st.write(temp_neg, width=200) if common == 'News': News['temp_list'] = News['clean_stp_words'].apply( lambda x: str(x).split()) top = Counter( [item for sublist in News['temp_list'] for item in sublist]) temp_news = pd.DataFrame(top.most_common(20)) temp_news.columns = ['Common_words', 'count'] temp_news = temp_news.style.background_gradient(cmap='Greens_r') st.write(temp_news, width=200) if common == 'Neutral': Neutral['temp_list'] = Neutral['clean_stp_words'].apply( lambda x: str(x).split()) top = Counter( [item for sublist in Neutral['temp_list'] for item in sublist]) temp_net = pd.DataFrame(top.most_common(20)) temp_net.columns = ['Common_words', 'count'] temp_net = temp_net.style.background_gradient(cmap='Greens_r') st.write(temp_net, width=200) st.subheader("**A Closer Look At The Data Distribution**") temp = raw.groupby( 'sentiment').count()['message'].reset_index().sort_values( by='message', ascending=False) temp['percentage'] = round( (temp['message'] / temp['message'].sum()) * 100, 0) labels1 = temp['sentiment'] labels = ["Sentiment %s" % i for i in temp['sentiment']] sizes = temp['percentage'] fig1, ax1 = plt.subplots(figsize=(6, 6)) fig1.subplots_adjust(0.3, 0, 1, 1) theme = plt.get_cmap('Greens_r') ax1.set_prop_cycle( "color", [theme(1. * i / len(sizes)) for i in range(len(sizes))]) _, _ = ax1.pie(sizes, startangle=90, labels=labels1, radius=1800) ax1.axis('equal') total = sum(sizes) plt.legend(loc='upper left', labels=[ '%s, %1.1f%%' % (l, (float(s) / total) * 100) for l, s in zip(labels, sizes) ], prop={'size': 7}, bbox_to_anchor=(0.0, 1), bbox_transform=fig1.transFigure) plt.show() # Equal aspect ratio ensures that pie is drawn as a circle. st.pyplot() #c, use_container_width=True) if st.checkbox('Interpretation of Pie Chart'): """More than half of the tweets analysed reflect a belief in climate change. Although it is not an overwhelming majority figure, believers are in the majority. As science begins to offer clearer evidence it is likely that many neutral tweeters could sway their beliefs. Less than ten percent of the sample population do not believe in climate change. If the sample is a good representation of the population than the market for evironmentally friendly or environmentally conscious goods and services could be a desireable product to fairly large sector of the population')""" if selection == "Purpose of the App": st.header( "**The Impact Of Climate Change Sentiment And Maximising Profit**") img2 = Image.open("Images/gw.jpeg.jpg") st.image(img2, width=400, caption="Visualising the climate change threat") """This app will reveal the overall sentiment toward climate change by analysing recent tweets (post made on the social media application Twitter).By understanding how potential consumers view climate change, companies can make informed decisions on product development and marketing. This app will answer the question: Do people see climate change as a real threat?""" st.subheader( "A brief Look At The Raw Data (Database of tweets analysed)") if st.checkbox('Show raw data'): # data is hidden if box is unchecked st.write(raw[["sentiment", "message"]]) # will write the df to the page data = pd.DataFrame(raw, columns=['sentiment', 'message']) st.write(data.plot(kind='hist', color='green')) st.pyplot() data = { 'Sentiment Type': ['-1', '0', '1', '2'], 'Sentiment Meaning': [ 'Negative sentiment, opposing the belief that climate change is a threat', 'Neutral, an impartial stance on climate change', 'Positive, supporting the belief that climate change poses a threat', 'News Report, topical news reported on climate change' ] } sentiment = pd.DataFrame( data, columns=['Sentiment Type', 'Sentiment Meaning']) sentiment = sentiment.set_index('Sentiment Type') st.write(sentiment, width=800) st.subheader("**Interpretation Of Sentiment Distribution**") """In the database ,most of the tweets indicate that alot of people believe climate change is a real threat and is man-man.""" """Media coverage on climate change concerns substantiates the belief that climate change is a real threat.There are tweets in the database that indicate that there are people who are nuetral on the subject of the subject of Global warming ,however ,they are vastly outnumbered""" if selection == "Machine Learning Models": st.header("**Logistic Regression**") """The Logistic regression algorithm builds a regression model to predict the probability that a given data entry belongs to the category numbered as “1”. Logistic regression becomes a classification technique only when a decision threshold is brought into the picture. The setting of the threshold value is a very is dependent on the classification problem itself. Logistic regression models the data using the sigmoid function. It squeezes the range of output values to exist only between 0 and 1. For binary classification ,the output value of a logistic regre.The threshold value is usually set to 0.5 and determine if an observation will belong to class 0 or 1.""" logistic_regression = Image.open("Images/logistic_regression.jpg") st.image(logistic_regression, caption="sigmoid function for logistic regression ", use_column_width=True) """For multiclass classification problems , logistic regression models are combined into what is known as the one-vs-rest approach (or OvR). In the OvR case, a separate logistic regression model is trained for each label that the response variable takes on.""" st.subheader("Pros and cons of Logistic Regression") """ - easy to implement and very efficient to train""" """ - Can overfit when data is unbalanced and Doesn't handle large number of categorical variables well.""" logistic_reg_perf = Image.open('Images/logistic_reg_perfomance.jpg') st.image(logistic_reg_perf, use_column_width=True) st.header("**Random Forest tree**") """The building blocks of the random first model are Decision trees.Simple put ,the decision tree is a flowchart of questions leading to a prediction. Random forest is a technique used in modeling predictions and behavior analysis and is built on decision trees. It contains many decision trees that represent a distinct instance of the classification of data input into the random forest. The random forest technique takes consideration of the instances individually, taking the one with the majority of votes as the selected prediction.""" """Each decision tree in the forest considers a random subset of features when forming questions and only has access to a random set of the training data points.This increases diversity in the forest leading to more robust overall predictions and the name ‘random forest.’ When it comes time to make a prediction, the random forest takes an average of all the individual decision tree estimates """ """Each tree in the classifications takes input from samples in the initial dataset.This is followed by a random selection of Features (or indipendent variables) , which are used in growing the tree at each node. Every tree in the forest is pruned until the end of the exercise when the prediction is reached decisively. Thus ,the random forest enables any classifiers with weak correlations to create a strong classifier""" decisiontree = Image.open("Images/random_forest.png") st.image(decisiontree, caption="Random Forest tree process to predict a label ", width=None) st.subheader("Pros and cons of the random forrest") """ - Can handle missing values well. Missing values are substituted by the variable appearing the most in a particular node.""" """ - Provides the some of the highest accuracy of available classification methods""" """ - Some drawbacks is that the random forst classifyer method is that it requires a lot of computational reasources time consuming ,and less intuitive compared to other algorithms""" random_for_perf = Image.open("Images/random_forest_perf.jpg") st.image(random_for_perf, use_column_width=True) st.header("Support Vector Machine") """A Support Vector Machine (SVM) is a supervised machine learning algorithm that can be employed for both classification and regression purposes.SVMs are based on the idea of finding a hyperplane that best divides a dataset into two classes""" """Support vectors are the data points nearest to the hyperplane, the points of a data set that, if removed, would alter the position of the dividing hyperplane. Because of this, they can be considered the critical elements of a data set.""" """Simply put ,a hyperplane is a line that linearly separates and classifies a set of data.""" """The further from the hyperplane a data point lies, the higher the probability that it has been classified correctly. Ideally ,we require a data point to be as far away as possible , while still being on the correct side of the hyperplane .Whenever new testing data is added ,the side of the hyperplane is lands on decides the class it is assigned to. """ svm = Image.open("Images/support_vector1.jpg") st.image(svm, caption="Hyperplane deviding data points", use_column_width=True) st.subheader("Pros and Cons of Support Vector Machines") """- it is very accurate and works well on smaller cleaner datasets""" """ - It can be more efficient because it uses a subset of training points""" """ - Less effective on noisier datasets with overlapping classes , training time with SVMs can be high ,thus not suitable for larger datasets""" svm_perf = Image.open("Images/support_vector_perfomance.jpg") st.image(svm_perf, use_column_width=True) st.header("For more information on algorithm implimentation") "**Logistic regression**" " https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" "**Random Forest **" " https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html" "**Support Vector Machines** " " https://scikit-learn.org/stable/modules/svm.html" #Natural language Processing page ,it slowed down my computer because of the english library #if selection == 'Natural Language Processing': #st.info("Natural Language Processing") #tweet_text = st.text_area("Enter Text","Type Here") #nlp_task = ["Tokenization","NER","Lemmatization","POS Tags"] #task_choice = st.selectbox("Choose NLP Task",nlp_task) #if st.button("Analyze"): #st.info("Original Text {}".format(tweet_text)) #docx = nlp(tweet_text) #if task_choice == 'Tokenization': #result = [ token.text for token in docx ] #elif task_choice == 'Lemmatization': #result = ["'Token':{},'Lemma':{}".format(token.text,token.lemma_) for token in docx] #elif task_choice == 'NER': #result = [(entity.text,entity.label_)for entity in docx.ents] #elif task_choice == 'POS Tags': #st.json(result) #if st.button("Tabulize"): #docx = nlp(tweet_text) #c_tokens = [ token.text for token in docx ] #c_lemma = [token.lemma_ for token in docx] #c_pos = [word.tag_ for word in docx] #new_df = pd.DataFrame(zip(c_tokens,c_lemma,c_pos),columns=['Tokens','Lemma','POS']) #st.dataframe(new_df) #if st.checkbox("Wordcloud"): #wordcloud = WordCloud(max_font_size=30, max_words=100, background_color="orange").generate(tweet_text) #plt.imshow(wordcloud,interpolation='bilinear') #plt.axis("off") #st.pyplot() # Building out the "Information" page if selection == "About Global Warming": st.info("General Information") """ # Global Warming in 5 minutes """ st.header("Natural Climate change") """ - Throughout its long history, Earth has warmed and cooled time and again. Climate has changed when the planet received more or less sunlight due to subtle shifts in its orbit, as the atmosphere or surface changed, or when the Sun’s energy varied .This was all without any help from humanity""" """ - Earth’s temperature begins with the Sun. Roughly 30% of incoming sunlight is reflected back into space by bright surfaces like clouds and ice. The rest is absorbed by the land and ocean, and the atmosphere. The absorbed solar energy heats our planet and makes it habitable.""" """ - As the rocks, the air, and the seas get warmer, they radiate “heat” energy which travels into the atmosphere ,where it is absorbed by water vapor and long-lived greenhouse gases """ """ - Greenhouse gases are those gases in the atmosphere that have an influence on the earth's energy balance. The best known greenhouse gases, carbon dioxide (CO₂), methane and nitrous oxide, can be found naturally in low concentrations in the atmosphere. """ """ - After absorbing the heat energy ,these greenhouse gases will radiate energy in all directions. Some of this energy is radiated back towards the Earth ,further warming atmosphere and surfaces - This is the natural greenhouse""" """ - Some natural forces that contribute to climate change include volcanic eruptions, which pump out clouds of dust and ash, which block out some sunlight. Volcanic debris also includes sulfur dioxide, combines with water vapor and dust in the atmosphere to form sulfate aerosols, which reflect sunlight away from the Earth’s leading to a cooling effect.""" """ - Earth orbital changes - Shifts and wobbles in the Earth’s orbit can trigger changes in climate such as the beginning and end of ice ages""" """ - Also natural is Solar variations. Although the Sun’s energy output appears constant from an everyday point of view, small changes over an extended period of time can lead to climate changes. Some scientists suspect that a portion of the warming in the first half of the 20th century was due to an increase in the output of solar energy""" """- Scientists constantly measure these natural effects, but none can account for the observed trend since 1970. Scientists can only account for recent global warming by including the effects of human greenhouse gas emissions.""" image = Image.open("Images/global temperature.jpg") st.image(image, caption="Global temperature graph(Image: Global Warming Art)", use_column_width=True) st.subheader("Some notable events in The Global Temperature timeline") """ Between 1850-1890 , the Mean global temperature was roughly 13.7°C.This is the time period of the First Industrial Revolution. Coal, railroads, and land clearing speed up greenhouse gas emission, while better agriculture and sanitation speed up population growth.""" """Between 1870-1910 was the Second Industrial Revolution. Fertilizers and other chemicals, electricity, and public health further accelerate population growth.""" """ Around 1940 ,massive output of aerosols from industries and power plants contributed to the global cooling trend from 1940-1970.""" """ two major volcanic eruptions, El Chichon in 1982 and Pinatubo in 1991, pumped sulfur dioxide gas high into the atmosphere. The gas was converted into tiny particles that lingered for more than a year, reflecting sunlight and shading Earth’s surface causing cooling for two to three years.""" """The 10 warmest years on record have all occurred since 1998, and 9 of the 10 have occurred since 2005.""" """Models predict that Earth will warm between 2 and 6 degrees Celsius in the next century. When global warming has happened at various times in the past two million years, it has taken the planet about 5,000 years to warm 5 degrees. e predicted rate of warming for the next century is at least 20 times faster""" """- Factuations climate is natural but scientists say temperatures are now rising faster than at many other times.""" """ - Humans have been artificially raising the concentration of greenhouse gases in the atmosphere ,causing the enhanced Greenhouse effect """ """ - Global warming is the unusually rapid increase in Earth’s average surface temperature over the past century primarily due to the greenhouse gases released as people burn fossil fuels. """ """ - According to IPCC in its 5th 2013 fifth assessment report ,there is between a 95% and 100% probability that more than half of modern day warming was due to humans.""" """ - Recent US fourth national climate assessment found that between 93% to 123% of observed 1951-2010 warming was due to human activities""" """ - Human activities like burning fossil fuels leading to higher carbon dioxide concentrations, farming and forestry — including land use change via agriculture and livestock cement manufacture aerosols — chlorofluorocarbons (CFCs) have been linked to Global warming""" """ - Greenhouse gases from these activities collect in the atmosphere and absorb sunlight and solar radiation that have bounced off the earth’s surface. Normally, this radiation would escape into space—but these pollutants, which can last for years to centuries in the atmosphere, trap the heat and cause the planet to get hotter. That's what's known as the greenhouse effect """ """- There are Natural external causes such as increases or decreases in volcanic activity or solar radiation. For example, every 11 years or so, the Sun’s magnetic field flips ,this can cause small fluctuations in global temperature, up to about 0.2 degrees. On longer time scales – tens to hundreds of millions of years – geological processes can drive changes in the climate, due to shifting continents and mountain building""" """ # Evidence of Global Warming 📈 """ """ - Across the globe, average sea level increased by 3.6mm per year between 2005 and 2015 """ """ - According to the World Meteorological Organization (WMO),The world is about one degree Celsius warmer than before widespread industrialisation""" """ - Data from NASA's Gravity Recovery and Climate Experiment show The Greenland and Antarctic ice sheets have decreased in mass""" st.subheader("Suggested Readings :earth_africa: ") st.markdown("https://www.bbc.com/news/science-environment-24021772") st.markdown("https://climate.nasa.gov/evidence/") st.markdown( "https://earthobservatory.nasa.gov/features/GlobalWarming/page2.php" ) st.markdown( "https://www.carbonbrief.org/analysis-why-scientists-think-100-of-global-warming-is-due-to-humans" ) # read in word file st.subheader("Refereces") """1.https://www.newscientist.com/article/dn11639-climate-myths-the-cooling-after-1940-shows-co2-does-not-cause-warming/""" st.subheader("Climate change tweet classification") # Building out the predication page if selection == "Prediction": st.info("Prediction with ML Models") # Creating a text box for user input tweet_text = st.text_area("Enter Text to Classify ", "Type Here") #tweet_text=[tweet_text] #tweet_text = st.text_area("Enter Text","Type Here") all_ml_models = [ "Logistic Regression", "Support Vector Machine", "Random Forest Tree" ] model_choice = st.selectbox("Choose ML Model", all_ml_models) prediction_labels = { "Neutral : This text neither supports nor refutes the belief of man-made Climate change": 0, "Pro : This text shows belief in man-man climate change": 1, "news : This text is links to factual news about climate change": 2, "Anti : This text shows lack of belief in man-made climate change": -1 } if st.button("Classify"): if model_choice == "Logistic Regression": predictor = joblib.load( open(os.path.join("resources/saved_model_for_App.pkl"), "rb")) prediction = predictor.predict([tweet_text]) elif model_choice == "Support Vector Machine": predictor = joblib.load( open(os.path.join("resources/saved_model_for_App.pkl"), "rb")) prediction = predictor.predict([tweet_text]) # st.write(prediction) elif model_choice == "Random Forest Tree": predictor = joblib.load( open(os.path.join("resources/saved_model_for_App.pkl"), "rb")) prediction = predictor.predict([tweet_text]) #Results displayed on screen after User has clicked the classify button final_result = get_keys(prediction, prediction_labels) st.success("{}".format(final_result))
def generateWordCloud(cursor): stopwords = set(STOPWORDS) stopwords.update([ "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz" ]) #Generate text results = SelectRecentArticles(cursor) String = "" for result in results: String += result[0] String += " " # generate word cloud wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(String) # set figure size plt.figure(figsize=(40, 30)) # Display image plt.imshow(wordcloud, interpolation='bilinear') # no axis details plt.axis("off") plt.savefig("../cloud.png") # generateWordCloud(cursor)
async def group_word(context): imported_1 = False if len(context.parameter) >= 1: imported_1 = True if not imported: try: await context.edit("支持库 `jieba` 未安装...\n正在尝试自动安装...") await execute(f'{executable} -m pip install jieba') await sleep(10) result = await execute(f'{executable} -m pip show jieba') if len(result) > 0: await context.edit('支持库 `jieba` 安装成功...\n正在尝试自动重启...') await context.client.disconnect() else: await context.edit( f"自动安装失败..请尝试手动安装 `{executable} -m pip install jieba` 随后,请重启 PagerMaid-Modify 。" ) return except: return if not imported_ and imported_1: try: await context.edit("支持库 `paddlepaddle-tiny` 未安装...\n正在尝试自动安装...") await execute(f'{executable} -m pip install paddlepaddle-tiny') await sleep(10) result = await execute( f'{executable} -m pip show paddlepaddle-tiny') if len(result) > 0 and not 'WARNING' in result: await context.edit( '支持库 `paddlepaddle-tiny` 安装成功...\n正在尝试自动重启...') await context.client.disconnect() else: await context.edit( f"自动安装失败,可能是系统不支持..\nAI 分词不可用,切换到基础分词。\n" f"您可以尝试手动安装 `{executable} -m pip install paddlepaddle-tiny` 。" ) await sleep(4) except: return try: await context.edit('正在生成中。。。') except: return if not exists("plugins/groupword"): makedirs("plugins/groupword") if not exists("plugins/groupword/wqy-microhei.ttc"): await context.edit('正在拉取中文字体文件。。。(等待时间请评估你的服务器)') r = get( 'https://cdn.jsdelivr.net/gh/anthonyfok/fonts-wqy-microhei/wqy-microhei.ttc' ) with open("plugins/groupword/wqy-microhei.ttc", "wb") as code: code.write(r.content) words = defaultdict(int) count = 0 try: if imported_ and imported_1: try: jieba.enable_paddle() except: imported_1 = False async for msg in context.client.iter_messages(context.chat, limit=500): if msg.id == context.id: continue if msg.text and not msg.text.startswith( '/') and not msg.text.startswith( '-') and not '//' in msg.text: try: if imported_ and imported_1: for word in jieba.cut(msg.text.translate(punctuation), use_paddle=True): word = word.lower() words[word] += 1 else: for word in jieba.cut(msg.text.translate(punctuation)): word = word.lower() words[word] += 1 count += 1 except: pass except: if count == 0: try: await context.edit('您已被 TG 官方限制。') return except: return try: image = WordCloud( font_path="plugins/groupword/wqy-microhei.ttc", width=800, height=400).generate_from_frequencies(words).to_image() stream = BytesIO() image.save(stream, 'PNG') except: await context.edit('词云生成失败。') return try: await context.client.send_message(context.chat, f'对最近的 {count} 条消息进行了分析。', file=stream.getvalue()) await context.delete() except: return
cut_text = " ".join(jieba.cut(x)) result = jieba.analyse.textrank(cut_text, topK=1000, withWeight=True) keywords = dict() for i in result: keywords[i[0]] = i[1] d = path.dirname(__file__) # 当前文件文件夹所在目录 color_mask = imread("/hwj/dorahacks/1.jpg") # 读取背景图片 cloud = WordCloud( #设置字体,不指定就会出现乱码 font_path="/hwj/dorahacks/STFANGSO.ttf", # font_path=path.join(d,'simsun.ttc'), width=200, height=200, #设置背景色 background_color='white', #词云形状 mask=color_mask, #允许最大词汇 max_words=2000, #最大号字体 max_font_size=40) word_cloud = cloud.generate(cut_text) # 产生词云 word_cloud.to_file("/hwj/dorahacks/user_img.jpg") #保存图片 # 显示词云图片 plt.imshow(word_cloud) plt.axis('off') plt.show() comment_text = x # 结巴分词,生成字符串,如果不通过分词,无法直接生成正确的中文词云
from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt # 한글 그래프 처리시 필수 from matplotlib import font_manager, rc import re font_path = "c:/Windows/Fonts/malgun.ttf" font_name = font_manager.FontProperties( fname="c:/Windows/Fonts/malgun.ttf").get_name() rc('font', family=font_name) plt.rcParams['axes.unicode_minus'] = False text = open('ab.txt', 'r', encoding='utf-8').read() # print(type(text)) ## 영문 숫자 제거 (한글만 가져 오는 방식) text = re.compile('[가-힣]+').findall(text) # print(type(text)) text = ' '.join(text) # list 를 string 으로 만들고 아이템 사이는 공백을 넣는다. # print(type(text)) wordcloud = WordCloud(font_path=font_path).generate(text) plt.imshow(wordcloud) plt.axis("off") plt.show()