class YaseeStopWords(): DEFAULT_STOPWORDS = STOPWORDS.union(UCIWC_DEFAULTSTOPWORDS) def __init__(self, stopwords:frozenset=None, replace:bool=False): if (stopwords == None): self.stopwords = set(YaseeStopWords.DEFAULT_STOPWORDS) else: if replace: self.stopwords = set(stopwords.__iter__()) else: self.stopwords = set(YaseeStopWords.DEFAULT_STOPWORDS) for x in stopwords: self.stopwords.add(x) def getStopwords(self) -> frozenset: return frozenset(self.stopwords.__iter__()) def addStopwords(self, item: str or iter=None): if item == None: return elif type(item) == str: self.stopwords.add(item) else: for x in item: self.stopwords.add(x) def __contains__(self, item): return item in self.stopwords def __iter__(self): return self.stopwords.__iter__()
def plotWordCloud(results_list): actions_str = "" keywords_str = "" for result in results_list: if (len(result["actions"]) > 0): actions_str += ' '.join(result["actions"]) + " " if (len(result["keywords"]) > 0): keywords_str += ' '.join(result["keywords"]) + " " my_stopwords = { "try", "keep", "use", "want", "need", "know", "give", "help", "tell", "might", "cant", "say", "cause" "place" } wordcloud_actions = WordCloud( stopwords=STOPWORDS.union(my_stopwords)).generate(actions_str) wordcloud_keywords = WordCloud().generate(keywords_str) fig, axs = plt.subplots(1, 2, figsize=(20, 10)) axs[0].imshow(wordcloud_actions) axs[0].set_title("Actions", fontsize=20) axs[0].axis("off") axs[1].imshow(wordcloud_keywords) axs[1].set_title("Keywords", fontsize=20) axs[1].axis("off") plt.show()
def word_cloud(data, my_stopwords, background_image, use_col='comment', max_fontsize=70, save_path='wordcloud.png'): """ :return: 词云图 """ background_image = plt.imread(background_image) wc = WordCloud( background_color='white', # 设置背景颜色 mask=background_image, # 设置遮罩图片,控制的是词云的形状,比如说松鼠形状的词云,云朵形状的等等,图清晰点比较好 max_words=55, # 设置最大现实的字数 collocations=False, stopwords=STOPWORDS.union(set(my_stopwords)), # 设置停用词 font_path='MSYH.TTF', # 设置字体格式,如不设置显示不了中文 max_font_size=max_fontsize, # 设置字体最大值,会自动根据图片大写调整,不同图的60看起来不一样 # min_font_size=2, # 设的比较大的话,小的就不显示了 # random_state = 1800, # 设置有多少种随机生成状态,即有多少种布局方案,横的竖的分布 scale=20 # 越大计算越慢,图的大小,不如让底图大点清晰点来得快 ) text = ','.join(map(str, data[use_col])) wc.generate(text) image_colors = ImageColorGenerator(background_image) wc.recolor(color_func=image_colors) plt.imshow(wc) plt.axis('off') plt.savefig(save_path)
def main(): options, args = cmdparameter(sys.argv) #----------------------------------- file = options.filein mask_pic = options.mask_pic if not mask_pic: mask_pic = None stopwords = options.stopwords maxwords = options.maxwords font = options.font output = options.output max_font_size = options.max_font_size if not output: output = file + '.png' verbose = options.verbose global debug debug = options.debug global STOPWORDS if stopwords: STOPWORDS = STOPWORDS.union( set([line.strip() for line in open(stopwords)])) print(STOPWORDS) #----------------------------------- draw_wordcloud(txt=file, output=output, font=font, max_font_size=max_font_size, max_words=maxwords, mask_pic=mask_pic, stopwords=STOPWORDS)
def get_stopwords() -> Set[str]: stopwords: Set[str] = set() # stop words from sklearn stopwords = stopwords.union(text.ENGLISH_STOP_WORDS) # stop words from wordcloud stopwords = WCSTOPWORDS.union(stopwords) # custom stopwords # from config and/or bot commands user_stopwords = CONFIG['stopwords'].split(',') stopwords.update(user_stopwords) # Adapt for how wordcloud and sklearn CountVectorizer handle stop words # Satisfy both preprocessed_stopwords = [] for sw in stopwords: if '\'' not in sw: continue parts = sw.split('\'') preprocessed_stopwords.append(parts[0]) preprocessed_stopwords.append(parts[1]) stopwords.update(preprocessed_stopwords) return stopwords
def config_stopwords(self, more_stopwords=None): """ (obj) -> None Configuring stopwords by adding more if required """ if more_stopwords is not None: self.STOPWORDS = STOPWORDS.union(more_stopwords)
def title_wordcloud(dataFrame): from wordcloud import WordCloud, STOPWORDS from PIL import Image #WordCloud Visualization text = " ".join(list(dataFrame['track_name'])) STOPWORDS = STOPWORDS.union(["feat","Remix","Edit","Radio","Version","Mix","Remastered"]) spotify_mask = np.array(Image.open(path.join( "spotify-logo.jpg"))) wordcloud = WordCloud(width=2880, height=1800,background_color="white", stopwords=STOPWORDS,mask = spotify_mask).generate(text) # Open a plot of the generated image. plt.figure( figsize=(10,6)) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad=0) plt.savefig("project3_wordcloud.png") plt.show()
def plot_word2(text): wordcloud = WordCloud( stopwords=STOPWORDS.union(set(stwlist)), max_words=200, max_font_size=120, font_path="simsun.ttf", random_state=0, ).generate(text) # Display the generated image: fig, ax = plt.subplots() # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis("off") # plt.show() ax.imshow(wordcloud, interpolation='bilinear') ax.axis("off") # ax.show() st.pyplot(fig)
def get_polarity_and_wordcloud(tweets, NoofTweets): """A function to calculate sentiments and to build wordcloud""" positive = 0 negative = 0 neutral = 0 polarity = 0 print() print('THE TWEETS ARE:') print() for tweet in tweets: #print(tweet.text) tweet_tokenisation = (' '.join( re.sub( "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^RT | co | ", " ", str(tweet.text)).split() )) #returns a list--.join converts it into a string print(tweet.created_at) print(tweet_tokenisation) sentiment_analyser = TextBlob(tweet.text) polarity += sentiment_analyser.sentiment.polarity if sentiment_analyser.sentiment.polarity == 0: neutral += 1 if sentiment_analyser.sentiment.polarity < 0: negative += 1 if sentiment_analyser.sentiment.polarity > 0: positive += 1 Positive = convert_percentage(positive, NoofTweets) Negative = convert_percentage(negative, NoofTweets) Neutral = convert_percentage(neutral, NoofTweets) print() print("WORDCLOUD:") print() more_stopwords = {'oh', 'will', 'hey', 'yet', 'RT'} # Adding stopwords as a part of text preprocessing STOPWORDS_MOD = STOPWORDS.union(more_stopwords) cloud = WordCloud(width=1800, height=1400, background_color='black', stopwords=STOPWORDS_MOD).generate(tweet_tokenisation) plt.imshow(cloud) plt.axis('off') plt.tight_layout(pad=0) plt.show() return Positive, Negative, Neutral
def title_wordcloud(dataFrame): from wordcloud import WordCloud, STOPWORDS from PIL import Image #WordCloud Visualization text = " ".join(list(dataFrame['track_name'])) STOPWORDS = STOPWORDS.union( ["feat", "Remix", "Edit", "Radio", "Version", "Mix", "Remastered"]) spotify_mask = np.array(Image.open(path.join("spotify-logo.jpg"))) wordcloud = WordCloud(width=2880, height=1800, background_color="white", stopwords=STOPWORDS, mask=spotify_mask).generate(text) # Open a plot of the generated image. plt.figure(figsize=(10, 6)) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad=0) plt.savefig("project3_wordcloud.png") plt.show()
def main(): options, args = cmdparameter(sys.argv) #----------------------------------- file = options.filein mask_pic = options.mask_pic if not mask_pic: mask_pic = None stopwords = options.stopwords maxwords = options.maxwords font = options.font output = options.output max_font_size = options.max_font_size if not output: output = file + '.png' verbose = options.verbose global debug debug = options.debug global STOPWORDS if stopwords: STOPWORDS = STOPWORDS.union( set([line.strip() for line in open(stopwords)])) print >> sys.stderr, STOPWORDS #----------------------------------- draw_wordcloud(txt=file, output=output, font=font, max_font_size=max_font_size, max_words=maxwords, mask_pic=mask_pic, stopwords=STOPWORDS) #-----------end close fh----------- ###--------multi-process------------------ #pool = ThreadPool(5) # 5 represents thread_num #result = pool.map(func, iterable_object) #pool.close() #pool.join() ###--------multi-process------------------ if verbose: print >>sys.stderr,\ "--Successful %s" % strftime(timeformat, localtime())
def plot_cloud(text): stopwords = STOPWORDS.union(common_words) wordcloud = WordCloud( background_color="white", width=2400, height=1200, stopwords=stopwords, max_words=300).generate( text) #.recolor(color_func=grey_color_func, random_state=3) # Open a plot of the generated image. plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") # import IPython; IPython.embed() fig = plt.gcf() fig.set_size_inches(18.5, 10.5) canvas = FigureCanvas(fig) png_output = BytesIO() canvas.print_png(png_output) return png_output.getvalue()
def make_wordcloud(text, outfile, custom_sw=None, reduction=None, figure_size=(20, 10), display=False): """ Generate a square wordcloud. """ # Ensure output directory exists. If not, create. directory = '/'.join(outfile.split( '/')[:-1]) # split by '/'; remove last element (file); join by '/' if not os.path.exists(directory): os.makedirs(directory) # Join default and custom stopwords if custom_sw is not None: sw = STOPWORDS.union(custom_sw) # Reduce if reduction is not None: text = replace_strings(text, reduction) # Create wordcloud wordcloud = WordCloud(max_font_size=60, stopwords=sw, background_color='black').generate(text) plt.figure(figsize=figure_size) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.savefig(outfile, bbox_inches='tight') if display: plt.show() pass
from pathlib import Path import matplotlib.pyplot as plt from nltk.corpus import stopwords from clean import CleanText nike_tweets = pd.read_csv( Path(__file__).absolute().parent.joinpath( '../dataset/5000-justdoit-tweets-dataset/justdoit_tweets_2018_09_07_2.csv' )) nike_tweets = nike_tweets[['tweet_full_text']] exclude = stopwords.words('english').append('https') tweet_string = [] cleaner = CleanText() words_to_exclude = {'https'} for t in nike_tweets.tweet_full_text: tweet_string.append(t) tweet_string = pd.Series(tweet_string).str.cat(sep=' ') whitelist = ["n't", "not", "no"] print(tweet_string) print(stopwords.words('english')) wc = WordCloud(width=1600, height=800, max_font_size=200, ranks_only="frequency", stopwords=STOPWORDS.union(words_to_exclude), collocations=False).generate(tweet_string) plt.figure(figsize=(12, 10)) plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.show()
# Twitter API docs: # https://dev.twitter.com/docs/api/1/get/search #----------------------------------------------------------------------- query = twitter.search.tweets(q = "modi", count=5000) #, until='2016-01-07') #----------------------------------------------------------------------- # How long did this query take? #----------------------------------------------------------------------- print ("Search complete (%.3f seconds)" % (query["search_metadata"]["completed_in"])) #----------------------------------------------------------------------- # Loop through each of the results, and print its content. #----------------------------------------------------------------------- #for result in query["statuses"]: # print ("(%s) @%s %s" % (result["created_at"], result["user"]["screen_name"], result["text"])) # make a corpus from the list of tweets status_list = [ result['text'] for result in query['statuses']] corpus = ' '.join(status_list) # read in the image and colors and plot the word cloud img = Image.open("modi.jpg") #img = img.resize((980,1080), Image.ANTIALIAS) modi_coloring = np.array(img) image_colors = ImageColorGenerator(modi_coloring) #hcmask = scipy.ndimage.zoom(hcmask, 2, order=3) STOPWORDS = STOPWORDS.union({"http","https","t","co","rt","since","towards","now","ok","okay","tag", "amp"}) #wc = WordCloud(background_color="white", max_words=2000, mask=hcmask, stopwords=STOPWORDS) wc = WordCloud(font_path='cabin-sketch.bold.ttf', background_color="white", max_words=2000, mask=modi_coloring, color_func=image_colors, stopwords=STOPWORDS) wc.generate(corpus) wc.to_file("wc_color.png")
import pandas as pd import io import os texts = {} df = pd.read_csv("train_set.csv", sep="\t") categories = ['Business', 'Politics', 'Film', 'Football', 'Technology'] my_stop_words = [ 'will', 'one', 'two', 'four', 'new', 'now', 'day', 'year', 'month', 'week', 'ago', 'late', 'little', 'many', 'said', 'last', 'time', 'first', 'second', 'make', 'say', 'saying', 'may', 'maybe', 'long', 'short', 'use', 'says', 'old', 'made', 'today', 'back', 'face', 'believe', 'around', 'become', 'th', 'high' ] stop_words = STOPWORDS.union(my_stop_words) if not os.path.exists("Images"): os.makedirs("Images") for i in categories: texts[i] = df.ix[df['Category'] == i]['Content'] texts[i] = texts[i].to_string(header=False) texts[i] = texts[i].replace('\n', ' ').replace('\r', '') wordcloud = WordCloud(max_font_size=50, min_font_size=2, max_words=500, stopwords=stop_words, background_color="white", relative_scaling=.4).generate(texts[i])
# # 따라하며 배우는 파이썬과 데이터과학(생능출판사 2020) # 9.7 정보를 한눈에 보여주는 워드 클라우드 , 232쪽 # from wordcloud import WordCloud, STOPWORDS # 중지어가 제외된 워드 클라우드를 만들자 s_words = STOPWORDS.union({'one', 'using', 'first', 'two', 'make', 'use'}) wordcloud = WordCloud(width=2000, height=1500, stopwords=s_words).generate(text)
fontsize=14) axes.set_xlabel("User ", fontsize=14) fig.tight_layout() plt.savefig("data/tweet_frequency_user_wise.jpg") ## 3. Get follower count for users dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d %H:%M:%S') user_data = pd.read_csv( './data/tweet_users.csv', sep="\t" ) #, parse_dates=['created_at'], date_parser=dateparse, dtype={'hashtags':str}) #print("Read the data. Its columns are:\n " , re.sub("[ ]+",":", str(user_data.dtypes).replace("\n", ",\t") ) ) fig, axes = plt.subplots(figsize=(15, 5)) user_data.set_index('name')[['followers_count']].plot(ax=axes, kind='bar') ##plt.setp(axes[0].get_xticklabels(), visible=False) plt.title("Visualizing Tweet frequency: Year wise, and User-wise", fontsize=16) axes.set_xlabel("Twitter user name", fontsize=16) fig.tight_layout() plt.savefig("data/followers.jpg") ##4 . A word cloud to see the important words at play here from wordcloud import WordCloud, STOPWORDS text = "\n".join([ft for ft in data.fulltext]) wordcloud = WordCloud(relative_scaling=1.0, stopwords=STOPWORDS.union(["https", "co", "rt"])).generate(text) fig, axes = plt.subplots(figsize=(15, 6)) plt.imshow(wordcloud) plt.axis("off"), plt.savefig("data/wordcloud.jpg") #plt.show()\n",
for i in range(0, A.shape[0]): if A[i][4] == "Politics": text_p += " " + A[i][3] elif A[i][4] == "Film": text_f += " " + A[i][3] elif A[i][4] == "Football": text_ft += " " + A[i][3] elif A[i][4] == "Technology": text_t += " " + A[i][3] elif A[i][4] == "Business": text_b += " " + A[i][3] my_additional_stop_words = STOPWORDS.union([ 'people', 'said', 'did', 'say', 'says', 'year', 'day', 'just', 'good', 'come', 'make', 'going', 'having', 'like', 'need', 'given', 'got' ]) stopwords = ENGLISH_STOP_WORDS.union(my_additional_stop_words) create_WordCloud(text_p, stopwords, "Politics") create_WordCloud(text_f, stopwords, "Films") create_WordCloud(text_ft, stopwords, "Football") create_WordCloud(text_t, stopwords, "Technology") create_WordCloud(text_b, stopwords, "Business")
event_ = PSTAT.event_ # Construct corpus: to lower case, strip numeric corpus = {} events = [26, 27] #[16, 83] for event in events: docs = keydev['events'].find( {'keydeveventtypeid': {'$eq': event}}, {'_id': 0}) corpus[event] = [re.sub(r'\b\w*[\d]\w*\b', ' ', " ".join( d[k] for k in ['headline', 'situation'])).lower() for d in docs] DataFrame({'description': [event_[event] for event in corpus.keys()], 'count': [len(lines) for lines in corpus.values()]}, index=corpus.keys()) # Tokenize, and remove stopwords stop_words = STOPWORDS.union(['co', 'ltd', 'mr', 'mrs', 'inc', 'llc']) for event, lines in corpus.items(): corpus[event] = [[w for w in re.findall(r"\w\w+", line) if w not in stop_words] for line in lines] # Split shuffled into labelled training and test sets train_data = [] test_data = [] split_frac = 0.9 for label, (event, lines) in enumerate(corpus.items()): np.random.shuffle(lines) n = int(split_frac * len(lines)) # split point of train and test sets train_data.extend([(label, corpus[event][p]) for p in range(n)]) test_data.extend([(label, corpus[event][p]) for p in range(n, len(lines))]) N = len(train_data) print('train/test:', N, [np.mean([label for label,_ in subset])
from wordcloud import WordCloud, STOPWORDS import wikipediaapi import matplotlib.pyplot as plt wiki = wikipediaapi.Wikipedia('en') page = wiki.page('UNESCO') STOPWORDS.union('work', 'literature', 'call', 'October', 'State', 'de', 'General') wordcloud = WordCloud(font_path='font/NanumGothic.ttf', stopwords=STOPWORDS, width=2000, height=2000).generate(page.summary) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud) plt.show()
import scipy tweet_file = open(username + "_tweets_file.bin",'rb') tweets = pickle.load(tweet_file) def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(60, 100) words = ' ' for tweet in tweets: words += tweet.text stopwords = {'https',"co","RT"} wordcloud = WordCloud( stopwords=STOPWORDS.union(stopwords), background_color='black', max_words=500, width=7000, height=7000 ).generate(words) plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3)) plt.axis('off') plt.savefig('./tweetcloud2.png', dpi=300) plt.show() """Top Hashtags""" hashtags_dict = {} for tweet in tweets:
def index(): if request.method == 'POST': hashtag_name = request.form['hashtag'] number = request.form['number'] splitted_hashtags = [ht.strip() for ht in re.split(", ", hashtag_name)] if check_if_hashtags_are_valid(splitted_hashtags): results = [] for tweet in tweepy.Cursor(api.search, q=splitted_hashtags, lang="en").items(int(number)): results.append(tweet) data_set = tweets_df(results) text = data_set["text"] for i in range(0, len(text)): txt = ' '.join(word for word in text[i].split() if not word.startswith('https:')) data_set.at[i, 'text2'] = txt data_set.drop_duplicates('text2', inplace=True) data_set.reset_index(drop=True, inplace=True) data_set.drop('text', axis=1, inplace=True) data_set.rename(columns={'text2': 'text'}, inplace=True) # Join all the text from the 1000 tweets text_Combined = " ".join(text.values.astype(str)) more_stopwords = { 'https', 'RT', 'rt', 'CO', '@', 'el', 't', '&', 'covid', 'covid 19', hashtag_name, hashtag_name[1:], '#covid19', 'tco', 'covid19', 'amp', '@drericding' } stopwords = STOPWORDS.union(more_stopwords) covid = " ".join([word for word in text_Combined.split()]) wordcount = {} # To eliminate duplicates, remember to split by punctuation, and use case demiliters. for word in covid.lower().split(): word = word.replace(".", "") word = word.replace(",", "") word = word.replace(":", "") word = word.replace("\"", "") word = word.replace("!", "") word = word.replace("“", "") word = word.replace("‘", "") word = word.replace("*", "") if word not in stopwords: if word not in wordcount: wordcount[word] = 1 else: wordcount[word] += 1 word_counter = collections.Counter(wordcount) # Create a data frame of the most common words lst = word_counter.most_common(100) df = pd.DataFrame(lst, columns=['Word', 'Count']) text1 = df["Word"] text_Combined = " ".join(text1.values.astype(str)) covid = " ".join([word for word in text_Combined.split()]) #Create a Word Cloud wc = WordCloud(background_color="White", stopwords=STOPWORDS.union(more_stopwords), width=600, height=300, relative_scaling=0, max_words=50) wc.generate(covid) wc.to_file('static/temporary_files/fig100.png') full_filename = os.path.join(app.config['UPLOAD_FOLDER'], 'fig100.png') return render_template("search.html", image=full_filename) else: return render_template("index.html") else: return render_template("index.html")
def load_file2list(filename): with open(filename, 'r') as fn: flines = fn.readlines() datam = [] for line in flines: temp = line.strip('\n').split(' ')[0] datam.append(temp) return datam if __name__ == '__main__': text = open(args.text).read() font_path = args.font_path jieba_result = jieba_processing_txt(text) mask_list = load_file2list(args.stop_words) mask_word = STOPWORDS.union(mask_list) wc = WordCloud(font_path=font_path, background_color="white", max_words=400, max_font_size=400, width=2000, height=1000, stopwords=mask_word) wc.generate(jieba_result) wc.to_file(args.output_image)
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(60, 100) words = ' ' for tweet in tweets: words += tweet.text stopwords = {'https', "co", "RT"} wordcloud = WordCloud(stopwords=STOPWORDS.union(stopwords), background_color='black', max_words=500, width=7000, height=7000).generate(words) plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3)) plt.axis('off') plt.savefig('./tweetcloud2.png', dpi=300) plt.show() """Top Hashtags""" hashtags_dict = {} for tweet in tweets: hashtags = tweet.entities.get('hashtags') for hashtag in hashtags:
from wordcloud import WordCloud from wordcloud import STOPWORDS import matplotlib.pyplot as plt import re filename="/Users/chunmeiGao/Documents/Dataincubator/emailsubject.txt" # Read the whole text. text = open(filename).read() print text text=re.sub('Re:', '', text) text=re.sub('RE:', '', text) text=re.sub('FW:', '', text) text=re.sub('Fwd:', '', text) text=re.sub('Enron', '', text) more_stopwords = {'X', 'Re', 'Fwd','ENRON','NA','FW'} STOPWORDS = STOPWORDS.union(more_stopwords) # Generate a word cloud image wordcloud = WordCloud(stopwords=STOPWORDS).generate(text) # Display the generated image: # the matplotlib way: plt.imshow(wordcloud) plt.axis("off") # take relative word frequencies into account, lower max_font_size wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text) plt.figure() plt.imshow(wordcloud) plt.axis("off") plt.show()
def plotly_wordcloud(text): colors = ["#000000", "#111111", "#101010", "#121212", "#212121", "#222222"] cmap = LinearSegmentedColormap.from_list("mycmap", colors) wc = WordCloud( stopwords=set(STOPWORDS.union(set(stwlist))), max_words=300, max_font_size=120, colormap=cmap, random_state=0, ) wc.generate(text) word_list = [] freq_list = [] fontsize_list = [] position_list = [] orientation_list = [] color_list = [] for (word, freq), fontsize, position, orientation, color in wc.layout_: word_list.append(word) freq_list.append(freq) fontsize_list.append(fontsize) position_list.append(position) orientation_list.append(orientation) color_list.append(color) # get the positions x = [] y = [] for i in position_list: x.append(i[0]) y.append(i[1]) # get the relative occurence frequencies new_freq_list = [] for i in freq_list: new_freq_list.append((i * 150 + 8)) # new_freq_list trace = go.Scatter(x=x, y=y, textfont=dict(size=new_freq_list, color=color_list), hoverinfo='text', hovertext=[ '{0}{1}'.format(w, f) for w, f in zip(word_list, freq_list) ], mode='text', text=word_list) layout = go.Layout({ 'xaxis': { 'showgrid': False, 'showticklabels': False, 'zeroline': False }, 'yaxis': { 'showgrid': False, 'showticklabels': False, 'zeroline': False } }) fig = go.Figure(data=[trace], layout=layout) fig.update_layout(plot_bgcolor='#D3DFE2') return fig
def stop_words_configs(self): """ Configuring stopwords by adding more if required """ more_stopwords = {'innojam', 'video', 'cebit2014'} self.STOPWORDS = STOPWORDS.union(more_stopwords)
import numpy as np import pandas as pd import collections from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt other_stopwords_to_remove = ['abracadabra', 'etc'] STOPWORDS = STOPWORDS.union(set(other_stopwords_to_remove)) stopwords = set(STOPWORDS) data=pd.read_csv("file.csv") text=data[data['Name'] == 'DDT'] text=data["comments"] wordcloud = WordCloud(width = 800, height = 800, background_color ='white', max_words=2000, stopwords = stopwords, min_font_size = 10).generate(str(text)) #Arguments of WordCloud #['self', 'font_path', 'width', 'height', 'margin', 'ranks_only', 'prefer_horizontal', 'mask', 'scale', 'color_func', 'max_words', 'min_font_size', 'stopwords', 'random_state', 'background_color', 'max_font_size', 'font_step', 'mode', 'relative_scaling', 'regexp', 'collocations', 'colormap', 'normalize_plurals'] plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
def get_stopwords(): # Create stopword list: return STOPWORDS.union( set([ "a", "actualmente", "adelante", "además", "afirmó", "agregó", "ahí", "ahora", "cc", "this", "pa", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "al", "algo", "algún", "algún", "alguna", "algunas", "alguno", "algunos", "alrededor", "ambos", "ampleamos", "añadió", "ante", "anterior", "antes", "apenas", "aproximadamente", "aquel", "aquellas", "aquellos", "aqui", "aquí", "arriba", "aseguró", "así", "atras", "aún", "aunque", "ayer", "bajo", "bastante", "bien", "buen", "buena", "buenas", "bueno", "buenos", "cada", "casi", "cerca", "cierta", "ciertas", "cierto", "ciertos", "cinco", "comentó", "como", "cómo", "con", "conocer", "conseguimos", "conseguir", "considera", "consideró", "consigo", "consigue", "consiguen", "consigues", "contra", "cosas", "creo", "cual", "cuales", "cualquier", "cuando", "cuanto", "cuatro", "cuenta", "da", "dado", "dan", "dar", "de", "debe", "deben", "debido", "decir", "dejó", "del", "demás", "dentro", "desde", "después", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "donde", "dos", "durante", "e", "ejemplo", "el", "de", "la", "el", "porfas", "t", "p", "d", "est", "él", "ella", "ellas", "ello", "ellos", "embargo", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "encuentra", "entonces", "entre", "era", "eramos", "eran", "eras", "eres", "es", "esa", "esas", "ese", "eso", "esos", "esta", "ésta", "está", "estaba", "estaban", "estado", "estais", "estamos", "estan", "están", "estar", "estará", "estas", "éstas", "este", "éste", "esto", "estos", "éstos", "estoy", "estuvo", "ex", "existe", "existen", "explicó", "expresó", "fin", "fue", "fuera", "fueron", "fui", "fuimos", "gracias", "gran", "grandes", "gueno", "ha", "haber", "había", "habían", "habrá", "hace", "haceis", "hacemos", "hacen", "hacer", "hacerlo", "haces", "hacia", "haciendo", "hago", "han", "hasta", "hay", "haya", "he", "hecho", "hemos", "hicieron", "hizo", "hoy", "hubo", "igual", "incluso", "indicó", "informó", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "ir", "junto", "la", "lado", "largo", "las", "le", "les", "llegó", "lleva", "llevar", "lo", "los", "luego", "lugar", "manera", "manifestó", "más", "mayor", "me", "mediante", "mejor", "mencionó", "menos", "mi", "mientras", "mio", "misma", "mismas", "mismo", "mismos", "modo", "momento", "mucha", "muchas", "mucho", "muchos", "muy", "nada", "nadie", "ni", "ningún", "ninguna", "ningunas", "ninguno", "ningunos", "no", "nos", "nosotras", "nosotros", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o", "ocho", "otra", "otras", "otro", "otros", "para", "parece", "parte", "partir", "pasada", "pasado", "pero", "pesar", "poca", "pocas", "poco", "pocos", "podeis", "podemos", "poder", "podrá", "podrán", "podria", "podría", "podriais", "podriamos", "podrian", "podrían", "podrias", "poner", "por", "porque", "por qué", "posible", "primer", "primera", "primero", "primeros", "principalmente", "propia", "propias", "propio", "propios", "próximo", "próximos", "pudo", "pueda", "puede", "pueden", "puedo", "pues", "que", "qué", "quedó", "queremos", "quien", "quién", "quienes", "quiere", "realizado", "realizar", "realizó", "respecto", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "se", "sea", "sean", "según", "segunda", "segundo", "seis", "señaló", "ser", "será", "serán", "sería", "si", "sí", "sido", "siempre", "siendo", "siete", "sigue", "siguiente", "sin", "sino", "sobre", "sois", "sola", "solamente", "solas", "solo", "sólo", "solos", "somos", "son", "soy", "su", "sus", "tal", "también", "tampoco", "tan", "tanto", "tardes", "tarde", "tendrá", "tendrán", "teneis", "tenemos", "tener", "tenga", "tengo", "tenía", "tenido", "tercera", "tiempo", "tiene", "tienen", "toda", "todas", "todavía", "todo", "todos", "total", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "trata", "través", "tres", "tuvo", "tuyo", "tu", "te", "pq", "mas", "qie", "us", "has", "ti", "ahi", "mis", "tus", "do", "X", "Ven", "mo", "Don", "dia", "PT", "sua", "q", "x", "i", "última", "últimas", "ultimo", "último", "últimos", "un", "una", "unas", "uno", "unos", "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "usted", "va", "vais", "valor", "vamos", "van", "varias", "varios", "vaya", "veces", "ver", "verdad", "verdadera", "verdadero", "vez", "vosotras", "n", "s", "of", "c", "the", "m", "qu", "to", "as", "is", "asi", "via", "sera", "tambien", "vosotros", "voy", "y", "ya", "yo" ])).union(set(stopwords.words('spanish')))
pessoa = sentenca[0] frase = "".join(sentenca[1:]) falas.append(dict(pessoa=pessoa, frase=frase)) declarante = [ fala['frase'].decode('utf-8') for fala in falas if fala['pessoa'] == 'Declarante' ] declarante = "".join(declarante).lower() declarante = declarante.replace(u"não sei", u"nãosei") swords = ['que', 'eu', u'não', 'da', 'de', 'por', 'ele', u'você', u'está', 'tem', 'um', 'uma', 'se', 'foi', u'lá', 'pra', 'para', 'vai', u'já', 'na', 'era', 'em', u'aí', 'minha', u'nós', 'os', 'as', 'ou', 'essa', 'isso', 'como', 'aqui', 'pois', u'só', 'quando', u'então', 'muito', 'porque', 'acho', 'nem', 'mais', 'meu', 'ser', 'estou', 'vou', 'coisa', 'tenho', 'tinha', 'ter', u'quem' 'fui', 'mas', u'são', 'muita', 'mim', 'tudo', 'toda', 'todo', 'deve', 'falar', 'eles', 'das'] STOPWORDS = STOPWORDS.union(swords) wordcloud = WordCloud(width=800, height=400, stopwords=STOPWORDS).generate(declarante) wordcloud.to_file("wordcloud.png")
# So now that we know more about lenders location, let's analyze the textual freeform column *loan_because* and construct a wordcloud to get an insight about their motives for funding proejcts on Kiva. # In[24]: import matplotlib as mpl from wordcloud import WordCloud, STOPWORDS import imageio heart_mask = imageio.imread('../input/poverty-indicators/heart_msk.jpg') #because displaying this wordcloud as a heart seems just about right :) mpl.rcParams['figure.figsize']=(12.0,8.0) #(6.0,4.0) mpl.rcParams['font.size']=10 #10 more_stopwords = {'org', 'default', 'aspx', 'stratfordrec','nhttp','Hi','also','now','much','username'} STOPWORDS = STOPWORDS.union(more_stopwords) lenders_reason = lenders[~pd.isnull(lenders['loan_because'])][['loan_because']] lenders_reason_string = " ".join(lenders_reason.loan_because.values) wordcloud = WordCloud( stopwords=STOPWORDS, background_color='white', width=3200, height=2000, mask=heart_mask ).generate(lenders_reason_string) plt.imshow(wordcloud) plt.axis("off") plt.savefig('./reason_wordcloud.png', dpi=900)
def info(request): if request.method == 'GET' and 'screen_name' in request.GET: scn = request.GET['screen_name'] if scn == "": return render(request, 'tweets/login.html', {'message':'Enter a valid Twitter handle'}) else: # print("start", datetime.now()) STAT_PATH = os.path.join(settings.BASE_DIR, 'tweets/static/tweets/') handle = tweepy.OAuthHandler(settings.CONSUMER_KEY, settings.CONSUMER_SECRET) handle.set_access_token(settings.ACCESS_TOKEN, settings.ACCESS_TOKEN_SECRET) api = tweepy.API(handle) try: user = api.get_user(screen_name=scn) except tweepy.TweepError: return render(request, 'tweets/login.html', {'message':'Enter a valid Twitter handle'}) """ Tweets vs Weekday graph """ try: timeline = api.user_timeline(screen_name=scn, count=3200, include_rts=True) mid = timeline[-1].id - 1 while True: tl = api.user_timeline(screen_name=scn, count=3200, include_rts=True, max_id=mid) if not len(tl): break timeline += tl mid = tl[-1].id - 1 except tweepy.TweepError or TypeError: return render(request, 'tweets/login.html', {'message':'Enter a valid Twitter handle'}) y = [0]*7 for tw in timeline: d = tw.created_at.strftime("%w") y[int(d)] += 1 x = [0,1,2,3,4,5,6] xpoints = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'] plt.xticks(x, xpoints) plt.plot(x, y, 'b-') plt.xlabel('Days of week') plt.ylabel('No. of tweets') path_graph = STAT_PATH + 'graph.png' if os.path.isfile(path_graph): os.remove(path_graph) plt.savefig(path_graph, dpi=300, bbox_inches='tight') plt.clf() """ Tag-cloud """ # more stopwords file = open(STAT_PATH + 'stopwords.txt', 'r') more_stops = file.readlines() for i in range(len(more_stops)): more_stops[i] = more_stops[i].rstrip('\n') global STOPWORDS STOPWORDS = STOPWORDS.union(more_stops) words = [] matrix = [] for tw in timeline: matrix.append(tw.text.split()) words = words + tw.text.split() long_tweet_stripd = "" for w in words: if w != 'RT' and not(w.startswith('http')) and not(w.startswith('@')) and not(w.startswith('#')) and not(w.lower() in STOPWORDS): long_tweet_stripd = " ".join([long_tweet_stripd, w.lower()]) un_words = long_tweet_stripd.split() mask = imread(STAT_PATH + 'twitter_mask.png') wcloud = WordCloud(max_words=50, background_color='white', stopwords=STOPWORDS, mask=mask).generate(long_tweet_stripd) # print(long_tweet_stripd) path_wordcloud = STAT_PATH + 'wordcloud.png' if os.path.isfile(path_wordcloud): os.remove(path_wordcloud) plt.imshow(wcloud) plt.gca().invert_yaxis() plt.axis('off') plt.savefig(path_wordcloud, dpi=600, bbox_inches='tight') plt.clf() plt.close() """ Word co-occurences matrix """ all_words_use = [] for w in un_words: try: if all(((ord(char)>=65 and ord(char)<=90) or (ord(char)>=97 and ord(char)<=122)) for char in w) and (not(w.lower() in STOPWORDS)): all_words_use.append(w.lower()) except Exception as e: pass un_words_use = list(set(all_words_use)) un_words_use_count = [0 for i in range(len(un_words_use))] for w in all_words_use: if w in un_words_use: un_words_use_count[un_words_use.index(w)] += 1 most_words_use = [] for i in range(30): if len(un_words_use_count) == 0: break; most_words_use.append(un_words_use[un_words_use_count.index(max(un_words_use_count))]) un_words_use_count.remove(un_words_use_count[un_words_use_count.index(max(un_words_use_count))]) most_words_use = list(set(most_words_use)) count = [[0 for i in range(len(most_words_use))] for j in range(len(most_words_use))] for i in range(len(matrix)): for j in range(len(matrix[i])): if matrix[i][j].lower() in most_words_use: for k in range(j+1, len(matrix[i])): if matrix[i][k].lower() in most_words_use: count[most_words_use.index(matrix[i][j].lower())][most_words_use.index(matrix[i][k].lower())] += 1; count[most_words_use.index(matrix[i][k].lower())][most_words_use.index(matrix[i][j].lower())] += 1; nodeFile = open(STAT_PATH + 'nodeFile.csv', 'w', newline='') nodeW = csv.writer(nodeFile, quotechar='|', quoting=csv.QUOTE_MINIMAL) nodeW.writerow(['id']) for w in most_words_use: nodeW.writerow([w]) edgeFile = open(STAT_PATH + 'edgeFile.csv', 'w', newline='') edgeW = csv.writer(edgeFile, quotechar='|', quoting=csv.QUOTE_MINIMAL) edgeW.writerow(['source']+['target']+['weight']) for i in range(len(count)): for j in range(i, len(count)): edgeW.writerow([most_words_use[i]]+[most_words_use[j]]+[count[i][j]]) edgeW.writerow([most_words_use[j]]+[most_words_use[i]]+[count[i][j]]) nodeFile.close() edgeFile.close() print("3 tasks completed", datetime.now()) """ Network graph """ obj = {} obj['nodes'] = [] obj['links'] = [] obj['nodes'].append({'name':scn, 'group':1}) follower_ids = api.followers_ids(screen_name=scn, count=100) follower_users = [] if len(follower_ids) > 0: follower_users = api.lookup_users(user_ids=follower_ids) i=1; # node no., user is node0 for u in follower_users: obj['nodes'].append({'name':u.screen_name, 'group':2}) # group2 for followers obj['links'].append({'source':i, 'target':0, 'weight':1}) i += 1 following_ids = api.friends_ids(screen_name=scn, count=100) following_users = [] if len(following_ids) > 0: following_users = api.lookup_users(user_ids=following_ids) for u in following_users: obj['nodes'].append({'name':u.screen_name, 'group':3}) # group3 for friends/following obj['links'].append({'source':0, 'target':i, 'weight':1}) i += 1 # Exceeds rate-limit, max 15 requests per 15-min interval # and 100 api requests per hour # Will have to decrease follower and friend count # all_users = follower_users + following_users # for i in range(len(all_users)): # frs = api.followers_ids(screen_name=all_users[i].screen_name) # fwn = api.friends_ids(screen_name=all_users[i].screen_name) # for j in range(i+1, len(all_users)): # if all_users[j].id in frs: # obj['links'].append({'source':j+1, 'target':i+1, 'weight':1}) # if all_users[j].id in fwn: # obj['links'].append({'source':i+1, 'target':j+1, 'weight':1}) with open(STAT_PATH + 'network.json', 'w') as jsonFile: json.dump(obj, jsonFile, indent=4) return render(request, 'tweets/info.html', {'user':user, 'day_list':y}) else: return render(request, 'tweets/login.html', {'message':'Enter a valid Twitter handle'})