def wordcloud(datafile): #remove stop words, the most common words in a language vectorizer=CountVectorizer(stop_words='english') for word in vectorizer.get_stop_words(): STOPWORDS.add(word) STOPWORDS.add("said") pony_mask = np.array(Image.open("../pinkyB.jpg")) wc = WordCloud(background_color="black", max_words=2000, mask=pony_mask, stopwords=STOPWORDS) #init dictionary with the five categories categoriesSet = set(datafile["Category"]) categoriesDict = dict.fromkeys(categoriesSet,"") #Conditional Selection # business = datafile.ix[datafile["Category"]=="Business"] # print business["Content"].size #fill index with data from cv for index, row in datafile.iterrows(): categoriesDict[row["Category"]] += str(row["Content"]) for category, text in categoriesDict.iteritems(): wc.generate(text) image = wc.to_image() image.save("../wordcloud/wordcloud_" + category + ".jpg") return
def wordCloud(text_array,name,keyword=""): new_text_arr=[] if keyword is not "": keyword=keyword.split(" ")[1] for text in text_array: if keyword in text: new_text_arr.append(text) text_array=new_text_arr cloud_text="" for text in text_array: cloud_text+=text+" " m_stopwords=['police','traffic','sir'] for word in m_stopwords: STOPWORDS.add(word) image_mask = os.path.join(BASE_DIR, 'static/tool/img/nebula.png') coloring = imread(image_mask) wordcloud = WordCloud(stopwords=STOPWORDS,background_color="white",mask=coloring,ranks_only=True,max_words=50).generate(cloud_text) filename=os.path.join(BASE_DIR, 'static/tool/img/'+name+'.png') image_colors = ImageColorGenerator(coloring) wordcloud.recolor(color_func=image_colors) wordcloud.to_file(filename) data_uri = open(filename, 'rb').read().encode('base64').replace('\n', '') img_tag = '<img src="data:image/png;base64,{0}" style="height:400px;">'.format(data_uri) layout=wordcloud.layout_ words_colours={} count=1 for lo in layout: entry={} entry['word']=lo[0][0] color=lo[len(lo)-1] color=color[4:] color=color[:-1] color_split=color.split(',') color_num=[int(x) for x in color_split] color_hex='#%02x%02x%02x' % tuple(color_num) # print color_num entry['color']=color_hex words_colours[count]=entry count+=1 # print words_colours list_html="" cap=51 if cap>len(words_colours): cap=len(words_colours) for i in range(1,cap): list_html+='<li class="list-group-item" ><a class="cloud-key-'+name+'" href="#" style="color:'+words_colours[i]['color']+'">' list_html+="#"+str(i)+" "+words_colours[i]['word']+'</a></li>' return (img_tag,list_html)
def generateWordCloud(text, stop): d = path.dirname(outputdir) for w in stop: STOPWORDS.add(w) # Generate the wordcloud without the stop words wordcloud = WordCloud(stopwords=STOPWORDS).generate(text) # Draw the positioned words to a PNG file. wordcloud.to_file(path.join(d, 'diabetes-wordcloud.png'))
def cloudplot(person): person = re.sub(r'\+', ' ', person) text = GetTextRange(Emails, person) text = rmBoring(rmNonAlpha(text)).decode('ascii', 'ignore') plt.clf() d = path.dirname(path.abspath(__file__)) hilcolor = np.array(Image.open(path.join(d, "static/img/hillarylogo.jpg"))) wc = WordCloud(background_color="white", max_words=150, mask=hilcolor, stopwords=STOPWORDS.add("said"), max_font_size=80, random_state=42, relative_scaling = 0.5) wc.generate(text) image_colors = ImageColorGenerator(hilcolor) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") fig = plt.gcf() img = StringIO.StringIO() fig.savefig(img) img.seek(0) return send_file(img, mimetype='image/png')
def create_wordcloud(posts): wordcloud_str=' '.join(post['message'] for post in posts) #join all posts together aces_mask=imread("aces.png") #add aces mask wc=WordCloud(background_color="BLACK", mask=aces_mask, stopwords=STOPWORDS.add("will")) #don't include the word "will" in the wordcloud #(not an interesting word and took up a large chunk of the wordcloud) wc.generate(wordcloud_str) plt.axis("off") plt.imshow(wc) plt.show() wc.to_file("aces_wordcloud.png")
def generate_wc(content): path = r'fzzqhj.TTF' bg_pic = imread('mo.png') # 读取一张图片文件 image_colors = ImageColorGenerator(bg_pic) # 从背景图片生成颜色值 wc = WordCloud(font_path=path, background_color="white", mask=bg_pic, stopwords=STOPWORDS.add("said"), max_font_size=40, color_func=image_colors, random_state=42) wc = wc.generate(content) wc.to_file(c.outputs_pictures_path + 'result.jpg')
def make_word_cloud(data): text = '' for d in data: text = text + d[0] + ' ' # Generate a word cloud image wordcloud = WordCloud(stopwords=STOPWORDS.add('watson')).generate(text) # Display the generated image: # the matplotlib way: import matplotlib.pyplot as plt plt.imshow(wordcloud) plt.axis("off") plt.show()
def mainProcess(usernames): print "Processing "+str(len(usernames)-1)+" usernames" words4="" loginFacebook(driver) timeread=time.time() time0=time.clock() for username in usernames: if len(username) is not 0: username=username.strip() time1=time.clock() count, words3 =produce3(username) module.Database.edit2(username, count, conn) time2=time.clock() words4=words4+" "+words3 time3=time.clock() timeread=time.time()-timeread print "TOTAL TIME" print time3-time0 print timeread more_stopwords =["ja", "aga", "kui", "siis", "tongue", "nii", "ka", "et", "see", "ma","oma","oli", "emoticon", "ei","ning", "seda", "või", "smile", "grin", "Kas", "kes", "veel"] for more in more_stopwords: STOPWORDS.add(more) utf=["Translation", "nüüd", "või", "ära", "Kas"] for u in utf: words4=words4.replace(u, "") wordcloud = WordCloud(stopwords=STOPWORDS).generate(words4) image = wordcloud.to_image() image.save("words.png","PNG") driver.close() driver.quit conn.commit() conn.close() print "Done"
def create_cloud(word, img, out_path): # Read the whole text. # text = open(word_path).read() text = word.read().decode('utf-8') # read the mask image # taken from # http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg alice_mask = np.array(Image.open(img)) # alice_mask = np.array(img_path) wc = WordCloud(font_path = '华文黑体.ttf' ,background_color="white", max_words=2000, mask=alice_mask, stopwords=STOPWORDS.add("said"), width=1000, height=2300, ranks_only=True, mode='RGBA') # generate word cloud wc.generate(text) # wc.generate_from_frequencies([()]) # store to file wc.to_file(out_path)
def WordCloudTopic( items , imagePath = None): # Generate a word cloud image if imagePath: alice_coloring = np.array(Image.open(imagePath)) wc = WordCloud(background_color="white", max_words=200, mask=alice_coloring, stopwords=STOPWORDS.add("said"), max_font_size=300) # generate word cloud wc.generate_from_frequencies(items) image_colors = ImageColorGenerator(alice_coloring) plt.imshow(wc.recolor(color_func=image_colors)) else: wc = WordCloud(background_color="white", max_words=300, max_font_size=40, random_state=42) wordcloud = wc.generate_from_frequencies(items) plt.imshow(wordcloud) plt.axis("off") plt.show()
def generate_wc(text = "Hello World"): #if int(time.time()*10)%10 in [0]: d = path.dirname(__file__) # read the mask image alice_coloring = np.array(Image.open(path.join(d, '..','static','images',"heart.png"))) wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring, stopwords=STOPWORDS.add("said"), max_font_size=40, random_state=42) # generate word cloud wc.generate(text) # generate word cloud image and save it filename = "wordcloud.png" wc.to_file(path.join(d,'..','static','images',filename)) del wc return filename
def main(): parser = argparse.ArgumentParser(description='Generate word cloud') parser.add_argument('artist', help='Artist to be searched') args = parser.parse_args() artist = string_to_url(args.artist) #artist = "Gaslight Anthem" api_url = "http://lyrics.wikia.com/api.php?func=getArtist&artist=%s&fmt=realjson" % (artist, ) data = json.load(urllib2.urlopen(api_url)) art_data = data['albums'] songs_by_album = [album['songs'] for album in art_data] songs = sum(songs_by_album, []) lyrics = "" for song in songs: song = song.strip(bad_chars) lyrics += get_lyrics(string_to_url(song), artist) wc = WordCloud(background_color="white", max_words=2000,stopwords=STOPWORDS.add("said")) if not args.sum: wc.generate(lyrics) wc.to_file("%s_%s.png" %(artist,song,))
def wordcloud(wordSource): #writes origional catagory list to text file d = os.path.dirname(__file__) file = open("catagory.txt", 'w') for item in wordSource: file.write("%s\n" % item) thefile = open(os.path.join(d, "catagory.txt")).read() #adds words to exclude list STOPWORDS.add("chronic") STOPWORDS.add("disease") STOPWORDS.add("obstructive") STOPWORDS.add("status") # generate word cloud wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", width = 650, height = 250).generate_from_text(thefile) #re-colers and saves wordcloud as png wordcloud.recolor(color_func=grey_color_func, random_state=3) wordcloud.to_file("wordcloud.png")
def cloud_word_with_mask(file_name): text = open(file_name).read() # read the mask / color image # amazon_coloring = imread('amazon-logo_grey.png') wc = WordCloud(background_color="white", max_words=200, #mask=amazon_coloring, stopwords=STOPWORDS.add("said"), max_font_size=200, random_state=42, width=1800, height=1000) # generate word cloud wc.generate(text) # create coloring from image # image_colors = ImageColorGenerator(amazon_coloring) # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor # plt.imshow(wc.recolor(color_func=image_colors)) plt.figure() plt.imshow(wc) plt.axis("off") # plt.show() plt.savefig(file_name.split('.')[0] + '.png')
def generateWordcloud(wordlist, outfile, title, nwords=100): """ :param wordlist: words in a list :param outfile: name of the output file to which to store the figure :param title: title of the figure :param nwords: maximum number of words to plot :return: None """ # generate word cloud wc = WordCloudSMN(background_color="white", max_words=nwords, width=800, height=400, stopwords=STOPWORDS.add("looking"), max_font_size=80, random_state=42) wc.generate_SMN(wordlist) # generate the figure plt.figure(figsize=(16, 16)) plt.title(title) plt.imshow(wc) plt.axis("off") plt.savefig(outfile) plt.close()
def wordcloudOf(messages): s = pd.DataFrame(messages) filteredS = s[s.content.str.contains("sent a photo") == False] text = filteredS['content'].str.cat(sep='\n') STOPWORDS.add('ok') STOPWORDS.add('Yea') STOPWORDS.add('Ye') STOPWORDS.add('Yes') STOPWORDS.add('Good') STOPWORDS.add('will') STOPWORDS.add('Oh') wordcloud = WordCloud(width=1000, height=1000, max_font_size=400, stopwords=STOPWORDS).generate(text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off")
from sklearn.feature_extraction.text import TfidfVectorizer import pandas as pd import numpy import matplotlib.pyplot as plt from PIL import Image # Read the whole text. df = pd.read_csv("train_set.csv", sep="\t") my_category = df['Category'] my_content = df['Content'] # read the mask image # taken from # http://rtyuiope.deviantart.com/art/Code-Geass-Wallpaper-374008098 zero_mask = numpy.array(Image.open("zero.png")) STOPWORDS.add("said") wc = WordCloud(background_color="red", max_words=2000, mask=zero_mask, stopwords=STOPWORDS) # generate word cloud text = "" for b in range(len(my_category.index)): if (my_category[b] == "Film"): text += my_content[b] wc.generate(text) # store to file wc.to_file("film_cloud.png")
unwanted_characters = re.compile('[^A-Za-z ]+') try: for cat in temp_dict: number = temp_dict[cat] desc_string = ' '.join(descriptions[cat]) descriptions[cat] = ' '.join([ w.lower() for w in re.sub(unwanted_characters, ' ', desc_string).split() if len(w) > 3 ]) wc = WordCloud(width=1000, height=800, background_color="white", colormap='jet') wc.generate(descriptions[cat]) wc.to_file(r'Y:\le\FEEDBACK\image1\%s_%d.jpg' % (cat, number)) except: pass # add stopwords nf_stopwords = [ 'order', 'refund', 'ship', 'part', 'ebay', 'item', 'seller', 'week', 'ordered', 'weeks', 'will', 'still', 'canceled', 'days', 'never' ] for w in nf_stopwords: STOPWORDS.add(w) wordlist = wc.words_ sorted_by_value = sorted(wordlist.items(), key=lambda kv: kv[1]) sorted_by_value.reverse()
# with open('command.txt','r') as fd: # for i in fd.readlines(): # line=i.strip('\n') # # text+=' '.join(jieba.cut(line)) comment_text = open('lrc_folk_full.txt', 'r').read() comment_text = re.sub(r'\D+:\D+', '', comment_text) comment_text = re.sub(r'\D+ : \D+', '', comment_text) comment_text = re.sub(r'\[\w+\]', '', comment_text) #comment_text=re.sub(r'[a-zA-Z]','',comment_text)#过滤英文 comment_text = re.sub(r'作\w : \D+', '', comment_text) #comment_text=re.sub(r'弦乐 : \D+','',comment_text) text = ''.join(jieba.cut(comment_text)) background = plt.imread('IMG_3674.JPG') #加载背景图片 STOPWORDS.add('原曲') STOPWORDS.add('作曲') STOPWORDS.add('作词') STOPWORDS.add('词曲') STOPWORDS.add('编曲') STOPWORDS.add('九九Lrc歌词网') STOPWORDS.add('制作人') STOPWORDS.add('九九Lrc') STOPWORDS.add('99Lrc') STOPWORDS.add('混音') STOPWORDS.add('吉他') STOPWORDS.add('九九歌词网') STOPWORDS.add('录音') STOPWORDS.add('后期') STOPWORDS.add('和声') STOPWORDS.add('演唱')
def create_wordcloud(df): complaints_text = list(df["Consumer complaint narrative"].dropna().values) # join all documents in corpus text = " ".join(list(complaints_text)) print("Complaints received") print(len(complaints_text)) d = getcwd() mask = np.array(Image.open(path.join(d, "thumbs-down.png"))) STOPWORDS.add("XXXX") STOPWORDS.add("XX") STOPWORDS.add("xx") STOPWORDS.add("xxxx") # TODO exclude name of all banks here STOPWORDS.add("wells") STOPWORDS.add("fargo") wc = WordCloud( background_color="white", stopwords=STOPWORDS, max_words=1000, mask=mask, max_font_size=90, random_state=42, contour_width=1, contour_color="#119DFF", ) wc.generate(text) # create wordcloud shape from image fig = plt.figure(figsize=[8, 8]) ax = plt.imshow(wc.recolor(), interpolation="bilinear") plt.axis("off") out_url = fig_to_uri(fig, bbox_inches="tight") return out_url
# wordcloud usage from os import path from PIL import Image import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS d = path.dirname(__file__) # read whole text text = open(path.join(d, 'alice.txt')).read() # read the mask image alice_mask = np.array(Image.open(path.join(d, "alice_mask.png"))) wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask, stopwords=STOPWORDS.add("said")) wc.generate(text) # store to file wc.to_file(path.join(d, 'alice.png')) # show plt.imshow(wc) plt.axis("off") plt.figure() plt.imshow(alice_mask) plt.axis("off") plt.show()
""" In order to make the graphs more useful we decided to prevent some words from being included """ ADDITIONAL_STOPWORDS = [ "XXXX", "XX", "xx", "xxxx", "n't", "Trans Union", "BOA", "Citi", "account", ] for stopword in ADDITIONAL_STOPWORDS: STOPWORDS.add(stopword) """ Proudly written for Plotly by Vildly in 2019. [email protected] The aim with this dashboard is to demonstrate how Plotly's Dash framework can be used for NLP based data analysis. The dataset is open and contains consumer complaints from US banks ranging from 2013 to 2017. Users can select to run the dashboard with the whole dataset (which can be slow to run) or a smaller subset which then is evenly and consistently sampled accordingly. Once a data sample has been selected the user can select a bank to look into by using the dropdown or by clicking one of the bars on the right with the top 10 banks listed by number of filed complaints. Naturally bigger banks tend to end up in this top 10 since we do not adjust for number of customers.
from random import shuffle import copy import numpy as np import pandas as pd import seaborn as sns # In[2]: ## Pre-processing tokenizer = RegexpTokenizer(r'\w+') #Tokenizer stemmer = SnowballStemmer('english') #Snowball Stemmer stops = set(stopwords.words('english')) #Stopwords for i in stops: STOPWORDS.add(i) # In[3]: cwd = os.getcwd() #Current Working Directory folders_path = os.path.join(cwd, r"Dataset\20_newsgroups") folders = os.listdir(folders_path) #List of folders # In[4]: count_to_file = {} #Dictionary that maps file no to file path file_to_count = {} #Dictionary that maps file path to file no count = 0
def Contacts_greater_than_5(filename): df = pd.read_csv(os.path.join('csvs',filename)) os.remove(os.path.join('csvs', filename)) df['Date'] = pd.to_datetime(df['Date']) df['Date'] = df['Date'].dt.strftime('%d/%m/%Y') fig = plt.GridSpec(13,4,wspace=0.4,hspace=0.5) plt.figure(figsize=(16, 50)) # title ax1 = plt.subplot(fig[0, :]) ax1.text(0.2, 0.4, 'CHAT ANALYSIS', weight='bold', color='#470070', fontsize="60") #sb.despine(left=True, bottom=True, ax=ax1) plt.xticks([], []) plt.yticks([], []) # 1st Row---------------------------- ax2 = plt.subplot(fig[1, 0]) msgs = df.shape[0] ax2.text(0.5, 0.4, msgs, horizontalalignment='center', color='#9f21de', fontsize="30") ax2.text(0.5, 0.1, 'Total Messages', horizontalalignment='center', color='#8f8da6', fontsize="20") sb.despine(ax=ax2, left=True) plt.xticks([], []) plt.yticks([], []) ax3 = plt.subplot(fig[1, 1]) members = np.unique(df['Contacts']).shape[0] ax3.text(0.5, 0.4, members, horizontalalignment='center', color='#9f21de', fontsize="30") ax3.text(0.5, 0.1, 'Members', horizontalalignment='center', color='#8f8da6', fontsize="20") sb.despine(ax=ax3, left=True) plt.xticks([], []) plt.yticks([], []) ax4 = plt.subplot(fig[1, 2]) sDate = df['Date'][0] ax4.text(0.5, 0.4, sDate, horizontalalignment='center', color='#9f21de', fontsize="30") ax4.text(0.5, 0.1, 'Start Date', horizontalalignment='center', color='#8f8da6', fontsize="20") sb.despine(ax=ax4, left=True) plt.xticks([], []) plt.yticks([], []) ax5 = plt.subplot(fig[1, 3]) eDate = df['Date'][df.shape[0]-1] ax5.text(0.5, 0.4, eDate, horizontalalignment='center', color='#9f21de', fontsize="30") ax5.text(0.5, 0.1, 'End Date', horizontalalignment='center', color='#8f8da6', fontsize="20") sb.despine(ax=ax5, left=True) plt.xticks([], []) plt.yticks([], []) # 2nd Row----------------------------- ax6 = plt.subplot(fig[2, 0]) i = 0 for msg in df['Messages']: i += (len(str(msg).split(' '))) avgMsg = str(i/df.shape[0]) ax6.text(0.5, 0.4, avgMsg[:4]+' words', horizontalalignment='center', color='#9f21de', fontsize="30") ax6.text(0.5, 0.1, 'Average msg length', horizontalalignment='center', color='#8f8da6', fontsize="20") sb.despine(ax=ax6, left=True) plt.xticks([], []) plt.yticks([], []) ax7 = plt.subplot(fig[2, 1]) length = 0 name = "" for msg in df['Messages']: if(length < len(str(msg).split(' '))): length = len(str(msg).split(' ')) name = df[df['Messages'] == msg]['Contacts'].values[0] ax7.text(0.5, 0.4, str(length)+' words', horizontalalignment='center', color='#9f21de', fontsize="30") ax7.text(0.5, 0.1, 'Maximum msg length', horizontalalignment='center', color='#8f8da6', fontsize="20") sb.despine(ax=ax7, left=True) plt.xticks([], []) plt.yticks([], []) ax8 = plt.subplot(fig[2, 2]) week = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: 'Saturday', 6: 'Sunday'} busy_day = week[Counter(pd.to_datetime( df['Date']).dt.weekday).most_common(1)[0][0]] ax8.text(0.5, 0.4, busy_day, horizontalalignment='center', color='#9f21de', fontsize="30") ax8.text(0.5, 0.1, 'Most Busy WeekDay', horizontalalignment='center', color='#8f8da6', fontsize="20") sb.despine(ax=ax8, left=True) plt.xticks([], []) plt.yticks([], []) ax9 = plt.subplot(fig[2, 3]) month = {1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December"} busy_month = month[Counter(pd.to_datetime( df['Date']).dt.month).most_common(1)[0][0]] ax9.text(0.5, 0.4, busy_month, horizontalalignment='center', color='#9f21de', fontsize="30") ax9.text(0.5, 0.1, ' Most Busy Month ', horizontalalignment='center', color='#8f8da6', fontsize="20") sb.despine(ax=ax9, left=True) plt.xticks([], []) plt.yticks([], []) # 3rd Row----------------------------- ax10 = plt.subplot(fig[3, :]) ax10.set_facecolor('#9f21de') ax10.text(0.5, 0.4, name, weight='bold', horizontalalignment='center', color='white', fontsize="30") ax10.text(0.5, 0.1, 'Maximum Length Message Send By', horizontalalignment='center', color='#e9ddf0', fontsize="20") sb.despine(ax=ax10, left=True) plt.xticks([], []) plt.yticks([], []) # pie chart--------------------------- pie_plot = plt.subplot(fig[4:6, :2]) i=1 df['Shift'] = pd.Series() for t in df['Time'] : if(str(t).endswith('am')): df['Shift'].loc[i] = 'am' else : df['Shift'].loc[i] = 'pm' i+=1 recipe = list( df.groupby('Shift').count()['Time'].index ) data = list(df.groupby('Shift').count()['Time'].values) lable = list([str(recipe[0] + '\n'+str(data[0])+' msgs') ,str(recipe[1] + '\n'+str(data[1])+' msgs')]) pie_plot.pie(data, textprops=dict( fontsize=18, color="black"), wedgeprops=dict(width=0.45), startangle=20 ,labels=lable) pie_plot.set_title("Messages in respective Meridian", fontsize=20) sb.despine(ax=pie_plot, left=True, bottom=True) # top active bar chart---------------- top_active = plt.subplot(fig[4:8, 2:]) sorted_active = df.groupby('Contacts').count()['Time'].sort_values() if(df.groupby('Contacts').count().shape[0] > 10): sb.barplot(sorted_active[-10:].values, sorted_active[-10:].index, ax=top_active, palette='spring' ) j = -10 for i, v in enumerate(sorted_active.values[-10:]): top_active.text( 0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20) j += 1 else: sb.barplot(sorted_active.values, sorted_active.index, ax=top_active, palette='spring' ) j = -1*len(sorted_active.values) for i, v in enumerate(sorted_active.values): top_active.text( 0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20) j += 1 top_active.set_title("Most Active Memebers", fontsize=20) top_active.set_yticks([], []) top_active.set_ylabel("") sb.despine(ax=top_active, left=True) # least active data------------------ least_active = plt.subplot(fig[6:8, :2]) sorted_active = df.groupby('Contacts').count()['Time'].sort_values() if(df.groupby('Contacts').count().shape[0] > 5): sb.barplot(sorted_active[:5].values, sorted_active[:5].index, ax=least_active, palette='spring' ) j = 0 for i, v in enumerate(sorted_active.values[:5]): least_active.text( 0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20) j += 1 else: sb.barplot(sorted_active.values, sorted_active.index, ax=least_active, palette='spring' ) j = 0 for i, v in enumerate(sorted_active.values): least_active.text( 0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20) j += 1 least_active.set_title("Least Active Memebers", fontsize=20) least_active.set_yticks([], []) least_active.set_ylabel("") sb.despine(ax=least_active, left=True) # weekday wise msgs------------------ week_plot = plt.subplot(fig[8:10, :]) weekday = Counter(pd.to_datetime(df['Date']).dt.weekday) od = collections.OrderedDict(sorted(weekday.items())) values = [] for value in od.values(): values.append(value) keys = [] for key in od.keys(): keys.append(key) week = ["Monday", 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday', 'Sunday'] x = [] for k in keys: x.append(week[k]) sb.barplot(x, values, palette='plasma', ax=week_plot) week_plot.set_xticklabels(x, fontsize=16) week_plot.set_title("WeekDay-wise Messages", fontsize=20) sb.despine(ax=week_plot) # WordCloud--------------------------- word_Cloud = plt.subplot(fig[10:, :]) new_stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll",'nan','media','omitted','media omitted' 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '.', ',', '/', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '+', '-' ] for stop in new_stop: STOPWORDS.add(stop) i = 0 comment_words = ' ' stopwords = set(STOPWORDS) # iterate through the csv file for val in df['Messages']: # typecaste each val to string val = str(val) if "media omitted" in val: i += 1 # split the value tokens = val.split() # Converts each token into lowercase for i in range(len(tokens)): tokens[i] = tokens[i].lower() for words in tokens: comment_words = comment_words + words + ' ' wordcloud = WordCloud(width=1400, height=800, background_color='white', stopwords=stopwords, min_font_size=15, max_font_size=100, colormap='plasma').generate(comment_words) word_Cloud.set_title("WORD CLOUD", fontsize=40) word_Cloud.imshow(wordcloud) word_Cloud.axis("off") plt.savefig(os.path.join('static/images/dashboard',filename+'.png'), bbox_inches='tight') return
def tag_and_lem(element): sent = pos_tag(word_tokenize(element)) return ' '.join([ lemmer.lemmatize(sent[k][0], convert_tag(sent[k][1][0])) for k in range(len(sent)) ]) data.loc[:, 'tweet'] = data['tweet'].apply(lambda x: tag_and_lem(x)) data.loc[:, 'hashtags'] = data['hashtags'].apply( lambda x: ' '.join([stemmer.stem(word) for word in x.split()])) # In[6]: from wordcloud import WordCloud, STOPWORDS stopwords = STOPWORDS.add('amp') all_words = ' '.join(data.tweet.values) hatred_words = ' '.join(data[data.label == 1].tweet.values) plt.figure(figsize=(16, 8)) cloud1 = WordCloud(width=400, height=400, background_color='white', stopwords=stopwords).generate(all_words) plt.subplot(121) plt.imshow(cloud1, interpolation="bilinear") plt.axis("off") plt.title('All tweets', size=20)
import os, urllib2, unirest, PIL from PIL import Image; from os import path; import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS d = path.dirname(__file__) # Read the whole text. textPros = open(path.join(d, 'generated data/pros_full.txt')).read() textCons = open(path.join(d, 'generated data/cons_full.txt')).read() # read the mask image # taken from http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg google_mask = np.array(Image.open(path.join(d, "google-logo.jpg"))) print STOPWORDS wc = WordCloud(background_color="white", max_words=30000, mask=None, stopwords=STOPWORDS.add("</p>")) # generate word cloud wc.generate(textPros) # store to file wc.to_file(path.join(d, "google-logo.jpg")) wc.generate(textCons) wc.to_file(path.join(d, "google-logo1.jpg")) # show plt.imshow(wc) plt.axis("off") plt.figure() plt.imshow(google_mask, cmap=plt.cm.gray) plt.axis("off") plt.show()
if (l == '<EOF>'): break else: s=l[53:] words +=s[:s.find('\t')]+' ' no_urls_no_tags = " ".join([word for word in words.split() if 'http' not in word and not word.startswith('@') and word != 'RT' ]) for c in string.punctuation: no_urls_no_tags= no_urls_no_tags.replace(c,"") STOPWORDS.add('amp') STOPWORDS.add('want') STOPWORDS.add('new') STOPWORDS.add('via') STOPWORDS.add('man') STOPWORDS.add('will') STOPWORDS.add('here') STOPWORDS.add('Heres') STOPWORDS.add('Here') wordcloud = WordCloud( font_path='C:/Tweets/cabin-sketch-v1.02/CabinSketch-Regular.ttf', stopwords=STOPWORDS, background_color='black', width=1800, height=1400
def wordcloud(): for i in [ 'https', 't', 'm', 'co', 'rt', 's', 're', 'go', 'use', 'y', 'feel', 'name', 'll', 'another', 'via', 'da', 'said', 'user', 'u', 'say', 'got', 'see', 'know', 'im', 'lol', 'try', 'look', 'want', 'never', 'even', 'need', 'still', 'amp', 'us', 'really', 'one', 'real', 'will', 'time', 'day', 'alway', 'Van', 'looks', 'word', 'back', 'yo', 'ya', 'done', 'win', 'new', 'man', 'think', 'give', 'life', 'make', 'ain', 'Happy', 'don', 'let', 'tell', 'good', 'stop', 'call', 'people', 'now', 'card', 'bout', 'going', 'every', 'come', 'Full', "ain't", 'right', 'Oh', '0h', 'year', 'bad', 'gonna', 'called', 'wanna', 'put', 'today', ]: STOPWORDS.add(i) data = sd.get_all() x = data['Tweet'] x = ' '.join(x) x.lower() # Define a function to plot word cloud def plot_cloud(wordcloud): # Set figure size plt.figure(figsize=(40, 30)) # Display image plt.imshow(wordcloud) # No axis details plt.axis("off") # Generate word cloud wordcloud = WordCloud(width=3000, height=2000, max_words=30000, random_state=1, background_color='white', colormap='Dark2_r', collocations=False, stopwords=STOPWORDS).generate(x) # Plot plot_cloud(wordcloud)
if __name__ == '__main__': d = path.dirname(__file__) # Read the whole text. text = open(path.join(d, __fileNamePath)).read() # read the mask / color image # taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010 alice_coloring = imread(path.join(d, __imagePath)) wc = WordCloud(font_path=__ttfPath, background_color="black", max_words=2000, mask=alice_coloring, stopwords=STOPWORDS.add("said"), max_font_size=100, random_state=42) # generate word cloud wc.generate(text) # create coloring from image image_colors = ImageColorGenerator(alice_coloring) # show plt.imshow(wc) plt.axis("off") plt.figure() # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor plt.imshow(wc.recolor(color_func=image_colors))
from wordcloud import WordCloud, STOPWORDS import config # test wordcloud on 1% threshold # lowercase debate titles and concatenate to giant text string text = config.concat_df1[2].apply(lambda x: x.lower()) text = text.str.cat(sep=' ') STOPWORDS.add('question') STOPWORDS.add('bill') STOPWORDS.add('second') STOPWORDS.add('reading') # create the wordcloud wordcloud = WordCloud().generate(text) # generate the image plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.savefig('../images/wordcloud_{}.jpg'.format(1)) plt.show() # save wordclouds for each threshold 5-25% concat_tsvs = [ config.concat_df5, config.concat_df10, config.concat_df15, config.concat_df20, config.concat_df25 ] percent = 5 for tsv in concat_tsvs:
from env import * # holds all the secrets import praw from ray import Ray from flappy_answers import answers import json import random import os import requests import re import datetime from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator STOPWORDS.add("game") STOPWORDS.add("deleted") STOPWORDS.add("f**k") STOPWORDS.add("f*****g") STOPWORDS.add("localray") STOPWORDS.add("https") STOPWORDS.add("reddit") STOPWORDS.add("create") STOPWORDS.add("wordcloud") STOPWORDS.add("commets") STOPWORDS.add("imgur") # set up a praw instance to use as a listener # let's listen to all comments on r/tampabayrays and highlight those that have the word cash in them #works ray = Ray() def create_wordcloud(url): print("in create_wordlcoud")
#cloud.py from wordcloud import WordCloud, ImageColorGenerator, random_color_func, STOPWORDS import matplotlib.pyplot as plt from os import path d = path.dirname(__file__) #mask mask = plt.imread(path.join(d, "source/dufu.jpg")) print("图片打开成功") #word STOPWORDS.add("杜甫") wc = WordCloud(font_path="/System/Library/Fonts/STHeiti Medium.ttc", mask=mask, width=1000, height=1000, background_color="black", max_font_size=62, min_font_size=5, stopwords=STOPWORDS) print("WordCloud创建成功") #generate import jieba with open("source/《杜甫诗》全集.txt", encoding='gb18030') as f: text = f.read() text = " ".join(jieba.lcut(text)) dict = wc.process_text(text) # print(dict) wc.generate_from_frequencies(dict) # wc.generate(text)
import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS d = path.dirname(__file__) # Read the whole text. text = open(path.join(d, 'sozler3.txt')).read() # read the mask image # taken from # http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg alice_mask = np.array(Image.open(path.join(d, "mask3.png"))) wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask, stopwords=STOPWORDS.add("yorulunca")) # generate word cloud wc.generate(text) # store to file wc.to_file(path.join(d, "mask_output3.png")) # show plt.imshow(wc) plt.axis("off") plt.figure() plt.imshow(alice_mask, cmap=plt.cm.gray) plt.axis("off") plt.show()
from scipy.misc import imread from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pylab as plt back_color = imread("./dragon.jpg") font = "C:\Windows\Fonts\STXINGKA.TTF" wc = WordCloud( background_color="white", max_words=500, mask=back_color, # 掩膜,产生词云背景的区域,以该参数值作图绘制词云,这个参数不为空时,width,height会被忽略 max_font_size=80, stopwords=STOPWORDS.add("其他"), # 屏蔽词 font_path=font, # 解决显示口型乱码问题 random_state=42, # 为每一词返回一个PIL颜色 prefer_horizontal=10) # 调整词云中字体水平和垂直的多少 text = open("./dragon.txt", "r", encoding="utf-8").read() wc.generate(text) # 从背景图片生成颜色值 image_colors = ImageColorGenerator(back_color) plt.imshow(wc) plt.axis("off") plt.show() wc.to_file("test01.png") plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) plt.show() plt.axis("off") wc.to_file("test02.png")
import re import jieba from scipy.misc import imread # 这是一个处理图像的函数 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt #选择背景图片,颜色最好对比分明,不然生成的词图,轮廓不明显 back_color = imread('chenli.jpg') # 解析该图片 # WordCloud各含义参数请点击 wordcloud参数 wc = WordCloud( background_color='white', # 背景颜色 max_words=1000, # 最大词数 mask=back_color, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略 max_font_size=100, # 显示字体的最大值 stopwords=STOPWORDS.add(' '), # 使用内置的屏蔽词,再添加'苟利国' font_path= "C:/Windows/Fonts/msyhbd.ttc", # 显示中文,从属性里复制字体名称,不能直接看windows显示的字体名 random_state=42, # 为每个词返回一个PIL颜色 # width=1000, # 图片的宽 # height=860 #图片的长 ) # 添加自己的词库分词,比如添加'陈粒啊'到jieba词库后,当你处理的文本中含有“陈粒啊”这个词, # 就会直接将'陈粒啊'当作一个词,而不会得到'陈粒'或'粒啊'这样的词 jieba.add_word('陈粒啊') # 打开词源的文本文件,加read以字符串的形式 txt = open('all_outputs.txt', 'r', encoding='UTF-8').read() # 去除文本中的英文,特殊符号等,只保留中文 txt = re.sub(
# Read the whole text. with open(args.words_file, 'r', encoding='utf-8') as wfile: text = wfile.read() # read the mask / color image # taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010 cloud_coloring = imread(args.source_image_file) if args.mask_image_file is not None: cloud_mask = imread(args.mask_image_file) else: cloud_mask = cloud_coloring wc = WordCloud(background_color=args.background_color, max_words=2000, mask=cloud_mask, font_path=args.font, stopwords=STOPWORDS.add("said"), mode="RGBA", max_font_size=args.max_font_size, random_state=42) # generate word cloud wc.generate(text) # create coloring from image image_colors = ImageColorGenerator(cloud_coloring) if args.output_image_file is not None: wc.recolor(color_func=image_colors).to_file(args.output_image_file) else: # show #plt.imshow(wc) #plt.axis("off") #plt.figure() # recolor wordcloud and show
# # This file genereates a Chinese word cloud in the shape you wish. from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt import jieba import numpy as np from PIL import Image # open the file with "read" attribute with open('comment.txt', 'r') as f: f_text = f.read() #read file res = jieba.cut(f_text) #split chinese characters using jieba package res_text = ' '.join(res) background_img = plt.imread( 'J.jpeg') #read image that you wish to input in the word cloud j_coloring = np.array( Image.open("j2.png")) #handle the image you just read STOPWORDS.add('via') #add stop words #generate the word cloud wc = WordCloud(background_color="white", mask=j_coloring, stopwords=STOPWORDS, font_path='SourceHanSans-Bold.ttf').generate(res_text) image_colors = ImageColorGenerator(j_coloring) #show the image plt.imshow(wc) plt.axis('off') plt.show()
article_url_pmc=[s for s in url_list if 'article' in s] if article_url_pmc: a=pmc_scr(article_url_pmc) else: a=abst(url) #文字列を.txtに変換 f = open('text.txt', 'w') f.write(a) f.close() #.txtからワードクラウドを作成 from wordcloud import WordCloud from wordcloud import STOPWORDS with open('text.txt', 'r') as f: text = f.read() STOPWORDS.add('meta') STOPWORDS.add('content') STOPWORDS.add('name') STOPWORDS.add('description') STOPWORDS.add('meta') STOPWORDS.add('pubmed') STOPWORDS.add('scholar') STOPWORDS.add('google') STOPWORDS.add('pmc') STOPWORDS.add('study') wc = WordCloud( width=480, height=320, background_color="white", prefer_horizontal=1.0, min_word_length=3,
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt from PIL import Image import numpy as np STOPWORDS.add('bu') STOPWORDS.add('mi') STOPWORDS / add('bir') text = open('sozler.txt', 'r').read() foto = np.array(Image.open('barisabi.png')) wc = WordCloud(background_color='white', collocations=False, mask=foto, width=1000, height=1000, stopwords=STOPWORDS) wc.generate(text) plt.figure(figsize=(20, 10)) plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.show() wc.to_file('barisabimiz.png')
from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt with open( "D:\isaac\Programmierung\Python_uebungen\Teil_05_Alice_in_wonderland.txt", "r") as f: text = f.read() wordcloud = WordCloud(width=1920, height=1200) STOPWORDS.add("said") STOPWORDS.add("illustration") wordcloud.generate(text) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show()
def plotly_wordcloud(data_frame): """A wonderful function that returns figure data for three equally wonderful plots: wordcloud, frequency histogram and treemap""" complaints_text = list(data_frame[0].dropna().values) ## join all documents in corpus text = " ".join(list(complaints_text)) STOPWORDS.add("movie") STOPWORDS.add("film") word_cloud = WordCloud(stopwords=set(STOPWORDS), max_words=100, max_font_size=90) word_cloud.generate(text) word_list = [] freq_list = [] fontsize_list = [] position_list = [] orientation_list = [] color_list = [] for (word, freq), fontsize, position, orientation, color in word_cloud.layout_: word_list.append(word) freq_list.append(freq) fontsize_list.append(fontsize) position_list.append(position) orientation_list.append(orientation) color_list.append(color) # get the positions x_arr = [] y_arr = [] for i in position_list: x_arr.append(i[0]) y_arr.append(i[1]) # get the relative occurence frequencies new_freq_list = [] for i in freq_list: new_freq_list.append(i * 80) trace = go.Scatter( x=x_arr, y=y_arr, textfont=dict(size=new_freq_list, color=color_list), hoverinfo="text", textposition="top center", hovertext=[ "{0} - {1}".format(w, f) for w, f in zip(word_list, freq_list) ], mode="text", text=word_list, ) layout = go.Layout({ "xaxis": { "showgrid": False, "showticklabels": False, "zeroline": False, "automargin": True, "range": [-100, 250], }, "yaxis": { "showgrid": False, "showticklabels": False, "zeroline": False, "automargin": True, "range": [-100, 450], }, "margin": dict(t=20, b=20, l=10, r=10, pad=4), "hovermode": "closest", }) wordcloud_figure_data = {"data": [trace], "layout": layout} word_list_top = word_list[:25] word_list_top.reverse() freq_list_top = freq_list[:25] freq_list_top.reverse() frequency_figure_data = { "data": [{ "y": word_list_top, "x": freq_list_top, "type": "bar", "name": "", "orientation": "h", }], "layout": { "height": "550", "margin": dict(t=20, b=20, l=100, r=20, pad=4) }, } treemap_trace = go.Treemap(labels=word_list_top, parents=[""] * len(word_list_top), values=freq_list_top) treemap_layout = go.Layout({"margin": dict(t=10, b=10, l=5, r=5, pad=4)}) treemap_figure = {"data": [treemap_trace], "layout": treemap_layout} return wordcloud_figure_data, frequency_figure_data, treemap_figure
import numpy as np from PIL import Image import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS insults = open('./trumpInsults.txt','r').read() trump_mask = np.array(Image.open('trump2.png')) bannedWords = ['said','new','will','york','many','the','total','never','united','states','failing','totally','news','bad','failed','people','senator', 'party','one','state','always','absolutely','governor','make','read','anything','always','good','thing','really','job','lost','show','group', 'nothing','story','television','political','time','cruz','talk','zero','organization', 'guy','even','deal','false','history','looking', 'reporting','look','country','poll','say','ratings','vote','money','former','president','press','republican','reporter','politician','magazine', 'much','debate','debates','times','campaign','presidential','fox','clinton','hillary','bush','credibility','candidate','know','columnist','immigration', 'another','ad','lied','chief','ted','record','newspaper','another','paid','journal','way','trump','got','life', 'last','dead','street','great','clue','jeb'] for word in bannedWords: STOPWORDS.add(word) wc = WordCloud(background_color="white", max_words=1500, mask=trump_mask, stopwords=STOPWORDS) wc.generate(insults) wc.to_file('trumpInsultWC.png')
def mostCommonWordsBar(messages): s = pd.DataFrame(messages) filteredS = s[s.content.str.contains("sent a photo") == False] words = pd.Series(' '.join( filteredS['content']).lower().split()).value_counts() wordsdf = pd.DataFrame({"count": words.values}, index=words.index) #print(wordsdf) STOPWORDS.add('ok') STOPWORDS.add('yea') STOPWORDS.add('ye') STOPWORDS.add('yes') STOPWORDS.add('good') STOPWORDS.add('will') STOPWORDS.add('oh') filteredWords = wordsdf[wordsdf.index.str.lower().isin(STOPWORDS) == False] filteredWords[:50].plot(kind="bar")
import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS d = path.dirname(__file__) # Read the whole text. text = open(path.join(d, 'alice.txt')).read() # read the mask image # taken from # http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg alice_mask = np.array(Image.open(path.join(d, "alice_mask.png"))) wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask, stopwords=STOPWORDS.add("said")) # generate word cloud wc.generate(text) # store to file wc.to_file(path.join(d, "alice.png")) # show plt.imshow(wc) plt.axis("off") plt.figure() plt.imshow(alice_mask, cmap=plt.cm.gray) plt.axis("off") plt.show()
'CLINTON: ', 'HOLT: ', 'WALLACE: ', '[crosstalk]', 'COOPER: ', 'RADDATZ: ', 'QUESTION: ' ]: if people in t: idx = t.find(people) if idx < firstIdx: firstIdx = idx firstPersonAfterTrump = people trumpSaid = t.split(firstPersonAfterTrump)[0] trumpRamble = trumpRamble + trumpSaid print(trumpRamble) dir = path.dirname(__file__) if "__file__" in locals() else os.getcwd() trump_pic = np.array(Image.open(path.join(dir, 'trump.png'))) STOPWORDS.add('re') wc = WordCloud(background_color="white", max_words=4000, mask=trump_pic, stopwords=STOPWORDS, max_font_size=150) wc.generate(trumpRamble) image_colors = ImageColorGenerator(trump_pic) plt.imshow(wc.recolor(color_func=image_colors), interpolation='bilinear') plt.axis('off') plt.show()
def TweetWordCloud(inputfile,lang,outputimage): #INPUT: # inputfile: A csv (or other format) file with a collection of tweet text and language information. # lang: The target language of the word cloud. # (Use the twitter codes, such as 'en' for English 'fr' for French etc.) #OUTPUT: # outputimage: A png file with the word cloud image. #EXAMPLE CALL: # TweetWordCloud('DavidBowieTributes.csv','en', 'davidbowietributes.png') # os.chdir('/home/kaushi/Desktop/Python_programming/Twitter/') colnames = ['text','language'] tweetdf = pd.read_csv(inputfile,header=0,names=colnames) tweetdf['text'] = tweetdf['text'].astype(str) tweetdf2 = tweetdf[tweetdf['language'] == lang] #Only select the English tweets, for example. tweetdf2.reset_index(drop=True) #--> Reset indices, otherwise further manipulations will encounter issues. #Construct the word cloud. words = ' '.join(tweetdf2['text']) #NOTES FOR IMPROVEMENT: #Consider how to remove emoticons, #unicode characters, selective punctuation etc. wordfilter = " ".join([word for word in words.split() if 'http' not in word #Take out urls and not word.startswith('@') #Take out twitter handles. and word != 'RT' #Take out retweet tags. and word != 'None' #Take out place holders and null values. and word != 'nan']) #Download the twitter mask (or any other mask of preference.) twitter_mask = imread('twitter_mask.png', flatten=True) wordcloud = WordCloud(font_path='/home/kaushi/customfonts/actionis.ttf', stopwords=STOPWORDS.add("will"),\ background_color='black',width=3000, height=3000,min_font_size=8,\ relative_scaling=0.3,mask=twitter_mask).generate(wordfilter) plt.imshow(wordcloud) plt.axis("off") plt.savefig(outputimage, dpi=1000) plt.show()
from os import path from scipy.misc import imread import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS d = path.dirname(__file__) # Read the whole text. text = open(path.join(d, 'summary.txt')).read() # read the mask image # taken from # http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg wc = WordCloud(background_color="black", width=1280, height=720, prefer_horizontal=0.8, font_path='Aller_Rg.ttf', max_words=50, stopwords=STOPWORDS.add("said")) # generate word cloud wc.generate(text) # store to file wc.to_file(path.join(d, "word_cloud.png")) # show plt.imshow(wc) plt.axis("off") plt.show()
continue for word in words: # print word review_words_per_bin[i].append(word) if beer_key not in beer_key_word_bags: beer_key_word_bags[beer_key] = [] beer_key_word_bags[beer_key].append(word) else: beer_key_word_bags[beer_key].append(word) print "words reviwing bin", i, ":", len(review_words_per_bin[i]), "reviews:", len(review_corpus_per_bin[i]) for key, val in beer_key_word_bags.iteritems(): print key, len(val) STOPWORDS.add("malt") STOPWORDS.add("taste") STOPWORDS.add("flavor") STOPWORDS.add("carbonation") STOPWORDS.add("had") STOPWORDS.add("hop") STOPWORDS.add("head") STOPWORDS.add("good") STOPWORDS.add("nice") STOPWORDS.add("light") STOPWORDS.add("dark") STOPWORDS.add("hops") STOPWORDS.add("white") # Create global beer-clouds for i in range(0, 3):
if __name__ == "__main__": d = path.dirname(__file__) # read in text # text = open(path.join(d, 'top_words.txt')).read() file_name = './data/top_words.txt' with open(file_name) as f: text = f.readlines() # read the mask image word_mask = np.array(Image.open(path.join(d, "./figures/circle_mask2.png"))) # construct wordcloud wc = WordCloud(background_color="white", max_words=100, mask=word_mask,\ stopwords=STOPWORDS.add("and")) print "generating word cloud ..." for topic_idx in range(len(text)): # generate word cloud wc.generate(text[topic_idx]) # store to file wc.to_file(path.join(d, "./figures/topic"+str(topic_idx)+".png")) #end # generate plots plt.figure() plt.imshow(wc) plt.axis("off") plt.show()
def loaddata(Text,mods): #read the preprocessed data from pickle file df = pd.read_pickle("corpus.pkl") STOPWORDS.add("rt") STOPWORDS.add("s") STOPWORDS.add("u") STOPWORDS.add("amp") STOPWORDS.add("th") STOPWORDS.add("will") STOPWORDS.add("t") STOPWORDS.add("m") STOPWORDS.add("today") #split the data into train and test set from sklearn.model_selection import train_test_split train, test = train_test_split(df, test_size=0.3, train_size=0.7, random_state=14) #performing stemming lt = LancasterStemmer() def token(text): txt = nltk.word_tokenize(text.lower()) return [lt.stem(word) for word in txt] #document term matrix using Tfidf vectorizer tfv = TfidfVectorizer(tokenizer=token,stop_words=STOPWORDS,analyzer=u'word', min_df=4) X_train_tfv = tfv.fit_transform(train['clean_tweet']) X_test_tfv = tfv.transform(test['clean_tweet']) X_train_tfv = pd.DataFrame(X_train_tfv.toarray(), columns=tfv.get_feature_names()) X_test_tfv = pd.DataFrame(X_test_tfv.toarray(), columns=tfv.get_feature_names()) if(mods=="MNB"): st.success("Performing MNB Classification") #build the model nb = MultinomialNB() # Train the model nb.fit(X_train_tfv, train['Party_log']) #transform the entered text into document term matrix vec_text = tfv.transform(Text).toarray() #predicting the value for newly entered tweet result = nb.predict(vec_text) #if result is 1 then democrat else republican else: st.success("Performing Logistic Regression") #build the model lr = LogisticRegression() # Train the model lr.fit(X_train_tfv, train['Party_log']) #transform the entered text into document term matrix vec_text = tfv.transform(Text).toarray() #predicting the value for newly entered tweet result = lr.predict(vec_text) #if result is 1 then democrat else republican if result == 1: return "demo" elif result == 0: return "rep"
#!/usr/bin/env python3 ''' make text find _build/text/ -name '*.txt' | xargs cat > _build/words.txt ''' from os import path from wordcloud import WordCloud, STOPWORDS d = path.dirname(__file__) STOPWORDS.add('will') STOPWORDS.add('example') # Read the whole text. text = open(path.join(d, '_build/words.txt'), encoding='utf-8').read() wordcloud = WordCloud(width=1920, height=1080, max_words=200).generate(text) wordcloud.to_file('word-cloud.png')
def get_coherence(topic): try: cp = palmetto.get_coherence(topic, coherence_type="cp") ca = palmetto.get_coherence(topic, coherence_type="ca") return cp+ca except: return -1 if __name__ == "__main__": vis_dir = './visualization/' if not os.path.exists(vis_dir): os.makedirs(vis_dir) for word in stop_words: STOPWORDS.add(word) if args.model == 'btm': doc_pt = args.fname dwid_pt = './temp/doc_wids.txt' voca_pt = './temp/voca.txt' model_dir = './temp/model/' if not os.path.exists(model_dir): os.makedirs(model_dir) indexFile(doc_pt, dwid_pt) write_w2id(voca_pt) vocab_size = len(w2id) # encode documents and build vocab alpha = 50 / args.K
import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS from scipy.misc import imread name = 'reyarch' with open('users/{0}/first_words.json'.format(name), 'r') as fs: first_words = json.load(fs) with open('users/{0}/relation_dict.json'.format(name), 'r') as fs: relation_dict = json.load(fs) all_words = sorted(relation_dict, key=lambda x: len(relation_dict[x])) text = open('all_words.txt', 'r').read() pepe_mask = imread('pepe.jpeg') wc = WordCloud(background_color="white", max_words=2000, mask=pepe_mask, stopwords=STOPWORDS.add("heart")) wc.generate(text) wc.to_file('pepecloud.png') plt.imshow(wc) plt.axis("off") plt.show()
remove = [ "interviewer", "interviewee" "shapiro", "inaudible", "heather", "castingwords", "par", "line", "silence", "course", "coursera", "courses", "lot", "like", ] STOPWORDS.add("said") STOPWORDS.add("course") STOPWORDS.add("courses") STOPWORDS.add("coursera") STOPWORDS.add("really") STOPWORDS.add("one") text = " ".join(filter(lambda x: x.lower() not in remove, text.split())) # read the mask image # taken from # http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg # In[22]: alice_mask = np.array(Image.open(path.join(d, "stormtrooper_mask.png")))
# coding: utf-8 import jieba from scipy.misc import imread # 这是一个处理图像的函数 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt back_color = imread('F:\WordCloud-master\WordCloud-master\o_003.jpg') # 解析该图片 wc = WordCloud(background_color='white', # 背景颜色 max_words=1000, # 最大词数 mask=back_color, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略 max_font_size=100, # 显示字体的最大值 stopwords=STOPWORDS.add('我国'), # 使用内置的屏蔽词,再添加'苟利国',需要增加其他屏蔽词时, #加入set,STOPWORDS.add(('我国','国家','祖国')) font_path="F:\WordCloud-master\WordCloud-master\static\simheittf\simhei.ttf", # 解决显示口字型乱码问题, #可进入C:/Windows/Fonts/目录更换字体 random_state=42, # 为每个词返回一个PIL颜色 # width=1000, # 图片的宽 # height=860 #图片的长) # WordCloud各含义参数请点击 wordcloud参数 # 添加自己的词库分词,比如添加'金三胖'到jieba词库后,当你处理的文本中含有金三胖这个词, # 就会直接将'金三胖'当作一个词,而不会得到'金三'或'三胖'这样的词 jieba.add_word('金三胖') # 打开词源的文本文件 # text = open('F:\WordCloud-master\WordCloud-master\cnword.txt',encoding='utf-8').read() with open('F:\WordCloud-master\WordCloud-master\cnword.txt','r',encoding='UTF-8') as f: text = f.read() f.close() # 该函数的作用就是把屏蔽词去掉,使用这个函数就不用在WordCloud参数中添加stopwords参数了 # 把你需要屏蔽的词全部放入一个stopwords文本文件里即可 def stop_words(texts): words_list = [] word_generator = jieba.cut(texts, cut_all=False) # 返回的是一个迭代器
# load config file config = SafeConfigParser() script_dir = path.dirname(__file__) config_file = path.join(script_dir, 'config/settings.cfg') config.read(config_file) # tell script where to put the JSON files returned logfile = config.get('files','logfile') listfile = config.get('files','listfile') outfolder = config.get('files','outfolder') # get usernames users = get_users(listfile) # add stop words STOPWORDS.add('https') # create a word cloud for each user for user in users: # get image masks for different users # from http://masterkoyo.deviantart.com/art/Template-Donald-Trump-35925789 # from https://openclipart.org/detail/211473/jeb-bush-outlines # from http://www.spstencils.com/shop/politics/hilary-clinton-stencil/ image_mask = None try: image_mask = imread(path.join(script_dir, ".".join([user,'jpg']))) print user except IOError: print 'Cannot open file '+ user + '.jpg under directory ' + script_dir
for i in range(len(tweets_data)): if tweets['lang'].loc[i]=='en': text += tweets['text'].loc[i] str1="" text2=TextBlob(text) for word, pos in text2.tags: if pos=='JJ' and word.isalpha(): check=word.spellcheck() if [x[1] for x in check] == [1.0]: str1 += word str1 += " " from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator alice_mask = np.array(Image.open("cloud.png")) STOPWORDS.add("rt") wc = WordCloud(background_color="white", max_words=100, mask=alice_mask, stopwords=STOPWORDS.add("https"),relative_scaling=0.5) # generate word cloud wc.generate(str1) # show plt.imshow(wc) plt.axis("off") plt.show()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- #Author:Winston.Wang import jieba from scipy.misc import imread # 这是一个处理图像的函数,读取图像 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator #词云生成库 import matplotlib.pyplot as plt #绘制库 back_color = imread('C:/Users/wzx/Desktop/word.jpg') # 解析该图片 # 使用内置的屏蔽词,再添加'损害' STOPWORDS.add('《共·惨党宣言》') #设置字体 font = 'C:/Windows/Fonts/simhei.ttf' wc = WordCloud( background_color='white', # 背景颜色 max_words=1000, # 最大词数 mask=back_color, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略 max_font_size=80, # 显示字体的最大值 stopwords=STOPWORDS, font_path=font, # 解决显示口字型乱码问题,可进入C:/Windows/Fonts/目录更换字体 random_state=42, # 为每个词返回一个PIL颜色 # width=1000, # 图片的宽 # height=860 #图片的长 ) # WordCloud各含义参数请点击 wordcloud参数 # 添加自己的词库分词,比如添加'中国改革开放'到jieba词库后,当你处理的文本中含有中国改革开放这个词不会拆, jieba.add_word('中国改革开放') # 打开词源的文本文件 with open('cnword.txt', 'r', encoding="utf-8") as f:
from scipy.misc import imread #import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS d = path.dirname(__file__) # Read the whole text. text = open(path.join(d, 'hot_key.txt')).read() # read the mask image # taken from # http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg alice_mask = imread(path.join(d, "alice_mask.png")) wc = WordCloud(font_path="simhei.ttf", background_color="white", max_words=2000, mask=alice_mask, stopwords=STOPWORDS.add("Qq")) # generate word cloud wc.generate(text) # store to file wc.to_file(path.join(d, "alice_Chinese.png")) # show # plt.imshow(wc) # plt.axis("off") # plt.figure() # plt.imshow(alice_mask, cmap=plt.cm.gray) # plt.axis("off") # plt.show()
text = text.replace("冯世杰说", "冯世杰") text = text.replace("叶晓明说", "叶晓明") # 下面两种都可以读取图片 因为图片本质上就是一个二维数组 # mask = np.array(Image.open(os.path.join(d,'timg.jpg'))) mask = imread(os.path.join(d, 'timg.jpg')) # 设置背景图片 # 生成我们的一个word云印象 # max_font_size=40 设置最大字体是40 # random_state=2 配色方案 # mask=mask 图片关联 # stopwords 屏蔽某些词 wc = WordCloud(font_path=font, max_words=200, mask=mask, stopwords=STOPWORDS.add("强奸"), background_color='green') wc.generate(text) # generate()根据我们的文本生成词云 image_colors = ImageColorGenerator(mask) # 从背景图片生成颜色值 plt.imshow(wc.recolor(color_func=image_colors)) # 显示我们生成图片 根据背景颜色设置词云文字颜色 # plt.imshow() # 显示我们生成图片 plt.axis("off") plt.show() # 生成可视化图片 wc.to_file("text2.png") # """ # wordcloud的所有参数 # # font_path : string //字体路径,需要展现什么字体就把该字体路径+后缀名写上,如:font_path = '黑体.ttf' # width : int (default=400) //输出的画布宽度,默认为400像素 # height : int (default=200) //输出的画布高度,默认为200像素