def wrd_cld(toks): '''Function to visualize word frequency using tokens and is particular to findings of the corpus used in the toxic words challenge from Kaggle. toks - tokens rendered from tokinization''' import string import nltk from nltk.corpus import stopwords from nltk import word_tokenize import matplotlib.pyplot as plt # Get all the stop words in the English language stopwords_list = stopwords.words('english') #remove punctuation stopwords_list += list(string.punctuation) ##adding adhoc all strings that don't appear to contribute, added 'article, page and wikipedia' iteratively as ##these are parts of most comment strings stopwords_list += ("''","``", "'s", "\\n\\n" , '...', 'i\\','\\n', '•', "i", 'the', "'m", 'i\\', "'ve", "don\\'t", "'re", "\\n\\ni", "it\\", "'ll", 'you\\', "'d", "n't", '’', 'article', 'page', 'wikipedia') import wordcloud from wordcloud import WordCloud wordcloud = WordCloud(stopwords=stopwords_list,collocations=False) wordcloud.generate(','.join(toks)) plt.figure(figsize = (12, 12), facecolor = None) plt.imshow(wordcloud) plt.axis('off')
def draw_cloud(dataframe, column): # Join the different processed titles together. long_string = ','.join(list(dataframe[column])) # Create a WordCloud object wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=6, contour_color='steelblue') # Generate a word cloud wordcloud.generate(long_string) # Visualize the word cloud return wordcloud.to_image()
def get_wordcloud(image,font,sw,word,result): wordcloud = WordCloud(scale=15, font_path=font, mask=image, stopwords=sw, background_color='white', max_words=80000,max_font_size=10, random_state=42) wordcloud.generate(word) img_colors = ImageColorGenerator(image) plt.imshow(wordcloud.recolor(color_func=img_colors)) plt.imshow(wordcloud) plt.axis('off') plt.show() wordcloud.to_file(result) print('Task Done!')
def data_visualisation(data): #word cloud genre = data['genres'].str.split('|') str2 = ','.join(str(i) for i in genre) print(str2) wordcloud = WordCloud() wordcloud.generate(str2) plt.figure() plt.max_words = 200 plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show()
def generate_wordcloud(logo_image,text, stopwords_list, portal, today, path): custom_mask = np.array(Image.open(logo_image)) wordcloud = WordCloud(stopwords = stopwords_list, background_color="white",mask=custom_mask) wordcloud.generate(text) image_colors = ImageColorGenerator(custom_mask) wordcloud.recolor(color_func=image_colors) plt.figure(figsize=(10,10)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.savefig( path + "/wordclouds/wordcloud_"+portal+"_"+today+".png")
def generateWordCloud(corpus: str, cmap: str) -> wordcloud: """ Return a Word Cloud object generated from the corpus and color map parameter. """ wordcloud = WordCloud(background_color='black', width=800, height=400, colormap=cmap, max_words=180, contour_width=3, max_font_size=80, contour_color='steelblue', random_state=0) wordcloud.generate(corpus) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.figure() return wordcloud
# Visualisation is key to understanding whether we are still on the right track! In addition, # it allows us to verify whether we need additional preprocessing before further analyzing the text data. # Import the wordcloud library import wordcloud # Join the different processed titles together. s = " " long_string = s.join(papers['title_processed']) # print (long_string) # Create a WordCloud object wordcloud = wordcloud.WordCloud() # Generate a word cloud wordcloud.generate(long_string) # Visualize the word cloud wordcloud.to_image() import matplotlib.pyplot as plt plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.show() # ## 6. Prepare the text for LDA analysis #The main text analysis method that we will use is latent Dirichlet allocation (LDA). # LDA is able to perform topic detection on large document sets, determining what the main 'topics' are in a large unlabeled set of texts. # A 'topic' is a collection of words that tend to co-occur often. # The hypothesis is that LDA might be able to clarify what the different topics in the research titles are. # These topics can then be used as a starting point for further analysis.</p>
import tweepy import wordcloud from Credentialstwitter import * from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt auth = tweepy.OAuthHandler(ConsumerKey, ConsumerSecret) api = tweepy.API(auth) text = " " tweets = api.user_timeline(screen_name="maiconkusterkkk", count=1000, include_rts=False, tweet_mode="extended") for tweet in tweets: #print(tweet.full_text) text = text + " " + tweet.full_text wordcloud = WordCloud(width=1920, height=1200) STOPWORDS.update(["hppt", "https", "co", "da","de","em","na","se","às","como","que", "para", "os", "dos", "das", "assim", "quais","feira","um", "uma", "mais", "ao", "por","pelo","pela",\ "como", "nosso", "nossa", "zu", "das", "zu","die","der","dem","und","auf","ein","nicht","von","wie","wird", "daß", "dass","mit","für", "Sie","sie","er","noch","vor","ist", "bei",\ "wenn", "sich", "den", "hat", "des", "diese", "diesen", "dieses", "dieser", "über", "eine", "einer", "einen", "eines", "auch", "es", "werden", "auch", "im", "als", "uns", "sehr",\ "aber", "einem", "zur", "nun", "mehr", "zum", "durch", "sind", "kann", "man", "aus", "nur", "haben", "will", "é" ]) wordcloud.generate(text) plt.imshow(wordcloud) plt.axis("off") plt.show()
import matplotlib.pyplot as plt # import os # pwd = os.getcwd() # print(pwd) file_ad = r"439-黄帝内经太素[204].txt" str = open(file_ad, encoding='gb2312') with str as f: data = f.read() pattern = re.compile('(?<=\[)[^\[.]+?(?=\])') search = pattern.findall(data) # print(search) if search: for group in search: seg_list = jieba.cut(group, cut_all=False) print(",".join(seg_list)) # wordcloud = WordCloud(max_font_size=40, relative_scaling=.5) wordcloud = WordCloud(font_path=u'./static/simheittf/simhei.ttf', background_color="black", margin=5, width=1800, height=800) wordcloud = wordcloud.generate(seg_list) plt.figure() plt.imshow(wordcloud) plt.axis("off") plt.show()