def wrd_cld(toks):
    '''Function to visualize word frequency using tokens and is particular to findings of the 
    corpus used in the toxic words challenge from Kaggle.

    toks - tokens rendered from tokinization'''
    import string
    import nltk
    from nltk.corpus import stopwords
    from nltk import word_tokenize
    import matplotlib.pyplot as plt

    # Get all the stop words in the English language
    stopwords_list = stopwords.words('english')

    #remove punctuation
    stopwords_list += list(string.punctuation)
    ##adding adhoc all strings that don't appear to contribute, added 'article, page and wikipedia' iteratively as 
    ##these are parts of most comment strings
    stopwords_list += ("''","``", "'s", "\\n\\n" , '...', 'i\\','\\n',
                       '•', "i", 'the', "'m", 'i\\', "'ve", "don\\'t",
                      "'re", "\\n\\ni", "it\\", "'ll", 'you\\', "'d", "n't",
                      '’', 'article', 'page', 'wikipedia')
    import wordcloud
    from wordcloud import WordCloud
    wordcloud = WordCloud(stopwords=stopwords_list,collocations=False)
    wordcloud.generate(','.join(toks))
    plt.figure(figsize = (12, 12), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis('off')
def draw_cloud(dataframe, column):
    
    # Join the different processed titles together.
    long_string = ','.join(list(dataframe[column]))
    # Create a WordCloud object
    wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=6, contour_color='steelblue')
    # Generate a word cloud
    wordcloud.generate(long_string)
    # Visualize the word cloud
    return wordcloud.to_image()
Пример #3
0
def get_wordcloud(image,font,sw,word,result):
    wordcloud = WordCloud(scale=15, font_path=font, mask=image, stopwords=sw, background_color='white',
                          max_words=80000,max_font_size=10, random_state=42)
    wordcloud.generate(word)
    img_colors = ImageColorGenerator(image)
    plt.imshow(wordcloud.recolor(color_func=img_colors))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
    wordcloud.to_file(result)
    print('Task Done!')
def data_visualisation(data):

    #word cloud
    genre = data['genres'].str.split('|')
    str2 = ','.join(str(i) for i in genre)
    print(str2)
    wordcloud = WordCloud()
    wordcloud.generate(str2)
    plt.figure()
    plt.max_words = 200
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
Пример #5
0
def generate_wordcloud(logo_image,text, stopwords_list, portal, today, path):
    custom_mask = np.array(Image.open(logo_image))

    wordcloud = WordCloud(stopwords = stopwords_list, background_color="white",mask=custom_mask)
    wordcloud.generate(text)


    image_colors = ImageColorGenerator(custom_mask)
    wordcloud.recolor(color_func=image_colors)

    plt.figure(figsize=(10,10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.savefig( path + "/wordclouds/wordcloud_"+portal+"_"+today+".png")
Пример #6
0
def generateWordCloud(corpus: str, cmap: str) -> wordcloud:
    """
    Return a Word Cloud object generated from the corpus and color map parameter.
    """
    wordcloud = WordCloud(background_color='black',
                          width=800,
                          height=400,
                          colormap=cmap,
                          max_words=180,
                          contour_width=3,
                          max_font_size=80,
                          contour_color='steelblue',
                          random_state=0)

    wordcloud.generate(corpus)

    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.figure()

    return wordcloud
Пример #7
0
# Visualisation is key to understanding whether we are still on the right track! In addition,
# it allows us to verify whether we need additional preprocessing before further analyzing the text data.

# Import the wordcloud library
import wordcloud

# Join the different processed titles together.
s = " "
long_string = s.join(papers['title_processed'])
# print (long_string)

# Create a WordCloud object
wordcloud = wordcloud.WordCloud()

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()
import matplotlib.pyplot as plt

plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.show()

# ## 6.  Prepare the text for LDA analysis
#The main text analysis method that we will use is latent Dirichlet allocation (LDA).
# LDA is able to perform topic detection on large document sets, determining what the main 'topics' are in a large unlabeled set of texts.
# A 'topic' is a collection of words that tend to co-occur often.
# The hypothesis is that LDA might be able to clarify what the different topics in the research titles are.
# These topics can then be used as a starting point for further analysis.</p>
Пример #8
0
import tweepy
import wordcloud
from Credentialstwitter import *
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

auth = tweepy.OAuthHandler(ConsumerKey, ConsumerSecret)
api = tweepy.API(auth)
text = " "

tweets = api.user_timeline(screen_name="maiconkusterkkk",
                           count=1000,
                           include_rts=False,
                           tweet_mode="extended")
for tweet in tweets:
    #print(tweet.full_text)
    text = text + " " + tweet.full_text

wordcloud = WordCloud(width=1920, height=1200)
STOPWORDS.update(["hppt", "https", "co", "da","de","em","na","se","às","como","que", "para", "os", "dos", "das", "assim", "quais","feira","um", "uma", "mais", "ao", "por","pelo","pela",\
    "como", "nosso", "nossa", "zu", "das", "zu","die","der","dem","und","auf","ein","nicht","von","wie","wird", "daß", "dass","mit","für", "Sie","sie","er","noch","vor","ist", "bei",\
    "wenn", "sich", "den", "hat", "des", "diese", "diesen", "dieses", "dieser", "über", "eine", "einer", "einen", "eines", "auch", "es", "werden", "auch", "im", "als", "uns", "sehr",\
    "aber", "einem", "zur", "nun", "mehr", "zum", "durch", "sind", "kann", "man", "aus", "nur", "haben", "will", "é" ])
wordcloud.generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
Пример #9
0
import matplotlib.pyplot as plt
# import os
# pwd = os.getcwd()
# print(pwd)
file_ad = r"439-黄帝内经太素[204].txt"
str = open(file_ad, encoding='gb2312')
with str as f:
    data = f.read()

pattern = re.compile('(?<=\[)[^\[.]+?(?=\])')
search = pattern.findall(data)
# print(search)
if search:
    for group in search:
        seg_list = jieba.cut(group, cut_all=False)
        print(",".join(seg_list))

# wordcloud = WordCloud(max_font_size=40, relative_scaling=.5)
wordcloud = WordCloud(font_path=u'./static/simheittf/simhei.ttf',
                      background_color="black",
                      margin=5,
                      width=1800,
                      height=800)

wordcloud = wordcloud.generate(seg_list)

plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()