Пример #1
1
def wordcloud(datafile):

    #remove stop words, the most common words in a language
    vectorizer=CountVectorizer(stop_words='english')

    for word in vectorizer.get_stop_words():
        STOPWORDS.add(word)
    STOPWORDS.add("said")

    pony_mask = np.array(Image.open("../pinkyB.jpg"))
    wc = WordCloud(background_color="black", max_words=2000, mask=pony_mask, stopwords=STOPWORDS)

    #init dictionary with the five categories
    categoriesSet = set(datafile["Category"])
    categoriesDict = dict.fromkeys(categoriesSet,"")

    #Conditional Selection
    # business = datafile.ix[datafile["Category"]=="Business"]
    # print business["Content"].size

    #fill index with data from cv
    for index, row in datafile.iterrows():
        categoriesDict[row["Category"]] += str(row["Content"])

    for category, text in categoriesDict.iteritems():
        wc.generate(text)
        image = wc.to_image()
        image.save("../wordcloud/wordcloud_" + category + ".jpg")
    return
Пример #2
0
def wordCloud(text_array,name,keyword=""):
	new_text_arr=[]
	if keyword is not "":
		keyword=keyword.split(" ")[1]
	for text in text_array:
		if keyword in text:
			new_text_arr.append(text)

	text_array=new_text_arr

	cloud_text=""
	for text in text_array:
		cloud_text+=text+" "

	m_stopwords=['police','traffic','sir']

	for word in m_stopwords:
		STOPWORDS.add(word)

	image_mask = os.path.join(BASE_DIR, 'static/tool/img/nebula.png')
	coloring = imread(image_mask)
	
	wordcloud = WordCloud(stopwords=STOPWORDS,background_color="white",mask=coloring,ranks_only=True,max_words=50).generate(cloud_text)
	filename=os.path.join(BASE_DIR, 'static/tool/img/'+name+'.png')

	image_colors = ImageColorGenerator(coloring)
	wordcloud.recolor(color_func=image_colors)
	wordcloud.to_file(filename)
	data_uri = open(filename, 'rb').read().encode('base64').replace('\n', '')

	img_tag = '<img src="data:image/png;base64,{0}" style="height:400px;">'.format(data_uri)
	
	layout=wordcloud.layout_
	words_colours={}
	count=1
	for lo in layout:
		entry={}
		entry['word']=lo[0][0]
		color=lo[len(lo)-1]
		color=color[4:]
		color=color[:-1]
		color_split=color.split(',')
		color_num=[int(x) for x in color_split]
		color_hex='#%02x%02x%02x' % tuple(color_num)
		# print color_num
		entry['color']=color_hex
		words_colours[count]=entry
		count+=1

	# print words_colours
	list_html=""
	cap=51
	if cap>len(words_colours):
		cap=len(words_colours)

	for i in range(1,cap):
		list_html+='<li class="list-group-item" ><a class="cloud-key-'+name+'" href="#" style="color:'+words_colours[i]['color']+'">'
		list_html+="#"+str(i)+" "+words_colours[i]['word']+'</a></li>'

	return (img_tag,list_html)
Пример #3
0
def generateWordCloud(text, stop):
    d = path.dirname(outputdir)

    for w in stop:
        STOPWORDS.add(w)

    # Generate the wordcloud without the stop words    
    wordcloud = WordCloud(stopwords=STOPWORDS).generate(text)

    # Draw the positioned words to a PNG file.
    wordcloud.to_file(path.join(d, 'diabetes-wordcloud.png'))
Пример #4
0
def cloudplot(person):

    person = re.sub(r'\+', ' ', person)

    text = GetTextRange(Emails, person)
    text = rmBoring(rmNonAlpha(text)).decode('ascii', 'ignore')

    plt.clf()

    d = path.dirname(path.abspath(__file__))

    hilcolor = np.array(Image.open(path.join(d, "static/img/hillarylogo.jpg")))

    wc = WordCloud(background_color="white", max_words=150, mask=hilcolor,
               stopwords=STOPWORDS.add("said"),
               max_font_size=80, random_state=42,
               relative_scaling = 0.5)


    wc.generate(text)
    image_colors = ImageColorGenerator(hilcolor)

    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis("off")

    fig = plt.gcf()
    img = StringIO.StringIO()
    fig.savefig(img)
    img.seek(0)

    return send_file(img, mimetype='image/png')
Пример #5
0
def create_wordcloud(posts):
	wordcloud_str=' '.join(post['message'] for post in posts) #join all posts together
	aces_mask=imread("aces.png") #add aces mask
	wc=WordCloud(background_color="BLACK", mask=aces_mask, stopwords=STOPWORDS.add("will")) #don't include the word "will" in the wordcloud
																							#(not an interesting word and took up a large chunk of the wordcloud)
	wc.generate(wordcloud_str)
	plt.axis("off")
	plt.imshow(wc)
	plt.show()
	wc.to_file("aces_wordcloud.png")
def generate_wc(content):
    path = r'fzzqhj.TTF'
    bg_pic = imread('mo.png')  # 读取一张图片文件
    image_colors = ImageColorGenerator(bg_pic)  # 从背景图片生成颜色值
    wc = WordCloud(font_path=path, background_color="white",
                   mask=bg_pic,
                   stopwords=STOPWORDS.add("said"),
                   max_font_size=40,
                   color_func=image_colors,
                   random_state=42)
    wc = wc.generate(content)
    wc.to_file(c.outputs_pictures_path + 'result.jpg')
Пример #7
0
def make_word_cloud(data):
  text = ''
  for d in data:
    text = text + d[0] + ' '

  # Generate a word cloud image
  wordcloud = WordCloud(stopwords=STOPWORDS.add('watson')).generate(text)

  # Display the generated image:
  # the matplotlib way:
  import matplotlib.pyplot as plt
  plt.imshow(wordcloud)
  plt.axis("off")
  plt.show()
Пример #8
0
def mainProcess(usernames):
	print "Processing "+str(len(usernames)-1)+" usernames"
	words4=""
	loginFacebook(driver)
	timeread=time.time()
 	time0=time.clock()

	for username in usernames:
		if len(username) is not 0: 
			username=username.strip()
			time1=time.clock() 
			count, words3 =produce3(username) 
			module.Database.edit2(username, count, conn)
			time2=time.clock()
			words4=words4+" "+words3

 	
 	time3=time.clock()
 	timeread=time.time()-timeread
 	print "TOTAL TIME"
 	print time3-time0
 	print timeread
 	more_stopwords =["ja", "aga", "kui", "siis", "tongue", "nii", "ka", "et", "see", "ma","oma","oli", "emoticon", "ei","ning", "seda", "või", "smile", "grin", "Kas", "kes", "veel"]
 	for more in more_stopwords: 
 		STOPWORDS.add(more)
 	utf=["Translation", "nüüd", "või", "ära", "Kas"]
  	for u in utf: 
  		words4=words4.replace(u, "")
 	wordcloud = WordCloud(stopwords=STOPWORDS).generate(words4)
	image = wordcloud.to_image()
	image.save("words.png","PNG")
	driver.close() 
 	driver.quit 
 	conn.commit()
 	conn.close() 
	print "Done"
Пример #9
0
def create_cloud(word, img, out_path):

    # Read the whole text.
    # text = open(word_path).read()
    text = word.read().decode('utf-8')
    # read the mask image
    # taken from
    # http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
    alice_mask = np.array(Image.open(img))
    # alice_mask = np.array(img_path)
    wc = WordCloud(font_path = '华文黑体.ttf' ,background_color="white", max_words=2000, mask=alice_mask,
                   stopwords=STOPWORDS.add("said"), width=1000, height=2300, ranks_only=True, mode='RGBA')
    # generate word cloud
    wc.generate(text)
    # wc.generate_from_frequencies([()])
    # store to file
    wc.to_file(out_path)
Пример #10
0
def WordCloudTopic( items , imagePath = None):
    # Generate a word cloud image
    
    if imagePath:
    	alice_coloring = np.array(Image.open(imagePath))

    	wc = WordCloud(background_color="white", max_words=200, mask=alice_coloring,
                   stopwords=STOPWORDS.add("said"),
                   max_font_size=300)
    	# generate word cloud
    	wc.generate_from_frequencies(items)
    	image_colors = ImageColorGenerator(alice_coloring)
    	plt.imshow(wc.recolor(color_func=image_colors))
    else:
    	wc = WordCloud(background_color="white", max_words=300,
        max_font_size=40, random_state=42)
    	wordcloud = wc.generate_from_frequencies(items)    
    	plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Пример #11
0
def generate_wc(text = "Hello World"):
    #if int(time.time()*10)%10 in [0]:
    d = path.dirname(__file__)


    # read the mask image
    alice_coloring = np.array(Image.open(path.join(d, '..','static','images',"heart.png")))

    wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring,
                   stopwords=STOPWORDS.add("said"),
                   max_font_size=40, random_state=42)
    # generate word cloud
    wc.generate(text)

    # generate word cloud image and save it 
    filename = "wordcloud.png"
    wc.to_file(path.join(d,'..','static','images',filename))
    del wc

    return filename
Пример #12
0
def main():
    parser = argparse.ArgumentParser(description='Generate word cloud')
    parser.add_argument('artist', help='Artist to be searched')
    args = parser.parse_args()
    artist = string_to_url(args.artist)
    #artist = "Gaslight Anthem"

    api_url = "http://lyrics.wikia.com/api.php?func=getArtist&artist=%s&fmt=realjson" % (artist, )
    data = json.load(urllib2.urlopen(api_url))
    art_data = data['albums']

    songs_by_album = [album['songs'] for album in art_data]
    songs = sum(songs_by_album, [])
    lyrics = ""
    for song in songs:
        song = song.strip(bad_chars)
        lyrics += get_lyrics(string_to_url(song), artist)
        wc = WordCloud(background_color="white", max_words=2000,stopwords=STOPWORDS.add("said"))
        if not args.sum:
            wc.generate(lyrics)
            wc.to_file("%s_%s.png" %(artist,song,))
def wordcloud(wordSource):
    #writes origional catagory list to text file
    d = os.path.dirname(__file__)
    file = open("catagory.txt", 'w')
    for item in wordSource:
        file.write("%s\n" % item)
    thefile = open(os.path.join(d, "catagory.txt")).read()

    #adds words to exclude list
    STOPWORDS.add("chronic")
    STOPWORDS.add("disease")
    STOPWORDS.add("obstructive")
    STOPWORDS.add("status")

    # generate word cloud
    wordcloud = WordCloud(stopwords=STOPWORDS,
        background_color="white",
        width = 650,
        height = 250).generate_from_text(thefile)

    #re-colers and saves wordcloud as png
    wordcloud.recolor(color_func=grey_color_func, random_state=3)
    wordcloud.to_file("wordcloud.png")
Пример #14
0
def cloud_word_with_mask(file_name):
	text = open(file_name).read()
	# read the mask / color image
	# amazon_coloring = imread('amazon-logo_grey.png')

	wc = WordCloud(background_color="white", max_words=200, #mask=amazon_coloring,
	               stopwords=STOPWORDS.add("said"),
	               max_font_size=200, random_state=42, width=1800, height=1000)
	# generate word cloud
	wc.generate(text)

	# create coloring from image
	# image_colors = ImageColorGenerator(amazon_coloring)

	# recolor wordcloud and show
	# we could also give color_func=image_colors directly in the constructor
	# plt.imshow(wc.recolor(color_func=image_colors))
	plt.figure()
	plt.imshow(wc)
	plt.axis("off")
	# plt.show()
	plt.savefig(file_name.split('.')[0] + '.png')
Пример #15
0
def generateWordcloud(wordlist, outfile, title, nwords=100):
    """

    :param wordlist: words in a list
    :param outfile: name of the output file to which to store the figure
    :param title: title of the figure
    :param nwords: maximum number of words to plot

    :return: None
    """
    # generate word cloud
    wc = WordCloudSMN(background_color="white", max_words=nwords,
                      width=800, height=400,
                      stopwords=STOPWORDS.add("looking"),
                      max_font_size=80, random_state=42)
    wc.generate_SMN(wordlist)

    # generate the figure
    plt.figure(figsize=(16, 16))
    plt.title(title)
    plt.imshow(wc)
    plt.axis("off")
    plt.savefig(outfile)
    plt.close()
Пример #16
0
def wordcloudOf(messages):
    s = pd.DataFrame(messages)
    filteredS = s[s.content.str.contains("sent a photo") == False]
    text = filteredS['content'].str.cat(sep='\n')
    STOPWORDS.add('ok')
    STOPWORDS.add('Yea')
    STOPWORDS.add('Ye')
    STOPWORDS.add('Yes')
    STOPWORDS.add('Good')
    STOPWORDS.add('will')
    STOPWORDS.add('Oh')
    wordcloud = WordCloud(width=1000,
                          height=1000,
                          max_font_size=400,
                          stopwords=STOPWORDS).generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
Пример #17
0
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from PIL import Image

# Read the whole text.
df = pd.read_csv("train_set.csv", sep="\t")
my_category = df['Category']
my_content = df['Content']

# read the mask image
# taken from
# http://rtyuiope.deviantart.com/art/Code-Geass-Wallpaper-374008098
zero_mask = numpy.array(Image.open("zero.png"))
STOPWORDS.add("said")
wc = WordCloud(background_color="red",
               max_words=2000,
               mask=zero_mask,
               stopwords=STOPWORDS)

# generate word cloud
text = ""
for b in range(len(my_category.index)):
    if (my_category[b] == "Film"):
        text += my_content[b]

wc.generate(text)

# store to file
wc.to_file("film_cloud.png")
Пример #18
0
unwanted_characters = re.compile('[^A-Za-z ]+')
try:
    for cat in temp_dict:
        number = temp_dict[cat]
        desc_string = ' '.join(descriptions[cat])
        descriptions[cat] = ' '.join([
            w.lower()
            for w in re.sub(unwanted_characters, ' ', desc_string).split()
            if len(w) > 3
        ])
        wc = WordCloud(width=1000,
                       height=800,
                       background_color="white",
                       colormap='jet')
        wc.generate(descriptions[cat])
        wc.to_file(r'Y:\le\FEEDBACK\image1\%s_%d.jpg' % (cat, number))
except:
    pass

# add stopwords
nf_stopwords = [
    'order', 'refund', 'ship', 'part', 'ebay', 'item', 'seller', 'week',
    'ordered', 'weeks', 'will', 'still', 'canceled', 'days', 'never'
]
for w in nf_stopwords:
    STOPWORDS.add(w)

wordlist = wc.words_

sorted_by_value = sorted(wordlist.items(), key=lambda kv: kv[1])
sorted_by_value.reverse()
Пример #19
0
# with open('command.txt','r') as fd:
#     for i in fd.readlines():
#         line=i.strip('\n')
#
# text+=' '.join(jieba.cut(line))
comment_text = open('lrc_folk_full.txt', 'r').read()
comment_text = re.sub(r'\D+:\D+', '', comment_text)
comment_text = re.sub(r'\D+ : \D+', '', comment_text)
comment_text = re.sub(r'\[\w+\]', '', comment_text)
#comment_text=re.sub(r'[a-zA-Z]','',comment_text)#过滤英文
comment_text = re.sub(r'作\w : \D+', '', comment_text)

#comment_text=re.sub(r'弦乐 : \D+','',comment_text)
text = ''.join(jieba.cut(comment_text))
background = plt.imread('IMG_3674.JPG')  #加载背景图片
STOPWORDS.add('原曲')
STOPWORDS.add('作曲')
STOPWORDS.add('作词')
STOPWORDS.add('词曲')
STOPWORDS.add('编曲')
STOPWORDS.add('九九Lrc歌词网')
STOPWORDS.add('制作人')
STOPWORDS.add('九九Lrc')
STOPWORDS.add('99Lrc')
STOPWORDS.add('混音')
STOPWORDS.add('吉他')
STOPWORDS.add('九九歌词网')
STOPWORDS.add('录音')
STOPWORDS.add('后期')
STOPWORDS.add('和声')
STOPWORDS.add('演唱')
Пример #20
0
def create_wordcloud(df):
    complaints_text = list(df["Consumer complaint narrative"].dropna().values)

    # join all documents in corpus
    text = " ".join(list(complaints_text))
    print("Complaints received")
    print(len(complaints_text))

    d = getcwd()
    mask = np.array(Image.open(path.join(d, "thumbs-down.png")))

    STOPWORDS.add("XXXX")
    STOPWORDS.add("XX")
    STOPWORDS.add("xx")
    STOPWORDS.add("xxxx")
    # TODO exclude name of all banks here
    STOPWORDS.add("wells")
    STOPWORDS.add("fargo")

    wc = WordCloud(
        background_color="white",
        stopwords=STOPWORDS,
        max_words=1000,
        mask=mask,
        max_font_size=90,
        random_state=42,
        contour_width=1,
        contour_color="#119DFF",
    )
    wc.generate(text)

    # create wordcloud shape from image
    fig = plt.figure(figsize=[8, 8])
    ax = plt.imshow(wc.recolor(), interpolation="bilinear")
    plt.axis("off")
    out_url = fig_to_uri(fig, bbox_inches="tight")
    return out_url
Пример #21
0
# wordcloud usage
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS


d = path.dirname(__file__)

# read whole text
text = open(path.join(d, 'alice.txt')).read()

# read the mask image
alice_mask = np.array(Image.open(path.join(d, "alice_mask.png")))

wc = WordCloud(background_color="white", max_words=2000,
               mask=alice_mask, stopwords=STOPWORDS.add("said"))
wc.generate(text)

# store to file
wc.to_file(path.join(d, 'alice.png'))

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
plt.imshow(alice_mask)
plt.axis("off")
plt.show()
Пример #22
0
"""
In order to make the graphs more useful we decided to prevent some words from being included
"""
ADDITIONAL_STOPWORDS = [
    "XXXX",
    "XX",
    "xx",
    "xxxx",
    "n't",
    "Trans Union",
    "BOA",
    "Citi",
    "account",
]
for stopword in ADDITIONAL_STOPWORDS:
    STOPWORDS.add(stopword)
"""
Proudly written for Plotly by Vildly in 2019. [email protected]


The aim with this dashboard is to demonstrate how Plotly's Dash framework
can be used for NLP based data analysis. The dataset is open and contains
consumer complaints from US banks ranging from 2013 to 2017.

Users can select to run the dashboard with the whole dataset (which can be slow to run)
or a smaller subset which then is evenly and consistently sampled accordingly.

Once a data sample has been selected the user can select a bank to look into by
using the dropdown or by clicking one of the bars on the right with the top 10
banks listed by number of filed complaints. Naturally bigger banks tend to end
up in this top 10 since we do not adjust for number of customers.
Пример #23
0
from random import shuffle
import copy
import numpy as np
import pandas as pd
import seaborn as sns


# In[2]:

## Pre-processing

tokenizer = RegexpTokenizer(r'\w+') #Tokenizer
stemmer = SnowballStemmer('english') #Snowball Stemmer
stops = set(stopwords.words('english')) #Stopwords
for i in stops:
    STOPWORDS.add(i)


# In[3]:

cwd = os.getcwd() #Current Working Directory
folders_path = os.path.join(cwd, r"Dataset\20_newsgroups")
folders = os.listdir(folders_path) #List of folders


# In[4]:

count_to_file = {} #Dictionary that maps file no to file path
file_to_count = {} #Dictionary that maps file path to file no
count = 0
Пример #24
0
def Contacts_greater_than_5(filename):
    df = pd.read_csv(os.path.join('csvs',filename))
    os.remove(os.path.join('csvs', filename))
    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
    fig  = plt.GridSpec(13,4,wspace=0.4,hspace=0.5)
    plt.figure(figsize=(16, 50))

    # title
    ax1 = plt.subplot(fig[0, :])
    ax1.text(0.2, 0.4, 'CHAT ANALYSIS', weight='bold',
            color='#470070', fontsize="60")
    #sb.despine(left=True, bottom=True, ax=ax1)
    plt.xticks([], [])
    plt.yticks([], [])


    # 1st Row----------------------------
    ax2 = plt.subplot(fig[1, 0])
    msgs = df.shape[0]
    ax2.text(0.5, 0.4, msgs, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax2.text(0.5, 0.1, 'Total Messages', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax2, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax3 = plt.subplot(fig[1, 1])
    members = np.unique(df['Contacts']).shape[0]
    ax3.text(0.5, 0.4, members, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax3.text(0.5, 0.1, 'Members', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax3, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax4 = plt.subplot(fig[1, 2])
    sDate = df['Date'][0]
    ax4.text(0.5, 0.4, sDate, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax4.text(0.5, 0.1, 'Start Date', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax4, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax5 = plt.subplot(fig[1, 3])
    eDate = df['Date'][df.shape[0]-1]
    ax5.text(0.5, 0.4, eDate, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax5.text(0.5, 0.1, 'End Date', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax5, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    # 2nd Row-----------------------------
    ax6 = plt.subplot(fig[2, 0])
    i = 0
    for msg in df['Messages']:
        i += (len(str(msg).split(' ')))

    avgMsg = str(i/df.shape[0])
    ax6.text(0.5, 0.4, avgMsg[:4]+' words', horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax6.text(0.5, 0.1, 'Average msg length', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax6, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax7 = plt.subplot(fig[2, 1])
    length = 0
    name = ""
    for msg in df['Messages']:
        if(length < len(str(msg).split(' '))):
            length = len(str(msg).split(' '))
            name = df[df['Messages'] == msg]['Contacts'].values[0]

    ax7.text(0.5, 0.4, str(length)+' words', horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax7.text(0.5, 0.1, 'Maximum msg length', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax7, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax8 = plt.subplot(fig[2, 2])
    week = {0: "Monday", 1: "Tuesday", 2: "Wednesday",
            3: "Thursday", 4: "Friday", 5: 'Saturday', 6: 'Sunday'}
    busy_day = week[Counter(pd.to_datetime(
        df['Date']).dt.weekday).most_common(1)[0][0]]
    ax8.text(0.5, 0.4, busy_day, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax8.text(0.5, 0.1, 'Most Busy WeekDay', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax8, left=True)
    plt.xticks([], [])
    plt.yticks([], [])


    ax9 = plt.subplot(fig[2, 3])
    month = {1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July",
            8: "August", 9: "September", 10: "October", 11: "November", 12: "December"}
    busy_month = month[Counter(pd.to_datetime(
        df['Date']).dt.month).most_common(1)[0][0]]
    ax9.text(0.5, 0.4, busy_month, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax9.text(0.5, 0.1, '    Most Busy Month    ', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax9, left=True)
    plt.xticks([], [])
    plt.yticks([], [])


    # 3rd Row-----------------------------
    ax10 = plt.subplot(fig[3, :])
    ax10.set_facecolor('#9f21de')
    ax10.text(0.5, 0.4, name, weight='bold',
            horizontalalignment='center', color='white', fontsize="30")
    ax10.text(0.5, 0.1, 'Maximum Length Message Send By',
            horizontalalignment='center', color='#e9ddf0', fontsize="20")
    sb.despine(ax=ax10, left=True)
    plt.xticks([], [])
    plt.yticks([], [])


    # pie chart---------------------------
    pie_plot = plt.subplot(fig[4:6, :2])
    i=1
    df['Shift'] = pd.Series()
    for t in df['Time'] :
        if(str(t).endswith('am')):
            df['Shift'].loc[i] = 'am' 
        else :
            df['Shift'].loc[i] = 'pm'
        i+=1

    recipe = list( df.groupby('Shift').count()['Time'].index )
    data = list(df.groupby('Shift').count()['Time'].values)
    lable = list([str(recipe[0] + '\n'+str(data[0])+' msgs') ,str(recipe[1] + '\n'+str(data[1])+' msgs')])

    pie_plot.pie(data, textprops=dict( fontsize=18,
        color="black"), wedgeprops=dict(width=0.45), startangle=20 ,labels=lable)

    pie_plot.set_title("Messages in respective Meridian", fontsize=20)
    sb.despine(ax=pie_plot, left=True, bottom=True)

    # top active bar chart----------------
    top_active = plt.subplot(fig[4:8, 2:])
    sorted_active = df.groupby('Contacts').count()['Time'].sort_values()
    if(df.groupby('Contacts').count().shape[0] > 10):
        sb.barplot(sorted_active[-10:].values,
                sorted_active[-10:].index,
                ax=top_active,
                palette='spring'
                )
        j = -10
        for i, v in enumerate(sorted_active.values[-10:]):
            top_active.text(
                0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20)
            j += 1
    else:
        sb.barplot(sorted_active.values,
                sorted_active.index,
                ax=top_active,
                palette='spring'
                )
        j = -1*len(sorted_active.values)
        for i, v in enumerate(sorted_active.values):
            top_active.text(
                0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20)
            j += 1
    top_active.set_title("Most Active Memebers", fontsize=20)
    top_active.set_yticks([], [])
    top_active.set_ylabel("")
    sb.despine(ax=top_active, left=True)


    # least active data------------------
    least_active = plt.subplot(fig[6:8, :2])
    sorted_active = df.groupby('Contacts').count()['Time'].sort_values()
    if(df.groupby('Contacts').count().shape[0] > 5):
        sb.barplot(sorted_active[:5].values,
                sorted_active[:5].index,
                ax=least_active,
                palette='spring'
                )
        j = 0
        for i, v in enumerate(sorted_active.values[:5]):
            least_active.text(
                0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20)
            j += 1
    else:
        sb.barplot(sorted_active.values,
                sorted_active.index,
                ax=least_active,
                palette='spring'
                )
        j = 0
        for i, v in enumerate(sorted_active.values):
            least_active.text(
                0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20)
            j += 1
    least_active.set_title("Least Active Memebers", fontsize=20)
    least_active.set_yticks([], [])
    least_active.set_ylabel("")
    sb.despine(ax=least_active, left=True)


    # weekday wise msgs------------------
    week_plot = plt.subplot(fig[8:10, :])
    weekday = Counter(pd.to_datetime(df['Date']).dt.weekday)
    od = collections.OrderedDict(sorted(weekday.items()))
    values = []
    for value in od.values():
        values.append(value)
    keys = []
    for key in od.keys():
        keys.append(key)
    week = ["Monday", 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday', 'Sunday']

    x = []
    for k in keys:
        x.append(week[k])

    sb.barplot(x, values, palette='plasma', ax=week_plot)
    week_plot.set_xticklabels(x, fontsize=16)
    week_plot.set_title("WeekDay-wise Messages", fontsize=20)
    sb.despine(ax=week_plot)


    # WordCloud---------------------------
    word_Cloud = plt.subplot(fig[10:, :])
    new_stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
                'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it',
                "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this',
                'that', "that'll",'nan','media','omitted','media omitted'
                'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
                'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
                'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
                'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
                'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
                'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
                'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
                'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",
                'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
                "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", '1', '2', '3', '4', '5', '6', '7',
                '8', '9', '0', '.', ',', '/', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '+', '-'
                ]

    for stop in new_stop:
        STOPWORDS.add(stop)

    i = 0

    comment_words = ' '
    stopwords = set(STOPWORDS)

    # iterate through the csv file
    for val in df['Messages']:

        # typecaste each val to string
        val = str(val)

        if "media omitted" in val:
            i += 1
        # split the value
        tokens = val.split()

        # Converts each token into lowercase
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()

        for words in tokens:
            comment_words = comment_words + words + ' '


    wordcloud = WordCloud(width=1400, height=800,
                        background_color='white',
                        stopwords=stopwords,
                        min_font_size=15,
                        max_font_size=100,
                        colormap='plasma').generate(comment_words)

    word_Cloud.set_title("WORD CLOUD", fontsize=40)
    word_Cloud.imshow(wordcloud)
    word_Cloud.axis("off")

    plt.savefig(os.path.join('static/images/dashboard',filename+'.png'), bbox_inches='tight')
    return
def tag_and_lem(element):
    sent = pos_tag(word_tokenize(element))
    return ' '.join([
        lemmer.lemmatize(sent[k][0], convert_tag(sent[k][1][0]))
        for k in range(len(sent))
    ])


data.loc[:, 'tweet'] = data['tweet'].apply(lambda x: tag_and_lem(x))
data.loc[:, 'hashtags'] = data['hashtags'].apply(
    lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# In[6]:

from wordcloud import WordCloud, STOPWORDS
stopwords = STOPWORDS.add('amp')

all_words = ' '.join(data.tweet.values)
hatred_words = ' '.join(data[data.label == 1].tweet.values)

plt.figure(figsize=(16, 8))

cloud1 = WordCloud(width=400,
                   height=400,
                   background_color='white',
                   stopwords=stopwords).generate(all_words)
plt.subplot(121)
plt.imshow(cloud1, interpolation="bilinear")
plt.axis("off")
plt.title('All tweets', size=20)
Пример #26
0
import os, urllib2, unirest, PIL
from PIL import Image;
from os import path;
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
d = path.dirname(__file__)
# Read the whole text.
textPros = open(path.join(d, 'generated data/pros_full.txt')).read()
textCons = open(path.join(d, 'generated data/cons_full.txt')).read()
# read the mask image
# taken from http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
google_mask = np.array(Image.open(path.join(d, "google-logo.jpg")))
print STOPWORDS
wc = WordCloud(background_color="white", max_words=30000, mask=None,
               stopwords=STOPWORDS.add("</p>"))
# generate word cloud
wc.generate(textPros)
# store to file
wc.to_file(path.join(d, "google-logo.jpg"))
wc.generate(textCons)
wc.to_file(path.join(d, "google-logo1.jpg"))
# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
plt.imshow(google_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
Пример #27
0
    if (l == '<EOF>'):
      break
    else:
      s=l[53:]
      words +=s[:s.find('\t')]+' '

no_urls_no_tags = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])
                            
for c in string.punctuation:
  no_urls_no_tags= no_urls_no_tags.replace(c,"")                            
                            
STOPWORDS.add('amp')    
STOPWORDS.add('want')
STOPWORDS.add('new')
STOPWORDS.add('via')
STOPWORDS.add('man')
STOPWORDS.add('will')
STOPWORDS.add('here')
STOPWORDS.add('Heres')
STOPWORDS.add('Here')
                        
wordcloud = WordCloud(
                      font_path='C:/Tweets/cabin-sketch-v1.02/CabinSketch-Regular.ttf',
                      stopwords=STOPWORDS,
                      background_color='black',
                      width=1800,
                      height=1400
def wordcloud():

    for i in [
            'https',
            't',
            'm',
            'co',
            'rt',
            's',
            're',
            'go',
            'use',
            'y',
            'feel',
            'name',
            'll',
            'another',
            'via',
            'da',
            'said',
            'user',
            'u',
            'say',
            'got',
            'see',
            'know',
            'im',
            'lol',
            'try',
            'look',
            'want',
            'never',
            'even',
            'need',
            'still',
            'amp',
            'us',
            'really',
            'one',
            'real',
            'will',
            'time',
            'day',
            'alway',
            'Van',
            'looks',
            'word',
            'back',
            'yo',
            'ya',
            'done',
            'win',
            'new',
            'man',
            'think',
            'give',
            'life',
            'make',
            'ain',
            'Happy',
            'don',
            'let',
            'tell',
            'good',
            'stop',
            'call',
            'people',
            'now',
            'card',
            'bout',
            'going',
            'every',
            'come',
            'Full',
            "ain't",
            'right',
            'Oh',
            '0h',
            'year',
            'bad',
            'gonna',
            'called',
            'wanna',
            'put',
            'today',
    ]:
        STOPWORDS.add(i)

    data = sd.get_all()

    x = data['Tweet']

    x = ' '.join(x)
    x.lower()

    # Define a function to plot word cloud
    def plot_cloud(wordcloud):
        # Set figure size
        plt.figure(figsize=(40, 30))
        # Display image
        plt.imshow(wordcloud)
        # No axis details
        plt.axis("off")

    # Generate word cloud
    wordcloud = WordCloud(width=3000,
                          height=2000,
                          max_words=30000,
                          random_state=1,
                          background_color='white',
                          colormap='Dark2_r',
                          collocations=False,
                          stopwords=STOPWORDS).generate(x)
    # Plot
    plot_cloud(wordcloud)
Пример #29
0
if __name__ == '__main__':
    d = path.dirname(__file__)

    # Read the whole text.
    text = open(path.join(d, __fileNamePath)).read()

    # read the mask / color image
    # taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
    alice_coloring = imread(path.join(d, __imagePath))

    wc = WordCloud(font_path=__ttfPath,
                   background_color="black",
                   max_words=2000,
                   mask=alice_coloring,
                   stopwords=STOPWORDS.add("said"),
                   max_font_size=100,
                   random_state=42)
    # generate word cloud
    wc.generate(text)

    # create coloring from image
    image_colors = ImageColorGenerator(alice_coloring)

    # show
    plt.imshow(wc)
    plt.axis("off")
    plt.figure()
    # recolor wordcloud and show
    # we could also give color_func=image_colors directly in the constructor
    plt.imshow(wc.recolor(color_func=image_colors))
Пример #30
0
from wordcloud import WordCloud, STOPWORDS
import config

# test wordcloud on 1% threshold
# lowercase debate titles and concatenate to giant text string
text = config.concat_df1[2].apply(lambda x: x.lower())
text = text.str.cat(sep=' ')

STOPWORDS.add('question')
STOPWORDS.add('bill')
STOPWORDS.add('second')
STOPWORDS.add('reading')

# create the wordcloud
wordcloud = WordCloud().generate(text)

# generate the image
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig('../images/wordcloud_{}.jpg'.format(1))
plt.show()

# save wordclouds for each threshold 5-25%
concat_tsvs = [
    config.concat_df5, config.concat_df10, config.concat_df15,
    config.concat_df20, config.concat_df25
]

percent = 5
for tsv in concat_tsvs:
Пример #31
0
from env import *  # holds all the secrets
import praw
from ray import Ray
from flappy_answers import answers
import json
import random
import os
import requests
import re
import datetime
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

STOPWORDS.add("game")
STOPWORDS.add("deleted")
STOPWORDS.add("f**k")
STOPWORDS.add("f*****g")
STOPWORDS.add("localray")
STOPWORDS.add("https")
STOPWORDS.add("reddit")
STOPWORDS.add("create")
STOPWORDS.add("wordcloud")
STOPWORDS.add("commets")
STOPWORDS.add("imgur")
# set up a praw instance to use as a listener
# let's listen to all comments on r/tampabayrays and highlight those that have the word cash in them
#works
ray = Ray()


def create_wordcloud(url):
    print("in create_wordlcoud")
Пример #32
0
#cloud.py
from wordcloud import WordCloud, ImageColorGenerator, random_color_func, STOPWORDS
import matplotlib.pyplot as plt
from os import path

d = path.dirname(__file__)

#mask
mask = plt.imread(path.join(d, "source/dufu.jpg"))
print("图片打开成功")
#word
STOPWORDS.add("杜甫")
wc = WordCloud(font_path="/System/Library/Fonts/STHeiti Medium.ttc",
               mask=mask,
               width=1000,
               height=1000,
               background_color="black",
               max_font_size=62,
               min_font_size=5,
               stopwords=STOPWORDS)
print("WordCloud创建成功")

#generate
import jieba
with open("source/《杜甫诗》全集.txt", encoding='gb18030') as f:
    text = f.read()
    text = " ".join(jieba.lcut(text))
    dict = wc.process_text(text)
    # print(dict)
    wc.generate_from_frequencies(dict)
    # wc.generate(text)
Пример #33
0
import numpy as np
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'sozler3.txt')).read()

# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = np.array(Image.open(path.join(d, "mask3.png")))

wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
               stopwords=STOPWORDS.add("yorulunca"))
# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join(d, "mask_output3.png"))

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
Пример #34
0
from scipy.misc import imread
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pylab as plt

back_color = imread("./dragon.jpg")
font = "C:\Windows\Fonts\STXINGKA.TTF"
wc = WordCloud(
    background_color="white",
    max_words=500,
    mask=back_color,  #   掩膜,产生词云背景的区域,以该参数值作图绘制词云,这个参数不为空时,width,height会被忽略
    max_font_size=80,
    stopwords=STOPWORDS.add("其他"),  # 屏蔽词
    font_path=font,  #   解决显示口型乱码问题
    random_state=42,  #   为每一词返回一个PIL颜色
    prefer_horizontal=10)  #   调整词云中字体水平和垂直的多少

text = open("./dragon.txt", "r", encoding="utf-8").read()
wc.generate(text)
#   从背景图片生成颜色值
image_colors = ImageColorGenerator(back_color)
plt.imshow(wc)
plt.axis("off")
plt.show()
wc.to_file("test01.png")
plt.figure()

plt.imshow(wc.recolor(color_func=image_colors))
plt.show()
plt.axis("off")
wc.to_file("test02.png")
Пример #35
0
import re
import jieba
from scipy.misc import imread  # 这是一个处理图像的函数
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
#选择背景图片,颜色最好对比分明,不然生成的词图,轮廓不明显
back_color = imread('chenli.jpg')  # 解析该图片

# WordCloud各含义参数请点击 wordcloud参数
wc = WordCloud(
    background_color='white',  # 背景颜色
    max_words=1000,  # 最大词数
    mask=back_color,  # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
    max_font_size=100,  # 显示字体的最大值
    stopwords=STOPWORDS.add(' '),  # 使用内置的屏蔽词,再添加'苟利国'
    font_path=
    "C:/Windows/Fonts/msyhbd.ttc",  # 显示中文,从属性里复制字体名称,不能直接看windows显示的字体名
    random_state=42,  # 为每个词返回一个PIL颜色
    # width=1000,  # 图片的宽
    # height=860  #图片的长
)

# 添加自己的词库分词,比如添加'陈粒啊'到jieba词库后,当你处理的文本中含有“陈粒啊”这个词,
# 就会直接将'陈粒啊'当作一个词,而不会得到'陈粒'或'粒啊'这样的词
jieba.add_word('陈粒啊')

# 打开词源的文本文件,加read以字符串的形式
txt = open('all_outputs.txt', 'r', encoding='UTF-8').read()
# 去除文本中的英文,特殊符号等,只保留中文
txt = re.sub(
Пример #36
0
# Read the whole text.
with open(args.words_file, 'r', encoding='utf-8') as wfile:
	text = wfile.read()

	# read the mask / color image
	# taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
	cloud_coloring = imread(args.source_image_file)
	if args.mask_image_file is not None:
		cloud_mask = imread(args.mask_image_file)
	else:
		cloud_mask = cloud_coloring

	wc = WordCloud(background_color=args.background_color, max_words=2000, 
	               mask=cloud_mask, font_path=args.font,  
	               stopwords=STOPWORDS.add("said"), mode="RGBA",
	               max_font_size=args.max_font_size, random_state=42)
	# generate word cloud
	wc.generate(text)

	# create coloring from image
	image_colors = ImageColorGenerator(cloud_coloring)

	if args.output_image_file is not None:
		wc.recolor(color_func=image_colors).to_file(args.output_image_file)
	else:
		# show
		#plt.imshow(wc)
		#plt.axis("off")
		#plt.figure()
		# recolor wordcloud and show
#
# This file genereates a Chinese word cloud in the shape you wish.

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import jieba
import numpy as np
from PIL import Image

# open the file with "read" attribute
with open('comment.txt', 'r') as f:
    f_text = f.read()  #read file
    res = jieba.cut(f_text)  #split chinese characters using jieba package
    res_text = ' '.join(res)
    background_img = plt.imread(
        'J.jpeg')  #read image that you wish to input in the word cloud
    j_coloring = np.array(
        Image.open("j2.png"))  #handle the image you just read
    STOPWORDS.add('via')  #add stop words

    #generate the word cloud
    wc = WordCloud(background_color="white",
                   mask=j_coloring,
                   stopwords=STOPWORDS,
                   font_path='SourceHanSans-Bold.ttf').generate(res_text)
    image_colors = ImageColorGenerator(j_coloring)

    #show the image
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
Пример #38
0
article_url_pmc=[s for s in url_list if 'article' in s]
if article_url_pmc:
    a=pmc_scr(article_url_pmc)
else:
    a=abst(url)
#文字列を.txtに変換
f = open('text.txt', 'w')
f.write(a)
f.close()

#.txtからワードクラウドを作成
from wordcloud import WordCloud
from wordcloud import STOPWORDS
with open('text.txt', 'r') as f:
    text = f.read()
STOPWORDS.add('meta')
STOPWORDS.add('content')
STOPWORDS.add('name')
STOPWORDS.add('description')
STOPWORDS.add('meta')
STOPWORDS.add('pubmed')
STOPWORDS.add('scholar')
STOPWORDS.add('google')
STOPWORDS.add('pmc')
STOPWORDS.add('study')
wc = WordCloud(
    width=480, 
    height=320, 
    background_color="white", 
    prefer_horizontal=1.0, 
    min_word_length=3,
Пример #39
0
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

STOPWORDS.add('bu')
STOPWORDS.add('mi')
STOPWORDS / add('bir')
text = open('sozler.txt', 'r').read()
foto = np.array(Image.open('barisabi.png'))
wc = WordCloud(background_color='white',
               collocations=False,
               mask=foto,
               width=1000,
               height=1000,
               stopwords=STOPWORDS)
wc.generate(text)

plt.figure(figsize=(20, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

wc.to_file('barisabimiz.png')
Пример #40
0
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

with open(
        "D:\isaac\Programmierung\Python_uebungen\Teil_05_Alice_in_wonderland.txt",
        "r") as f:
    text = f.read()

wordcloud = WordCloud(width=1920, height=1200)
STOPWORDS.add("said")
STOPWORDS.add("illustration")

wordcloud.generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
Пример #41
0
def plotly_wordcloud(data_frame):
    """A wonderful function that returns figure data for three equally
    wonderful plots: wordcloud, frequency histogram and treemap"""
    complaints_text = list(data_frame[0].dropna().values)
    ## join all documents in corpus
    text = " ".join(list(complaints_text))
    STOPWORDS.add("movie")
    STOPWORDS.add("film")

    word_cloud = WordCloud(stopwords=set(STOPWORDS),
                           max_words=100,
                           max_font_size=90)
    word_cloud.generate(text)

    word_list = []
    freq_list = []
    fontsize_list = []
    position_list = []
    orientation_list = []
    color_list = []

    for (word,
         freq), fontsize, position, orientation, color in word_cloud.layout_:
        word_list.append(word)
        freq_list.append(freq)
        fontsize_list.append(fontsize)
        position_list.append(position)
        orientation_list.append(orientation)
        color_list.append(color)

    # get the positions
    x_arr = []
    y_arr = []
    for i in position_list:
        x_arr.append(i[0])
        y_arr.append(i[1])

    # get the relative occurence frequencies
    new_freq_list = []
    for i in freq_list:
        new_freq_list.append(i * 80)

    trace = go.Scatter(
        x=x_arr,
        y=y_arr,
        textfont=dict(size=new_freq_list, color=color_list),
        hoverinfo="text",
        textposition="top center",
        hovertext=[
            "{0} - {1}".format(w, f) for w, f in zip(word_list, freq_list)
        ],
        mode="text",
        text=word_list,
    )

    layout = go.Layout({
        "xaxis": {
            "showgrid": False,
            "showticklabels": False,
            "zeroline": False,
            "automargin": True,
            "range": [-100, 250],
        },
        "yaxis": {
            "showgrid": False,
            "showticklabels": False,
            "zeroline": False,
            "automargin": True,
            "range": [-100, 450],
        },
        "margin": dict(t=20, b=20, l=10, r=10, pad=4),
        "hovermode": "closest",
    })

    wordcloud_figure_data = {"data": [trace], "layout": layout}
    word_list_top = word_list[:25]
    word_list_top.reverse()
    freq_list_top = freq_list[:25]
    freq_list_top.reverse()

    frequency_figure_data = {
        "data": [{
            "y": word_list_top,
            "x": freq_list_top,
            "type": "bar",
            "name": "",
            "orientation": "h",
        }],
        "layout": {
            "height": "550",
            "margin": dict(t=20, b=20, l=100, r=20, pad=4)
        },
    }
    treemap_trace = go.Treemap(labels=word_list_top,
                               parents=[""] * len(word_list_top),
                               values=freq_list_top)
    treemap_layout = go.Layout({"margin": dict(t=10, b=10, l=5, r=5, pad=4)})
    treemap_figure = {"data": [treemap_trace], "layout": treemap_layout}
    return wordcloud_figure_data, frequency_figure_data, treemap_figure
Пример #42
0
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

insults = open('./trumpInsults.txt','r').read()
trump_mask = np.array(Image.open('trump2.png'))

bannedWords = ['said','new','will','york','many','the','total','never','united','states','failing','totally','news','bad','failed','people','senator',
              'party','one','state','always','absolutely','governor','make','read','anything','always','good','thing','really','job','lost','show','group',
              'nothing','story','television','political','time','cruz','talk','zero','organization', 'guy','even','deal','false','history','looking',
              'reporting','look','country','poll','say','ratings','vote','money','former','president','press','republican','reporter','politician','magazine',
              'much','debate','debates','times','campaign','presidential','fox','clinton','hillary','bush','credibility','candidate','know','columnist','immigration',
              'another','ad','lied','chief','ted','record','newspaper','another','paid','journal','way','trump','got','life',
              'last','dead','street','great','clue','jeb']

for word in bannedWords:
    STOPWORDS.add(word)

wc = WordCloud(background_color="white", max_words=1500, mask=trump_mask, stopwords=STOPWORDS)
wc.generate(insults)
wc.to_file('trumpInsultWC.png')
Пример #43
0
def mostCommonWordsBar(messages):
    s = pd.DataFrame(messages)
    filteredS = s[s.content.str.contains("sent a photo") == False]
    words = pd.Series(' '.join(
        filteredS['content']).lower().split()).value_counts()
    wordsdf = pd.DataFrame({"count": words.values}, index=words.index)

    #print(wordsdf)
    STOPWORDS.add('ok')
    STOPWORDS.add('yea')
    STOPWORDS.add('ye')
    STOPWORDS.add('yes')
    STOPWORDS.add('good')
    STOPWORDS.add('will')
    STOPWORDS.add('oh')

    filteredWords = wordsdf[wordsdf.index.str.lower().isin(STOPWORDS) == False]

    filteredWords[:50].plot(kind="bar")
Пример #44
0
import numpy as np
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'alice.txt')).read()

# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = np.array(Image.open(path.join(d, "alice_mask.png")))

wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
               stopwords=STOPWORDS.add("said"))
# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join(d, "alice.png"))

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
Пример #45
0
                'CLINTON: ', 'HOLT: ', 'WALLACE: ', '[crosstalk]', 'COOPER: ',
                'RADDATZ: ', 'QUESTION: '
        ]:
            if people in t:
                idx = t.find(people)
                if idx < firstIdx:
                    firstIdx = idx
                    firstPersonAfterTrump = people
        trumpSaid = t.split(firstPersonAfterTrump)[0]
        trumpRamble = trumpRamble + trumpSaid

print(trumpRamble)

dir = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
trump_pic = np.array(Image.open(path.join(dir, 'trump.png')))

STOPWORDS.add('re')
wc = WordCloud(background_color="white",
               max_words=4000,
               mask=trump_pic,
               stopwords=STOPWORDS,
               max_font_size=150)

wc.generate(trumpRamble)

image_colors = ImageColorGenerator(trump_pic)

plt.imshow(wc.recolor(color_func=image_colors), interpolation='bilinear')
plt.axis('off')
plt.show()
Пример #46
0
def TweetWordCloud(inputfile,lang,outputimage):
    #INPUT:
    #   inputfile: A csv (or other format) file with a collection of tweet text and language information.
    #   lang: The target language of the word cloud.
    #         (Use the twitter codes, such as 'en' for English 'fr' for French etc.)
    #OUTPUT:
    #   outputimage: A png file with the word cloud image.
    #EXAMPLE CALL:
    #   TweetWordCloud('DavidBowieTributes.csv','en', 'davidbowietributes.png')
    #
    os.chdir('/home/kaushi/Desktop/Python_programming/Twitter/')
    colnames = ['text','language']
    tweetdf = pd.read_csv(inputfile,header=0,names=colnames)
    tweetdf['text'] = tweetdf['text'].astype(str)
    
    tweetdf2 = tweetdf[tweetdf['language'] == lang] #Only select the English tweets, for example.
    tweetdf2.reset_index(drop=True) #--> Reset indices, otherwise further manipulations will encounter issues.
    
    #Construct the word cloud.
    words = ' '.join(tweetdf2['text'])
    #NOTES FOR IMPROVEMENT:
    #Consider how to remove emoticons,
    #unicode characters, selective punctuation etc.
    wordfilter = " ".join([word for word in words.split()
                           if 'http' not in word #Take out urls
                           and not word.startswith('@') #Take out twitter handles.
                           and word != 'RT' #Take out retweet tags.
                           and word != 'None' #Take out place holders and null values.
                           and word != 'nan'])
    
    #Download the twitter mask (or any other mask of preference.)
    twitter_mask = imread('twitter_mask.png', flatten=True)
    wordcloud = WordCloud(font_path='/home/kaushi/customfonts/actionis.ttf', stopwords=STOPWORDS.add("will"),\
                          background_color='black',width=3000, height=3000,min_font_size=8,\
                          relative_scaling=0.3,mask=twitter_mask).generate(wordfilter)
    
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(outputimage, dpi=1000)
    plt.show()
Пример #47
0
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'summary.txt')).read()

# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg

wc = WordCloud(background_color="black", width=1280, height=720, prefer_horizontal=0.8, font_path='Aller_Rg.ttf', max_words=50, stopwords=STOPWORDS.add("said"))

# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join(d, "word_cloud.png"))

# show
plt.imshow(wc)
plt.axis("off")
plt.show()
            continue
        for word in words:
            # print word
            review_words_per_bin[i].append(word)
            if beer_key not in beer_key_word_bags:
                beer_key_word_bags[beer_key] = []
                beer_key_word_bags[beer_key].append(word)
            else:
                beer_key_word_bags[beer_key].append(word)

    print "words reviwing bin", i, ":", len(review_words_per_bin[i]), "reviews:", len(review_corpus_per_bin[i])

for key, val in beer_key_word_bags.iteritems():
    print key, len(val)

STOPWORDS.add("malt")
STOPWORDS.add("taste")
STOPWORDS.add("flavor")
STOPWORDS.add("carbonation")
STOPWORDS.add("had")
STOPWORDS.add("hop")
STOPWORDS.add("head")
STOPWORDS.add("good")
STOPWORDS.add("nice")
STOPWORDS.add("light")
STOPWORDS.add("dark")
STOPWORDS.add("hops")
STOPWORDS.add("white")

# Create global beer-clouds
for i in range(0, 3):
Пример #49
0
if __name__ == "__main__":

    d = path.dirname(__file__)

    # read in text
    # text = open(path.join(d, 'top_words.txt')).read()
    file_name = './data/top_words.txt'
    with open(file_name) as f:
        text = f.readlines()

    # read the mask image
    word_mask = np.array(Image.open(path.join(d, "./figures/circle_mask2.png")))

    # construct wordcloud
    wc = WordCloud(background_color="white", max_words=100, mask=word_mask,\
                stopwords=STOPWORDS.add("and"))

    print "generating word cloud ..."
    for topic_idx in range(len(text)):
        # generate word cloud
        wc.generate(text[topic_idx])
        # store to file
        wc.to_file(path.join(d, "./figures/topic"+str(topic_idx)+".png"))
    #end

    # generate plots 
    plt.figure()
    plt.imshow(wc)
    plt.axis("off")
    plt.show()
Пример #50
0
 def loaddata(Text,mods):
         #read the preprocessed data from pickle file
         df = pd.read_pickle("corpus.pkl")
         
         STOPWORDS.add("rt")
         STOPWORDS.add("s")
         STOPWORDS.add("u")
         STOPWORDS.add("amp")
         STOPWORDS.add("th")
         STOPWORDS.add("will")
         STOPWORDS.add("t")
         STOPWORDS.add("m")
         STOPWORDS.add("today")
        
         
         #split the data into train and test set
         from sklearn.model_selection import train_test_split
         train, test = train_test_split(df, test_size=0.3, train_size=0.7, random_state=14)
         
         
         #performing stemming
         lt = LancasterStemmer()
         def token(text):
             txt = nltk.word_tokenize(text.lower())
             return [lt.stem(word) for word in txt]
         
         #document term matrix using Tfidf vectorizer
         tfv = TfidfVectorizer(tokenizer=token,stop_words=STOPWORDS,analyzer=u'word', min_df=4)
         X_train_tfv = tfv.fit_transform(train['clean_tweet']) 
         X_test_tfv = tfv.transform(test['clean_tweet']) 
     
        
         X_train_tfv = pd.DataFrame(X_train_tfv.toarray(), columns=tfv.get_feature_names())
         X_test_tfv = pd.DataFrame(X_test_tfv.toarray(), columns=tfv.get_feature_names())
         
         if(mods=="MNB"):
             
             st.success("Performing MNB Classification")
             #build the model
             nb = MultinomialNB()
             # Train the model
             nb.fit(X_train_tfv, train['Party_log'])
             
             #transform the entered text into document term matrix
             vec_text = tfv.transform(Text).toarray()
             #predicting the value for newly entered tweet
             result = nb.predict(vec_text)
             #if result is 1 then democrat else republican
         else:
             st.success("Performing Logistic Regression")
             #build the model
             lr = LogisticRegression()
             # Train the model
             lr.fit(X_train_tfv, train['Party_log'])
             
             #transform the entered text into document term matrix
             vec_text = tfv.transform(Text).toarray()
             #predicting the value for newly entered tweet
             result = lr.predict(vec_text)
             #if result is 1 then democrat else republican
             
         if result == 1:
              return "demo"
         elif result == 0:
              return "rep"
#!/usr/bin/env python3

'''
make text
find _build/text/ -name '*.txt' | xargs cat > _build/words.txt
'''

from os import path
from wordcloud import WordCloud, STOPWORDS

d = path.dirname(__file__)

STOPWORDS.add('will')
STOPWORDS.add('example')

# Read the whole text.
text = open(path.join(d, '_build/words.txt'), encoding='utf-8').read()
wordcloud = WordCloud(width=1920, height=1080, max_words=200).generate(text)
wordcloud.to_file('word-cloud.png')
Пример #52
0
def get_coherence(topic):
    try:
        cp = palmetto.get_coherence(topic, coherence_type="cp")
        ca = palmetto.get_coherence(topic, coherence_type="ca")
        return cp+ca
    except:
        return -1


if __name__ == "__main__":
    vis_dir = './visualization/'
    if not os.path.exists(vis_dir):
            os.makedirs(vis_dir)
    for word in stop_words:
        STOPWORDS.add(word)

    if args.model == 'btm':
        doc_pt = args.fname
        dwid_pt = './temp/doc_wids.txt'
        voca_pt = './temp/voca.txt'
        model_dir = './temp/model/'
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        indexFile(doc_pt, dwid_pt)
        write_w2id(voca_pt)
        vocab_size = len(w2id)
        # encode documents and build vocab

        alpha = 50 / args.K
Пример #53
0
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from scipy.misc import imread

name = 'reyarch'

with open('users/{0}/first_words.json'.format(name), 'r') as fs:
	first_words = json.load(fs)

with open('users/{0}/relation_dict.json'.format(name), 'r') as fs:
	relation_dict = json.load(fs)

all_words = sorted(relation_dict, key=lambda x: len(relation_dict[x]))

text = open('all_words.txt', 'r').read()
pepe_mask = imread('pepe.jpeg')

wc = WordCloud(background_color="white", max_words=2000, mask=pepe_mask,
               stopwords=STOPWORDS.add("heart"))
wc.generate(text)
wc.to_file('pepecloud.png')
plt.imshow(wc)
plt.axis("off")
plt.show()
remove = [
    "interviewer",
    "interviewee" "shapiro",
    "inaudible",
    "heather",
    "castingwords",
    "par",
    "line",
    "silence",
    "course",
    "coursera",
    "courses",
    "lot",
    "like",
]
STOPWORDS.add("said")
STOPWORDS.add("course")
STOPWORDS.add("courses")
STOPWORDS.add("coursera")
STOPWORDS.add("really")
STOPWORDS.add("one")
text = " ".join(filter(lambda x: x.lower() not in remove, text.split()))
# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg


# In[22]:

alice_mask = np.array(Image.open(path.join(d, "stormtrooper_mask.png")))
Пример #55
0
# coding: utf-8
import jieba
from scipy.misc import imread  # 这是一个处理图像的函数
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
back_color = imread('F:\WordCloud-master\WordCloud-master\o_003.jpg')  # 解析该图片
wc = WordCloud(background_color='white',  # 背景颜色
               max_words=1000,  # 最大词数
               mask=back_color,  # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
               max_font_size=100,  # 显示字体的最大值
               stopwords=STOPWORDS.add('我国'), # 使用内置的屏蔽词,再添加'苟利国',需要增加其他屏蔽词时,
               #加入set,STOPWORDS.add(('我国','国家','祖国'))
               font_path="F:\WordCloud-master\WordCloud-master\static\simheittf\simhei.ttf",  # 解决显示口字型乱码问题,
               #可进入C:/Windows/Fonts/目录更换字体
               random_state=42,  # 为每个词返回一个PIL颜色
               # width=1000,  # 图片的宽
               # height=860  #图片的长)
# WordCloud各含义参数请点击 wordcloud参数
# 添加自己的词库分词,比如添加'金三胖'到jieba词库后,当你处理的文本中含有金三胖这个词,
# 就会直接将'金三胖'当作一个词,而不会得到'金三'或'三胖'这样的词
jieba.add_word('金三胖')
# 打开词源的文本文件
# text = open('F:\WordCloud-master\WordCloud-master\cnword.txt',encoding='utf-8').read()
with open('F:\WordCloud-master\WordCloud-master\cnword.txt','r',encoding='UTF-8') as f:
    text = f.read()
    f.close()
# 该函数的作用就是把屏蔽词去掉,使用这个函数就不用在WordCloud参数中添加stopwords参数了
# 把你需要屏蔽的词全部放入一个stopwords文本文件里即可
def stop_words(texts):
    words_list = []
    word_generator = jieba.cut(texts, cut_all=False)  # 返回的是一个迭代器
# load config file
config = SafeConfigParser()
script_dir = path.dirname(__file__)
config_file = path.join(script_dir, 'config/settings.cfg')
config.read(config_file)

# tell script where to put the JSON files returned
logfile = config.get('files','logfile')
listfile = config.get('files','listfile')
outfolder = config.get('files','outfolder')

# get usernames
users = get_users(listfile)

# add stop words
STOPWORDS.add('https')

# create a word cloud for each user
for user in users:

    # get image masks for different users
    # from http://masterkoyo.deviantart.com/art/Template-Donald-Trump-35925789
    # from https://openclipart.org/detail/211473/jeb-bush-outlines
    # from http://www.spstencils.com/shop/politics/hilary-clinton-stencil/
    image_mask = None
    try:
        image_mask = imread(path.join(script_dir, ".".join([user,'jpg'])))
        print user
    except IOError:
        print 'Cannot open file '+ user + '.jpg under directory ' + script_dir
for i in range(len(tweets_data)):
   if tweets['lang'].loc[i]=='en':
         text += tweets['text'].loc[i]

str1=""
text2=TextBlob(text)
for word, pos in text2.tags:
    if pos=='JJ' and word.isalpha():
       check=word.spellcheck()
       if [x[1] for x in check] == [1.0]:
          str1 += word
          str1 += " "
      
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

alice_mask = np.array(Image.open("cloud.png"))
STOPWORDS.add("rt")


wc = WordCloud(background_color="white", max_words=100, mask=alice_mask,
               stopwords=STOPWORDS.add("https"),relative_scaling=0.5)
# generate word cloud
wc.generate(str1)


# show
plt.imshow(wc)
plt.axis("off")
plt.show()
Пример #58
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#Author:Winston.Wang
import jieba
from scipy.misc import imread  # 这是一个处理图像的函数,读取图像
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator  #词云生成库
import matplotlib.pyplot as plt  #绘制库

back_color = imread('C:/Users/wzx/Desktop/word.jpg')  # 解析该图片
# 使用内置的屏蔽词,再添加'损害'
STOPWORDS.add('《共·惨党宣言》')
#设置字体
font = 'C:/Windows/Fonts/simhei.ttf'
wc = WordCloud(
    background_color='white',  # 背景颜色
    max_words=1000,  # 最大词数
    mask=back_color,  # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
    max_font_size=80,  # 显示字体的最大值
    stopwords=STOPWORDS,
    font_path=font,  # 解决显示口字型乱码问题,可进入C:/Windows/Fonts/目录更换字体
    random_state=42,  # 为每个词返回一个PIL颜色
    # width=1000,  # 图片的宽
    # height=860  #图片的长
)
# WordCloud各含义参数请点击 wordcloud参数

# 添加自己的词库分词,比如添加'中国改革开放'到jieba词库后,当你处理的文本中含有中国改革开放这个词不会拆,
jieba.add_word('中国改革开放')

# 打开词源的文本文件
with open('cnword.txt', 'r', encoding="utf-8") as f:
Пример #59
0
from scipy.misc import imread
#import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'hot_key.txt')).read()

# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = imread(path.join(d, "alice_mask.png"))

wc = WordCloud(font_path="simhei.ttf", background_color="white", max_words=2000, mask=alice_mask,
               stopwords=STOPWORDS.add("Qq"))
# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join(d, "alice_Chinese.png"))

# show
# plt.imshow(wc)
# plt.axis("off")
# plt.figure()
# plt.imshow(alice_mask, cmap=plt.cm.gray)
# plt.axis("off")
# plt.show()
Пример #60
0
text = text.replace("冯世杰说", "冯世杰")
text = text.replace("叶晓明说", "叶晓明")

# 下面两种都可以读取图片 因为图片本质上就是一个二维数组
# mask = np.array(Image.open(os.path.join(d,'timg.jpg')))
mask = imread(os.path.join(d, 'timg.jpg'))  # 设置背景图片

# 生成我们的一个word云印象
# max_font_size=40 设置最大字体是40
# random_state=2 配色方案
# mask=mask 图片关联
# stopwords 屏蔽某些词
wc = WordCloud(font_path=font,
               max_words=200,
               mask=mask,
               stopwords=STOPWORDS.add("强奸"),
               background_color='green')
wc.generate(text)  # generate()根据我们的文本生成词云
image_colors = ImageColorGenerator(mask)  # 从背景图片生成颜色值
plt.imshow(wc.recolor(color_func=image_colors))  # 显示我们生成图片 根据背景颜色设置词云文字颜色
# plt.imshow() # 显示我们生成图片
plt.axis("off")
plt.show()  # 生成可视化图片
wc.to_file("text2.png")

# """
# wordcloud的所有参数
#
# font_path : string //字体路径,需要展现什么字体就把该字体路径+后缀名写上,如:font_path = '黑体.ttf'
# width : int (default=400) //输出的画布宽度,默认为400像素
# height : int (default=200) //输出的画布高度,默认为200像素