Пример #1
0
    def make_cloud(self, text):

        words = wordcloud.process_text(text)
        elements = wordcloud.fit_words(words, width=400, height=400)
        wordcloud.draw(elements, self.out, width=400, height=400, scale=2)

        return self.out
Пример #2
0
def analizer(query, num_topics, dictionary, corpus, alpha):    
    image_path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/QueryAnalyzer/Models/"
    model_path = image_path
    alpha=alpha
    #lda = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, update_every=1, chunksize=50, passes=1)
    lda_2 = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, alpha=alpha, update_every=1, chunksize=50, passes=1)    
    
    dictionary.save(model_path + "tmp_dictionary.dict")  
    corpora.MmCorpus.serialize(model_path + "tmp_corpus.mm", corpus)
    lda_2.save(model_path + 'tmp_model.lda') # same for tfidf, lsi, ...
    
    
    goals_distribution = ldam.perQueryGoalProportions(query, dictionary, lda_2)
    max_goal = ldam.viewPerQueryGoalProportions(goals_distribution)
    show_goal = lda_2.show_topic(max_goal)
    #print show_goal
    new_goal = []
    for goal in show_goal:
        weight = goal[0]
        tag = goal[1]
        new_goal.append((tag, weight))
        
    # Compute the position of the words.
    elements = wordcloud.fit_words(new_goal, width=100, height=100)
    image_path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/QueryAnalyzer/Models/"
    # Draw the positioned words to a PNG file.    
    wordcloud.draw(elements, path.join(image_path + 'image.png'), width=100, height=100, scale=2)    
    lda_model = lda_2
    return image_path, lda_model
Пример #3
0
	def make_cloud(self, text):
		
		words = wordcloud.process_text(text)
		elements = wordcloud.fit_words(words, width = 400, height = 400)
		wordcloud.draw(elements, self.out, width = 400, height = 400, scale = 2)
		
		return self.out
Пример #4
0
 def makeCloud(self, text, font=None):
     if font is None:
         font = random.choice(self.fonts)
     words, counts = wordcloud.process_text(text, max_features=2000)
     elements = wordcloud.fit_words(words, counts, width=self.size,
             height=self.size, font_path=font)
     wordcloud.draw(elements, self.outFile, width=self.size,
             height=self.size, scale=self.scale, font_path=font)
Пример #5
0
 def makeCloud(self, text, font=None):
     if font is None:
         font = random.choice(self.fonts)
     words, counts = wordcloud.process_text(text, max_features=2000)
     elements = wordcloud.fit_words(words, counts, width=self.size,
             height=self.size, font_path=font)
     wordcloud.draw(elements, self.outFile, width=self.size,
             height=self.size, scale=self.scale, font_path=font)
Пример #6
0
def draw_goal(lsi, topic):
    other_goal = lsi.show_topic(topic)
    new_goal = []
    image_path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/QueryAnalyzer/Models/"
    
    for goal in other_goal: weight = goal[0];tag = goal[1];new_goal.append((tag, weight))
    
    elements = wordcloud.fit_words(new_goal, width=100, height=100)
    wordcloud.draw(elements, path.join(image_path + 'other_image.png'), width=100, height=100, scale=2)  
Пример #7
0
def make_word_cloud(text, filepath):
    import wordcloud  #@UnresolvedImport

    if isinstance(text, str):
        text = wordcloud.process_text(text, max_features=20)

    w, h = (400, 400)
    text = remove_letters(text)
    elements = wordcloud.fit_words(text, width=w, height=h)

    wordcloud.draw(elements, filepath, width=w, height=h, scale=1)
    return filepath
def words_cloud(args):

    voc = sio.loadmat(args.vocabulary)[args.voc_key]

    folds = [x for x in os.listdir(args.exp) if "fold" in x]
    folds.sort()

    if args.force_fold >= 0:
        folds = [x for x in folds if "%d" % args.force_fold in x]
    exp_opts = pickle.load(file(os.sep.join([args.exp, "cmd_opts"]), "rb"))
    out_dir = os.sep.join([
        args.out,
        "u_lambda=%s" % exp_opts.u_lambda,
        "w_lambda=%s" % exp_opts.w_lambda
    ])
    if not os.path.exists(out_dir): os.makedirs(out_dir)

    logger.info("Number of folds: %d" % len(folds))
    for fold in folds:
        fold_dir = os.sep.join([args.exp, fold])
        epochs = [x for x in os.listdir(fold_dir) if "epoch" in x]
        epochs.sort()
        if args.force_epoch >= 0:
            epochs = [x for x in epochs if "%d" % args.force_epoch in x]
        for epoch_name in epochs:
            ef_out_dir = os.sep.join([out_dir, fold, epoch_name])
            if not os.path.exists(ef_out_dir): os.makedirs(ef_out_dir)
            epoch_path = os.sep.join([fold_dir, epoch_name])
            epoch = load_compressed(epoch_path)

            region_task_words = scored_epoch_words(epoch, voc, args.nwords,
                                                   args.force_region)

            for r in region_task_words:
                for t in region_task_words[r]:
                    for pn in region_task_words[r][t]:
                        if args.for_wordle:
                            outname = "%s_r%dt%d.wordle" % (pn, r, t)
                            outfile = os.sep.join([ef_out_dir, outname])
                            f = file(outfile, "w")
                            for word, score in region_task_words[r][t][pn]:
                                f.write("%s: %2.5f\n" % (word, score))
                            f.close()
                        else:
                            outname = "%s_r%dt%d.png" % (pn, r, t)
                            fit_rtw = wordcloud.fit_words(
                                region_task_words[r][t][pn],
                                font_path=args.font)
                            wordcloud.draw(fit_rtw,
                                           os.sep.join([ef_out_dir, outname]),
                                           font_path=args.font)
Пример #9
0
def word_cloud():
    final_topics = open(os.path.join(MODELS_DIR, "final_topics.txt"), 'rb')
    curr_topic = 0
    for line in final_topics:
        line = line.strip()[line.rindex(":") + 2:]
        scores = [float(x.split("*")[0]) for x in line.split(" + ")]
        words = [x.split("*")[1] for x in line.split(" + ")]
        freqs = []
        for word, score in zip(words, scores):
            freqs.append((word, score))
        #elements = wordcloud.fit_words(freqs, width=120, height=120)
        wordcloud.draw(freqs, "gs_topic_%d.png" % (curr_topic), width=120, height=120)
        curr_topic += 1
    final_topics.close()
def word_cloud():
    final_topics = open(os.path.join(MODELS_DIR, "final_topics.txt"), 'rb')
    curr_topic = 0
    for line in final_topics:
        line = line.strip()[line.rindex(":") + 2:]
        scores = [float(x.split("*")[0]) for x in line.split(" + ")]
        words = [x.split("*")[1] for x in line.split(" + ")]
        freqs = []
        for word, score in zip(words, scores):
            freqs.append((word, score))
        #elements = wordcloud.fit_words(freqs, width=120, height=120)
        wordcloud.draw(freqs, "gs_topic_%d.png" % (curr_topic), width=120, height=120)
        curr_topic += 1
    final_topics.close()
Пример #11
0
def ldaGoalDistribution(goals_distribution, max_goal, image_path, lda_n, name):    
    show_goal = lda_n.show_topic(max_goal)
    print show_goal
    new_goal = []
    for goal in show_goal:
        weight = goal[0]
        tag = goal[1]
        new_goal.append((tag, weight))
        
    # Compute the position of the words.
    elements = wordcloud.fit_words(new_goal, width=100, height=100)
    
    # Draw the positioned words to a PNG file.    
    wordcloud.draw(elements, path.join(image_path + name), width=100, height=100, scale=2)                
Пример #12
0
def freq_words(string):


	print "\n\n\n\t\t\tReading from file"

	#tokenize on white spaces
	raw_word_list=word_tokenize(string)

	#remove stop words
	processed_word_list=[word for word in raw_word_list if word not in total_stop_words]

	#create an nltk text object
	text_obj=nltk.Text(processed_word_list)

	print "\n\n\n\t\t\tProcessing"

	#Call the frequency distribution method and store the words and  corresponding frequencies in a dictionary
	fd=FreqDist(text_obj)

	
	#convert the dictionary to a list of tuples conatining key-value pairs
	result=fd.items()
	

	#select the 100 most frequent words. If number of words in the result is less than 100, adjust accordingly
	if len(result) < 100:
		result_length=len(result)
		chosen_words=result[: result_length/2]
	else:
		chosen_words=result[:100]
	
	print "\n\n\n\t\t\tDrawing cloud"


	#specify the canvas measurement
	elements = wordcloud.fit_words(chosen_words, width=500, height=500)


	#draw the cloud
	wordcloud.draw(elements, path.join(d, 'frequent_words.png'), width=500, height=500,
        scale=2)


	print "\n\n\n\t\t\tWord cloud generated in frequent_words.png file"
	

	return
Пример #13
0
def drawTags(model, lsi, query, dictionary, image_path, tfidf):
    print "Init drawTags"   
    goals_distribution = model.perQueryGoalProportions(query, dictionary, tfidf, lsi)
    max_goal = model.viewPerQueryGoalProportions(goals_distribution)    
    show_goal = lsi.show_topic(max_goal)
    print show_goal
      
    new_goal = []
    for goal in show_goal:
        weight = goal[0]
        tag = goal[1]
        new_goal.append((tag, weight))
        
    # Compute the position of the words.
    elements = wordcloud.fit_words(new_goal, width=100, height=100)    
    # Draw the positioned words to a PNG file.    
    wordcloud.draw(elements, path.join(image_path + 'lsa-image.png'), width=100, height=100, scale=2)    
Пример #14
0
def main():

    dictionary = gensim.corpora.Dictionary.load(
        "dict_abstracts_corpus_1_cleaned.dict")
    corpus = gensim.corpora.MmCorpus("abstracts_corpus_1_cleaned.mm")
    lda = gensim.models.LdaModel.load("abstracts_corpus_1_cleaned.lda")

    list_of_topics = lda.show_topics(100, formatted=False)

    for i in range(len(list_of_topics)):
        # Compute the position of the words.
        elements = wordcloud.fit_words([(str(l[1]), l[0])
                                        for l in list_of_topics[i]],
                                       font_path="/Library/Fonts/Tahoma.ttf")
        # Draw the positioned words to a PNG file.
        wordcloud.draw(elements,
                       "../topic_%i.png" % i,
                       font_path="/Library/Fonts/Tahoma.ttf")
Пример #15
0
def words_cloud(args):

	voc = sio.loadmat(args.vocabulary)[args.voc_key]
	
	folds = [x for x in os.listdir(args.exp) if "fold" in x]
	folds.sort()

	if args.force_fold >= 0: folds = [x for x in folds if "%d"%args.force_fold in x]
	exp_opts = pickle.load(file(os.sep.join([args.exp,"cmd_opts"]),"rb"))
	out_dir = os.sep.join([args.out,"u_lambda=%s"%exp_opts.u_lambda,"w_lambda=%s"%exp_opts.w_lambda])
	if not os.path.exists(out_dir): os.makedirs(out_dir)

	logger.info("Number of folds: %d"%len(folds))
	for fold in folds:
		fold_dir = os.sep.join([args.exp,fold])
		epochs = [x for x in os.listdir(fold_dir) if "epoch" in x]
		epochs.sort()
		if args.force_epoch >= 0: epochs = [x for x in epochs if "%d"%args.force_epoch in x]
		for epoch_name in epochs:
			ef_out_dir = os.sep.join([out_dir,fold,epoch_name])
			if not os.path.exists(ef_out_dir): os.makedirs(ef_out_dir)
			epoch_path = os.sep.join([fold_dir,epoch_name])
			epoch = load_compressed(epoch_path)

			region_task_words = scored_epoch_words(epoch,voc,args.nwords,args.force_region)
			
			for r in region_task_words:
				for t in region_task_words[r]:
					for pn in region_task_words[r][t]:
						if args.for_wordle:
							outname = "%s_r%dt%d.wordle"%(pn,r,t)
							outfile = os.sep.join([ef_out_dir,outname])
							f = file(outfile,"w")
							for word,score in region_task_words[r][t][pn]:
								f.write("%s: %2.5f\n"%(word,score))
							f.close()
						else:
							outname = "%s_r%dt%d.png"%(pn,r,t)
							fit_rtw = wordcloud.fit_words(region_task_words[r][t][pn],font_path=args.font)
							wordcloud.draw(fit_rtw, os.sep.join([ef_out_dir,outname]),font_path=args.font)
Пример #16
0
def make_cloud(channel, time, myType=None, drawLabels=True, font_path=None):
    #get the log file
    directory = "logs/" + channel + '/' + time 

    if myType == None:
        file_path = os.path.relpath(directory + '/words.log')
        with open(file_path, 'r') as f:
            words = f.read().upper()

        file_path = os.path.relpath(directory + '/emotes.log')
        with open(file_path, 'r') as f:
            emotes = " ".join(filter(lambda x:len(x)>3 and x != 'double' and x != 'triple', f.read().split('\n')))

        directory = "images/" + channel + '/' + time
        if not os.path.exists(directory):
            os.makedirs(directory)

        print "Generating word cloud... Hold on! (This takes a while if there are a lot of messages)"
        scale = 2
        w = wordcloud.process_text(words, max_features=1500)
        elements = wordcloud.fit_words(w, width=w_words/scale, height=h_words/scale)
        wordcloud.draw(elements, os.path.relpath(directory + '/wordcloud.png'), 
                       width=w_words/scale, height=h_words/scale, scale=scale)
        print "Word cloud created!"

        print "Generating emote cloud..."
        w = wordcloud.process_text(emotes, max_features=1500)
        elements = wordcloud.fit_words(w, width=w_emotes, height=h_emotes)
        wordcloud.draw(elements, os.path.relpath(directory + '/emotecloud.png'), 
                       width=w_emotes, height=h_emotes)
        print "Emote cloud created!"
    else: #if running the program manually. this is mainly for my debugging purposes.
        w_custom = 1100
        h_custom = 700
        file_path = os.path.relpath(directory + '/'+myType+'.log')

        directory = "images/" + channel + '/' + time
        if not os.path.exists(directory):
            os.makedirs(directory)

        with open(file_path, 'r') as f:
            data = f.read()
        if myType.lower() == 'authors':
            data = data.upper()
        print "Generating " +myType+ " cloud... Hold on!"
        scale = 2
        w = wordcloud.process_text(data, max_features=1000)
        elements = wordcloud.fit_words(w, width=w_custom/scale, 
                                       height=h_custom/scale, font_path=font_path)
        wordcloud.draw(elements, os.path.relpath(directory + '/'+myType+'cloud.png'), 
              width=w_custom/scale, height=h_custom/scale, scale=scale, font_path=font_path)
        print myType + " cloud created!"
OAUTH_TOKEN="2303879557-cQmezLl1sFxOZiEVjLgrERce5ybcB2P1c1qc8q4"
OAUTH_TOKEN_SECRET="ejk5YJBgOtUeMUtMwd9X0opnKAnH8rJHnEYlqoeXafCmM"

twitter = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)


newest = max(glob.iglob('*.png'), key=globby.path.getctime)

newesttext = max(glob.iglob('*.txt'), key=globby.path.getctime)


d = path.dirname(__file__)


# Read the whole text.
text = open(path.join(d, newesttext)).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text)
#for tweet in words:
#    print(tweet)
# Compute the position of the words.
elements = wordcloud.fit_words(words)
# Draw the positioned words to a PNG file.
newesttext=newesttext[:-4]
trend = "#"+ newesttext
newesttext = newesttext + ".png"
wordcloud.draw(elements, path.join(d, newesttext))


#photo = open(str(newest), 'rb')
#twitter.update_status_with_media(status=trend, media=photo)
Пример #18
0
#!/usr/bin/env python2

from os import path
import sys
import wordcloud

# print 'Number of arguments:', len(sys.argv), 'arguments.'
# print 'Argument List:', str(sys.argv)

# Experimenting with random seeds
import random
random.seed(42)

if(len(sys.argv) != 5):
    print "[ USAGE ]: ", sys.argv[0], " <WordsFile> <OutputFile> <Width> <Height>"
    sys.exit()

d = path.dirname(__file__)

# Read the whole text
text = open(path.join(d, sys.argv[1])).read()

# Separate into a list of (word, frequency).
words = wordcloud.process_text(text, max_features=500)
# Compute the position of the words.
elements = wordcloud.fit_words(words, width=int(sys.argv[3]), height=int(sys.argv[4]))
# Draw the positioned words to a PNG file.
wordcloud.draw(elements, path.join(d, sys.argv[2]), width=int(sys.argv[3]), height=int(sys.argv[4]), scale=2)
import os
import wordcloud

MODELS_DIR = "models"

final_topics = open(os.path.join(MODELS_DIR, "final_topics.txt"), 'rb')
curr_topic = 0
for line in final_topics:
    line = line.strip()[line.rindex(":") + 2:]
    scores = [float(x.split("*")[0]) for x in line.split(" + ")]
    words = [x.split("*")[1] for x in line.split(" + ")]
    freqs = []
    for word, score in zip(words, scores):
        freqs.append((word, score))
    elements = wordcloud.fit_words(freqs, width=120, height=120)
    wordcloud.draw(elements, "gs_topic_%d.png" % (curr_topic),
                   width=120, height=120)
    curr_topic += 1
final_topics.close()
Пример #20
0
import os
import wordcloud

MODELS_DIR = "."

final_topics = open(os.path.join(MODELS_DIR, "final_topics.txt"), 'rb')
curr_topic = 0
for line in final_topics:
    line = line.strip()[line.rindex(":") + 2:]
    scores = [float(x.split("*")[0]) for x in line.split(" + ")]
    words = [x.split("*")[1] for x in line.split(" + ")]
    freqs = []
    for word, score in zip(words, scores):
        freqs.append((word, score))
    elements = wordcloud.fit_words(freqs, width=120, height=120)
    wordcloud.draw(elements,
                   "gs_topic_%d.png" % (curr_topic),
                   width=120,
                   height=120)
    curr_topic += 1
final_topics.close()
Пример #21
0
import splider_test
import BeautifulSoup
import config
import Regex
import wordcloud


def home_handle(home_url, dic):
    list_blog = BeautifulSoup.home_parse(home_url)
    for url in list_blog:
        if (Regex.url_judge(url)):
            print("Begin handling:\t", url)
            str_html = splider_test.splider(url)
            BeautifulSoup.parse(str_html, dic)


home_base = 'https://edu.cnblogs.com/campus/buaa/OO2018'
dic = {}
for i in range(1, 24):
    home_url = ''
    if (i == 1):
        home_url = home_base
    else:
        home_url = home_base + "?page=" + str(i)
    home_handle(home_url, dic)
list_sort = sorted(dic.items(), key=lambda e: e[1], reverse=True)
list_word = []
list_count = []
config.trans(list_word, list_count, list_sort)
wordcloud.draw(list_word, list_count)
Пример #22
0
def genWordCloud(filename):
    textArray = openTxt(filename)
    count = countWords(textArray, 1000)
    words = wordcloud.fit_words(count, width=500, height=500)
    wordcloud.draw(words, pngPath + os.path.splitext(filename)[0] + '.png', width=500, height=500, scale=2)
    return 'Cloud generated for {}'.format(filename)
Пример #23
0
#!/usr/bin/env python2

from os import path
import sys
import wordcloud

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'presinaug-addresses.txt')).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text, max_features=10000)
# Compute the position of the words.
elements = wordcloud.fit_words(words, width=900, height=1600)
# Draw the positioned words to a PNG file.
wordcloud.draw(elements,
               path.join(d, 'presinaug-wordcloud-1600x900.png'),
               width=900,
               height=1600,
               scale=1)
    for row in fileReader:
        topicKeys[int(row[0])] = row[2].split(' ')

# main work        
for source in sources:
    print "Processing %s..." %source
    topics, weightSum = get_top_topics([source],docWeights,weightThreshold)
    text = ""
    for topic in topics:
        t = " ".join(topicKeys[topic]) + " "
        text += t
    
    words = wordcloud.process_text(text)
    elements = wordcloud.fit_words(words,font_path=fontPath)
    outpath = "/Users/jchan/Desktop/Dropbox/Research/Dissertation/OpenIDEO/Pipeline/Validation/wordclouds/k400t50/%s.png" %source
    wordcloud.draw(elements, outpath, font_path=fontPath)

## read in the concept list
#concepts = {}
#with open(conceptFile, 'rU') as csvfile:
#    filereader = csv.reader(csvfile, delimiter=',', quotechar='|')
#    for row in filereader:
#        concepts[row[0]] = row[1]
#
## grab topic-keys -> hash: key = topic, value = list of words
#topicKeys = {}
#with open(keysFile,'rU') as csvFile:
#    fileReader = csv.reader(csvFile, delimiter='\t', quotechar='|')
#    for row in fileReader:
#        topicKeys[row[0]] = row[2].split(' ')
#        
Пример #25
0
#!/usr/bin/env python2
#coding=utf-8

from os import path
import sys
import wordcloud

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'alice.txt')).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text, max_features=2000)
# Compute the position of the words.
elements = wordcloud.fit_words(words, width=500, height=500)
# Draw the positioned words to a PNG file.
wordcloud.draw(elements, path.join(d, 'alice.png'), width=500, height=500,
        scale=2)
Пример #26
0
import wordcloud
import os
from gensim import corpora


def make_words(dictionary):
    result = []
    for word, token_id in dictionary.token2id.iteritems():
        result.append((word, dictionary.dfs[token_id]))
    return result


wordcloud.FONT_PATH = 'C:/Windows/Fonts/DroidSansMono.ttf'
if os.name == 'posix':
    wordcloud.FONT_PATH = '/Users/micazook/Library/Fonts/DroidSansMono.ttf'
dictionary = corpora.Dictionary.load('../dictionary.dict')
words = make_words(dictionary)
elements = wordcloud.fit_words(words, width=500, height=500)
wordcloud.draw(elements, '../word_cloud.png', width=500, height=500, scale=2)
Пример #27
0
#!/usr/bin/env python2

from os import path
import sys
import wordcloud

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'constitution.txt')).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text)
# Compute the position of the words.
elements = wordcloud.fit_words(words)
# Draw the positioned words to a PNG file.
wordcloud.draw(elements, path.join(d, 'constitution.png'))
Пример #28
0
for doc_top in topics:
	for ti,_ in doc_top:
		counts[ti] += 1

# most talked about topics
words_max = model.show_topic(counts.argmax(), 50)

# least talked about topics
words_min = model.show_topic(counts.argmin(), 50)

wf_max = []
wlist_max = []
for i,j in words_max:
	wlist_max.append(j)
for i in range(50):
	wf_max.append((wlist_max[i],counts[i]))

wf_min = []
wlist_min = []
for i,j in words_min:
	wlist_min.append(j)
for i in range(50):
	wf_min.append((wlist_min[i],counts[i+50]))

d = path.dirname(__file__)

elements_max = wordcloud.fit_words(wf_max)
wordcloud.draw(elements_max, path.join(d, 'top50.png'),scale=3)

elements_min = wordcloud.fit_words(wf_min)
wordcloud.draw(elements_min, path.join(d, 'bottom50.png'),scale=3)
Пример #29
0
def wordclouds(x):
    d = path.dirname("/Users/MrG/Capstone/")
    words = wordcloud.process_text(str(x), max_features = 500)
    elements = wordcloud.fit_words(words)
    wordcloud.draw(elements, path.join(d,"WC.png"), scale = 5)
    return Image(filename='/Users/MrG/Capstone/WC.png', height= 1000, width= 618)
Пример #30
0
def produceWordCloud(inputText, outputPng):
	words = wordcloud.process_text(inputText, max_features=400)
	elements = wordcloud.fit_words(words, width=800, height=500)
	wordcloud.draw(elements, outputPng, width=800, height=500, scale=2)
Пример #31
0
# Cluster
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=False)
km.fit(X)


# Create cluster outputs
output_dict = {'cluster': km.labels_, 'values': dataset}
output_df = pd.DataFrame(output_dict)

# Create text files 
for i in range(true_k):
    print len(output_df[output_df.cluster == i]), round(100*len(output_df[output_df.cluster == i]) / float(len(output_df)), 2)

    cluster_text = output_df['values'][output_df.cluster == i].values
    temp = "cluster " + str(i) + ".txt"
    
    with open(temp, "w") as outfile:
       for j in cluster_text:
           outfile.write("%s\n" % j)

# Create wordclouds
for i in range(true_k):
    text = open('cluster ' + str(i) + '.txt').read()
    # Separate into a list of (word, frequency).
    words = wordcloud.process_text(text)
    # Compute the position of the words.
    elements = wordcloud.fit_words(words, font_path='/Library/Fonts/Arial Black.ttf', width=600, height=300)
    # Draw the positioned words to a PNG file.
    wordcloud.draw(elements, 'cluster ' + str(i) + '.png', font_path="/Library/Fonts/Arial Black.ttf", width=600, height=300)
Пример #32
0
        # lemmatize based on WordNet 
        txtfeatWords5 = [ lmtzr.lemmatize(word) for word in txtfeatWords4 ]
        
        # stem using snowball stemmer
        txtfeatWords6 = [ stemmer.stem(word) for word in txtfeatWords5 ]
        
        # remove punctuations
        txtfeatWords7 = [ word.encode('utf-8').translate(None,string.punctuation) for word in txtfeatWords6 ]
        
        # remove empty strings
        txtfeatWords8 = [ word for word in txtfeatWords7 if word <> '' ]
        
        txtfeatWordList[i] = ' '.join(txtfeatWords8)
        
        #pprint('Iteration: %d'% i)
        #pprint(txtfeatWordList[i])
   

pprint(txtfeatWordList)

text = '\n'.join([ str(txtfeatWordList[i]) for i in range(num_total) ])

#tags = make_tags(get_tag_counts( '\n'.join([ str(txtfeatWordList[i]) for i in range(num_total) ]) ))
#create_tag_image(tags, 'cloud_large.png', size=(1800, 1200), fontname='Lobster')

d = os.path.dirname(__file__)
words = wordcloud.process_text(text)
elements = wordcloud.fit_words(words)
wordcloud.draw(elements, os.path.join(d, 'lemmatized_wordle.png'))

Пример #33
0
import wordcloud
import os
from gensim import corpora


def make_words(dictionary):
    result = []
    for word, token_id in dictionary.token2id.iteritems():
        result.append((word, dictionary.dfs[token_id]))
    return result

wordcloud.FONT_PATH = 'C:/Windows/Fonts/DroidSansMono.ttf'
if os.name == 'posix':
    wordcloud.FONT_PATH = '/Users/micazook/Library/Fonts/DroidSansMono.ttf'
dictionary = corpora.Dictionary.load('../dictionary.dict')
words = make_words(dictionary)
elements = wordcloud.fit_words(words, width=500, height=500)
wordcloud.draw(elements, '../word_cloud.png', width=500, height=500, scale=2)
Пример #34
0
#!/usr/bin/env python2

from os import path
import sys
import wordcloud

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'constitution.txt')).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text)
# Compute the position of the words.
elements = wordcloud.fit_words_old(words)
# Draw the positioned words to a PNG file.
print(elements)
wordcloud.draw(elements, path.join(d, 'constitution.png'))
Пример #35
0
import os as globby

APP_KEY = "5D2qAHz3RbrR5xHxgg9nQ"
APP_SECRET = "0zg8HTnA4ibhXV4M4i4rJUNy5xdaA0DwVk35atY8o"
OAUTH_TOKEN = "2303879557-cQmezLl1sFxOZiEVjLgrERce5ybcB2P1c1qc8q4"
OAUTH_TOKEN_SECRET = "ejk5YJBgOtUeMUtMwd9X0opnKAnH8rJHnEYlqoeXafCmM"

twitter = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

newest = max(glob.iglob('*.png'), key=globby.path.getctime)

newesttext = max(glob.iglob('*.txt'), key=globby.path.getctime)

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, newesttext)).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text)
#for tweet in words:
#    print(tweet)
# Compute the position of the words.
elements = wordcloud.fit_words(words)
# Draw the positioned words to a PNG file.
newesttext = newesttext[:-4]
trend = "#" + newesttext
newesttext = newesttext + ".png"
wordcloud.draw(elements, path.join(d, newesttext))

#photo = open(str(newest), 'rb')
#twitter.update_status_with_media(status=trend, media=photo)
Пример #36
0
from os import path
import sys
import wordcloud

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'alice.txt')).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text, max_features=2000)
# Compute the position of the words.
elements = wordcloud.fit_words(words, width=500, height=500)
# Draw the positioned words to a PNG file.
wordcloud.draw(elements,
               path.join(d, 'alice.png'),
               width=500,
               height=500,
               scale=2)

########NEW FILE########
__FILENAME__ = simple
#!/usr/bin/env python2

from os import path
import sys
import wordcloud

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'constitution.txt')).read()
Пример #37
0
        txtfeatWords5 = [lmtzr.lemmatize(word) for word in txtfeatWords4]

        # stem using snowball stemmer
        txtfeatWords6 = [stemmer.stem(word) for word in txtfeatWords5]

        # remove punctuations
        txtfeatWords7 = [
            word.encode('utf-8').translate(None, string.punctuation)
            for word in txtfeatWords6
        ]

        # remove empty strings
        txtfeatWords8 = [word for word in txtfeatWords7 if word <> '']

        txtfeatWordList[i] = ' '.join(txtfeatWords8)

        #pprint('Iteration: %d'% i)
        #pprint(txtfeatWordList[i])

pprint(txtfeatWordList)

text = '\n'.join([str(txtfeatWordList[i]) for i in range(num_total)])

#tags = make_tags(get_tag_counts( '\n'.join([ str(txtfeatWordList[i]) for i in range(num_total) ]) ))
#create_tag_image(tags, 'cloud_large.png', size=(1800, 1200), fontname='Lobster')

d = os.path.dirname(__file__)
words = wordcloud.process_text(text)
elements = wordcloud.fit_words(words)
wordcloud.draw(elements, os.path.join(d, 'lemmatized_wordle.png'))
Пример #38
0
def generateCloud(text):
	dir = path.dirname(__file__)
	words = wordcloud.process_text(text, max_features=1000)
	elements = wordcloud.fit_words(words, width=1000, height=1000)
	wordcloud.draw(elements, path.join(dir, 'wordcloud.png'), width=1000, height=1000)
Пример #39
0
d = path.dirname(__file__)

# String to hold the text from the webpages. 
text = "";

# Array of webpages which we'll loop through (from googling Deonte Burton draft).
url_list = ["http://www.draftexpress.com/profile/Deonte-Burton-6487/", 
            "http://blogs.rgj.com/chrismurray/2014/01/10/nba-scouts-view-nevadas-deonte-burton-as-solid-draft-pick-but-not-a-first-rounder/", 
            "http://www.nbadraftroom.com/2014/01/deonte-burton.html", 
            "http://www.nbadraftinsider.com/deonte-burton/", 
            "http://nbaprospects.blogspot.com/2012/08/scouting-report-deonte-burton-nevada.html",
            "http://rushthecourt.net/2014/01/09/a-college-basketball-resolution-for-2014-get-to-know-nevadas-deonte-burton/",
            "http://mrsportsblog.wordpress.com/2014/03/05/trust-me-on-this-dynamic-deonte-burton-of-nevada-will-be-making-a-living-in-the-nba/", 
            "http://www.draftexpress.com/article/NBA-Draft-Prospect-of-the-Week-Deonte-Burton-4392/",
            "http://www.nevadawolfpack.com/sports/m-baskbl/spec-rel/021214aad.html"] 

# Loop through url items and get the text from each. 
for url in url_list:
    content = urllib2.urlopen(url)

    text += Document(content).summary() + " "

# Separate into a list of word, frequency).
words = wordcloud.process_text(text)

# Compute the position of the words. 
elements = wordcloud.fit_words(words)

# Draw the positioned words to a PNG file. 
wordcloud.draw(elements, path.join(d, 'db2.png'))
Пример #40
0
#!/usr/bin/env python2

from os import path
import sys
import wordcloud

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, '4chdata/all.dat')).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text, max_features=1000)
# Compute the position of the words.
elements = wordcloud.fit_words(words, width=1000, height=1000)
# Draw the positioned words to a PNG file.
wordcloud.draw(elements, path.join(d, str(sys.argv[1])), width=1000, height=1000,
        scale=2)