示例#1
0
def draw(infile, name):
	#Read in input
	#infile = open(file_name, 'r')
	infile.seek(0)
	str_in = infile.read()

	BlackList = open("BlackList", 'r')
	list_words = BlackList.read().split(" ") + [name]
	
	#Processes the input
	words, sums = sum([x for x in str_in.split() if x not in list_words])

	#Actually makes the wordcloud itself
	wordcloud.make_wordcloud(words = words, counts = sums, fname ="out.png",width=1600, height=800)
示例#2
0
def ctm_word_cloud():
    # 新文档中各个主题所占的比重,只取前两个主题
    with open('holdout-phi-sum.dat','r') as fp:
        topicProportion=fp.readlines()
        # 列表中的数字是float型
        topicProportion=map(lambda x: float(x.split()[0]),topicProportion)
        # tP=[[10,0],[90,1],[50,2],..]
        tP=[]
        for i in range(len(topicProportion)):
            item1=list((topicProportion[i],i))
            tP.append(item1)

        # 从小到大排序
        tP.sort()
        # 取两个主题,tP_new=[[75,8],[90,2]],    [主题所占'比重',主题]
        tP_new=tP[-2:]
        
        sum2=tP_new[0][0]+tP_new[1][0]
        
        for i in range(2):
            tP_new[i][0]=tP_new[i][0]/sum2
        
        # now, tP_new=[[0.4,8],[0.6,2]]
        
    
    
    
    topics=save_topics('final-log-beta.dat', 'vocab.dat')
    
    # 构造ctm单词云
    wordsList=[]
    countsList=[]
    for (rate,topic) in tP_new:
        if rate>=0.1:
            # 四舍五入
            wordsNum=int(round(WORDS*rate))
            # 抽取主题topic,wordsNum个单词,加入words和counts
            for i in range(wordsNum):
                wordsList.append(topics[topic][i][1])
                countsList.append(topics[topic][i][0])
    
    words=np.array(wordsList)
    counts=np.array(countsList)
    font_path=r'C:\Windows\Fonts\simsun.ttc' # 宋体
    imageName='ctm'
    wordcloud.make_wordcloud(words,counts,font_path,imageName)
示例#3
0
def format_and_draw(infile):
	#Read in input
	#infile = open(file_name, 'r')
	infile.seek(0)

	str_in = format(infile.read())

	BlackList = open("BlackList", 'r')
	list_words = BlackList.read().split(" ")
	
	#Processes the input
	words, sums = sum([x for x in str_in.split() if (x not in list_words)])

	#Actually makes the wordcloud itself
	wordcloud.make_wordcloud(words = words, counts = sums, fname ="out.png",width=1600, height=800)

	#Automatically changes your desktop background. Spontaneously stopped working, set file manually as background instead.
	#call("gsettings set org.gnome.desktop.background picture-uri file://./out.png".split())

	pass
def display_wordcloud(words, counts):
    with tempfile.NamedTemporaryFile(suffix='.png') as tmp:
        temp_filename = tmp.name
        counts = wordcloud.make_wordcloud(words, counts, temp_filename, font_path='/Library/Fonts/Georgia.ttf',width=800, height=800, ranks_only=False)
        image_file = cbook.get_sample_data(temp_filename)
        image = plt.imread(image_file)

        fig, ax = plt.subplots()
        im = ax.imshow(image)
        plt.axis('off')
        plt.show()
示例#5
0
    def process(self, document):
        fdist = filter_stopwords(document['freqdist'], document['language'])
        words = numpy.array([t[0] for t in fdist])
        counts = numpy.array([t[1] for t in fdist])
        wordcloud_img = make_wordcloud(words, counts)
        fd = StringIO()
        wordcloud_img.save(fd, format="PNG")
        fd.seek(0)
        result = {'wordcloud': base64.b64encode(fd.read())}
        fd.close()

        return result
示例#6
0
    def process(self, document):
        fdist = filter_stopwords(document['freqdist'], document['language'])
        words = numpy.array([t[0] for t in fdist])
        counts = numpy.array([t[1] for t in fdist])
        wordcloud_img = make_wordcloud(words, counts)
        fd = StringIO()
        wordcloud_img.save(fd, format="PNG")
        fd.seek(0)
        result = {'wordcloud': base64.b64encode(fd.read())}
        fd.close()

        return result
示例#7
0
def make_clouds(auth_vec_fname, crawl_fname, out_dir="img", max_words=200):
    arch = np.load(auth_vec_fname)
    labels = arch["author_labels"]
    vectorizer = arch["vectorizer"][()]
    vectors = arch["vectors"][()]
    
    # get author'a ascii name (as used for IDI photo)         
    tree = etree.parse(crawl_fname)
    authors = [unicode(e) 
               for e in tree.xpath("//name/text()")]
    ascii_name= [splitext(basename(e))[0]
                 for e in tree.xpath("//img/text()")]
    toascii = dict(zip(authors, ascii_name))
    
    vocab = np.array(vectorizer.get_feature_names())

    
    for author, vec in zip(labels, vectors):
        counts = vec.toarray().ravel()  
        
        if counts.sum() > max_words:
            mask_inds = counts.argsort()[-max_words:]
            counts = counts[mask_inds]
            words = vocab[mask_inds]
        else:
            log.error(u"vector for author {} has not enough words ({})".format(
                author, int(counts.sum())))
            continue
        
        out_fname = join(out_dir, toascii[author] + ".png")
        log.debug(u"writing " + out_fname)
        
        make_wordcloud(words, counts, out_fname,
        #font_path="/usr/local/texlive/2012/texmf-dist/fonts/truetype/public/droid/DroidSansMono.ttf",
        font_path="/usr/local/texlive/2012/texmf-dist/fonts/truetype/public/opensans/OpenSans-Regular.ttf",
        width=1000, height=600, show_img=False)
示例#8
0
def make_wordcloud_rawtext(text, savename, width=400, height=200):
    import os
    import sys
    from sklearn.feature_extraction.text import CountVectorizer

    sources = [savename]

    cv = CountVectorizer(min_df=1, charset_error="ignore",
                         stop_words="english", max_features=200)
    counts = cv.fit_transform([text]).toarray().ravel()
    words = np.array(cv.get_feature_names())
    # throw away some words, normalize
    words = words[counts > 1]
    counts = counts[counts > 1]
    output_filename = (os.path.splitext(os.path.basename(sources[0]))[0]
                       + ".bmp")
    counts = wordcloud.make_wordcloud(words, counts, output_filename, width=width, height=height)
def display_wordcloud(words, counts):
    with tempfile.NamedTemporaryFile(suffix='.png') as tmp:
        temp_filename = tmp.name
        counts = wordcloud.make_wordcloud(
            words,
            counts,
            temp_filename,
            font_path='/Library/Fonts/Georgia.ttf',
            width=800,
            height=800,
            ranks_only=False)
        image_file = cbook.get_sample_data(temp_filename)
        image = plt.imread(image_file)

        fig, ax = plt.subplots()
        im = ax.imshow(image)
        plt.axis('off')
        plt.show()
示例#10
0
def generate_image(profile, field):
    fonts_to_use = [
        "JosefinSansStd-Light.ttf", "Neucha.ttf", "Molengo-Regular.ttf",
        "ReenieBeanie.ttf", "Lobster.ttf"
    ]
    assert len(fonts_to_use) == len(profile.field_dicts)
    fonts_dir = "/home/roman/Dropbox/django_practice/mysite/mysite/tag_cloud/fonts"
    # field_to_fonts = {}
    # for key, font in zip(profile.field_dicts.keys(), fonts_to_use):
    #     field_to_fonts[key] = font
    random_int = np.random.randint(len(fonts_to_use))
    # font_path = os.path.join(fonts_dir, field_to_fonts[field])
    font_path = os.path.join(fonts_dir, fonts_to_use[random_int])
    max_len = max(profile.words[field].apply(lambda x: len(x)))
    try:
        img = wordcloud.make_wordcloud(
            np.array(profile.words[field], dtype="S%d" % max_len),
            np.array(profile.scores[field]),
            font_path=font_path,
            width=640,
            height=480,
        )
    except Exception, e:
        print e
示例#11
0
def generate_image(profile, field):
    fonts_to_use = ["JosefinSansStd-Light.ttf", 
                    "Neucha.ttf", 
                    "Molengo-Regular.ttf", 
                    "ReenieBeanie.ttf", 
                    "Lobster.ttf"]
    assert len(fonts_to_use) == len(profile.field_dicts)
    fonts_dir = "/home/roman/Dropbox/django_practice/mysite/mysite/tag_cloud/fonts"
    # field_to_fonts = {}
    # for key, font in zip(profile.field_dicts.keys(), fonts_to_use):
    #     field_to_fonts[key] = font
    random_int = np.random.randint(len(fonts_to_use))
    # font_path = os.path.join(fonts_dir, field_to_fonts[field])
    font_path = os.path.join(fonts_dir, fonts_to_use[random_int])
    max_len = max(profile.words[field].apply(lambda x: len(x)))
    try: img = wordcloud.make_wordcloud(
                np.array(profile.words[field], dtype="S%d" % max_len), 
                np.array(profile.scores[field]), 
                font_path=font_path,
                width = 640,
                height = 480,
                )
    except Exception, e:
        print e
    def commRanking(self,numTopComms, prevTimeslots,xLablNum):
        import itertools, tfidf 
        # from pymongo import MongoClient
        from pytagcloud.lang.stopwords import StopWords
        # from nltk.corpus import stopwords
        from wordcloud import  make_wordcloud
        from PIL import Image

        '''Detect the evolving communities'''
        uniCommIdsEvol=self.uniCommIdsEvol
        timeslots=self.timeslots

        tempcommRanking = {}
        #structure: tempcommRanking={Id:[persistence,stability,commCentrality,degreeness]}
        commRanking,fluctuation,lifetime = {},{},0
        for Id in self.uniCommIds:
            uniqueTimeSlLen = len(set(uniCommIdsEvol[Id][0]))
            timeSlLen=len(uniCommIdsEvol[Id][0])
            tempcommRanking[Id] = []
            tempcommRanking[Id].append(uniqueTimeSlLen / timeslots)#persistence
            tempcommRanking[Id].append((sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1) + 1) / (timeslots + 1))#stability
            tempcommRanking[Id].append(product([x+1 for x in uniCommIdsEvol[Id][1]]) / uniqueTimeSlLen)#commCentrality
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][4]) / timeslots)#Degreeness
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][5])/timeSlLen)#degree centrality
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][6])/timeSlLen)#betweeness centrality
            # '''Checking Theseus Ship'''
            # theseus=1+len(list(set(uniCommIdsEvol[Id][3][0]) & set(uniCommIdsEvol[Id][3][-1]))) / len(set(np.append(uniCommIdsEvol[Id][3][0], uniCommIdsEvol[Id][3][-1])))
            # tempcommRanking[Id].append(theseus)
            commRanking[Id] = np.prod(tempcommRanking[Id])

            #Construct average jaccardian between timeslots for each dyn comm
            if timeSlLen not in fluctuation:
                fluctuation[timeSlLen]=[(sum(uniCommIdsEvol[Id][7])/(timeSlLen-1))] #[1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)]
            else:
                fluctuation[timeSlLen].append((sum(uniCommIdsEvol[Id][7])/(timeSlLen-1)))#1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1))
            lifetime=max(lifetime,timeSlLen)

        '''All the communities ranked in order of importance'''
        rankedCommunities = sorted(commRanking, key=commRanking.get, reverse=True)
        if numTopComms>len(rankedCommunities):
            numTopComms=len(rankedCommunities)

        '''Jaccardian for lifespans which appear only once are discarded (outliers)'''
        flux=[]
        for lifeT in range(lifetime+1):
            if lifeT in fluctuation and len(fluctuation[lifeT])>1:
                flux.append(sum(fluctuation[lifeT])/len(fluctuation[lifeT]))
            else:
                flux.append(0)

        '''Constructing community size heatmap data'''
        commSizeHeatData = np.zeros([numTopComms, timeslots])
        for rCIdx, comms in enumerate(rankedCommunities[0:numTopComms]):
            for sizeIdx, timesteps in enumerate(uniCommIdsEvol[comms][0]):
                if commSizeHeatData[rCIdx, timesteps] != 0:
                    commSizeHeatData[rCIdx, timesteps] = max(np.log(uniCommIdsEvol[comms][2][sizeIdx]),commSizeHeatData[rCIdx, timesteps])
                else:
                    commSizeHeatData[rCIdx, timesteps] = np.log(uniCommIdsEvol[comms][2][sizeIdx])
        normedHeatdata = commSizeHeatData/commSizeHeatData.max()

        '''Writing ranked communities to json files + MongoDB'''
        dataset_name=self.dataset_path.split('/')
        dataset_name=dataset_name[-1]
        #Mongo--------------------
        # client = MongoClient()
        # db = client[dataset_name]
        # dyccos=db.dyccos
        #-------------------------
        rankedCommunitiesFinal = {}
        twitterDataFile = open(self.dataset_path + '/data/nonadaptive/results/rankedCommunities.json', "w")#, encoding="utf-8-sig")
        jsondata = dict()
        jsondata["ranked_communities"] = []

        '''Create corpus and stopwords'''
        # stop = stopwords.words('english')
        stop = []
        # grstopwords=pickle.load(open("./greek_stopwords.pck", 'rb'))
        # stop.extend(grstopwords)
        definiteStop = ['gt','amp','rt','via']
        stop.extend(definiteStop)
        if not os.path.exists(self.dataset_path + "/data/nonadaptive/tmp/datasetCorpus.pck"):
            idf = self.corpusExtraction(rankedCommunities[:numTopComms])
        else:
            idf = pickle.load(open(self.dataset_path + "/data/nonadaptive/tmp/datasetCorpus.pck", 'rb'))
            print('loaded corpus from file')
        #-------------------------
        regex1 = re.compile("(?:\@|#|https?\://)\S+",re.UNICODE)
        regex2 = re.compile("\w+'?\w",re.UNICODE)

        width,height = 400,200
        blank_image = Image.new("RGB", (timeslots*width, (numTopComms*2+2)*height),(255,255,255)) #make blank for colage
        for tmptime in range(timeslots):
            timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2], width=width, height=height)
            blank_image.paste(timeimage, (tmptime*width,height))

        for rank, rcomms in enumerate(rankedCommunities[:numTopComms]):
            tmslUsrs, tmpTags, tmptweetids, commTwText, tmpUrls, topic, tmpkeywrds = [], [], [], [], [], [], []
            strRank = '{0}'.format(str(rank).zfill(2))
            rankedCommunitiesFinal[strRank] = [rcomms]
            rankedCommunitiesFinal[strRank].append(commRanking[rcomms])
            rankedCommunitiesFinal[strRank].append(uniCommIdsEvol[rcomms][3])
            timeSlotApp = [self.timeLimit[x] for x in uniCommIdsEvol[rcomms][0]]

            '''make and save wordclouds'''
            if not os.path.exists(self.dataset_path + "/data/nonadaptive/results/wordclouds/"+self.fileTitle+'/'+str(rank)):
                os.makedirs(self.dataset_path + "/data/nonadaptive/results/wordclouds/"+self.fileTitle+'/'+str(rank))

            for tmsl, users in enumerate(uniCommIdsEvol[rcomms][3]):
                uscentr, tmptweetText = [], []
                for us in users:
                    uscentr.append([us, self.userPgRnkBag[uniCommIdsEvol[rcomms][0][tmsl]][us]])
                    # uscentr = sorted(uscentr, key=itemgetter(1), reverse=True)
                    if us in self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmpTags.extend(self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmpUrls.append(self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmptweetids.extend(self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmptweetText.extend(self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                uscentr = sorted(uscentr, key=itemgetter(1), reverse=True)
                tmslUsrs.append({str(uniCommIdsEvol[rcomms][0][tmsl]): uscentr})
                tmptweetText = [i.replace("\n", "").replace('\t',' ') for i in tmptweetText]
                seen = set()
                seen_add = seen.add
                tmptweetText2 = [x for x in tmptweetText if x not in seen and not seen_add(x)]
                commTwText.append({timeSlotApp[tmsl]: tmptweetText2})
                #topic extraction
                topicList = " ".join(tmptweetText2)
                topicList = topicList.lower()
                topicList = regex1.sub('', topicList)
                topicList = regex2.findall(topicList)
                s = StopWords()
                s.load_language(s.guess(topicList))
                topicList = collections.Counter(topicList)
                tmpkeys = topicList.keys()
                if len(topicList)>5:
                    for i in list(tmpkeys):
                            if not i or i in stop or i.startswith(('htt','(@','t.co')) or len(i)<=2 or s.is_stop_word(i):
                                del topicList[i]
                else:
                    for i in list(tmpkeys):
                        if i in definiteStop or not i:
                            del topicList[i]

                timeSlLen=len(uniCommIdsEvol[Id][0])
                tmpTopic=tfidf.comm_tfidf(topicList,idf,10)
                topic.append({timeSlotApp[tmsl]: tmpTopic})
                # tmpTopic = [x[0] for x in tmpTopic]
                '''wordcloud image'''
                popkeys = [x[0] for x in tmpTopic]
                popvals = [x[1] for x in tmpTopic]
                if len(popvals)<2:
                    try:
                        if popvals[0]<1:
                            popvals[0]=1
                    except:
                        pass
                '''Create intermediate image'''
                position = (rank+1)*2
                backgroundcolor = int((1-(normedHeatdata[rank,uniCommIdsEvol[rcomms][0][tmsl]]))*255)
                locimage = make_wordcloud(popkeys,popvals, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'.pdf'
                blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,position*height))
                popusers = [x[0] for x in uscentr[:10]]
                popcentr = [x[1]*100 for x in uscentr[:10]]
                locimage = make_wordcloud(popusers,popcentr, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'usrs.pdf'
                blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,(position+1)*height))
                # tmpkeywrds.extend(tmpTopic)

            if tmpTags:
                popTags = [x.lower() for x in list(itertools.chain.from_iterable(tmpTags))]
                popTags = collections.Counter(popTags)
                popTags = popTags.most_common(10)
            else:
                popTags=[]
            if tmpUrls:
                if tmpUrls[0]:
                    tmpUrls=[x.lower() for x in list(itertools.chain.from_iterable(tmpUrls)) if x]
                    popUrls = collections.Counter(tmpUrls)
                    popUrls = popUrls.most_common(10)
                else:
                    popUrls=[]
            else:
                    popUrls=[]
            commTweetIds = list(set(tmptweetids))
            # popKeywords = collections.Counter(tmpkeywrds)
            # popKeywords = popKeywords.most_common(10)
            # popkeys = [x[0] for x in popKeywords]
            # popvals = [x[1] for x in popKeywords]
            # make_wordcloud(popkeys,popvals,self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'.pdf')
            dycco={'community label': rcomms, 'rank': rank, 'timeslot appearance': timeSlotApp,# 'text': commTwText,
                 'persistence:': tempcommRanking[rcomms][0],'total score':commRanking[rcomms],'topic': topic,
                 'stability': tempcommRanking[rcomms][1],'community centrality': tempcommRanking[rcomms][2],
                 'community size per slot': uniCommIdsEvol[rcomms][2], 'users:centrality per timeslot': tmslUsrs,
                 'popTags': popTags, 'popUrls': popUrls}
            jsondycco=dycco.copy()
            # dyccos.insert(dycco)
            jsondata["ranked_communities"].append(jsondycco)
        twitterDataFile.write(json.dumps(jsondata, sort_keys=True))#,ensure_ascii=False).replace("\u200f",""))
        twitterDataFile.close()

        for tmptime in range(timeslots):
            timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2])
            blank_image.paste(timeimage, (tmptime*width,(position+2)*height))
        imsize=blank_image.size
        blank_image = blank_image.resize((round(imsize[0]/2),round(imsize[1]/2)),Image.ANTIALIAS)
        blank_image.save(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'_collage.pdf', quality=50)

        makefigures(commSizeHeatData,flux,self.fileTitle,self.day_month,commRanking,numTopComms,timeslots,uniCommIdsEvol,rankedCommunities,self.commPerTmslt,self.uniCommIds,prevTimeslots,self.dataset_path,self.xLablNum)
        return rankedCommunitiesFinal
示例#13
0
    for k in mycounts.iterkeys():
        words.append(k)
        counts.append(mycounts[k])
    words = np.array(words)
    counts = np.array(counts)
    # throw away some words, normalize
    words = words[counts > 1]
    counts = counts[counts > 1]
    subjects = {}
    for w in words:
        subjects[w] = mysubjectind[most_common(subs[w])]

    output_filename = "%s/cloud.png" % datetime.date.today()
    print output_filename
    os.system("mkdir %s" % datetime.date.today())
    counts = make_wordcloud(words, counts, output_filename)

    os.system("rm  thisweekarxiv.png ")
    os.system("ln -s %s thisweekarxiv.png " % output_filename)

    _t = time.time()
    wordlists = {}
    i = 0
    for row in content:
        i += 1
        user = row[0]
        if (i % 100 == 0):
            sys.stdout.write('\r%s' % user)
            sys.stdout.flush()
        wordlists[user] = {}
        wordlists[user]['words'] = set([s for s in row[2:] if s != ''])
示例#14
0
import sys
import numpy as np
sys.path.insert(0, './word_cloud-master/')
import wordcloud

if len(sys.argv)<3:
    sys.exit("Usage: python get_wordcloud.py wordlist.txt number")

wordlist_name = sys.argv[1]
cluster_number = sys.argv[2]
wordlist_file = open(wordlist_name,'r')

words = []
dists = []
# the distances are to the centroid
for line in wordlist_file:
    word = line.split()[0]
    cluster = line.split()[1]
    dist = float(line.split()[2])
    if cluster==cluster_number:
        words.append(word)
        dists.append(dist)

max_dist = max(dists)*1.05
weights = [max_dist - dist for dist in dists]

words_array = np.array(words)
weights_array = np.array(weights)

wordcloud.make_wordcloud(words_array,weights_array,wordlist_name+'.'+cluster_number+'.png')
示例#15
0
    for k in mycounts.iterkeys():
        words.append(k)
        counts.append(mycounts[k])
    words=np.array(words)
    counts=np.array(counts)
    # throw away some words, normalize
    words = words[counts > 1]
    counts = counts[counts > 1]
    subjects={}
    for w in words:
        subjects[w]=mysubjectind[most_common(subs[w])]

    output_filename = "%s/cloud.png"%datetime.date.today()
    print output_filename
    os.system("mkdir %s"%datetime.date.today())
    counts = make_wordcloud(words, counts, output_filename)

    os.system("rm  thisweekarxiv.png ")
    os.system("ln -s %s thisweekarxiv.png "%output_filename)
    

    _t = time.time()
    wordlists={}
    i=0
    for row in content:
        i+=1
        user = row[0]
        if (i%100==0):
            sys.stdout.write('\r%s' % user)
            sys.stdout.flush()
        wordlists[user] = {}
示例#16
0
def batch_vs_online(articleName):
    # 载入字典和语料库
    diction = gensim.corpora.Dictionary.load('people.dict')
    mm = gensim.corpora.MmCorpus('people_tfidf.mm')
    # 读入新的文章
    with open(articleName,'r') as fp:
        content=fp.read()
        list1=fenci(content) # 停用词已经去掉
    
    doc_bow=diction.doc2bow(list1) # 统计词频后的向量,新文章的向量表示
    
    
    # 得到batch_lda与online_lda
    # alpha,eta使用默认参数
    batch_lda=gensim.models.ldamodel.LdaModel(corpus=mm, id2word=diction, \
    num_topics=TOPICS, update_every=0, passes=20)
    
    """
    batch_lda=gensim.models.ldamodel.LdaModel(corpus=mm, id2word=diction, \
    num_topics=TOPICS, update_every=0, passes=80,alpha=50.0/TOPICS)
    """
   
    
    # 主题数大一点,chunksize大一点,alpha要自己设置
    online_lda=gensim.models.ldamodel.LdaModel(corpus=mm, id2word=diction, \
    num_topics=TOPICS, update_every=1, chunksize=5, passes=1)
    
    """
    online_lda=gensim.models.ldamodel.LdaModel(corpus=mm, id2word=diction, \
    num_topics=TOPICS, update_every=1, chunksize=40, passes=80,alpha=50.0/TOPICS)
    """
    
    # batchTopics=[[],[],[]]
    # 每个主题下,单词已经按概率大小排列好了
    # list1=lda.show_topic(0)
    # list1=[(0.0017132658909227052, '\xb7\xa8\xd4\xba'), (0.0016304890553909524, '\xd4\xbd\xc0\xb4\xd4\xbd')]
    for k in range(TOPICS):
        
        batchTopics.append(batch_lda.show_topic(k))
        onlineTopics.append(online_lda.show_topic(k))
    
    
    # 先画batch_lda的单词云
    # 一篇文档中,各个主题所占的比重,是一个列表
    
    doc_batch_lda = batch_lda[doc_bow]
    
    print doc_batch_lda
    
    
    
    tP_batch=[]
    for yuanzu in doc_batch_lda:
        tP_batch.append(list(yuanzu))
    
    for i in range(len(tP_batch)):
        tmp=tP_batch[i][0]
        tP_batch[i][0]=tP_batch[i][1]
        tP_batch[i][1]=tmp
    
    # 从小到大排序
    tP_batch.sort()
    # 取3个主题   [主题所占'比重',主题]
    tP_batch_new=tP_batch[-3:]
    #print tP_batch
    # 归一标准化
    sum0=0
    for i in range(len(tP_batch_new)):
        sum0+=tP_batch_new[i][0]
    
    for i in range(len(tP_batch_new)):
        tP_batch_new[i][0]=tP_batch_new[i][0]/sum0
    # now, tP_new=[[0.1,8],[0.3,2],[0.6,1]]
    
    
    
    # 画单词云用
    batchWordsList=[]
    batchWordsCount=[]
    
    for (rate,topic) in tP_batch_new:
        if rate>=0.1:
            # 四舍五入
            wordsNum=int(round(WORDS*rate))
            # 抽取主题topic,wordsNum个单词,加入words和counts
            for i in range(wordsNum):
                batchWordsList.append(batchTopics[topic][i][1])
                batchWordsCount.append(batchTopics[topic][i][0])

    
    

              
    
    words=np.array(batchWordsList)
    counts=np.array(batchWordsCount)

    font_path=r'C:\Windows\Fonts\simsun.ttc' # 宋体
    imageName='batch_lda_'+articleName
    wordcloud.make_wordcloud(words,counts,font_path,imageName)
    
    
    # online_lda的单词云
    doc_online_lda = online_lda[doc_bow]
    print doc_online_lda
    
    
    tP_online=[]
    for yuanzu in doc_online_lda:
        tP_online.append(list(yuanzu))
    
    for i in range(len(tP_online)):
        tmp=tP_online[i][0]
        tP_online[i][0]=tP_online[i][1]
        tP_online[i][1]=tmp
    
    # 从小到大排序
    tP_online.sort()
    # 取3个主题   [主题所占'比重',主题]
    tP_online_new=tP_online[-3:]
    
    # 归一标准化
    sum0=0
    for i in range(len(tP_online_new)):
        sum0+=tP_online_new[i][0]
    
    for i in range(len(tP_online_new)):
        tP_online_new[i][0]=tP_online_new[i][0]/sum0
    # now, tP_new=[[0.1,8],[0.3,2],[0.6,1]]
                                
            
    onlineWordsList=[]
    onlineWordsCount=[]
    
    for (rate,topic) in tP_online_new:
        if rate>=0.1:
            # 四舍五入
            wordsNum=int(round(WORDS*rate))
            # 抽取主题topic,wordsNum个单词,加入words和counts
            for i in range(wordsNum):
                onlineWordsList.append(onlineTopics[topic][i][1])
                onlineWordsCount.append(onlineTopics[topic][i][0])
                


    
    words=np.array(onlineWordsList)
    counts=np.array(onlineWordsCount)
    font_path=r'C:\Windows\Fonts\simsun.ttc' # 宋体
    imageName='online_lda_'+articleName
    wordcloud.make_wordcloud(words,counts,font_path,imageName)
示例#17
0
            table.append(r)
    line_count += 1
ifile.close()
print "Matrix shape: %d rows, %d columns" % (r_count, num_columns)
print "Header: " + str(header)

# CHECK THAT ALL ROWS HAVE SAME NUMBER OF COLUMNS
if not all(map(lambda x: x == num_columns,map(len,table))):
    sys.exit("ERROR: Not all rows have the same number of columns. Aborting.")

# TRANSPOSE
table_t = map(list,zip(*table))

# GENERATE WORDCLOUDS
# columns
words = np.asarray(table_t[0])
for j in xrange(1,num_columns): 
    print "Processing column cloud #%d (%s)..." % (j,header[j])
    counts = np.asarray(table_t[j])
    fname = output_dir_cols + header[j] + "_wordcloud" + ext
    wordcloud.make_wordcloud(words, counts, fname, font_path, width, height)
# rows
words = np.asarray(header[1:])
for i in xrange(1,r_count): 
    row_label = table[i][0]
    row_data = table[i][1:]
    print "Processing row cloud #%d (%s)..." % (i,row_label)
    counts = np.asarray(row_data)
    fname = output_dir_rows + row_label + "_wordcloud" + ext
    wordcloud.make_wordcloud(words, counts, fname, font_path, width, height)
    def commRanking(self,numTopComms, prevTimeslots,xLablNum):
        import itertools, tfidf 
        # from pymongo import MongoClient
        from pytagcloud.lang.stopwords import StopWords
        # from nltk.corpus import stopwords
        from wordcloud import  make_wordcloud
        from PIL import Image

        '''Detect the evolving communities'''
        uniCommIdsEvol=self.uniCommIdsEvol
        timeslots=self.timeslots

        tempcommRanking = {}
        #structure: tempcommRanking={Id:[persistence,stability,commCentrality,degreeness]}
        commRanking,fluctuation,lifetime = {},{},0
        for Id in self.uniCommIds:
            uniqueTimeSlLen = len(set(uniCommIdsEvol[Id][0]))
            timeSlLen=len(uniCommIdsEvol[Id][0])
            tempcommRanking[Id] = []
            tempcommRanking[Id].append(uniqueTimeSlLen / timeslots)#persistence
            tempcommRanking[Id].append((sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1) + 1) / (timeslots + 1))#stability
            tempcommRanking[Id].append(product([x+1 for x in uniCommIdsEvol[Id][1]]) / uniqueTimeSlLen)#commCentrality
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][4]) / timeslots)#Degreeness
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][5])/timeSlLen)#degree centrality
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][6])/timeSlLen)#betweeness centrality
            # '''Checking Theseus Ship'''
            # theseus=1+len(list(set(uniCommIdsEvol[Id][3][0]) & set(uniCommIdsEvol[Id][3][-1]))) / len(set(np.append(uniCommIdsEvol[Id][3][0], uniCommIdsEvol[Id][3][-1])))
            # tempcommRanking[Id].append(theseus)
            commRanking[Id] = np.prod(tempcommRanking[Id])

            #Construct average jaccardian between timeslots for each dyn comm
            if timeSlLen not in fluctuation:
                fluctuation[timeSlLen]=[(sum(uniCommIdsEvol[Id][7])/(timeSlLen-1))] #[1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)]
            else:
                fluctuation[timeSlLen].append((sum(uniCommIdsEvol[Id][7])/(timeSlLen-1)))#1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1))
            lifetime=max(lifetime,timeSlLen)

        '''All the communities ranked in order of importance'''
        rankedCommunities = sorted(commRanking, key=commRanking.get, reverse=True)
        if numTopComms>len(rankedCommunities):
            numTopComms=len(rankedCommunities)

        '''Jaccardian for lifespans which appear only once are discarded (outliers)'''
        flux=[]
        for lifeT in range(lifetime+1):
            if lifeT in fluctuation and len(fluctuation[lifeT])>1:
                flux.append(sum(fluctuation[lifeT])/len(fluctuation[lifeT]))
            else:
                flux.append(0)

        '''Constructing community size heatmap data'''
        commSizeHeatData = np.zeros([numTopComms, timeslots])
        for rCIdx, comms in enumerate(rankedCommunities[0:numTopComms]):
            for sizeIdx, timesteps in enumerate(uniCommIdsEvol[comms][0]):
                if commSizeHeatData[rCIdx, timesteps] != 0:
                    commSizeHeatData[rCIdx, timesteps] = max(np.log(uniCommIdsEvol[comms][2][sizeIdx]),commSizeHeatData[rCIdx, timesteps])
                else:
                    commSizeHeatData[rCIdx, timesteps] = np.log(uniCommIdsEvol[comms][2][sizeIdx])
        normedHeatdata = commSizeHeatData/commSizeHeatData.max()

        '''Writing ranked communities to json files + MongoDB'''
        dataset_name=self.dataset_path.split('/')
        dataset_name=dataset_name[-1]
        #Mongo--------------------
        # client = MongoClient()
        # db = client[dataset_name]
        # dyccos=db.dyccos
        #-------------------------
        rankedCommunitiesFinal = {}
        twitterDataFile = open(self.dataset_path + '/data/results/rankedCommunities.json', "w")#, encoding="utf-8-sig")
        jsondata = dict()
        jsondata["ranked_communities"] = []

        '''Create corpus and stopwords'''
        # stop = stopwords.words('english')
        stop = []
        # grstopwords=pickle.load(open("./greek_stopwords.pck", 'rb'))
        # stop.extend(grstopwords)
        definiteStop = ['gt','amp','rt','via']
        stop.extend(definiteStop)
        if not os.path.exists(self.dataset_path + "/data/tmp/datasetCorpus.pck"):
            idf = self.corpusExtraction(rankedCommunities[:numTopComms])
        else:
            idf = pickle.load(open(self.dataset_path + "/data/tmp/datasetCorpus.pck", 'rb'))
            print('loaded corpus from file')
        #-------------------------
        regex1 = re.compile("(?:\@|#|https?\://)\S+",re.UNICODE)
        regex2 = re.compile("\w+'?\w",re.UNICODE)

        width,height = 400,200
        blank_image = Image.new("RGB", (timeslots*width, (numTopComms*2+2)*height),(255,255,255)) #make blank for colage
        for tmptime in range(timeslots):
            timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2], width=width, height=height)
            blank_image.paste(timeimage, (tmptime*width,height))

        for rank, rcomms in enumerate(rankedCommunities[:numTopComms]):
            tmslUsrs, tmpTags, tmptweetids, commTwText, tmpUrls, topic, tmpkeywrds = [], [], [], [], [], [], []
            strRank = '{0}'.format(str(rank).zfill(2))
            rankedCommunitiesFinal[strRank] = [rcomms]
            rankedCommunitiesFinal[strRank].append(commRanking[rcomms])
            rankedCommunitiesFinal[strRank].append(uniCommIdsEvol[rcomms][3])
            timeSlotApp = [self.timeLimit[x] for x in uniCommIdsEvol[rcomms][0]]

            '''make and save wordclouds'''
            if not os.path.exists(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'/'+str(rank)):
                os.makedirs(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'/'+str(rank))

            for tmsl, users in enumerate(uniCommIdsEvol[rcomms][3]):
                uscentr, tmptweetText = [], []
                for us in users:
                    uscentr.append([us, self.userPgRnkBag[uniCommIdsEvol[rcomms][0][tmsl]][us]])
                    # uscentr = sorted(uscentr, key=itemgetter(1), reverse=True)
                    if us in self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmpTags.extend(self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmpUrls.append(self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmptweetids.extend(self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmptweetText.extend(self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                uscentr = sorted(uscentr, key=itemgetter(1), reverse=True)
                tmslUsrs.append({str(uniCommIdsEvol[rcomms][0][tmsl]): uscentr})
                tmptweetText = [i.replace("\n", "").replace('\t',' ') for i in tmptweetText]
                seen = set()
                seen_add = seen.add
                tmptweetText2 = [x for x in tmptweetText if x not in seen and not seen_add(x)]
                commTwText.append({timeSlotApp[tmsl]: tmptweetText2})
                #topic extraction
                topicList = " ".join(tmptweetText2)
                topicList = topicList.lower()
                topicList = regex1.sub('', topicList)
                topicList = regex2.findall(topicList)
                s = StopWords()
                s.load_language(s.guess(topicList))
                topicList = collections.Counter(topicList)
                tmpkeys = topicList.keys()
                if len(topicList)>5:
                    for i in list(tmpkeys):
                            if not i or i in stop or i.startswith(('htt','(@','t.co')) or len(i)<=2 or s.is_stop_word(i):
                                del topicList[i]
                else:
                    for i in list(tmpkeys):
                        if i in definiteStop or not i:
                            del topicList[i]

                timeSlLen=len(uniCommIdsEvol[Id][0])
                tmpTopic=tfidf.comm_tfidf(topicList,idf,10)
                topic.append({timeSlotApp[tmsl]: tmpTopic})
                # tmpTopic = [x[0] for x in tmpTopic]
                '''wordcloud image'''
                popkeys = [x[0] for x in tmpTopic]
                popvals = [x[1] for x in tmpTopic]
                if len(popvals)<2:
                    try:
                        if popvals[0]<1:
                            popvals[0]=1
                    except:
                        pass
                '''Create intermediate image'''
                position = (rank+1)*2
                backgroundcolor = int((1-(normedHeatdata[rank,uniCommIdsEvol[rcomms][0][tmsl]]))*255)
                locimage = make_wordcloud(popkeys,popvals, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'.pdf'
                blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,position*height))
                popusers = [x[0] for x in uscentr[:10]]
                popcentr = [x[1]*100 for x in uscentr[:10]]
                locimage = make_wordcloud(popusers,popcentr, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'usrs.pdf'
                blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,(position+1)*height))
                # tmpkeywrds.extend(tmpTopic)

            if tmpTags:
                popTags = [x.lower() for x in list(itertools.chain.from_iterable(tmpTags))]
                popTags = collections.Counter(popTags)
                popTags = popTags.most_common(10)
            else:
                popTags=[]
            if tmpUrls:
                if tmpUrls[0]:
                    tmpUrls=[x.lower() for x in list(itertools.chain.from_iterable(tmpUrls)) if x]
                    popUrls = collections.Counter(tmpUrls)
                    popUrls = popUrls.most_common(10)
                else:
                    popUrls=[]
            else:
                    popUrls=[]
            commTweetIds = list(set(tmptweetids))
            # popKeywords = collections.Counter(tmpkeywrds)
            # popKeywords = popKeywords.most_common(10)
            # popkeys = [x[0] for x in popKeywords]
            # popvals = [x[1] for x in popKeywords]
            # make_wordcloud(popkeys,popvals,self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'.pdf')
            dycco={'community label': rcomms, 'rank': rank, 'timeslot appearance': timeSlotApp,# 'text': commTwText,
                 'persistence:': tempcommRanking[rcomms][0],'total score':commRanking[rcomms],'topic': topic,
                 'stability': tempcommRanking[rcomms][1],'community centrality': tempcommRanking[rcomms][2],
                 'community size per slot': uniCommIdsEvol[rcomms][2], 'users:centrality per timeslot': tmslUsrs,
                 'popTags': popTags, 'popUrls': popUrls}
            jsondycco=dycco.copy()
            # dyccos.insert(dycco)
            jsondata["ranked_communities"].append(jsondycco)
        twitterDataFile.write(json.dumps(jsondata, sort_keys=True))#,ensure_ascii=False).replace("\u200f",""))
        twitterDataFile.close()

        for tmptime in range(timeslots):
            timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2])
            blank_image.paste(timeimage, (tmptime*width,(position+2)*height))
        imsize=blank_image.size
        blank_image = blank_image.resize((round(imsize[0]/2),round(imsize[1]/2)),Image.ANTIALIAS)
        blank_image.save(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'_collage.pdf', quality=50)

        makefigures(commSizeHeatData,flux,self.fileTitle,self.day_month,commRanking,numTopComms,timeslots,uniCommIdsEvol,rankedCommunities,self.commPerTmslt,self.uniCommIds,prevTimeslots,self.dataset_path,self.xLablNum)
        return rankedCommunitiesFinal
        if y > 0:
            partial_integral += integral[x:, y - 1][:, np.newaxis]

        integral[x:, y:] = partial_integral

    # redraw in color
    img = Image.new("RGB", (width, height), (backgroundweight,backgroundweight,backgroundweight))
    draw = ImageDraw.Draw(img)
    everything = zip(words, font_sizes, positions, orientations)
    for word, font_size, position, orientation in everything:
        font = ImageFont.truetype(font_path, font_size)
        # transpose font optionally
        transposed_font = ImageFont.TransposedFont(font, orientation=orientation)
        draw.setfont(transposed_font)
        draw.text((position[1], position[0]), word, #fill = "red")
                   fill="hsl(%d" % random.randint(0, 50) + ", 80%, 50%)")
    #img.show()
    try:
        img.save(fname)
    except:
        pass
    return img


if __name__ == "__main__":

    x=['qqqqq','wwww','eeee','rrrr','ddddd','hhnhhhh']
    co=[1,2,3,4,5,6]
    from wordcloud import  make_wordcloud
    make_wordcloud(x,co,'wordy.jpg')