def PrintClust(self):

        self.DataMatrix()
        #creating canvas for printing cluster
        self.maincanvas=Canvas(self.root,bg='#FFFFFF',width=100,height=320,)
        self.maincanvas.grid(row=8,rowspan=8,column=1,columnspan=16,pady=10,sticky="WE")
        self.xscrollbar = Scrollbar(self.root, orient=HORIZONTAL,command=self.maincanvas.xview)
        self.xscrollbar.grid(row=15,column=1,columnspan=16,sticky="WSE")

        self.yscrollbar = Scrollbar(self.root,orient=VERTICAL,command=self.maincanvas.yview)
        self.yscrollbar.grid(column=17,row=8,rowspan=8,sticky="WNS")

        self.maincanvas.config(xscrollcommand=self.xscrollbar,yscrollcommand=self.yscrollbar)

        courses, words, data = clusters.readfile("data.txt")
        #creating if condition for distance methods, tanimoto and pearson
        if str(self.var.get())=="1":

            clust=clusters.hcluster(data,distance=clusters.pearson)
            self.printclust=clusters.clust2str(clust, labels=courses)
            self.maincanvas.create_text(350,10,font="Helvetiva 10",text=self.printclust,anchor="nw")

        elif str(self.var.get())=="2":

            clust=clusters.hcluster(data,distance=clusters.tanimoto)
            self.printclust=clusters.clust2str(clust, labels=courses)
            self.maincanvas.create_text(350,10,font="Helvetiva 10",text=self.printclust,anchor="nw")
        else:

            tkMessageBox.showerror("ERROR","Please select distance method.")
            self.DataMatrix()
 def clustering(self, canvas):  # Clusters District or Parties
     columns, rows, percentages = clusters.readfile('data.txt')
     clust = clusters.hcluster(percentages, distance=clusters.sim_distance)
     clusters.drawdendrogram(clust, columns, jpeg='cl.jpg')
     img = ImageTk.PhotoImage(Image.open("cl.jpg"))
     canvas.create_image(20, 20, anchor=NW, image=img)
     canvas.image = img
示例#3
0
def do_stemmed():
    generate_blogfile_stem()
    blognames, words, data = clusters.readfile('datafiles/blogtop500_stemmed.txt')
    clust = clusters.hcluster(data)
    with open("datafiles/blogtop500stemmed_asciideno.txt", "w+") as out:
        clusters.printclust2file(clust, out, labels=blognames)
    clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500stemmed_deno.jpg')

    with open("datafiles/kmeans_blogtop500stemmed.txt", "w+") as kout:
        for k in [5, 10, 20]:
            print("For k=%d" % k)
            kout.write("K=%d\n" % k)
            kout.write("Iterations\n")
            centriods = clusters.kcluster_toFile(data, k=k, out=kout)
            kout.write("Centroid Values\n-------------------------\n")
            for count, centriod in enumerate(centriods, 1):
                print("Centroid #%d" % count)
                kout.write("Centroid #%d\n" % count)
                values = []
                for idx in centriod:
                    print(blognames[idx])
                    values.append(blognames[idx])
                kout.write("%s\n" % ', '.join(values))
            kout.write("=================================\n")
            print("-------")
    with open("datafiles/dimensionReductionStemmed.txt", "w+") as dout:
        scaled = clusters.scaledown_logiter(data, out=dout)
    clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500stemmed_clust2d.jpg')
def kmeans(x):
    jobnames,projects,data=clusters.readfile('job_projects')
    cl, matches = clusters.kcluster(data, k=x)
    #print cl
    matches_with_names = []
    for i in range(x):
         matches_with_names.append([jobnames[r] for r in matches[i]])
    return matches_with_names
示例#5
0
def main():

    blognames, words, data = clusters.readfile('blogdata.txt')
    print "K value is 5"
    kclust = clusters.kcluster(data, k=5)
    print "K value is 10"
    kclust = clusters.kcluster(data, k=10)
    print "K value is 20"
    kclust = clusters.kcluster(data, k=20)
示例#6
0
def main():

    blognames,words,data=clusters.readfile('blogdata.txt') 
    print "K value is 5"
    kclust=clusters.kcluster(data,k=5)
    print "K value is 10"
    kclust=clusters.kcluster(data,k=10)
    print "K value is 20"
    kclust=clusters.kcluster(data,k=20)
def createDendrogram():
    blogs, colnames, data = clusters.readfile('blogdata.txt')
    cluster = clusters.hcluster(data)
    clusters.drawdendrogram(cluster, blogs, jpeg='Dendrogram.jpg')
    f = open("ASCII.txt", 'w')
    sys.stdout = f
    clusters.printclust(cluster, labels=blogs)
    f.close()
    sys.stderr.close()
 def cluster_parties(self):
     self.state = "party"  #if user clickes cluster parties state changes to party.
     self.analysis_frame.pack(side=TOP, fill=BOTH)
     self.canvas.delete("all")  #clearing canvas
     # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas
     self.party_list, self.district_list, self.data = clusters.readfile(
         "matrix.txt")
     clust = clusters.hcluster(self.data, distance=clusters.sim_distance)
     clusters.drawdendrogram(clust, self.party_list, jpeg='parties.jpg')
     self.insert_image("parties.jpg")  #insert clustered image to canvas
示例#9
0
def main():

    # returns blog titles, words in blog (10%-50% boundaries), list of frequency info
    blognames,words,data=clusters.readfile('blogdata.txt') 

    # returns a tree of foo.id, foo.left, foo.right
    clust=clusters.hcluster(data)

    # walks tree and prints ascii approximation of a dendogram; distance measure is Pearson's r
    clusters.printclust(clust,labels=blognames) 
示例#10
0
def createJPegDendogram():

	'''
	blognames,words,data=clusters.readfile('blogVector.txt')
	clust=clusters.hcluster(data)
	clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
	'''
	


	
	blognames,words,data=clusters.readfile('blogVectorTFIDFVersion.txt')
	clust=clusters.hcluster(data)
	clusters.drawdendrogram(clust,blognames,jpeg='blogclustTFIDFVersion.jpg')
示例#11
0
def createKMeansClusters(kValue):

	if( kValue>0 ):
		blognames,words,data=clusters.readfile('blogVector.txt')
		kclust=clusters.kcluster(data,k=kValue)

		count = 0
		for cluster in kclust:

			if( len(cluster) > 0 ):
				print 'cluster', count
				for instance in cluster:
					print '...',blognames[instance]

				count += 1
 def cluster_district(self):
     self.state = "district"
     #if user clickes cluster districts state changes to district.
     self.analysis_frame.pack(side=TOP, fill=BOTH)
     self.canvas.delete("all")  #clearing canvas
     # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas
     self.party_list, self.district_list, self.data = clusters.readfile(
         "matrix.txt")
     new_data = clusters.rotatematrix(self.data)
     #we need to rotated matrix to cluster districts.
     clust = clusters.hcluster(new_data, distance=clusters.sim_distance)
     clusters.drawdendrogram(clust,
                             self.district_list,
                             jpeg='districts.jpg')
     self.insert_image("districts.jpg")  #insert clustered image to canvas
示例#13
0
def kmeans():
    karr = [5, 10, 20]
    blogs, colnames, data = clusters.readfile('Outputs/blogdata.txt')
    for i in karr:

        kclust, itercount = clusters.kcluster(data, k=i)
        print(kclust)
        f = open("Outputs/kclust_%d.txt" % i, 'w')
        f.write("Iteration count: %d \n" % itercount)
        print(len(kclust))
        for cluster in kclust:
            f.write("****************************\n")
            f.write("[")
            for blogid in cluster:
                f.write(blogs[blogid] + ", ")
            f.write("]\n")
示例#14
0
 def get_clusture(self, param):
     """
     param - str -> Parameeter will be specified in self.writefiles
     if param is Country it will show Country clusters
     if param is Criterias it will show data clusters
     """
     country_names, records, records_data = clusters.readfile(
         self.writed_names)
     if param == "Country":
         clust = clusters.hcluster(records_data)
         label = country_names
     elif param == "Criterias":
         rotated = clusters.rotatematrix(records_data)
         clust = clusters.hcluster(rotated)
         label = records
     self.jpg_names = 'clustured2.jpg'
     clusters.drawdendrogram(clust, labels=label, jpeg=self.jpg_names)
     self.show_image()
示例#15
0
def getKmeans():
    blognames, words, data = clusters.readfile("blogdata.txt")
    print "K value is 5"
    kclust = clusters.kcluster(data, k=5)
    print "\t\t" + str([blognames[r] for r in kclust[0]])  # print blognames in 1st centroid
    print "\t\t" + str([blognames[r] for r in kclust[1]])  # print blognames in 2nd centroid
    print "\t\t" + str([blognames[r] for r in kclust[2]])  # print blognames in 3rd centroid
    print "\t\t" + str([blognames[r] for r in kclust[3]])  # print blognames in 4th centroid
    print "\t\t" + str([blognames[r] for r in kclust[4]])  # print blognames in 5th centroid
    print "K value is 10"
    kclust = clusters.kcluster(data, k=10)
    print "\t\t" + str([blognames[r] for r in kclust[0]])  # print blognames in 1st centroid
    print "\t\t" + str([blognames[r] for r in kclust[1]])  # print blognames in 2nd centroid
    print "\t\t" + str([blognames[r] for r in kclust[2]])  # print blognames in 3rd centroid
    print "\t\t" + str([blognames[r] for r in kclust[3]])  # print blognames in 4th centroid
    print "\t\t" + str([blognames[r] for r in kclust[4]])  # print blognames in 5th centroid
    print "\t\t" + str([blognames[r] for r in kclust[5]])  # print blognames in 6th centroid
    print "\t\t" + str([blognames[r] for r in kclust[6]])  # print blognames in 7th centroid
    print "\t\t" + str([blognames[r] for r in kclust[7]])  # print blognames in 8th centroid
    print "\t\t" + str([blognames[r] for r in kclust[8]])  # print blognames in 9th centroid
    print "\t\t" + str([blognames[r] for r in kclust[9]])  # print blognames in 10th centroid
    print "K value is 20"
    kclust = clusters.kcluster(data, k=20)
    print "\t\t" + str([blognames[r] for r in kclust[0]])  # print blognames in 1st centroid
    print "\t\t" + str([blognames[r] for r in kclust[1]])  # print blognames in 2nd centroid
    print "\t\t" + str([blognames[r] for r in kclust[2]])  # print blognames in 3rd centroid
    print "\t\t" + str([blognames[r] for r in kclust[3]])  # print blognames in 4th centroid
    print "\t\t" + str([blognames[r] for r in kclust[4]])  # print blognames in 5th centroid
    print "\t\t" + str([blognames[r] for r in kclust[5]])  # print blognames in 6th centroid
    print "\t\t" + str([blognames[r] for r in kclust[6]])  # print blognames in 7th centroid
    print "\t\t" + str([blognames[r] for r in kclust[7]])  # print blognames in 8th centroid
    print "\t\t" + str([blognames[r] for r in kclust[8]])  # print blognames in 9th centroid
    print "\t\t" + str([blognames[r] for r in kclust[9]])  # print blognames in 10th centroid
    print "\t\t" + str([blognames[r] for r in kclust[10]])  # print blognames in 11th centroid
    print "\t\t" + str([blognames[r] for r in kclust[11]])  # print blognames in 12th centroid
    print "\t\t" + str([blognames[r] for r in kclust[12]])  # print blognames in 13th centroid
    print "\t\t" + str([blognames[r] for r in kclust[13]])  # print blognames in 14th centroid
    print "\t\t" + str([blognames[r] for r in kclust[14]])  # print blognames in 15th centroid
    print "\t\t" + str([blognames[r] for r in kclust[15]])  # print blognames in 16th centroid
    print "\t\t" + str([blognames[r] for r in kclust[16]])  # print blognames in 17th centroid
    print "\t\t" + str([blognames[r] for r in kclust[17]])  # print blognames in 18th centroid
    print "\t\t" + str([blognames[r] for r in kclust[18]])  # print blognames in 19th centroid
    print "\t\t" + str([blognames[r] for r in kclust[19]])  # print blognames in 20th centroid
示例#16
0
def getKmeans():
	blognames,words,data=clusters.readfile('blogdata.txt') 
	print "K value is 5"
	kclust=clusters.kcluster(data,k=5)
	print "\t\t"+str([blognames[r] for r in kclust[0]]) 
	print "\t\t"+str([blognames[r] for r in kclust[1]]) 
	print "\t\t"+str([blognames[r] for r in kclust[2]]) 
	print "\t\t"+str([blognames[r] for r in kclust[3]]) 
	print "\t\t"+str([blognames[r] for r in kclust[4]]) 
	print "K value is 10"
	kclust=clusters.kcluster(data,k=10)
	print "\t\t"+str([blognames[r] for r in kclust[0]]) 
	print "\t\t"+str([blognames[r] for r in kclust[1]]) 
	print "\t\t"+str([blognames[r] for r in kclust[2]]) 
	print "\t\t"+str([blognames[r] for r in kclust[3]]) 
	print "\t\t"+str([blognames[r] for r in kclust[4]]) 
	print "\t\t"+str([blognames[r] for r in kclust[5]]) 
	print "\t\t"+str([blognames[r] for r in kclust[6]]) 
	print "\t\t"+str([blognames[r] for r in kclust[7]]) 
	print "\t\t"+str([blognames[r] for r in kclust[8]]) 
	print "\t\t"+str([blognames[r] for r in kclust[9]]) 
	print "K value is 20"
	kclust=clusters.kcluster(data,k=20)
	print "\t\t"+str([blognames[r] for r in kclust[0]]) 
	print "\t\t"+str([blognames[r] for r in kclust[1]]) 
	print "\t\t"+str([blognames[r] for r in kclust[2]])
	print "\t\t"+str([blognames[r] for r in kclust[3]]) 
	print "\t\t"+str([blognames[r] for r in kclust[4]]) 
	print "\t\t"+str([blognames[r] for r in kclust[5]]) 
	print "\t\t"+str([blognames[r] for r in kclust[6]]) 
	print "\t\t"+str([blognames[r] for r in kclust[7]]) 
	print "\t\t"+str([blognames[r] for r in kclust[8]]) 
	print "\t\t"+str([blognames[r] for r in kclust[9]]) 
	print "\t\t"+str([blognames[r] for r in kclust[10]])
	print "\t\t"+str([blognames[r] for r in kclust[11]])
	print "\t\t"+str([blognames[r] for r in kclust[12]])
	print "\t\t"+str([blognames[r] for r in kclust[13]])
	print "\t\t"+str([blognames[r] for r in kclust[14]])
	print "\t\t"+str([blognames[r] for r in kclust[15]])
	print "\t\t"+str([blognames[r] for r in kclust[16]])
	print "\t\t"+str([blognames[r] for r in kclust[17]])
	print "\t\t"+str([blognames[r] for r in kclust[18]])
	print "\t\t"+str([blognames[r] for r in kclust[19]])
def kMean():
    kMeanValues = [5, 10, 20]
    blogs, colnames, data = clusters.readfile('blogdata.txt')
    for i in kMeanValues:

        kclust, itercount = clusters.kcluster(data, k=i)
        print(kclust)
        f = open("kclust_%d.txt" % i, 'w')
        f.write("Total Number Of Iterations: %d \n" % itercount)
        print(len(kclust))
        clusterCount = 1
        for cluster in kclust:
            i = 1
            f.write("---\n")
            f.write("Cluster %d \n" % clusterCount)
            for blogid in cluster:
                f.write(str(i) + ".\t" + blogs[blogid] + "\n")
                i += 1
            f.write("\n")
            clusterCount += 1
    def Clustering(self):
        try:
            dic = self.fetcher_Journalist_with_Titles()

            if len(data_dict1) == 0:
                self.Error_Message()
                return

            Matrix = self.Make_Matrix()
            Journlist, word, freq = clusters.readfile('Matrix')
            if self.Radio_Values3.get() == 0:
                H_clustering = hcluster(freq)
                self.All_Results.delete(0, END)
                for i in range(
                        len(
                            clust2str(H_clustering,
                                      labels=Journlist).split('\n'))):
                    self.All_Results.insert(
                        END,
                        clust2str(H_clustering,
                                  labels=Journlist).split('\n')[i])
            elif self.Radio_Values3.get() == 1:
                K_Vlaue = self.Valueof_k.get()
                Cluster_Value = kcluster(freq, k=int(K_Vlaue))
                Journalists = dic.keys()
                list = [(len(i), i) for i in Cluster_Value]
                list.sort(reverse=True)
                counter = 0
                self.All_Results.delete(0, END)
                for i, j in list:
                    list1 = [Journalists[k] for k in range(len(j))]
                    new_str = ""
                    for i in list1:
                        new_str += str(i) + "  "
                    self.All_Results.insert(
                        END,
                        "Cluster %d:{" % (counter + 1) + new_str + "}" + "\n")
                    counter += 1

        except:
            pass
示例#19
0
def main():
    file = ''
    with open('1000_terms.csv') as f:
        file = f.readlines()

    users, words, data = clusters.readfile(file)

    kclust5, kclust10, kclust20 = get_clusts(data)

    five = get_users_clust(kclust5, users)
    ten = get_users_clust(kclust10, users)
    twenty = get_users_clust(kclust20, users)

    print(five)
    print(ten)
    print(twenty)

    output_clusters(five, 5)
    output_clusters(ten, 10)
    output_clusters(twenty, 20)

    return 0
示例#20
0
 def clustering_button(self):
     if len(database) == 0:
         self.Error_Message_Function()
         return
     prof_names, words, data = clusters.readfile("Will_be_Cluestered.txt")
     type_of_clustering = values_of_clustering[int(
         self.Radio_Values3.get())]
     #Determining the type of clustering with the dictionary.
     if type_of_clustering == "Hierarcial":
         clust = clusters.hcluster(data)
         self.All_Results_Part.delete(0, END)
         for i in range(
                 len(
                     clusters.clust2str(clust, labels=prof_names).split(
                         '\n')) - 1):
             #split method is used for proper showing of cluster.
             self.All_Results_Part.insert(
                 END,
                 clusters.clust2str(clust,
                                    labels=prof_names).split('\n')[i])
             #Last line of list will be empty string so it is neglected.
     elif type_of_clustering == "K-Means":
         clust = clusters.kcluster(data, k=int(self.Value_of_k.get()))
         #k is getting from the entry.
         prof_names = database.keys()
         new_list_with_length_of_elements = [(len(i), i) for i in clust]
         new_list_with_length_of_elements.sort(reverse=True)
         counter = 0
         self.All_Results_Part.delete(0, END)
         for i, j in new_list_with_length_of_elements:
             new_proper_list = [prof_names[k] for k in range(len(j))]
             new_str = ""
             for i in new_proper_list:
                 new_str += str(i) + "  "
             self.All_Results_Part.insert(
                 END, "Cluster %d:{" % (counter + 1) + new_str + "}" + "\n")
             counter += 1
示例#21
0
def do_non_stem():
    # generate the blog file
    generate_blogfile()
    # read the data in
    blognames, words, data = clusters.readfile('datafiles/blogtop500.txt')
    # do clustering
    clust = clusters.hcluster(data)
    # write out asci denogram
    with open("datafiles/blogtop500_asciideno.txt", "w+") as out:
        clusters.printclust2file(clust, out, labels=blognames)
    # generate jpg version of same denogram
    clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500_deno.jpg')
    # do kmeans and log to file
    with open("datafiles/kmeans_blogtop500.txt", "w+") as kout:
        for k in [5, 10, 20]:
            print("For k=%d" % k)
            kout.write("K=%d\n" % k)
            kout.write("Iterations\n")
            # kmeans for value k
            centriods = clusters.kcluster_toFile(data, k=k, out=kout)
            kout.write("Centroid Values\n-------------------------\n")
            # log centroid values
            for count, centriod in enumerate(centriods, 1):
                print("Centroid #%d" % count)
                kout.write("Centroid #%d\n" % count)
                values = []
                for idx in centriod:
                    print(blognames[idx])
                    values.append(blognames[idx])
                kout.write("%s\n" % ', '.join(values))
            kout.write("=================================\n")
            print("-------")
    # do the dimensionality reduction
    with open("datafiles/dimensionReductionNonStemmed.txt","w+") as dout:
        scaled = clusters.scaledown_logiter(data,out=dout)
    # generated the similar blog jpg
    clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500_clust2d.jpg')
示例#22
0
#!/usr/bin/env python
import clusters
import question2b_conference_truth
import question2b_policy_truth
import question2b_race_truth
import question2b_review_truth
import question2b_story_truth
from svm import *
import svmutil

datafile = '../data/blog_entries_word_data.tsv';
training = 50

blognames,words,data=clusters.readfile(datafile)


def calculate_conference():
    correct = 0
    answers = []
    input = []
    count = 0
    for d in data:
        answers.append(question2b_conference_truth.truth[count])
        input.append(d)
        if count == 49:
            break
        count += 1

    prob = svmutil.svm_problem(answers, input)
    param = svmutil.svm_parameter('-t 2 -c 4')
    param.cross_validation = 1
示例#23
0
#Shawn Jones

#!/usr/local/bin/python

# all code here stolen shamelessly from 

# "Programming Collective Intelligence, Chapter 3"

import sys

sys.path.insert(0, '../libs')

import clusters

blognames,words,data=clusters.readfile('blogdata1V2.txt')

clust = clusters.hcluster(data)

# print ASCII dendrogram

clusters.printclust(clust, labels=blognames)

# save JPEG dendrogram

clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg')

示例#24
0
    # Take the average of the top k results
    for i in range(k):
        idx=dlist[i][1]
        avg+=data[idx]['result']
    avg=avg/k
    return avg

def exclude(data, idx):
    new_data = []
    for i in range(len(data)):
        if i!=idx:
            new_data.append(data[i])            
    return new_data

if __name__ == '__main__':
    blognames, terms, data = clusters.readfile('blog_term_matrix.csv')
    
    rows = []
    for i in range(len(data)):
        int_d = [int(c) for c in data[i]]
        rows.append({
            'input' : tuple(data[i]), 
            'result' : i
        })
    
    for k in [1,2,5,10,20]: 
        print('Using k=%s' % k)
        print('=' * len('Using k=%s' % k))
        
        # Nearest blog for 99th blog : F-Measure
        # Exclude 99th row from rows
示例#25
0
####54页调用generatefeedvector生成blogdata文件失败。是因为feedlist里面的网址无法打开吗?
###downloadzebodata生成zebo.txt也失败。sigh
import clusters

blognames,words,data = clusters.readfile('blogdatadown.txt')#1
#clust = clusters.hcluster(data)
#print (clust)#果然函数中这个值输出也都不一样呢。
#print(blognames)

#clusters.printclust(clust, labels = blognames)#2

#clusters.drawdendrogram(clust, blognames, jpeg = 'blogclust.jpg')#3

rdata = clusters.rotatematrix(data)#4
wordclust = clusters.hcluster(rdata)
clusters.drawdendrogram(wordclust, labels = words, jpeg = 'wordclust.jpg')
'''
kclust = clusters.kcluster(data, k = 4)#5
print ([blognames[r] for r in kclust[0]])
print ([blognames[r] for r in kclust[1]])

import urllib.request#6
from bs4 import BeautifulSoup
c = urllib.request.urlopen('https://en.wikipedia.org/wiki/Jon_Snow')
soup =  BeautifulSoup(c.read(),"lxml")#这里非常有趣! 感觉有空需要看下这个源代码库呀。
links = soup('a')#所以我还是不懂beautiful soup 的用法呀。
print(links[10])
print(links[10]['href'])
#这一段是教BS的。
示例#26
0
文件: run.py 项目: wz125/courses
def countword():
  blognames,words,data=clusters.readfile('blogdata1.txt')
  clust=clusters.hcluster(data)
示例#27
0
文件: run.py 项目: wz125/courses
def prefer():
  reload(clusters)
  wants,people,data=clusters.readfile('zebo.txt')
  clust=clusters.hcluster(data,distance=clusters.tanamoto)
  clusters.drawdendrogram(clust,wants)
示例#28
0
文件: run.py 项目: wz125/courses
def ColumnClustering():
  reload(clusters)
  blognames,words,data=clusters.readfile('blogdata1.txt')
  rdata=clusters.rotatematrix(data)
  wordclust=clusters.hcluster(rdata)
  clusters.drawdendrogram(wordclust,labels=words,jpeg='wordclust.jpg')
示例#29
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import clusters

blognames,words,data = clusters.readfile( './../data/feed_list.csv' )

coords = clusters.scaledown(data)

clusters.draw2d(coords, blognames, jpeg="2d.jpg")
示例#30
0
文件: ascii.py 项目: vnwala/cs595-f14
import clusters
blognames, words, data = clusters.readfile('blogVectorResult.txt')
clust = clusters.hcluster(data)
#clusters.printclust(clust,labels=blognames)
reload(clusters)
#clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
#kclust=clusters.kcluster(data,k=20)
coords = clusters.scaledown(data)
clusters.draw2d(coords, blognames, jpeg='blogs2d.jpg')
示例#31
0
import clusters


blognames,words,data=clusters.readfile('blogList500-matrix.txt')
clust = clusters.hcluster(data)

# print ASCII dendrogram
clusters.printclust(clust, labels=blognames)

# save JPEG dendrogram
clusters.drawdendrogram(clust, blognames, jpeg='blogclust-q5.jpg')
def multidim():
    jobnames,projects,data=clusters.readfile('job_projects')
    coords = clusters.scaledown(data)
    clusters.draw2d(coords,jobnames,jpeg='job_multidim.jpg')
示例#33
0
def createMDS():

    blognames,words,data=clusters.readfile('blogdata.txt') 
    coords=clusters.scaledown(data)
    clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg') 
示例#34
0
#!/usr/local/bin/python

# all code here stolen shamelessly from 
# "Programming Collective Intelligence, Chapter 3"

import sys

sys.path.insert(0, '../libs')

import clusters

blognames,words,data=clusters.readfile('../q1/blogdata500.txt')

coords = clusters.scaledown(data)

clusters.draw2d(coords, blognames, jpeg='blogs2d.jpg')
def draw_dendogram():
    jobnames,projects,data=clusters.readfile('job_projects')
    clust=clusters.hcluster(data)
    #clusters.printclust(clust,labels=jobnames)
    clusters.drawdendrogram(clust,jobnames,jpeg='jobclust.jpg')
示例#36
0
import clusters
import numpredict


def findNearestNeighbour(i, data, k):
    testing = data[i]
    neighbors = numpredict.knnestimate(data, testing, k)
    for i in neighbors:
        print(blogs[i[1]])


blogs, text, data = clusters.readfile("blogDataForknn.txt")
for name in "F-Measure", "Web Science and Digital Libraries Research Group":
    for k in 1, 2, 5, 10, 20:
        print("Blog Name", name)
        print("For K", k)
        findNearestNeighbour(blogs.index(name), data, k)
        print("\n\n")
示例#37
0
文件: run.py 项目: wz125/courses
def kmean():
  reload(clusters)
  rownames,words,data=clusters.readfile('blogdata.txt')
  kclust=clusters.kcluster(data,k=2)
  [rownames[r] for r in kclust[0]]
  [rownames[r] for r in kclust[1]]
示例#38
0
import codecs
import clusters, random
import shelve
sh = shelve.open("melone_data")
standnames, words, data = clusters.readfile("jojodata.txt")
datasize = len(data)
sh["standnames"] = standnames
sh["words"] = words
sh["data"] = data

tmpvec = []

def geneticoptimize(costf,popsize=50,\
     mutprob=0.3,elite=0.3,maxiter=100):
    def mutate(vec):
        j = random.randint(1, len(vec) - 1)
        except_vec = [
            y for y in filter(lambda x: x not in vec,
                              [x for x in range(datasize)])
        ]
        new_dna = random.sample(except_vec, j)

        return new_dna + vec[j:]

    def crossover(r1, r2):
        i = random.randint(1, datasize - 2)
        result = r1[0:i] + r2[i:]
        while len(set(result)) != 5:
            result = r1[0:i] + r2[i:]
        return result
示例#39
0
文件: run.py 项目: wz125/courses
def prefer2d():
  reload(clusters)
  blognames,words,data=clusters.readfile('blogdata.txt')
  coords=clusters.scaledown(data)
  clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
示例#40
0
def printwords(list, data, words):
    vecsum = zeros(len(data[0]))
    for l in list:
        vecsum = add(data[l],vecsum)

    topwrds =  sorted(range(len(vecsum)), key=lambda x: vecsum[x])[-5:]
    for r in topwrds:
        print vecsum[r]
    print "The top words of this cluster are: \n"
    for r in topwrds:
        print words[r]


	
moviename, words, data =  clusters.readfile('res/blogdata2.txt')
print 'Processing......'
kclust = clusters.kcluster( data, k = 5)

print "\t\t******* CLUSTER 1 *******"
printwords( kclust[0], data, words)
print '\n'
print [moviename[r] for r in kclust[0]]
print '\n\n\n'


print "\t\t******* CLUSTER 2 *******"
printwords( kclust[1], data, words)
print '\n'
print [moviename[r] for r in kclust[1]]
print '\n\n\n'
示例#41
0
# Valentina Neblitt-Jones
# CS 595 Introduction to Web Science
# Fall 2013
# Assignment 9 Question 2

import sys

sys.path.insert(0, '/Users/vneblitt/Documents/cs595-f13/assignment09/library')

import clusters

# Create clusters
blognames,words,data=clusters.readfile('/Users/vneblitt/Documents/cs595-f13/assignment09/q01/blogdata1.txt')
clust=clusters.hcluster(data)

# Create ASCII dendrogram
clusters.printclust(clust,labels=blognames)

# Create Nicer dendrogram with PIL
clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
示例#42
0
文件: ascii.py 项目: vnwala/cs595-f14
import clusters
blognames,words,data=clusters.readfile('blogVectorResult.txt')
clust=clusters.hcluster(data)
#clusters.printclust(clust,labels=blognames)
reload(clusters)
#clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
#kclust=clusters.kcluster(data,k=20)
coords=clusters.scaledown(data)
clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
import clusters

docs, words, data = clusters.readfile('titles_vectors.txt')
distance_func = clusters.euclidean

similarity_matrix = []
similarity_matrix.append([])
similarity_matrix[0].append("     ")
padded_docs = ["{:<5}".format(d) for d in docs]
similarity_matrix[0].extend(padded_docs)

for i in range(1, len(data) + 1):
    similarity_matrix.append([])
    similarity_matrix[i].append(docs[i - 1])
    similarity_matrix[i].extend([' ' * 5] * len(data))

print("Euclidean distance - for documents")
for i in range(len(data) - 1):
    for j in range(i + 1, len(data)):
        dist = distance_func(data[i], data[j])
        similarity_matrix[i + 1][j + 1] = "{:<5}".format("{:.2f}".format(
            1.0 / (1.0 + dist)))

clusters.print_2d_array(similarity_matrix)
示例#44
0
        rows.append(row)
    except:
        skip -= 1

size = size + skip
print(skip, size)
#print(rows)

f = open("newsdata.txt", "w")
f.write("|||".join(word_list) + "\n")
for i in range(size):
    row = rows[i]

    last_row = []
    last_row.append(row["TITLE"])
    for word in word_list:
        if word in row:
            last_row.append(str(row[word]))
        else:
            last_row.append(str(0))
    f.write("|||".join(last_row) + "\n")

f.close()

# what's data :data.append([float(x) for x in p[1:]]) p:row
blognames, words, data = clusters.readfile("newsdata.txt")
clust = clusters.hcluster(data)

clusters.printclust(clust, labels=blognames)
示例#45
0
#!/usr/local/bin/python

import clusters

blog, words, data = clusters.readfile('blogdata.txt')

coordinates = clusters.scaledown(data)

clusters.draw2d(coordinates, blog, jpeg='blogs.jpg')
示例#46
0
#!/usr/local/bin/python

# all code here stolen shamelessly from 
# "Programming Collective Intelligence, Chapter 3"

import sys
import argparse 

sys.path.insert(0, '../libs')

import clusters

blognames,words,data=clusters.readfile('../producedFiles/blogtermmatrix.txt')

coords = clusters.scaledown(data)

clusters.draw2d(coords, blognames, jpeg='../producedFiles/2dBlogSpace.jpg')
示例#47
0
文件: do.py 项目: yanak/clustering
#!/usr/bin/env python
# -*- coding: utf-8 -*-


import clusters

blognames,words,data=clusters.readfile( './../data/banpaku_utf8.csv' )
clust=clusters.hcluster(data)

# CUIで結果を表示
#clusters.printclust( clust, labels=blognames)

# 画像で結果を表示
reload(clusters)
clusters.drawdendrogram(clust, blognames, jpeg="banpaku_reg.jpg")
示例#48
0
import clusters

blognames,words,data=clusters.readfile('blogsdata.txt') 
kclust=clusters.kcluster(data,k=20)
print([blognames[r] for r in kclust[0]])
print([blognames[r] for r in kclust[1]])
print([blognames[r] for r in kclust[2]])
print([blognames[r] for r in kclust[3]])
print([blognames[r] for r in kclust[4]])
print([blognames[r] for r in kclust[5]])
print([blognames[r] for r in kclust[6]])
print([blognames[r] for r in kclust[7]])
print([blognames[r] for r in kclust[8]])
print([blognames[r] for r in kclust[9]])
print([blognames[r] for r in kclust[10]])
print([blognames[r] for r in kclust[11]])
print([blognames[r] for r in kclust[12]])
print([blognames[r] for r in kclust[13]])
print([blognames[r] for r in kclust[14]])
print([blognames[r] for r in kclust[15]])
print([blognames[r] for r in kclust[16]])
print([blognames[r] for r in kclust[17]])
print([blognames[r] for r in kclust[18]])
print([blognames[r] for r in kclust[19]])
示例#49
0
    distancelist.sort()

    return distancelist


def euclidean(v1, v2):  #NEED TO REIMPLEMENT.
    d = 0.0
    for i in range(len(v1)):
        d += (v1[i] - v2[i])**2
    return math.sqrt(d)


#getBlogs()
#main()

blognames, words, data = clusters.readfile('similarblogdata.txt')
print(blognames)
print(words)
print(data)
for i in range(len(data[1:])):
    if len(data[i + 1]) != len(data[i]):
        print(blognames[i + 1])
        print(len(data[i + 1]))
        print(blognames[i])
        print(len(data[i]))
clust = clusters.hcluster(data)
clusters.printclust(clust, labels=blognames)
clusters.drawdendrogram(clust, blognames, jpeg='sblogclust.jpg')

kclust = clusters.kcluster(data, k=5)
printkclustValues(kclust)
示例#50
0
文件: run.py 项目: wz125/courses
def drawingtheDendrogram():
  blognames,words,data=clusters.readfile('blogdata1.txt')
  clust=clusters.hcluster(data)
  reload(clusters)
  clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
示例#51
0
import clusters
blognames, words, data = clusters.readfile('blogdataascii.txt')
clust = clusters.hcluster(data)
clusters.drawdendrogram(clust, blognames, jpeg='blogcluster.jpg')
示例#52
0
import clusters

def printCentroid(name, kcluster, n):
    for x in range(n):
        print("Centroid ", str(x + 1), ":")
        print([name[r] for r in kcluster[x]])
        print("Centroid ", str(x + 1), ":", file=open("kcluster" + str(n) + ".txt", 'a+'))
        print([name[r] for r in kcluster[x]], file=open("kcluster" + str(n) + ".txt", 'a+'))


name, word, data = clusters.readfile('blogdata1 (copy).txt')

kcluster =clusters.kcluster(data,k=5)
printCentroid(name,kcluster,5)

kcluster =clusters.kcluster(data,k=10)
printCentroid(name,kcluster,10)

kcluster =clusters.kcluster(data,k=20)
printCentroid(name,kcluster,20)