Exemplo n.º 1
0
def do_stemmed():
    generate_blogfile_stem()
    blognames, words, data = clusters.readfile('datafiles/blogtop500_stemmed.txt')
    clust = clusters.hcluster(data)
    with open("datafiles/blogtop500stemmed_asciideno.txt", "w+") as out:
        clusters.printclust2file(clust, out, labels=blognames)
    clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500stemmed_deno.jpg')

    with open("datafiles/kmeans_blogtop500stemmed.txt", "w+") as kout:
        for k in [5, 10, 20]:
            print("For k=%d" % k)
            kout.write("K=%d\n" % k)
            kout.write("Iterations\n")
            centriods = clusters.kcluster_toFile(data, k=k, out=kout)
            kout.write("Centroid Values\n-------------------------\n")
            for count, centriod in enumerate(centriods, 1):
                print("Centroid #%d" % count)
                kout.write("Centroid #%d\n" % count)
                values = []
                for idx in centriod:
                    print(blognames[idx])
                    values.append(blognames[idx])
                kout.write("%s\n" % ', '.join(values))
            kout.write("=================================\n")
            print("-------")
    with open("datafiles/dimensionReductionStemmed.txt", "w+") as dout:
        scaled = clusters.scaledown_logiter(data, out=dout)
    clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500stemmed_clust2d.jpg')
Exemplo n.º 2
0
def do_non_stem():
    # generate the blog file
    generate_blogfile()
    # read the data in
    blognames, words, data = clusters.readfile('datafiles/blogtop500.txt')
    # do clustering
    clust = clusters.hcluster(data)
    # write out asci denogram
    with open("datafiles/blogtop500_asciideno.txt", "w+") as out:
        clusters.printclust2file(clust, out, labels=blognames)
    # generate jpg version of same denogram
    clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500_deno.jpg')
    # do kmeans and log to file
    with open("datafiles/kmeans_blogtop500.txt", "w+") as kout:
        for k in [5, 10, 20]:
            print("For k=%d" % k)
            kout.write("K=%d\n" % k)
            kout.write("Iterations\n")
            # kmeans for value k
            centriods = clusters.kcluster_toFile(data, k=k, out=kout)
            kout.write("Centroid Values\n-------------------------\n")
            # log centroid values
            for count, centriod in enumerate(centriods, 1):
                print("Centroid #%d" % count)
                kout.write("Centroid #%d\n" % count)
                values = []
                for idx in centriod:
                    print(blognames[idx])
                    values.append(blognames[idx])
                kout.write("%s\n" % ', '.join(values))
            kout.write("=================================\n")
            print("-------")
    # do the dimensionality reduction
    with open("datafiles/dimensionReductionNonStemmed.txt","w+") as dout:
        scaled = clusters.scaledown_logiter(data,out=dout)
    # generated the similar blog jpg
    clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500_clust2d.jpg')
Exemplo n.º 3
0
    for i in range(len(v1)):
        d += (v1[i] - v2[i])**2
    return math.sqrt(d)


#getBlogs()
#main()

blognames, words, data = clusters.readfile('similarblogdata.txt')
print(blognames)
print(words)
print(data)
for i in range(len(data[1:])):
    if len(data[i + 1]) != len(data[i]):
        print(blognames[i + 1])
        print(len(data[i + 1]))
        print(blognames[i])
        print(len(data[i]))
clust = clusters.hcluster(data)
clusters.printclust(clust, labels=blognames)
clusters.drawdendrogram(clust, blognames, jpeg='sblogclust.jpg')

kclust = clusters.kcluster(data, k=5)
printkclustValues(kclust)
kclust = clusters.kcluster(data, k=10)
printkclustValues(kclust)
kclust = clusters.kcluster(data, k=20)
printkclustValues(kclust)
coords = clusters.scaledown(data)
clusters.draw2d(coords, blognames, jpeg='sblogs2d.jpg')
Exemplo n.º 4
0
Arquivo: run.py Projeto: wz125/courses
def prefer2d():
  reload(clusters)
  blognames,words,data=clusters.readfile('blogdata.txt')
  coords=clusters.scaledown(data)
  clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
Exemplo n.º 5
0
#!/usr/local/bin/python

# all code here stolen shamelessly from 
# "Programming Collective Intelligence, Chapter 3"

import sys
import argparse 

sys.path.insert(0, '../libs')

import clusters

blognames,words,data=clusters.readfile('../producedFiles/blogtermmatrix.txt')

coords = clusters.scaledown(data)

clusters.draw2d(coords, blognames, jpeg='../producedFiles/2dBlogSpace.jpg')
def multidim():
    jobnames,projects,data=clusters.readfile('job_projects')
    coords = clusters.scaledown(data)
    clusters.draw2d(coords,jobnames,jpeg='job_multidim.jpg')
Exemplo n.º 7
0
def main():

    blognames,words,data=clusters.readfile('blogdata.txt') 
    coords=clusters.scaledown(data)
    clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg') 
Exemplo n.º 8
0
def mds():
    blognames, words, data = clusters.readfile('blogdata.txt')
    coords, itercount = clusters.scaledown(data)
    clusters.draw2d(coords, labels=blognames, jpeg='mds.jpg')
    print ('Iteration count: %d' % itercount)
Exemplo n.º 9
0
#!/usr/bin/python

import clusters
blognames, words, data = clusters.readfile('blogdata1.txt')
clust = clusters.hcluster(data)

#Question 2
clusters.printclust(clust, labels=blognames)
clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg')

#Question 3
print "K = 5"
kclust5 = clusters.kcluster(data, k=5)
print "\nK = 10"
kclust10 = clusters.kcluster(data, k=10)
print "\nK = 20"
kclust20 = clusters.kcluster(data, k=20)

#Question 4

coords = clusters.scaledown(data)
clusters.draw2d(coords, blognames, jpeg='MDS.jpg')
                if frac>0.1 and frac<0.5:
                    wordlist.append(w)
    except:
        print 'Failed to parse feed %s' % feedurl

# Create a textfile containing matrix of all wordcounts from all blogs
out=file('blogdata.txt','w')
out.write('Blog')
for word in wordlist: out.write('\t%s' % word)
out.write('\n')
for blog,wc in wordcounts.items():
  print blog
  out.write(blog.encode('utf8'))
  for word in wordlist:
    if word in wc: out.write('\t%d' % wc[word])
    else: out.write('\t0')
  out.write('\n')


"""
  Using a smaller feedlist, the original generated blogdata was 32M.
  
  ************************
  Printing out the cluster
  ************************
  import clusters as clusters
  blogentries, words, data = clusters.readfile('blogdata5.txt')
  coords = clusters.scaledown(data)
  clusters.draw2d(coords, blogentries, jpeg='blog_entries.jpg')
"
Exemplo n.º 11
0
#!/usr/local/bin/python

import clusters

blog,words,data=clusters.readfile('blogdata.txt')

coordinates = clusters.scaledown(data)

clusters.draw2d(coordinates, blog, jpeg='blogs.jpg')
Exemplo n.º 12
0
import clusters

blog,words,data=clusters.readfile('blogdata.txt') 
coords=clusters.scaledown(data)
clusters.draw2d(coords,blog,jpeg='blogsMDS.jpg')

Exemplo n.º 13
0
#!/usr/local/bin/python

import clusters

(blognames, words, data)=clusters.readfile('blogdata.txt')

daata = clusters.scaledown(data)

clusters.draw2d(daata, blognames, jpeg='MDS.jpg')
Exemplo n.º 14
0
#!/usr/bin/env python

import clusters

datafile = "../data/word_data.tsv"

blognames, words, data = clusters.readfile(datafile)

iterations, coords = clusters.scaledown(data)

clusters.draw2d(coords, blognames, jpeg="../question4.jpg")

print "iterations: {}".format(iterations)
Exemplo n.º 15
0
import clusters

with open('1000_terms.csv') as f:
    file = f.readlines()

users, words, data = clusters.readfile(file)

coords = clusters.scaledown(data)
clusters.draw2d(coords, users, jpeg='q5_data/twitter_user_MDS.jpg')
Exemplo n.º 16
0
import clusters

name, word, data = clusters.readfile('blogdata1 (copy).txt')

cluster = clusters.scaledown(data)

clusters.draw2d(cluster, name, jpeg='mds.jpg')
Exemplo n.º 17
0
import clusters
import Image
moviename, words, data =  clusters.readfile('res/blogdata2.txt')
print 'Processing......'
clust = clusters.hcluster( data)
print 'Output image is generating...'
clusters.drawdendrogram(clust, moviename, jpeg = 'output/finaloutput.jpg')
print "Scaling down..."
coords = clusters.scaledown(data)
clusters.draw2d(coords, moviename, jpeg = 'output/finaloutput2d.jpg')
image = Image.open('output/finaloutput.jpg')
image.show()
x = input("Press any key to quit....")
Exemplo n.º 18
0
## Main driver added
if __name__ == "__main__":
    import clusters
    # hierarchical clustering
    blognames,words,data=clusters.readfile('blogdata.txt')
    clust=clusters.hcluster(data)
    # ASCII dendrodram
    out=file('C:/Python27/myFiles/Assignment 9/ASCII-Dendrogram.txt','w')
    # redirect standard output to our file
    orig_stdout = sys.stdout
    sys.stdout = out
    clusters.printclust(clust,labels=blognames)
    out.close()
    sys.stdout = orig_stdout
    # JPEG dendrogram
    clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
    print "Dendrodrams complete."
    # K-Means Clustering
    print "K=5"
    kclust=clusters.kcluster(data,k=5)
    print "\n"
    print "K=10"
    kclust=clusters.kcluster(data,k=10)
    print "\n"
    print "K=20"    
    kclust=clusters.kcluster(data,k=20)
    # Multidimensional scaling
    coords=clusters.scaledown(data)
    clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg') \end{verbatim}
Exemplo n.º 19
0
import clusters

blognames, words, data = clusters.readfile("blogdata.txt")

# hierarchy clustering
# clust=clusters.hcluster(data)
##clusters.printclust(clust, labels=blognames)
# clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg')

# column clustering
# rdata = clusters.rotatematrix(data)
# clust=clusters.hcluster(rdata)
# clusters.drawdendrogram(clust, words, jpeg='wordclust.jpg')

# k-means clustering
# kclust=clusters.kcluster(data, k=10)
# print [blognames[r] for r in kclust[0]]

# zebo.txt
# wants, people, data=clusters.readfile('zebo.txt')
# clust = clusters.hcluster(data, distance = clusters.tanimoto)
# clusters.drawdendrogram(clust, wants, jpeg='zebo_wants_clust.jpg')

# mds
wants, people, data = clusters.readfile("zebo.txt")
loc = clusters.scaledown(data, wants)
clusters.draw2d(loc, wants)

print "hello world"
Exemplo n.º 20
0
import clusters

blognames, words, data = clusters.readfile('blogdata.txt')
coordinates = clusters.scaledown(data)
clusters.draw2d(coordinates, blognames, jpeg='blogs2d.jpg')  # for mds
Exemplo n.º 21
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import clusters

blognames,words,data = clusters.readfile( './../data/feed_list.csv' )

coords = clusters.scaledown(data)

clusters.draw2d(coords, blognames, jpeg="2d.jpg")
'''
clust = cl.hcluster(data)
cl.printclust(clust,labels=blognames)
cl.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')

rdata = cl.rotatematrix(data)
wordclust = cl.hcluster(rdata)
cl.printclust(wordclust,labels=words)
cl.drawdendrogram(wordclust,words,jpeg='wordclust.jpg')

k = 4
kclust = cl.kcluster(data,k=k)
l = [[blognames[r] for r in kclust[i]] for i in range(k)]
for ll in l:
    print len(ll),ll


kclust = cl.kcluster_np(data,k=k)
l = [[blognames[r] for r in kclust[i]] for i in range(k)]
for ll in l:
    print len(ll),ll


wants,people,data = cl.readfile('zebo')
clust = cl.hcluster(data,distance=cl.tanimoto)
cl.drawdendrogram(clust,wants)
'''

coords = cl.scaledown(data)
cl.draw2d(coords, blognames, jpeg='blogs2d.jpg')
Exemplo n.º 23
0
def createMDS():

    blognames,words,data=clusters.readfile('blogdata.txt') 
    coords=clusters.scaledown(data)
    clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg') 
Exemplo n.º 24
0
# -*- coding: utf-8 -*- 

import clusters

if __name__ == '__main__':
    blognames, terms, data = clusters.readfile('blog_term_matrix.csv')
    
    coords = clusters.scaledown(data)
    clusters.draw2d(coords, blognames, jpeg='mds_blog_2d.jpg')
Exemplo n.º 25
0
#!/usr/bin/env python

import clusters

datafile = '../data/word_data_tfidf.tsv';

blognames,words,data=clusters.readfile(datafile)

iterations, coords=clusters.scaledown(data)

clusters.draw2d(coords,blognames,jpeg='../question5c.jpg')

print "iterations: {}".format(iterations)
Exemplo n.º 26
0
import clusters
blogname, words, data = clusters.readfile('blogdata2.txt')
coords = clusters.scaledown(data)
clusters.draw2d(coords, blogname, jpeg='blog2d.jpg')
rdata = clusters.rotatematrix(data)
wordclust = clusters.hcluster(rdata)
clusters.drawdendrogram(wordclust, labels=words, jpeg='wordclust.jpg')
Exemplo n.º 27
0
import clusters    


blognames, words, data = clusters.readfile('blogdata.txt')
coords, itercount = clusters.scaledown(data)
clusters.draw2d(coords, labels=blognames, jpeg='mds.jpg')
print ('Iteration count: %d' % itercount)
Exemplo n.º 28
0
def createMDS():
	blognames,words,data=clusters.readfile('blogVector.txt')
	coords,iterationCount=clusters.scaledown(data)
	clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')

	print 'iterationCount', iterationCount
Exemplo n.º 29
0
#!/usr/local/bin/python

import clusters

blog, words, data = clusters.readfile('blogdata.txt')

coordinates = clusters.scaledown(data)

clusters.draw2d(coordinates, blog, jpeg='blogs.jpg')
Exemplo n.º 30
0
#!/usr/bin/python

import clusters
blognames, words, data = clusters.readfile('blogdata1.txt')
clust=clusters.hcluster(data)

#Question 2
clusters.printclust(clust, labels=blognames)
clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg')



#Question 3
print "K = 5"
kclust5 = clusters.kcluster(data, k=5)
print "\nK = 10"
kclust10 = clusters.kcluster(data, k=10)
print "\nK = 20"
kclust20 = clusters.kcluster(data, k=20)



#Question 4

coords=clusters.scaledown(data)
clusters.draw2d(coords, blognames, jpeg='MDS.jpg')

Exemplo n.º 31
0
#!/usr/local/bin/python

# all code here stolen shamelessly from 
# "Programming Collective Intelligence, Chapter 3"

import sys

sys.path.insert(0, '../libs')

import clusters

blognames,words,data=clusters.readfile('../q1/blogdata500.txt')

coords = clusters.scaledown(data)

clusters.draw2d(coords, blognames, jpeg='blogs2d.jpg')
Exemplo n.º 32
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import clusters

blognames, words, data = clusters.readfile('./../data/feed_list.csv')

coords = clusters.scaledown(data)

clusters.draw2d(coords, blognames, jpeg="2d.jpg")