def kmeans_driver(threshold, k, init): # set up the job args mr_job = MRKmeans( args=['topUsers_Apr-Jul_2014_1000-words.txt', '--file=centroids.txt']) # initialize the centroids centroid_points = [] #k = 4 if init == 'A': centroid_points = startCentroidsA(k) print "(A) K=4 uniform random centroid-distributions over the 1000 words (generate 1000 random numbers and normalize the vectors)\n" elif init == 'B' or init == 'C': centroid_points = startCentroidsBC(k) print "(C) K=4 perturbation-centroids, randomly perturbed from the aggregated (user-wide) distribution\n" else: centroid_points = startCentroidsD(k) print "(D) K=4 \"trained\" centroids, determined by the sums across the classes\n" # write centroids to the expected file with open('centroids.txt', 'w+') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) f.close() # update centroids iteratively i = 0 code_clusters = [{}] * k while (1): # save previous centoids to check convergency centroid_points_old = centroid_points[:] print "iteration" + str(i) + ":" with mr_job.make_runner() as runner: runner.run() # stream_output: get access of the output for line in runner.stream_output(): key, values = mr_job.parse_output_line(line) #print key, values centroid = values[0] codes = values[1] centroid_points[key] = centroid code_clusters[key] = codes # Update the centroids for the next iteration with open('centroids.txt', 'w') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) print "\n" i = i + 1 if (stop_criterion(centroid_points_old, centroid_points, threshold)): break print "\nTotal iterations:", i max_vals = [] total_vals = [] print('\n%s\t%s\t\t%s\t\t%s\t\t%s\t\t%s') % ('cluster', 'human', 'cyborg', 'robot', 'spammer', 'total') print '=============================================================================' for idx, cluster in enumerate(code_clusters): zero_val = one_val = two_val = three_val = 0 total = float(sum(cluster.values())) if '0' in cluster.keys(): zero_val = cluster['0'] if '1' in cluster.keys(): one_val = cluster['1'] if '2' in cluster.keys(): two_val = cluster['2'] if '3' in cluster.keys(): three_val = cluster['3'] print('%d\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d') % ( idx, zero_val, (zero_val / total * 100), one_val, (one_val / total * 100), two_val, (two_val / total * 100), three_val, (three_val / total * 100), total) #purity = sum of the max points for each cluster divided by sum of total points in each cluster max_vals.append(max(cluster.values())) total_vals.append(sum(cluster.values())) purity = float(sum(max_vals)) / (sum(total_vals)) print "purity = %.2f%%" % (100 * purity)
#!/usr/bin/env python #START STUDENT CODE45_RUNNER import numpy as np import sys from Kmeans import MRKmeans, stop_criterion # set the randomizer seed so results are the same each time. np.random.seed(0) # define mrjob runner mr_job = MRKmeans( args=["topUsers_Apr-Jul_2014_1000-words.txt", '--file=Centroids.txt']) centroid_points = [] k = 4 class_codes = { '0.0': 'Human', '1.0': 'Cyborg', '2.0': 'Robot', '3.0': 'Spammer' } def startCentroidsBC(k): import re counter = 0 for line in open( "topUsers_Apr-Jul_2014_1000-words_summaries.txt").readlines(): if counter == 1: data = re.split(",", line) globalAggregate = [
from numpy import random import numpy as np from Kmeans import MRKmeans, stop_criterion import sys from custom_func import calc_purity mr_job = MRKmeans(args=['topUsers_Apr-Jul_2014_1000-words.txt']) random.seed(0) #number of features n= 1000 #get centroid type and number of clusters from user if len(sys.argv) >2: k = int(sys.argv[2]) cen_type = sys.argv[1] #Geneate initial centroids centroid_points = [] #based on the centroid type generate centroids if(cen_type=='Uniform'): rand_int = random.uniform(size=[k,n]) total = np.sum(rand_int,axis=1) centroid_points = (rand_int.T/total).T with open('Centroids.txt', 'w+') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) f.close() elif(cen_type=='Perturbation'): data = [s.split('\n')[0].split(',') for s in open("topUsers_Apr-Jul_2014_1000-words_summaries.txt").readlines()][1]
import numpy as np import sys from Kmeans import MRKmeans, stop_criterion # initialize variables SOURCE = "topUsers_Apr-Jul_2014_1000-words.txt" SUMMARY = "topUsers_Apr-Jul_2014_1000-words_summaries.txt" CENTROIDS = "/tmp/centroids" THRESHOLD = 0.001 # set the randomizer seed so results are the same each time. np.random.seed(0) # define mrjob runner mr_job = MRKmeans(args=[SOURCE]) # validate driver inputs - K and distribution type if len(sys.argv) != 3: print "Invalid number of arguments. Pass k (cluster size) and centroid distribution type (uniform, perturbed, normal)" sys.exit(1) k = sys.argv[1] try: k = int(k) except: raise TypeError("Invalid k. k must be an integer") distr_type = sys.argv[2] if distr_type not in ['uniform', 'perturbed', 'trained']: print "Invalid centroid distribution type. Type should be uniform, perturbed or trained." sys.exit(1)
centroids[idx][1] = centroids[idx][1] + y centroids[idx][0] = centroids[idx][0]/num[idx] centroids[idx][1] = centroids[idx][1]/num[idx] yield idx,(centroids[idx][0],centroids[idx][1]) if __name__ == '__main__': MRKmeans.run() ## Driver ## %reload_ext autoreload %autoreload 2 from numpy import random from Kmeans import MRKmeans, stop_criterion mr_job = MRKmeans(args=['Kmeandata.csv', '--file=Centroids.txt']) #Geneate initial centroids centroid_points = [] k = 3 for i in range(k): centroid_points.append([random.uniform(-3,3),random.uniform(-3,3)]) with open('Centroids.txt', 'w+') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) # Initiate the W0, W1 # Update centroids iteratively i = 0 while(1): # save previous centoids to check convergency
# New Centroids = initial centroids # # While(1): # + Cacluate new centroids # + stop if new centroids close to old centroids # + Updates centroids # In[11]: #get_ipython().magic(u'reload_ext autoreload') #get_ipython().magic(u'autoreload 2') %reload_ext autoreload %autoreload 2 from numpy import random from Kmeans import MRKmeans, stop_criterion mr_job = MRKmeans(args=['Kmeandata.csv', '--file=Centroids.txt']) # training data, initial centriods coded below #Geneate initial centroids centroid_points = [] k = 3 for i in range(k): random.seed(8888) centroid_points.append([random.uniform(-3,3),random.uniform(-3,3)]) with open('Centroids.txt', 'w') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) # Update centroids iteratively i = 0 while(1): # save previous centoids to check convergency centroid_points_old = centroid_points[:] # store the current version of the centroids