return centroids # write initial centroids to file centroid_points = startCentroidsBC(k) with open('Centroids.txt', 'w+') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) f.close() # Update centroids iteratively i = 0 while (1): # save previous centoids to check convergency centroid_points_old = centroid_points[:] print "iteration" + str(i) + ":" with mr_job.make_runner() as runner: runner.run() centroid_points = [] clusters = {} # stream_output: get access of the output for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) centroid, codes = value centroid_points.append(centroid) clusters[key] = codes # Update the centroids for the next iteration with open('Centroids.txt', 'w') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)
def kmeans_driver(threshold, k, init): # set up the job args mr_job = MRKmeans( args=['topUsers_Apr-Jul_2014_1000-words.txt', '--file=centroids.txt']) # initialize the centroids centroid_points = [] #k = 4 if init == 'A': centroid_points = startCentroidsA(k) print "(A) K=4 uniform random centroid-distributions over the 1000 words (generate 1000 random numbers and normalize the vectors)\n" elif init == 'B' or init == 'C': centroid_points = startCentroidsBC(k) print "(C) K=4 perturbation-centroids, randomly perturbed from the aggregated (user-wide) distribution\n" else: centroid_points = startCentroidsD(k) print "(D) K=4 \"trained\" centroids, determined by the sums across the classes\n" # write centroids to the expected file with open('centroids.txt', 'w+') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) f.close() # update centroids iteratively i = 0 code_clusters = [{}] * k while (1): # save previous centoids to check convergency centroid_points_old = centroid_points[:] print "iteration" + str(i) + ":" with mr_job.make_runner() as runner: runner.run() # stream_output: get access of the output for line in runner.stream_output(): key, values = mr_job.parse_output_line(line) #print key, values centroid = values[0] codes = values[1] centroid_points[key] = centroid code_clusters[key] = codes # Update the centroids for the next iteration with open('centroids.txt', 'w') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) print "\n" i = i + 1 if (stop_criterion(centroid_points_old, centroid_points, threshold)): break print "\nTotal iterations:", i max_vals = [] total_vals = [] print('\n%s\t%s\t\t%s\t\t%s\t\t%s\t\t%s') % ('cluster', 'human', 'cyborg', 'robot', 'spammer', 'total') print '=============================================================================' for idx, cluster in enumerate(code_clusters): zero_val = one_val = two_val = three_val = 0 total = float(sum(cluster.values())) if '0' in cluster.keys(): zero_val = cluster['0'] if '1' in cluster.keys(): one_val = cluster['1'] if '2' in cluster.keys(): two_val = cluster['2'] if '3' in cluster.keys(): three_val = cluster['3'] print('%d\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d') % ( idx, zero_val, (zero_val / total * 100), one_val, (one_val / total * 100), two_val, (two_val / total * 100), three_val, (three_val / total * 100), total) #purity = sum of the max points for each cluster divided by sum of total points in each cluster max_vals.append(max(cluster.values())) total_vals.append(sum(cluster.values())) purity = float(sum(max_vals)) / (sum(total_vals)) print "purity = %.2f%%" % (100 * purity)
total = int(cluster[2])#get the total count of words feature = map(lambda x:((1.0 * float(x)) / total),cluster[3:]) #normalise centroid_points.append(feature) with open('Centroids.txt', 'w+') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) f.close() print 'Centroid Type: %s' %cen_type # Update centroids iteratively i = 0 while(1): # save previous centoids to check convergency centroid_points_old = centroid_points[:] print "iteration"+str(i)+":" with mr_job.make_runner() as runner: centroid_points = [] cluster_dist ={} runner.run() # stream_output: get access of the output for line in runner.stream_output(): key,value = mr_job.parse_output_line(line) centroid, codes = value centroid_points.append(centroid) cluster_dist[key]=codes i = i + 1 #check if we have convergence if(stop_criterion(centroid_points_old,centroid_points,0.001)): break