def kmeans(data,numExamples,numClusters): # find the dimension of the data dimension = len(data[1]) # find the initial prototypes prototypes = [] index = [] for i in xrange(numClusters): rand = random.randint(0,numExamples-1) # make sure there are no repeats while rand in index: rand = random.randint(0,numExamples-1) prototypes.append(data[rand]) index.append(rand) errors = [] counter = 0 responsibilities = [None]*numExamples # repeat until the error stops decreasing while True: # assign responsibilities to data for i in xrange(numExamples): distances = map(lambda n: utils.squareDistance(data[i],n), prototypes) responsibilities[i] = makeR(numClusters,distances.index(min(distances))) # find new error error = 0 for j in range(numExamples): for k in range(numClusters): error += responsibilities[j][k]*utils.squareDistance(data[j],prototypes[k])/numExamples # print "Error:",counter,":",errors, error # quit if error isn't improving if counter > 10 and error >= errors[counter-9]: return error break # else, updates the prototypes errors.append(error) counter += 1 for l in range(numClusters): topsum = [0]*dimension bottomsum = 0 for m in range(numExamples): myrespon = responsibilities[m][l] topsum = listadd(topsum, listmult(data[m],myrespon)) bottomsum += responsibilities[m][l] if bottomsum == 0: print "Cluster number",l,"is obsolete" prototypes[l] = listmult(topsum,float(1.0/bottomsum))
def kmeans(data, k): # threshold used to determine when to stop threshold = 0.01 # for each k, set prototype u_k to a random vector in data prototypes = random.sample(data, k) responsibilities = [] for d in data: responsibilities.append([0] * k) while True: # update responsibilities for i in range(len(data)): # zero out the responsibility vector for s in range(len(responsibilities[i])): responsibilities[i][s] = 0 # calculate closest_prototype distances = [] for p in prototypes: distances.append(utils.squareDistance(data[i], p)) responsibilities[i][distances.index(min(distances))] = 1 # update prototypes largest_shift = 0.0 for p in range(len(prototypes)): temp = [0.0] * len(prototypes[p]) point_count = 0 for j in range(len(responsibilities)): if responsibilities[j][p] == 1: temp = [a + b for a, b in zip(temp, data[j])] point_count += 1 temp = map(lambda x: x / point_count, temp) diffs = [math.fabs(a - b) for a, b in zip(prototypes[p], temp)] largest_shift = max(diffs) prototypes[p] = temp if largest_shift < threshold: break sq_err = 0.0 count = 0 for d in range(len(data)): for j in range(len(responsibilities[d])): if responsibilities[d][j] == 1: sq_err += utils.squareDistance(data[d], prototypes[j]) count += 1 break mse = sq_err / count print "\nMSE: {}".format(mse) print "\n***CLUSTER MEANS***\n" for p in range(len(prototypes)): print "CLUSTER {}: {}\n".format(p + 1, prototypes[p])
def mean_squared_by_cluster(clusters): tot = 0 for clust in clusters: clust_mean = mean_of_cluster(clust) for x in clust: tot += utils.squareDistance(x, clust_mean) return 1.0/len(cluster)*tot
def MSEforDirection(directions, pCentroids, pMeasure, distrib, dataset): """ A more efficient way to compute MSE for different directions (for a specific update function) """ directionsMSE = np.zeros(len(directions)) numberOfPointsUsed = np.zeros(len(directions)) # define regions and centroids regionsWithCentroids = [] for dir in directions: regionsWithCentroids.append(core.centroids(dir,pCentroids,distrib) ) # calculate error for k in range(pMeasure): x = utils.f(len(directions[0][0])-1 , distrib, dataset) # pick a random point x for i in range(len(directions)): #for each direction i r = utils.findRegion(x,directions[i]) regionRegistered = False for j in range(len(regionsWithCentroids[i])): if np.all(regionsWithCentroids[i][j,0] == r): c = regionsWithCentroids[i][j,1] regionRegistered = True break; if regionRegistered: directionsMSE[i] += utils.squareDistance(x,c) numberOfPointsUsed[i] += 1. directionsMSE /= float(len(x)) directionsMSE /= numberOfPointsUsed return directionsMSE
def MSE(hyperplanes,pCentroids,pMeasure,distrib, dataset): """ Returns MSE given the hyperplanes separating regions. Parameter pCentroids is the number of realisations of f used for determining the centroids of each region. Parameter pMeasure is the number of realisations used for computing the MSE. """ error = 0. numberOfPointsUsed = 0 regionsWithCentroids = core.centroids(hyperplanes,pCentroids,distrib) for i in range(pMeasure): x = utils.f(len(hyperplanes[0])-1 , distrib, dataset) r = utils.findRegion(x,hyperplanes) regionRegistered = False for j in range(len(regionsWithCentroids)): if np.all(regionsWithCentroids[j,0] == r): c = regionsWithCentroids[j,1] regionRegistered = True break; if regionRegistered: error += utils.squareDistance(x,c) numberOfPointsUsed += 1 error /= float(len(x)) error /= float(numberOfPointsUsed) return error
def k_means(xs, numExamples,numClusters): #initializing of the means as randomly chosen members of the input means = random.sample(xs, numClusters) #initializing the membership variables #r_nk = 1 if example n is in cluster k, 0 if not rlist = [] # for n in range(numExamples): # rlist.append([]) # for k in range(numClusters): # rlist[n].append(0) #initialize the indicator matrix for n in range(numExamples): #find the index of the old mean rlist.append(zero_list(numClusters)) #find the best cluster, i.e. that minimizes the L_2 norm k = utils.argmin_index(means, lambda y : utils.squareDistance(xs[n],y)) rlist[n][k]=1 converged = False while not(converged): converged = True #we change this if any means get updated for n in range(numExamples): #find the index of the old mean k_old = numpy.argmax(rlist[n]) rlist[n] = zero_list(numClusters) #find the best cluster, i.e. that minimizes the L_2 norm k = utils.argmin_index(means, lambda y : utils.squareDistance(xs[n],y)) rlist[n][k]=1 #as long as one datapoint changes clusters, we have failed to converge if not(k_old == k): converged = False #unit test assert checkMembership(rlist) for k in range(numClusters): means[k] = avg(rlist, xs, numExamples, k) print "---------" # for i in range(numClusters): # print "mean of cluster ",i, " is ", means[i] print "mean_squared is ", mean_squared(means, rlist, xs)
def kMeans(data, numClusters): muList = [] r_vectors = [] for i in range(numClusters): muList.append(data[random.randint(0, len(data))]) for i in range(len(data)): toAppend = [] for k in range(numClusters): toAppend.append(0) r_vectors.append(toAppend) # convergence when no examples are reassigned somethingChanged = True while somethingChanged: # setting the r vector: assigning examples to clusters somethingChanged = False # average mean squared distance ave_msd = 0 for i in range(len(data)): distances = [] for muVec in muList: if not(isinstance(muVec,str)): distances.append(utils.squareDistance(data[i], muVec)) minDist = min(distances) ave_msd += minDist k = distances.index(minDist) rVec = r_vectors[i] if rVec[k] != 1: # data as been reassigned to different cluster somethingChanged = True for j in range(len(rVec)): if j == k: rVec[j] = 1 else: rVec[j] = 0 r_vectors[i] = rVec ave_msd = ave_msd/len(data) # recenter the prototype vectors (muVecs) for k in range(len(muList)): count = 0.0 vecSum = [0] * len(data[0]) # initialize vecSum to zero vector with the same length as a datum for i in range(len(data)): if r_vectors[i][k] == 1: vecSum = map(add, vecSum, data[i]) count += 1.0 if count!=0: muList[k] = [x/count for x in vecSum] else: muList[k] = 'Empty' return muList,ave_msd
def MSE(regions, germs, pMeasure, distrib, dataset): ''' Returns the mean squared error based on an approximation made with random points generated according to the random distribution being studied. ''' nDimensions = len(regions[0, 0]) - 1 error = 0. for k in range(pMeasure): x = utils.f(nDimensions, distrib, dataset) r = findRegion(x, regions) error += utils.squareDistance(x, germs[r]) error /= float(nDimensions) error /= float(pMeasure) return error
def mean_squared(means, rlist, xs): tot = 0 for n in range(len(xs)): for k in range(len(means)): tot += rlist[n][k]*utils.squareDistance(xs[n], means[k]) return tot/len(xs)
def Distance(xs,ys): return math.sqrt(utils.squareDistance(xs,ys))