def xy_proj0(t, east, west, c, data, z, x, y, count, tz): # print "+" bigger = 1.05 some = 0.000001 # xyobj = xy() # calculating test row's x and y coordinates trow = row(data[tz][t]) # xyobj.addtrow(trow) ta = dist(data[tz][t], data[z][east], data, z, indep, nump) tb = dist(data[tz][t], data[z][west], data, z, indep, nump) # xyobj.trow.x = (ta**2 + c**2 - tb**2) / (2*c + some) # xyobj.trow.y = (ta**2 - xyobj.trow.x**2)**0.5 tx = (ta ** 2 + c ** 2 - tb ** 2) / (2 * c + some) ty = (ta ** 2 - tx ** 2) ** 0.5 xyobj = xy(t, tx, ty) # print xyobj.trow.x,xyobj.trow.y for d in data[z]: ind = data[z].index(d) a = dist(data[z][ind], data[z][east], data, z, indep, nump) b = dist(data[z][ind], data[z][west], data, z, indep, nump) x[ind] = (a ** 2 + c ** 2 - b ** 2) / (2 * c + some) y[ind] = (a ** 2 - x[ind] ** 2) ** 0.5 r = row(d) r.x = x[ind] r.y = y[ind] xyobj.keep(r) return xyobj
def xycalc(z): rows = [] bigger = 1.05 some = 0.00001 #Pick any row d d = anyi(data[z]) if d == len(data[z]): d -= 1 #Initialize x and y lists x = [0] * len(data[z]) y = [0] * len(data[z]) #find furthest from d east = furthest(d, data, z) west = furthest(data[z].index(east), data, z) inde = data[z].index(east) indw = data[z].index(west) c = dist(data[z][inde], data[z][indw], z, indep, nump) for d in data[z]: ind = data[z].index(d) a = dist(data[z][ind], data[z][inde], z, indep, nump) b = dist(data[z][ind], data[z][indw], z, indep, nump) x[ind] = (a**2 + c**2 - b**2) / (2**c + some) y[ind] = (a**2 - x[ind]**2)**0.5 r = xy.row(d) r.x = x[ind] r.y = y[ind] rows.append(r) return rows
def xycalc(z): rows = [] bigger = 1.05 some = 0.00001 #Pick any row d d = anyi(data[z]) if d == len(data[z]): d -= 1 #Initialize x and y lists x = [0]*len(data[z]) y = [0]*len(data[z]) #find furthest from d east = furthest(d,data,z) west = furthest(data[z].index(east),data,z) inde = data[z].index(east) indw = data[z].index(west) c = dist(data[z][inde],data[z][indw],z,indep,nump) for d in data[z]: ind = data[z].index(d) a = dist(data[z][ind],data[z][inde],z,indep,nump) b = dist(data[z][ind],data[z][indw],z,indep,nump) x[ind] = (a**2 + c**2 -b**2) / (2**c + some) y[ind] = (a**2 - x[ind]**2)**0.5 r = xy.row(d) r.x = x[ind] r.y = y[ind] rows.append(r) return rows
def project0(east,west,data,z,x,y,count): print "+" bigger = 1.05 some = 0.000001 c = dist(data[z][east],data[z][west],data,z,indep,nump) for d in data[z]: ind = data[z].index(d) a = dist(data[z][ind],data[z][east],data,z,indep,nump) b = dist(data[z][ind],data[z][west],data,z,indep,nump) if a > c*bigger: return project0(east,ind,data,z,x,y,count) if b > c*bigger: return project0(ind,west,data,z,x,y,count) #print "." x[ind] = (a**2 + c**2 - b**2) / (2*c + some) y[ind] = (a**2 - x[ind]**2)**0.5
def extend_path(path, ps, lookahead=0): """Add the closest point in ps to an end of path to that end of path and delete it from ps.""" def update_path(end, cur_path, p): "Add p to the appropriate end of cur_path." if j == 0: cur_path.insert(0, ps[i]) else: cur_path.append(ps[i]) min_dist = None min_indices = None for i in range(len(ps)): for j in [0, -1]: d = None if lookahead > 0: tmp_path = list(path) update_path(j, tmp_path, ps[i]) tmp_ps = list(ps) del tmp_ps[i] tour = nn_tour(tmp_path, tmp_ps, lookahead=lookahead - 1) d = tour_length(tour) else: d = dist(ps[i], path[j]) if min_dist == None or d < min_dist: min_dist = d min_indices = (i, j) i, j = min_indices update_path(j, path, ps[i]) del ps[i]
def xy_proj(z, data, t, tz, check): # xyobj = xy() d = anyi(data[z]) if d == len(data[z]): d -= 1 x = [0] * len(data[z]) y = [0] * len(data[z]) east = furthest(d, data, z) west = furthest(data[z].index(east), data, z) inde = data[z].index(east) indw = data[z].index(west) c = dist(data[z][inde], data[z][indw], data, z, indep, nump) xyobj = xy_proj0(t, inde, indw, c, data, z, x, y, count, tz) leaves = {} oldd = 999999 for n, leaf in enumerate(xyobj.tiles(20, 4, 0, oldd)): leaves[n] = leaf # if check == True: leafprint(leaves) if check == True: print "nearest d", xy_d # ,"nearest node",xyobj.nearest if check == True: print "test row:", xyobj.trow.x, xyobj.trow.y ltab = leaftab(leaves) if check == True: printltab(ltab) close = nearleaf(ltab, xyobj) if check == True: checkie(leaves, ltab, close, data, tz, t) return out_reduced(leaves, close)
def nearestn(zlst, near, e): #Returns zlst containing only nearest n elements to centroid for i in range(1, len(zlst)): z = zlst[i] l = len(data[z]) dists = [] for j in range(0, l): dists.append(dist(expected1(z, e), data[z][j], z, indep, nump)) sorted_dists = sorted(dists) k = 0 #Create temporary data structure temp_data = [] for d in sorted_dists: if k <= ((near * l) / 100): r = dists.index(d) temp_data.append(data[z][r]) k += 1 else: break #Remove old data and add new data removeData(z) for r in temp_data: addRow(r, z) return zlst
def project0(east, west, data, z, x, y, count): print "+" bigger = 1.05 some = 0.000001 c = dist(data[z][east], data[z][west], data, z, indep, nump) for d in data[z]: ind = data[z].index(d) a = dist(data[z][ind], data[z][east], data, z, indep, nump) b = dist(data[z][ind], data[z][west], data, z, indep, nump) if a > c * bigger: return project0(east, ind, data, z, x, y, count) if b > c * bigger: return project0(ind, west, data, z, x, y, count) #print "." x[ind] = (a**2 + c**2 - b**2) / (2 * c + some) y[ind] = (a**2 - x[ind]**2)**0.5
def nearestn(zlst,near,e): #Returns zlst containing only nearest n elements to centroid for i in range(1,len(zlst)): z = zlst[i] l = len(data[z]) dists = [] for j in range(0,l): dists.append(dist(expected1(z,e),data[z][j],z,indep,nump)) sorted_dists = sorted(dists) k = 0 #Create temporary data structure temp_data = [] for d in sorted_dists: if k<= ((near*l)/100): r = dists.index(d) temp_data.append(data[z][r]) k += 1 else: break #Remove old data and add new data removeData(z) for r in temp_data: addRow(r,z) return zlst
def computeWSS(centroids, clusters,dist=dist.euclidiandist): '''Computes the WSS. centroids -- a list of records that form the centroids for the given clusters. clusters -- a list of lists which hold the indexes for the points in each clusters. Corresponds to globally stored data. dist -- the distance function used to compute WSS. Defaults to euclidian dist. returns -- the WSS. ''' WSS=0.0 for i in range(len(clusters)): for j in range(len(clusters[i])): dis=dist(centroids[i],data[clusters[i][j]]) dis=math.pow(dis,2) WSS+=dis return WSS
def computeBSS(centroids, clusters, dist=dist.euclidiandist): '''Computes the BSS. centroids -- a list of records that form the centroids for the given data. clusters -- a list of lists which hold the indexes for the points in each cluster. Corresponds to globally stored data dist -- the distance function used to compute BSS. Defaults to euclidian dist. returns -- the BSS ''' BSS=0.0 all_centroid=[] #First we must compute the centroid of the entire data set. for i in range(len(data[0])): run_tot=0.0 for j in data: run_tot+=j[i] all_centroid.append(run_tot/len(data)) #Now we can compute the BSS for i in range(len(clusters)): BSS+=len(clusters[i])*math.pow(dist(all_centroid,centroids[i]),2) return BSS
def neighbors(t, data, z, lst): for d in data[z]: ind = data[z].index(d) dic = {} dic['x'] = dist(t, d, data, z, indep[z], nump[z]) dic['d'] = d lst.append(dic) #lst[ind]['x'] = dist(t,d,data,z,indep[z],nump[z]) #lst[ind]['d'] = d """print "lsttttttttttttttttttttttt" for i in range(0,len(lst)): try: print lst[i] print lst[i]['x']; print i except KeyError: lst[i]['x'] = -1 lst[i]['d'] = [] """ sort = sorted(lst, key=lambda lst: lst['x']) return sort
def k_means(data,k,dist): '''Runs the k means algorithm with the specified distance measure and number of clusters. data -- properly formated 2-d array of data dist -- distance function k -- number of clusters returns -- returns list of centroids and list of clusters (which is a list of points' indicies) prints out header and cluster on stout. ''' #We will arbitrarily pick the first k points as centroids. #Please note that centroid will not always be data points. #len(centroids)==k cycles=0 centroids=[] if k>len(data): print "Error: k should be less than the number of records" sys.exit(0) for i in range(k): centroids.append(data[i][:]) clusters_changed=True #This array will contain arrays of indexes of data points clusters=[] #initialize it so that we don't get index out of range problems down the line. for w in range(k): clusters.append([]) while clusters_changed: #print "centroids",centroids #We need to store both old clusters and new, so that we can compare them. new_clusters=[] #initialize new clusters for w in range(k): new_clusters.append([]) #For every point, we must place it in a centroid. j is the point's index. #print "data",data for j in range(len(data)): #Find the closest centroid. sortable=[] for c in range(len(centroids)): d=dist(centroids[c],data[j]) t=d,c sortable.append(t) list.sort(sortable) #print j,sortable #Places point index in closest centroid. (e.g. shortest distance) new_clusters[sortable[0][1]].append(j) #At this point we have successfully created our new clusters. Now we need to compare them to the original clusters #Please note: We just need to check new[i] against old[i]. They should line up. #print new_clusters #print clusters same=True for i in range(k): #we only sort new clusters. Old clusters have been sorted because they used to be new clusters. new_clusters[i].sort() #Clusters have changed. if(clusters[i]!=new_clusters[i]): same=False break; if(same==False): #print "data beginning recompute",data #print "centroids en recomp",centroids #reassign, recompute centroids and continue cycles+=1 clusters=new_clusters for q in range(k): for w in range(len(data[0])): runtot=0.0 for e in range(len(clusters[q])): index=clusters[q][e] runtot+=data[index][w] val=len(clusters[q]) if val!=0: runtot=runtot/len(clusters[q]) else: runtot=0 centroids[q][w]=runtot #print "data end recompue",data else: #We've found the stuff, so print stuff and return. #print "**********For",k,"Clusters***********" #for i in range(k): #print "Cluster",i,"centroid is:",centroids[i] #print "\tContaining points:", clusters[i] #for j in range(len(clusters[i])): #print data_labels[clusters[i][j]] #print "cycles to complete:",cycles #print "cycles to complete:",cycles return centroids, clusters #END OF KMENAS
def distedAll(csvfile, z): for i in range(0, len(data[z])): for j in range(0, len(data[z])): print dist(data[z][i], data[z][j], data, z, indep, nump)
def distedAll(csvfile,z): for i in range(0,len(data[z])): for j in range(0,len(data[z])): print dist(data[z][i],data[z][j],data,z,indep,nump)