def EigenEmbedding(dataTable, finalDims = 3): t0 = time.time() e = eig(loadMatrix(dataTable)) e = [(e[1].real.T[i]*sqrt(abs(e[0].real[i]))).tolist() for i in xrange(len(e[0]))] #e.reverse() e = [list(l) for l in zip(*e[:finalDims])] print time.time()-t0, " seconds to compute eigenvalue embedding." return e
def EigenEmbedding(dataTable, finalDims=3): t0 = time.time() e = eig(loadMatrix(dataTable)) e = [(e[1].real.T[i] * sqrt(abs(e[0].real[i]))).tolist() for i in xrange(len(e[0]))] #e.reverse() e = [list(l) for l in zip(*e[:finalDims])] print time.time() - t0, " seconds to compute eigenvalue embedding." return e
def KMeans(dataTable, k, epsilon=0.00001, srcDims = 1000000000000000, iters=20, normData = False): """ Get the best out of iters tries of k means terminating when delta k < epsilon """ #load up the configuration kmOptions = KMeansConfig(dataTable,k,epsilon,srcDims) #load and format the table for use. data = loadMatrix(dataTable)[:,:kmOptions['sourceDims']] #check if we should normalise the data (this is really quick and dirty, replace it with something better) if normData: dmax = amax(data) dmin = amin(data) data = (data-dmin)/(dmax-dmin+0.00000001) #make our starting point solutions from the dataset solutions = [array(random.sample(data,k)) for i in xrange(iters)] #chunk solutions if necessary for i in xrange(len(solutions)): sol = [] while len(solutions[i]) > kmOptions['chunkSize']: sol.append(solutions[i][:kmOptions['chunkSize']]) solutions[i] = solutions[i][kmOptions['chunkSize']:] sol.append(solutions[i]) solutions[i] = sol #create our chunked problem data dataChunks = [] while len(data) > kmOptions['chunkSize']: dataChunks.append(data[:kmOptions['chunkSize']]) data = data[kmOptions['chunkSize']:] dataChunks.append(data) kNorm = (len(dataChunks)-1)+len(dataChunks[-1])/float(len(dataChunks[0])) #create the CUDA kernels program = SourceModule(open(KernelLocation+"KMEANS_LABEL.nvcc").read()) prg = program.get_function("KMEANS_LABEL") program = SourceModule(open(KernelLocation+"KMEANS_UPDATE.nvcc").read()) prg2 = program.get_function("KMEANS_UPDATE") t0 = time.time() #store the resultant performance of each solution here results = [] finalSols = [] #make GPU allocations and support variables total = 0. dists = [numpy.zeros(kmOptions['chunkSize']).astype(numpy.float32)+10000000000000000. for i in xrange(len(dataChunks))] #this is used as an intermediate step labels = [numpy.zeros(kmOptions['chunkSize']).astype(numpy.uint32) for i in xrange(len(dataChunks))] #this is used as an intermediate step data_gpu = drv.mem_alloc(dataChunks[0].nbytes) k_gpu = drv.mem_alloc(solutions[0][0].nbytes) labels_gpu = drv.mem_alloc(labels[0].nbytes) dists_gpu = drv.mem_alloc(dists[0].nbytes) #calculate KMeans for sol in solutions: t0 = time.time() for i in xrange(10000): #Step 1: find all the closest labels for i in xrange(len(sol)): #copy in blank distances, labels, and the label coordinates drv.memcpy_htod(k_gpu, sol[i]) for j in xrange(len(dataChunks)): drv.memcpy_htod(data_gpu, dataChunks[j]) drv.memcpy_htod(labels_gpu, labels[j]) drv.memcpy_htod(dists_gpu, dists[j]) prg(k_gpu, data_gpu, kmOptions["dimensions"], labels_gpu, dists_gpu, kmOptions['k'], kmOptions['dataSize'], kmOptions['chunkSize'], numpy.int64(i*kmOptions['chunkSize']), #k offset numpy.int64(j*kmOptions['chunkSize']), #data offset kmOptions['maxThreads'], block=kmOptions['block'], grid=kmOptions['grid']) drv.memcpy_dtoh(labels[i], labels_gpu) #Step 2: find the new averages old_sol = [s.copy() for s in sol] for i in xrange(len(sol)): #load up a blank set of k matrices drv.memcpy_htod(k_gpu, sol[i]*0.) for j in xrange(len(dataChunks)): drv.memcpy_htod(data_gpu, dataChunks[j]) drv.memcpy_htod(labels_gpu, labels[j]) prg2(k_gpu, data_gpu, kmOptions["dimensions"], labels_gpu, kmOptions['k'], kmOptions['dataSize'], kmOptions['chunkSize'], numpy.int64(i*kmOptions['chunkSize']), #label offset numpy.int64(j*kmOptions['chunkSize']), #data offset kmOptions['maxThreads'], block=kmOptions['block'], grid=kmOptions['grid']) drv.memcpy_dtoh(sol[i], k_gpu) sol[i] /= kNorm #final normalisation #Step 3: check that the update distance is larger than epsilon total = 0. for j in xrange(len(sol)): tmp = sol[j]-old_sol[j] tmp = tmp*tmp total += sum([sum(t**0.5) for t in tmp]) if total/kmOptions['dataSize'] < kmOptions['eps']: break print "solution done in ",time.time()-t0 results.append((total,len(results))) finalSols.append(numpy.concatenate(sol)[:kmOptions['dataSize']]) results.sort() return finalSols[results[0][1]]
def KMeans(dataTable, k, epsilon=0.00001, srcDims=1000000000000000, iters=20, normData=False): """ Get the best out of iters tries of k means terminating when delta k < epsilon """ #load up the configuration kmOptions = KMeansConfig(dataTable, k, epsilon, srcDims) #load and format the table for use. data = loadMatrix(dataTable)[:, :kmOptions['sourceDims']] #check if we should normalise the data (this is really quick and dirty, replace it with something better) if normData: dmax = amax(data) dmin = amin(data) data = (data - dmin) / (dmax - dmin + 0.00000001) #make our starting point solutions from the dataset solutions = [array(random.sample(data, k)) for i in xrange(iters)] #chunk solutions if necessary for i in xrange(len(solutions)): sol = [] while len(solutions[i]) > kmOptions['chunkSize']: sol.append(solutions[i][:kmOptions['chunkSize']]) solutions[i] = solutions[i][kmOptions['chunkSize']:] sol.append(solutions[i]) solutions[i] = sol #create our chunked problem data dataChunks = [] while len(data) > kmOptions['chunkSize']: dataChunks.append(data[:kmOptions['chunkSize']]) data = data[kmOptions['chunkSize']:] dataChunks.append(data) kNorm = (len(dataChunks) - 1) + len(dataChunks[-1]) / float(len(dataChunks[0])) #create the CUDA kernels program = SourceModule(open(KernelLocation + "KMEANS_LABEL.nvcc").read()) prg = program.get_function("KMEANS_LABEL") program = SourceModule(open(KernelLocation + "KMEANS_UPDATE.nvcc").read()) prg2 = program.get_function("KMEANS_UPDATE") t0 = time.time() #store the resultant performance of each solution here results = [] finalSols = [] #make GPU allocations and support variables total = 0. dists = [ numpy.zeros(kmOptions['chunkSize']).astype(numpy.float32) + 10000000000000000. for i in xrange(len(dataChunks)) ] #this is used as an intermediate step labels = [ numpy.zeros(kmOptions['chunkSize']).astype(numpy.uint32) for i in xrange(len(dataChunks)) ] #this is used as an intermediate step data_gpu = drv.mem_alloc(dataChunks[0].nbytes) k_gpu = drv.mem_alloc(solutions[0][0].nbytes) labels_gpu = drv.mem_alloc(labels[0].nbytes) dists_gpu = drv.mem_alloc(dists[0].nbytes) #calculate KMeans for sol in solutions: t0 = time.time() for i in xrange(10000): #Step 1: find all the closest labels for i in xrange(len(sol)): #copy in blank distances, labels, and the label coordinates drv.memcpy_htod(k_gpu, sol[i]) for j in xrange(len(dataChunks)): drv.memcpy_htod(data_gpu, dataChunks[j]) drv.memcpy_htod(labels_gpu, labels[j]) drv.memcpy_htod(dists_gpu, dists[j]) prg( k_gpu, data_gpu, kmOptions["dimensions"], labels_gpu, dists_gpu, kmOptions['k'], kmOptions['dataSize'], kmOptions['chunkSize'], numpy.int64(i * kmOptions['chunkSize']), #k offset numpy.int64(j * kmOptions['chunkSize']), #data offset kmOptions['maxThreads'], block=kmOptions['block'], grid=kmOptions['grid']) drv.memcpy_dtoh(labels[i], labels_gpu) #Step 2: find the new averages old_sol = [s.copy() for s in sol] for i in xrange(len(sol)): #load up a blank set of k matrices drv.memcpy_htod(k_gpu, sol[i] * 0.) for j in xrange(len(dataChunks)): drv.memcpy_htod(data_gpu, dataChunks[j]) drv.memcpy_htod(labels_gpu, labels[j]) prg2( k_gpu, data_gpu, kmOptions["dimensions"], labels_gpu, kmOptions['k'], kmOptions['dataSize'], kmOptions['chunkSize'], numpy.int64(i * kmOptions['chunkSize']), #label offset numpy.int64(j * kmOptions['chunkSize']), #data offset kmOptions['maxThreads'], block=kmOptions['block'], grid=kmOptions['grid']) drv.memcpy_dtoh(sol[i], k_gpu) sol[i] /= kNorm #final normalisation #Step 3: check that the update distance is larger than epsilon total = 0. for j in xrange(len(sol)): tmp = sol[j] - old_sol[j] tmp = tmp * tmp total += sum([sum(t**0.5) for t in tmp]) if total / kmOptions['dataSize'] < kmOptions['eps']: break print "solution done in ", time.time() - t0 results.append((total, len(results))) finalSols.append(numpy.concatenate(sol)[:kmOptions['dataSize']]) results.sort() return finalSols[results[0][1]]
for o in optlist: nonmetric = True for o in optlist: if o[0].strip('-') == 'help' or o[1].strip('-') == 'h': print "The following commands are available:" print "\t--if=inputfile\tDefaults to embedding.csv" print "\t--of=outputfile\tDefaults to embedding.ps" print "\t--k=k_nearest_neighbours\tDefaults to 12" print "\t--outdims=embedding_dimensions\tDefaults to 3" print "\t--indims=input_dimensions\tDefaults to all in the input file" print "\t--nonmetric\tEnables non-metric MDS embeddings" result = [] graph_distances = array(loadMatrix(distfile)).flatten() for dim in xrange(minDims,maxDims): embedding = loadMatrix(infile) embedding_distances = [] for i in xrange(len(embedding)): ei = embedding[i][:dim] for j in xrange(i): embedding_distances.append(embedding_distances[i+j*len(embedding)]) for j in xrange(i,len(embedding)): e = ei-embedding[j][:dim] embedding_distances.append(dot(e,e)) embedding_distances = corrcoef(sqrt(array(embedding_distances)),graph_distances) residual = (1 - embedding_distances*embedding_distances)[0][1]