def test_device_boruvka_usa_cal(self): sp_mat = load_sparse_csr(path_usa_cal) dest = sp_mat.indices weight = sp_mat.data firstedge = sp_mat.indptr[:-1] outdegree = np.empty_like(firstedge) outdegree_from_firstedge(firstedge, outdegree, dest.size) n_edges = dest.size n_vertices = firstedge.size t1 = Timer() t1.tic() mst, n_mst = boruvka_minho_gpu(dest, weight, firstedge, outdegree) t1.tac() if n_mst < mst.size: mst = mst[:n_mst] # get MST from scipy library graph_csr = load_sparse_csr(path_usa_cal) scipy_mst = minimum_spanning_tree(graph_csr) true_mst_size = scipy_mst.size assert_msg = 'MST number of edges mismatch' self.assertEqual(n_mst, true_mst_size, assert_msg) assert_msg = 'MST total weight mismatch' self.assertEqual(weight[mst].sum(), scipy_mst.sum(), assert_msg)
def test_seq_gpu(self): print "HOST VS DEVICE" same_sol = list() same_cost = list() for r in range(20): sp_mat = load_sparse_csr(path_4elt) dest = sp_mat.indices weight = sp_mat.data firstedge = sp_mat.indptr[:-1] # last element is the total number outdegree = np.empty_like(firstedge) outdegree_from_firstedge(firstedge, outdegree, dest.size) n_edges = dest.size n_vertices = firstedge.size t1, t2 = Timer(), Timer() t1.tic() mst1, n_edges1 = boruvka_minho_seq(dest, weight, firstedge, outdegree) t1.tac() if n_edges1 < mst1.size: mst1 = mst1[:n_edges1] mst1.sort() assert_msg = '4elt dataset MST not fully connected in sequential' self.assertEqual(mst1.size, n_vertices-1, assert_msg) t2.tic() mst2, n_edges2 = boruvka_minho_gpu(dest, weight, firstedge, outdegree, MAX_TPB=256) t2.tac() if n_edges2 < mst2.size: mst2 = mst2[:n_edges2] mst2.sort() assert_msg = '4elt dataset MST not fully connected in gpu' self.assertEqual(mst2.size, n_vertices-1, assert_msg) # how many edges are common to both solutions # same_sol.append(np.in1d(mst1, mst2).sum()) # check MST cost cost1 = weight[mst1].sum() cost2 = weight[mst2].sum() self.assertEqual(cost1, cost2, 'MSTs have diferent costs')
def mst_cal(): sp_cal = load_csr_graph(home + "QCThesis/datasets/graphs/USA-road-d.CAL.csr") dest, weight, firstEdge, outDegree = get_boruvka_format(sp_cal) del sp_cal print "# edges: ", dest.size print "# vertices: ", firstEdge.size print "size of graph (MB): ", (dest.size + weight.size + firstEdge.size + outDegree.size) * 4.0 / 1024 / 1024 times_cpu = list() times_gpu = list() equal_mst = list() equal_cost = list() t1, t2 = Timer(), Timer() for r in range(10): print "cpu round ", r t1.tic() mst1, n_edges1 = boruvka_minho_seq(dest, weight, firstEdge, outDegree) t1.tac() print "finished in ", t1.elapsed if n_edges1 < mst1.size: mst1 = mst1[:n_edges1] print "gpu round ", r t2.tic() mst2, n_edges2 = boruvka_minho_gpu(dest, weight, firstEdge, outDegree, MAX_TPB=512) t2.tac() print "finished in ", t2.elapsed print "" if n_edges2 < mst2.size: mst2 = mst2[:n_edges2] equal_mst.append(np.in1d(mst1,mst2).all()) equal_cost.append(weight[mst1].sum() == weight[mst2].sum()) if r > 0: times_cpu.append(t1.elapsed) times_gpu.append(t2.elapsed) print equal_mst print equal_cost print "average time cpu: ", np.mean(times_cpu) print "average time gpu: ", np.mean(times_gpu)
def device_boruvka(): print "CUDA BORUVKA" dest, weight, firstEdge, outDegree = load_graph("4elt") t1 = Timer() t1.tic() mst, n_edges = boruvka_minho_gpu(dest, weight, firstEdge, outDegree) t1.tac() if n_edges < mst.size: mst = mst[:n_edges] print "time elapsed: ", t1.elapsed mst.sort() print mst print n_edges
def host_vs_device(): print "HOST VS DEVICE" same_sol = list() same_cost = list() for r in range(20): dest, weight, firstEdge, outDegree = load_graph("4elt") t1, t2 = Timer(), Timer() t1.tic() mst1, n_edges1 = boruvka_minho_seq(dest, weight, firstEdge, outDegree) t1.tac() if n_edges1 < mst1.size: mst1 = mst1[:n_edges1] mst1.sort() t2.tic() mst2, n_edges2 = boruvka_minho_gpu(dest, weight, firstEdge, outDegree, MAX_TPB=256) t2.tac() if n_edges2 < mst2.size: mst2 = mst2[:n_edges2] mst2.sort() same_sol.append(np.in1d(mst1,mst2).sum()) same_cost.append(weight[mst1].sum() == weight[mst2].sum()) #same_sol.append((mst1==mst2).all()) print "no. edges: ", weight.size print "no. nodes: ", firstEdge.size print "Same solution: ", same_sol print "Same cost:", np.all(same_cost) print "Solution CPU cost: ", weight[mst1].sum() print "Solution GPU cost: ", weight[mst2].sum() print "Host time elapsed: ", t1.elapsed print "Device time elapsed: ", t2.elapsed
def analyze_graph_from_h5(filename, verbose=False): def v_print(vstr): if verbose: print vstr csr_mat = load_h5_to_csr(filename) dest, weight, firstEdge, outDegree = get_boruvka_format(csr_mat) del csr_mat n_e = dest.size n_v = firstEdge.size mem = (dest.size*dest.itemsize + weight.size*weight.itemsize + firstEdge.size*firstEdge.itemsize + outDegree.size*outDegree.itemsize)/ (1024.0**2) print "# edges: ", n_e print "# vertices: ", n_v print "size of graph (MB): ", mem times_cpu = list() times_gpu = list() equal_mst = list() equal_cost = list() mst_costs = {'cpu':list(), 'gpu':list()} t1, t2 = Timer(), Timer() for r in range(10): v_print('------ Round {} -------'.format(r)) t1.reset() t1.tic() mst1, n_edges1 = boruvka_minho_seq(dest, weight, firstEdge, outDegree) t1.tac() v_print('CPU finished in {} s'.format(t1.elapsed)) if n_edges1 < mst1.size: mst1 = mst1[:n_edges1] t2.reset() t2.tic() mst2, n_edges2 = boruvka_minho_gpu(dest, weight, firstEdge, outDegree, MAX_TPB=512) t2.tac() v_print('GPU finished in {} s'.format(t2.elapsed)) if n_edges2 < mst2.size: mst2 = mst2[:n_edges2] mst_costs['cpu'].append(weight[mst1].sum()) mst_costs['gpu'].append(weight[mst2].sum()) equal_mst.append(np.in1d(mst1,mst2).all()) equal_cost.append(weight[mst1].sum() == weight[mst2].sum()) if r > 0: times_cpu.append(t1.elapsed) times_gpu.append(t2.elapsed) max_cost = max((max(mst_costs['cpu']), max(mst_costs['gpu']))) cost_error = map(lambda x: abs(x[0]-x[1]), zip(*mst_costs.values())) cost_error = map(lambda x: x/max_cost, cost_error) error_threshold = 1e-5 cpu_str = '' for t in times_cpu: cpu_str += str(t) + ',' gpu_str = '' for t in times_gpu: gpu_str += str(t) + ',' cpu_costs = '' for c in mst_costs['cpu']: cpu_costs += str(c) + ',' gpu_costs = '' for c in mst_costs['gpu']: gpu_costs += str(c) + ',' print 'dataset: {}'.format(os.path.basename(filename)) print 'CPU times,{},{},{},{}'.format(n_e,n_v,mem,cpu_str[:-1]) print 'GPU times,{},{},{},{}'.format(n_e,n_v,mem,gpu_str[:-1]) print 'CPU costs,{},{},{},{}'.format(n_e,n_v,mem,cpu_costs[:-1]) print 'GPU costs,{},{},{},{}'.format(n_e,n_v,mem,gpu_costs[:-1]) print '' print 'All equal MSTs: {}'.format(np.all(np.array(equal_mst) == equal_mst[0])) print 'All equal costs: {}'.format(np.all(equal_cost)) print 'All cost errors <= {}: {}'.format(error_threshold, np.all(map(lambda x:x<error_threshold, cost_error))) print 'Max normalized error: {}'.format(max(cost_error)) speedup = np.array(times_cpu) / np.array(times_gpu) print 'Times(s)\tMean\tStd\tMax\tMin' print 'CPU \t{:.5F}\t{:.5F}\t{:.5F}\t{:.5F}'.format(np.mean(times_cpu), np.std(times_cpu), np.max(times_cpu), np.min(times_cpu)) print 'GPU \t{:.5F}\t{:.5F}\t{:.5F}\t{:.5F}'.format(np.mean(times_gpu), np.std(times_gpu), np.max(times_gpu), np.min(times_gpu)) print 'SpeedUp \t{:.5F}\t{:.5F}\t{:.5F}\t{:.5F}'.format(np.mean(speedup), np.std(speedup), np.max(speedup), np.min(speedup)) print 'Error \t{:.5F}\t{:.5F}\t{:.5F}\t{:.5F}'.format(np.mean(cost_error), np.std(cost_error), np.max(cost_error), np.min(cost_error))
def mst_cluster_coassoc(): t1,t2 = Timer(), Timer() #foldername = "/home/courses/aac2015/diogoaos/QCThesis/datasets/gaussmix1e4/" foldername = home + "QCThesis/datasets/gaussmix1e4/" print "Loading datasets" t1.tic() # dest = np.genfromtxt(foldername + "prot_dest.csr", dtype = np.int32, delimiter=",") # weight = np.genfromtxt(foldername + "prot_weight.csr", dtype = np.float32, delimiter=",") # fe = np.genfromtxt(foldername + "prot_fe.csr", dtype = np.int32, delimiter=",") dest = np.genfromtxt(foldername + "full_dest.csr", dtype = np.int32, delimiter=",") weight = np.genfromtxt(foldername + "full_weight.csr", dtype = np.float32, delimiter=",") fe = np.genfromtxt(foldername + "full_fe.csr", dtype = np.int32, delimiter=",") t1.tac() print "loading elapsed time : ", t1.elapsed fe = fe[:-1] od = np.empty_like(fe) outdegree_from_firstedge(fe, od, dest.size) # fix weights to dissimilarity weight = 100 - weight print "# edges : ", dest.size print "# vertices : ", fe.size print "edges/vertices ratio : ", dest.size * 1.0 / fe.size t1.tic() mst, n_edges = boruvka_minho_seq(dest, weight, fe, od) t1.tac() print "seq: time elapsed : ", t1.elapsed print "seq: mst size :", mst.size print "seq: n_edges : ", n_edges if n_edges < mst.size: mst = mst[:n_edges] mst.sort() ev1,ev2 = cuda.event(), cuda.event() ev1.record() d_dest = cuda.to_device(dest) d_weight = cuda.to_device(weight) d_fe = cuda.to_device(fe) d_od = cuda.to_device(od) ev2.record() send_graph_time = cuda.event_elapsed_time(ev1,ev2) t2.tic() mst2, n_edges2 = boruvka_minho_gpu(d_dest, d_weight, d_fe, d_od, MAX_TPB=512, returnDevAry = True) t2.tac() ev1.record() mst2 = mst2.copy_to_host() n_edges2 = n_edges2.getitem(0) ev2.record() recv_mst_time = cuda.event_elapsed_time(ev1,ev2) print "gpu: send graph time : ", send_graph_time print "gpu: time elapsed : ", t2.elapsed print "gpu: rcv mst time : ", recv_mst_time print "gpu: mst size :", mst2.size print "seq: n_edges : ", n_edges2 if n_edges2 < mst2.size: mst2 = mst2[:n_edges2] mst2.sort() if n_edges == n_edges2: mst_is_equal = (mst == mst2).all() else: mst_is_equal = False print "mst gpu == seq : ", mst_is_equal
def check_colors(): print "CHECK COLORS SEQ & CUDA" #dest, weight, firstEdge, outDegree = load_graph("4elt") sp_cal = load_csr_graph(home + "QCThesis/datasets/graphs/USA-road-d.CAL.csr") dest, weight, firstEdge, outDegree = get_boruvka_format(sp_cal) del sp_cal print "# edges: ", dest.size print "# vertices: ", firstEdge.size print "size of graph (MB): ", (dest.size + weight.size + firstEdge.size + outDegree.size) * 4.0 / 1024 / 1024 print "# vertices: ", firstEdge.size print "# edges: ", dest.size print "seq: Computing MST" t1 = Timer() t1.tic() mst, n_edges = boruvka_minho_seq(dest, weight, firstEdge, outDegree) t1.tac() print "seq: time elapsed: ", t1.elapsed print "seq: mst size :", mst.size print "seq: n_edges: ", n_edges if n_edges < mst.size: mst = mst[:n_edges] mst.sort() print "gpu: Computing MST" t1.tic() mst2, n_edges2 = boruvka_minho_gpu(dest, weight, firstEdge, outDegree, MAX_TPB=256) t1.tac() print "gpu: time elapsed: ", t1.elapsed print "gpu: mst size :", mst2.size print "seq: n_edges: ", n_edges2 if n_edges2 < mst2.size: mst2 = mst2[:n_edges2] mst2.sort() print "mst gpu == seq: ", (mst == mst2).all() # make two cuts mst = mst[:-2] print "seq: Generating MST graph" nod = np.zeros(outDegree.size, dtype = outDegree.dtype) nfe = np.empty(firstEdge.size, dtype = firstEdge.dtype) ndest = np.empty(mst.size * 2, dtype = dest.dtype) nweight = np.empty(mst.size * 2, dtype = weight.dtype) t1.tic() get_new_graph(dest, weight, firstEdge, outDegree, mst, nod, nfe, ndest, nweight) t1.tac() print "seq: time elapsed: ", t1.elapsed print "seq: Computing labels" t1.tic() colors = getLabels_seq(ndest, nweight, nfe, nod) t1.tac() print "seq: time elapsed: ", t1.elapsed print "seq: # colors: ", np.unique(colors).size print "gpu: Computing labels" t1.tic() colors2 = getLabels_gpu(ndest, nweight, nfe, nod, MAX_TPB=256) t1.tac() print "gpu: time elapsed: ", t1.elapsed print "gpu: # colors: ", np.unique(colors2).size print "colors gpu == seq: ", (colors == colors2).all()
def sl_mst_lifetime_gpu(dest, weight, fe, od, disconnect_weight=None, MAX_TPB=256, stream=None): """ Input are device arrays. Inputs: dest, weight, fe : device arrays disconnect_weight : weight between unconnected vertices mst : list of edges in MST MAX_TPB : number of threads per block stream : CUDA stream to use TODO: - argmax is from cuBlas and only works with 32/64 floats. Make this work with any type. - """ if disconnect_weight is None: disconnect_weight = weight.max() if stream is None: myStream = cuda.stream() else: myStream = stream mst, n_edges = boruvka_minho_gpu(dest, weight, fe, od, MAX_TPB=MAX_TPB, stream=myStream, returnDevAry=True) # Allocate array for the mst weights. h_n_edges = int(n_edges.getitem(0, stream=myStream)) # edges to keep in MST mst_weights = cuda.device_array(h_n_edges, dtype=weight.dtype) # Get array with only the considered weights in the MST # and remove those edges in the MST edge list mstGrid = compute_cuda_grid_dim(h_n_edges, MAX_TPB) d_weight = cuda.to_device(weight, stream=myStream) getWeightsOfEdges_gpu[mstGrid, MAX_TPB, myStream](mst, n_edges, d_weight, mst_weights) # Sort the MST weights. There are no repeated edges at this # point since the output MST is like a directed graph. sorter = RadixSort(maxcount=mst_weights.size, dtype=mst_weights.dtype, stream=myStream) sortedWeightArgs = sorter.argsort(mst_weights) # Allocate array for the lifetimes. lifetimes = cuda.device_array(mst_weights.size - 1, dtype=mst_weights.dtype) compute_lifetimes_CUDA[mstGrid, MAX_TPB, myStream](mst_weights, lifetimes) maxer = Blas(stream) arg_max_lt = maxer.amax(lifetimes) max_lt = lifetimes.getitem(arg_max_lt) # this is the lifetime between edges with no connection and the weakest link #lt_threshold = disconnect_weight - max_lt lt_threshold = disconnect_weight - mst_weights.getitem(mst_weights.size - 1) # if the maximum lifetime is higher or equal than the lifetime threshold # cut the tree if max_lt >= lt_threshold: # from arg_max_lt onward all edges are discarded n_discarded = lifetimes.size - arg_max_lt + 1 # remove edges removeGrid = compute_cuda_grid_dim(n_discarded, MAX_TPB) removeEdges[removeGrid, MAX_TPB](edgeList, sortedArgs, n_discarded) # compute new amount of edges and update it new_n_edges = h_n_edges - n_discarded cuda.to_device(np.array([new_n_edges], dtype=n_edges.dtype), to=n_edges, stream=myStream) ngraph = getGraphFromEdges_gpu(dest, weight, fe, od, edges=mst, n_edges=n_edges, MAX_TPB=MAX_TPB, stream=myStream) ndest, nweight, nfe, nod = ngraph labels = connected_comps_gpu(ndest, nweight, nfe, nod, MAX_TPB=512, stream=myStream) del ndest, nweight, nfe, nod, lifetimes return labels