def run_pagerank_job(path_to_file): """Runs PageRank on the specified graph using graphlab's API. Parameters ---------- path_to_file : String type The path leading to the edge list file Returns ------- runtime : String type The total runtime of the job """ toc = time.time() g = gl.load_graph(path, 'snap') pr = gl.pagerank.create(g, max_iterations=20) tic = time.time() print pr.summary() pr_out = pr['pagerank'] return "Total runtime: {} seconds".format(tic - toc)
def run_ndegree_neigh_job(path_to_file, source_vertex, degree): """Finds the nth degree neighborhood on the specified graph using graphlab's API. This case ignores direction. Parameters ---------- path_to_file : String type The path leading to the edge list file source_vertex : Long type The id of the source vertex degree : int type The degree of neighbors Returns ------- runtime : String type The total runtime of the job """ toc = time.time() graph = gl.load_graph(path, 'snap') sssp = gl.shortest_path.create(graph, source_vid=source_vertex) sp_sframe = sssp['distance'].filter_by(float(degree), 'distance') print("Neighorhood length: {}\nNeighbors:\n{}".format( sp_sframe.num_rows(), sp_sframe)) tic = time.time() return "Total runtime: {} seconds".format(tic - toc)
def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path): """ Internal util to get a GLC object from a persistent ID in the pickle file. Parameters ---------- type_tag : The name of the glc class as saved in the GLC pickler. gl_archive_abs_path: An absolute path to the GLC archive where the object was saved. Returns ---------- The GLC object. """ if type_tag == "SFrame": obj = _gl.SFrame(gl_archive_abs_path) elif type_tag == "SGraph": obj = _gl.load_graph(gl_archive_abs_path) elif type_tag == "SArray": obj = _gl.SArray(gl_archive_abs_path) elif type_tag == "Model": obj = _gl.load_model(gl_archive_abs_path) else: raise _pickle.UnpicklingError("GraphLab pickling Error: Unspported object." " Only SFrames, SGraphs, SArrays, and Models are supported.") return obj
def run_conn_comp_job(path_to_file): """Finds the connected components on the specified graph using graphlab's API. Parameters ---------- path_to_file : String type The path leading to the edge list file Returns ------- runtime : String type The total runtime of the job """ toc = time.time() g = gl.load_graph(path, 'snap') conn_comp = gl.connected_components.create(g) tic = time.time() print conn_comp.summary() return "Total runtime: {} seconds".format(tic - toc)
def run_triangle_counting_job(path_to_file): """Calculates the number of triangles on the specified graph using graphlab's API. Parameters ---------- path_to_file : String type The path leading to the edge list file Returns ------- runtime : String type The total runtime of the job """ toc = time.time() g = gl.load_graph(path, 'snap') tri = gl.triangle_counting.create(g) tic = time.time() print tri.summary() tri_out = tri['triangle_count'] return "Total runtime: {} seconds".format(tic - toc)
def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path): """ Internal util to get a GLC object from a persistent ID in the pickle file. Parameters ---------- type_tag : The name of the glc class as saved in the GLC pickler. gl_archive_abs_path: An absolute path to the GLC archive where the object was saved. Returns ---------- The GLC object. """ if type_tag == "SFrame": obj = _gl.SFrame(gl_archive_abs_path) elif type_tag == "SGraph": obj = _gl.load_graph(gl_archive_abs_path) elif type_tag == "SArray": obj = _gl.SArray(gl_archive_abs_path) elif type_tag == "Model": obj = _gl.load_model(gl_archive_abs_path) else: raise _pickle.UnpicklingError( "GraphLab pickling Error: Unspported object." " Only SFrames, SGraphs, SArrays, and Models are supported.") return obj
def run_ndegree_neigh_job (path_to_file, source_vertex, degree): """Finds the nth degree neighborhood on the specified graph using graphlab's API. Parameters ---------- path_to_file : String type The path leading to the edge list file source_vertex : Long type The id of the source vertex degree : int type The degree of neighbors Returns ------- runtime : String type The total runtime of the job """ toc = time.time() g = gl.load_graph(path, 'snap') result = nth_neighborhood(g, source_vertex, degree) print("Neighorhood length: {}\nNeighbors:\n{}".format(len(neighbors), neighbors)) tic = time.time() return "Total runtime: {} seconds".format(tic-toc)
def run_sssp_job(path_to_file, source_vertex, target_vertex): """Finds the shortest path from a given vertex to a target vertex on the specified graph using graphlab's API. Parameters ---------- path_to_file : String type The path leading to the edge list file source_vertex : Long type The id of the source vertex target_vertex : Long type The id of the target vertex Returns ------- runtime : String type The total runtime of the job """ toc = time.time() g = gl.load_graph(path, 'snap') sssp = gl.shortest_path.create(g, source_vid=source_vertex) result = sssp.get_path(vid=target_vertex) print result tic = time.time() return "Total runtime: {} seconds".format(tic - toc)
def test_exception(self): self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....')) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....')) self.assertRaises(IOError, lambda: self.graph.save("/root/tmp.graph")) self.assertRaises(IOError, lambda: self.sframe.save("/root/tmp.frame_idx")) self.assertRaises(IOError, lambda: self.model.save("/root/tmp.model")) self.assertRaises(IOError, lambda: graphlab.load_graph("/root/tmp.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe("/root/tmp.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model("/root/tmp.model"))
def run_reachability_job (path_to_file, source_vertex, target_vertex, max_depth): """Determines whether a target vertex is reachable from a source vertex on the specified graph using graphlab's API. Parameters ---------- path_to_file : String type The path leading to the edge list file source_vertex : Long type The id of the source vertex target_vertex : Long type The id of the target vertex max_depth : int type The maximum recursion depth Returns ------- runtime : String type The total runtime of the job """ toc = time.time() graph = gl.load_graph(path, 'snap') sources_set = set([source_vertex]) # Start from source vertex - BFS targets_set = set() is_reachable = False while max_depth > 0: for vertex in sources_set: outgoing_edges = graph.get_edges(src_ids=[vertex]) targets_set.update(list(outgoing_edges["__dst_id"])) if target_vertex in targets_set: is_reachable = True break else: sources_set = targets_set targets_set = set() max_depth -= 1 tic = time.time() if is_reachable: print("Vertex {} is reachable from vertex {}".format(target_vertex, source_vertex)) else: print("Vertex {} cannot be reached from vertex {}".format(target_vertex, source_vertex)) return "Total runtime: {} seconds".format(tic-toc)
def test_exception(self): self.assertRaises(ValueError, lambda: self._test_read_write_helper(self.tempfile, 'hello world')) self.assertRaises(ValueError, lambda: self._test_read_write_helper("local://" + self.tempfile + ".csv.gz", 'hello,world,woof')) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("remote:///root/tmp", '.....')) self.assertRaises(IOError, lambda: self.graph.save("remote:///root/tmp.graph")) self.assertRaises(IOError, lambda: self.sframe.save("remote:///root/tmp.frame_idx")) self.assertRaises(IOError, lambda: self.model.save("remote:///root/tmp.model")) self.assertRaises(IOError, lambda: graphlab.load_graph("remote:///root/tmp.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe("remote:///root/tmp.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model("remote:///root/tmp.model"))
def test_exception(self): bad_url = "hdfs:///root/" if self.has_hdfs: self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs://" + self.tempfile)) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(bad_url + "/tmp", "somerandomcontent")) self.assertRaises(IOError, lambda: self.graph.save(bad_url + "x.graph")) self.assertRaises(IOError, lambda: self.sframe.save(bad_url + "x.frame_idx")) self.assertRaises(IOError, lambda: self.model.save(bad_url + "x.model")) self.assertRaises(IOError, lambda: graphlab.load_graph(bad_url + "mygraph")) self.assertRaises(IOError, lambda: graphlab.load_sframe(bad_url + "x.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model(bad_url + "x.model")) else: logging.getLogger(__name__).info("No hdfs avaiable. Test pass.")
def run_reachability_job (path_to_file, source_vertex, target_vertex): """Determines whether a target vertex is reachable from a source vertex on the specified graph using graphlab's API. Parameters ---------- path_to_file : String type The path leading to the edge list file source_vertex : Long type The id of the source vertex target_vertex : Long type The id of the target vertex max_depth : int type The maximum recursion depth Returns ------- runtime : String type The total runtime of the job """ toc = time.time() graph = gl.load_graph(path, 'snap') is_reachable = False sssp = gl.shortest_path.create(graph, source_vid=source_vertex) sp_sframe = sssp['distance'].filter_by(target_vertex, '__id') distance = list(sp_sframe['distance'])[0] if distance < 1e+30: is_reachable = True tic = time.time() if is_reachable: print("Vertex {} is reachable from vertex {} - Distance: {}".format(target_vertex, source_vertex, int(distance))) else: print("Vertex {} cannot be reached from vertex {} - Distance: {}".format(target_vertex, source_vertex, int(distance))) return "Total runtime: {} seconds".format(tic-toc)
def test_exception(self): if self.has_s3: bad_bucket = "i_am_a_bad_bucket" prefix = "s3://" + bad_bucket self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3:///")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + self.standard_bucket + "/somerandomfile")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + "/somerandomfile")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + "/somerandomfile", "somerandomcontent")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + self.standard_bucket + "I'amABadUrl/", "somerandomcontent")) self.assertRaises(IOError, lambda: self.graph.save(prefix + "/x.graph")) self.assertRaises(IOError, lambda: self.sframe.save(prefix + "/x.frame_idx")) self.assertRaises(IOError, lambda: self.model.save(prefix + "/x.model")) self.assertRaises(IOError, lambda: graphlab.load_graph(prefix + "/x.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe(prefix + "/x.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model(prefix + "/x.model")) else: logging.getLogger(__name__).info("No s3 bucket avaiable. Test pass.")
def run_ndegree_neigh_job(path_to_file, source_vertex, degree): """Finds the nth degree neighborhood on the specified graph using graphlab's API. Parameters ---------- path_to_file : String type The path leading to the edge list file source_vertex : Long type The id of the source vertex degree : int type The degree of neighbors Returns ------- runtime : String type The total runtime of the job """ toc = time.time() graph = gl.load_graph(path, 'snap') sources_set = set([source_vertex]) # Start from source vertex - BFS targets_set = set() while degree > 0: for vertex in sources_set: outgoing_edges = graph.get_edges(src_ids=[vertex]) targets_set.update(list(outgoing_edges["__dst_id"])) if degree is 1: nth_neighbors = targets_set break else: sources_set = targets_set targets_set = set() degree -= 1 print("Neighorhood length: {}\nNeighbors:\n{}".format( len(nth_neighbors), nth_neighbors)) tic = time.time() return "Total runtime: {} seconds".format(tic - toc)
def _test_save_load_object_helper(testcase, obj, url): """ Helper function to test save and load a server side object to a given url. """ def cleanup(url): """ Remove the saved file from temp directory. """ protocol = None path = None splits = url.split("://") if len(splits) > 1: protocol = splits[0] path = splits[1] else: path = url if not protocol or protocol is "local" or protocol is "remote": tempdir = tempfile.gettempdir() pattern = path + ".*" for f in os.listdir(tempdir): if re.search(pattern, f): os.remove(os.path.join(tempdir, f)) if isinstance(obj, graphlab.SGraph): obj.save(url + ".graph") newobj = graphlab.load_graph(url + ".graph") testcase.assertItemsEqual(obj.get_fields(), newobj.get_fields()) testcase.assertDictEqual(obj.summary(), newobj.summary()) elif isinstance(obj, graphlab.Model): obj.save(url + ".model") newobj = graphlab.load_model(url + ".model") testcase.assertItemsEqual(obj.list_fields(), newobj.list_fields()) testcase.assertEqual(type(obj), type(newobj)) elif isinstance(obj, graphlab.SFrame): obj.save(url + ".frame_idx") newobj = graphlab.load_sframe(url + ".frame_idx") testcase.assertEqual(obj.shape, newobj.shape) testcase.assertEqual(obj.column_names(), newobj.column_names()) testcase.assertEqual(obj.column_types(), newobj.column_types()) assert_frame_equal(obj.head(obj.num_rows()).to_dataframe(), newobj.head(newobj.num_rows()).to_dataframe()) else: raise TypeError cleanup(url)
def run_reachability_job(path_to_file, source_vertex, target_vertex, max_depth): """Determines whether a target vertex is reachable from a source vertex on the specified graph using graphlab's API. Parameters ---------- path_to_file : String type The path leading to the edge list file source_vertex : Long type The id of the source vertex target_vertex : Long type The id of the target vertex max_depth : int type The maximum recursion depth Returns ------- runtime : String type The total runtime of the job """ toc = time.time() g = gl.load_graph(path, 'snap') result = is_reachable(g, source_vertex, target_vertex, max_depth) tic = time.time() if result: print("Vertex {} is reachable from vertex {}".format( source_vertex, target_vertex)) else: print("Vertex {} cannot be reached from vertex {}".format( source_vertex, target_vertex)) return "Total runtime: {} seconds".format(tic - toc)
def run_ndegree_neigh_job(path_to_file, source_vertex, degree): """Finds the nth degree neighborhood on the specified graph using graphlab's API. This case ignores direction. Parameters ---------- path_to_file : String type The path leading to the edge list file source_vertex : Long type The id of the source vertex degree : int type The degree of neighbors Returns ------- runtime : String type The total runtime of the job """ toc = time.time() graph = gl.load_graph(path, 'snap') subgraph = graph.get_neighborhood(ids=[source_vertex], radius=degree, full_subgraph=False) nth_neighbors = set(subgraph.get_vertices()["__id"]) print("Neighorhood length: {}\nNeighbors:\n{}".format( len(nth_neighbors), nth_neighbors)) tic = time.time() return "Total runtime: {} seconds".format(tic - toc)
#!/usr/bin/env python import graphlab import sys import time if (len(sys.argv) < 5): print "Usage: %s <edge list filename> <epsilon> <damping factor> <max iterations>" % sys.argv[0] sys.exit(1) filename = sys.argv[1] print(filename) # from https://dato.com/products/create/docs/generated/graphlab.pagerank.create.html epsilon = float(sys.argv[2]) damping = float(sys.argv[3]) maxIterations = int(sys.argv[4]) g = graphlab.load_graph(filename, format='snap') start = time.time() pr = graphlab.pagerank.create(g, reset_probability=damping, max_iterations=maxIterations, _distributed=False, threshold=epsilon) end = time.time() pr_out = pr['pagerank'] # SFrame g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] print(pr_out) print('time: ' + str(end - start))
del g.vertices['total_weight'] # initialize vertex field g.vertices['prev_pagerank'] = 1.0 it = 0 total_l1_delta = len(g.vertices) start = time.time() while(total_l1_delta > threshold and it < max_iterations): g.vertices['pagerank'] = 0.0 g = g.triple_apply(pagerank_update_fn, ['pagerank']) g.vertices['pagerank'] = g.vertices['pagerank'] * (1 - reset_prob) \ + reset_prob g.vertices['l1_delta'] = (g.vertices['pagerank'] - \ g.vertices['prev_pagerank']).apply(lambda x: abs(x)) total_l1_delta = g.vertices['l1_delta'].sum() g.vertices['prev_pagerank'] = g.vertices['pagerank'] print 'Iteration %d: total pagerank changed in L1 = %f' % (it,\ total_l1_delta) it = it + 1 print 'Triple apply pagerank finished in: %f secs' % (time.time() - start) del g.vertices['prev_pagerank'] return g # Load graph g = gl.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', 'snap') g.edges['weight'] = 1.0 # Run triple apply sssp pagerank_graph = pagerank_triple_apply(g) print pagerank_graph
#dijkstra for 100,000 nodes in graph lab import graphlab print "-----Nodes: 100,000-----" g = graphlab.load_graph('http://snap.stanford.edu/data/bigdata/communities/com-youtube.ungraph.txt.gz', format='snap') sp = graphlab.shortest_path.create(g, source_vid=1) sp_sframe = sp['distance'] sp_sframe print "---------------------" sp_sframe sp_sframe.print_rows(100,3) print "----above are the first 100 computed vertices-----"
# pagerank for 1 million nodes in graph lab import graphlab print "-----1000 node Dataset-----" g = graphlab.load_graph("../dataset/pr_1000.txt", format="snap") pr = graphlab.pagerank.create(g) pr_out = pr["pagerank"] print "---------------------" pr_out pr_out.print_rows(100, 3) print "----above are the first 100 computed vertices-----"
def setUp(self): url = dataset_server + "p2p-Gnutella04.txt.gz" self.graph = gl.load_graph(url, format='snap')
#coding:utf-8 __author__ = 'zlj' import sys reload(sys) sys.setdefaultencoding('utf8') import graphlab as gl gl.load_graph()
# Example usage: # rmse_train[0.1] = [r1, r2, r3, ...] # where 0.1 is the regularization parameter (lambda), # r1 is the RMSE on the training data after 1 pass over the data, # r2 is the RMSE on the training data after 2 passes over the data, # etc rmse_train = {} # Same thing, but for validation data rmse_val = {} # Same thing, but for test data rmse_test = {} lambs = [0, 0.001, 0.01, 0.1, 1] # You should not have to edit any of the code below, except to plot figures. for l in lambs: g = graphlab.load_graph('data/training_graph.sgraph') n, m = M.shape L = np.ones((n + 1, k)) R = np.ones((k, m + 1)) lambda_u = lambda_v = l rmse_train[l] = [] rmse_val[l] = [] rmse_test[l] = [] # Get the initial rmse, before we do anything rmse = np.sqrt( sum((M[M.nonzero()] - L.dot(R)[M.nonzero()])**2) / len(M[M.nonzero()])) rmse_train[l].append(rmse) rmse = np.sqrt( sum((val[val.nonzero()] - L.dot(R)[val.nonzero()])**2) / len(val[val.nonzero()])) rmse_val[l].append(rmse)
#pagerank for 1 million nodes in graph lab import graphlab print "-----Youtube Dataset-----" print "-----Nodes: 1134890 Edges: 2987624-----" g = graphlab.load_graph('http://snap.stanford.edu/data/bigdata/communities/com-youtube.ungraph.txt.gz', format='snap') pr = graphlab.pagerank.create(g) pr_out = pr['pagerank'] print "---------------------" pr_out pr_out.print_rows(100,3) print "----above are the first 100 computed vertices-----"
print 'Iteration %d: total pagerank changed in L1 = %f' % (it, \ total_l1_delta) it = it + 1 print 'Triple apply pagerank finished in: %f secs' % (time.time() - start) del g.vertices['prev_pagerank'] return g if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument("--threshold", type=float, nargs='?', const=True, default=1e-3, help="threshold") parser.add_argument("--max_iteration", type=float, nargs='?', const=True, default=20, help="max iterations") args = parser.parse_args() threshold = args.threshold max_iteration = args.max_iteration print "Start pagerank with threshold=%s, max_iteration=%s" % (str(threshold), str(max_iteration)) g = gl.load_graph('https://snap.stanford.edu/data/web-Google.txt.gz', 'snap') g.edges['weight'] = 1.0 pagerank_graph = pagerank_triple_apply(g, threshold=threshold, max_iterations=max_iteration) output_file = './result_%s_%s.txt' % (str(threshold), str(max_iteration)) with open(output_file, 'w') as f: sorted = pagerank_graph.vertices.sort('pagerank', ascending=False) sorted.print_rows(100, output_file=f)
#dijkstra for 1000 nodes in graph lab import graphlab print "-----Nodes: 1000-----" g = graphlab.load_graph('../dataset/pr_1000.txt', format='snap') sp = graphlab.shortest_path.create(g, source_vid=1) sp_sframe = sp['distance'] sp_sframe print "---------------------" sp_sframe sp_sframe.print_rows(100,3) print "----above are the first 100 computed vertices-----"
#dijkstra for 36000 nodes in graph lab import graphlab print "-----Nodes: 36692 Edges: 367662-----" g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') sp = graphlab.shortest_path.create(g, source_vid=1) sp_sframe = sp['distance'] sp_sframe print "---------------------" sp_sframe sp_sframe.print_rows(100, 3) print "----above are the first 100 computed vertices-----"
#!/usr/bin/env python import graphlab import sys import time if (len(sys.argv) < 5): print "Usage: %s <edge list filename> <epsilon> <damping factor> <max iterations>" % sys.argv[ 0] sys.exit(1) filename = sys.argv[1] print(filename) # from https://dato.com/products/create/docs/generated/graphlab.pagerank.create.html epsilon = float(sys.argv[2]) damping = float(sys.argv[3]) maxIterations = int(sys.argv[4]) g = graphlab.load_graph(filename, format='snap') start = time.time() pr = graphlab.pagerank.create(g, reset_probability=damping, max_iterations=maxIterations, _distributed=False, threshold=epsilon) end = time.time() pr_out = pr['pagerank'] # SFrame g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] print(pr_out) print('time: ' + str(end - start))
#dijkstra for 1000 nodes in graph lab import graphlab print "-----Nodes: 1000-----" g = graphlab.load_graph('../dataset/pr_1000.txt', format='snap') sp = graphlab.shortest_path.create(g, source_vid=1) sp_sframe = sp['distance'] sp_sframe print "---------------------" sp_sframe sp_sframe.print_rows(100, 3) print "----above are the first 100 computed vertices-----"
#dijkstra for 36000 nodes in graph lab import graphlab print "-----Nodes: 36692 Edges: 367662-----" g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') sp = graphlab.shortest_path.create(g, source_vid=1) sp_sframe = sp['distance'] sp_sframe print "---------------------" sp_sframe sp_sframe.print_rows(100,3) print "----above are the first 100 computed vertices-----"