def PageRank(): url = '/home/gengl/Datasets/PageRank/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.0001, max_iterations=1000, _distributed=True) pr_model.summary()
def PageRank(): url = '/clueweb/PageRank/clueweb_20M/edge_pair.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.000000000001, max_iterations=42, _distributed=True) pr_model.summary()
def getToHighlight(G): if not graphlab_is_installed: print "GraphLab is not installed!" important = None try: print "Highlights nodes, for SGraph only!" pr = pagerank.create(G) pr_out = pr['pagerank'] # SFrame #print pr_out['__id'] #['pagerank'] important= set() for a,b in zip(pr_out['__id'],pr_out['pagerank']): if b > 0.2: important.add(a) print b except: pass return important
def pagerank(self): return pagerank.create(self._graph)['pagerank']
import datetime # Create cluster c = gl.deploy.hadoop_cluster.create(name='test-cluster',dato_dist_path='hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/dato/tmp',hadoop_conf_dir='/usr/local/hadoop/etc/hadoop',num_containers=3) print c from graphlab import SFrame, SGraph url = 'hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/data/pokec.txt' data = SFrame.read_csv(url, delimiter='\t',header=False) g = SGraph().add_edges(data, src_field='X2', dst_field='X1') # triangle counting from graphlab import triangle_counting tc = triangle_counting.create(g) tc_out = tc['triangle_count'] #pagerank from graphlab import pagerank datetime.datetime.now() pr = pagerank.create(g,threshold=0.001) datetime.datetime.now() # Connected Components from graphlab import connected_components datetime.datetime.now() cc = connected_components.create(g) datetime.datetime.now()