def PageRank():
    url = '/home/gengl/Datasets/PageRank/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    pr_model = pagerank.create(graph,
                               reset_probability=0.2,
                               threshold=0.0001,
                               max_iterations=1000,
                               _distributed=True)
    pr_model.summary()
示例#2
0
def PageRank():
    url = '/clueweb/PageRank/clueweb_20M/edge_pair.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    pr_model = pagerank.create(graph,
                               reset_probability=0.2,
                               threshold=0.000000000001,
                               max_iterations=42,
                               _distributed=True)
    pr_model.summary()
def getToHighlight(G):
    if not graphlab_is_installed: print "GraphLab is not installed!"
    important = None    
    try:
        print "Highlights nodes, for SGraph only!"
        pr = pagerank.create(G)
        pr_out = pr['pagerank']     # SFrame
        #print pr_out['__id'] #['pagerank'] 
    
        important= set()
        for a,b in zip(pr_out['__id'],pr_out['pagerank']):
            if b > 0.2: important.add(a)  
            print b
    except: pass
    return important   
示例#4
0
 def pagerank(self):
     return pagerank.create(self._graph)['pagerank']
示例#5
0
import datetime

# Create cluster
c = gl.deploy.hadoop_cluster.create(name='test-cluster',dato_dist_path='hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/dato/tmp',hadoop_conf_dir='/usr/local/hadoop/etc/hadoop',num_containers=3)
print c

from graphlab import SFrame, SGraph
url = 'hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/data/pokec.txt'
data = SFrame.read_csv(url, delimiter='\t',header=False)
g = SGraph().add_edges(data, src_field='X2', dst_field='X1')


# triangle counting
from graphlab import triangle_counting
tc = triangle_counting.create(g)
tc_out = tc['triangle_count']


#pagerank
from graphlab import pagerank
datetime.datetime.now()
pr = pagerank.create(g,threshold=0.001)
datetime.datetime.now()


# Connected Components
from graphlab import connected_components
datetime.datetime.now()
cc = connected_components.create(g)
datetime.datetime.now()