#!/usr/bin/env python
# -*- coding: utf-8 -*-

from pyspark import SparkContext
from pyspark.sql import SQLContext

from graphframes import *
from graphframes.examples import Graphs

sc = SparkContext('local', 'barcos')
sq = SQLContext(sc)
friends = Graphs(sq).friends()

friends.vertices.show()
friends.edges.show()

over30 = friends.vertices.filter("age > 30")
only_friends = friends.edges.filter("relationship = 'friend'")
friends_over_30 = GraphFrame(over30, only_friends)
friends_over_30.triplets.show()

示例#2
0
v=spark.read.csv('/Users/pc/spark-2.1.0-bin-hadoop2.7/vertices.csv',header=True,inferSchema=True)
e=spark.read.csv('/Users/pc/spark-2.1.0-bin-hadoop2.7/edges.csv',header=True,inferSchema=True)

g=GraphFrame(v,e)
# g.vertices.show()
# g.edges.show()
# g.inDegrees.show()
# g.outDegrees.show()

results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").orderBy("pagerank",ascending=False).show()

### connected component ###

from graphframes.examples import Graphs
g = Graphs(sqlContext).friends()
sc.setCheckpointDir('User/pc/checkpoint')
result = g.connectedComponents()
result.select("id", "component").orderBy("component").show() 
"""
+---+---------+
| id|component|
+---+---------+
|  a|        0|
|  d|        0|
|  e|        0|
|  f|        0|
|  b|        0|
|  c|        0|
+---+---------+
means that they all belong to one component which id is 0
# Vertex DataFrame
v = sqlContext.createDataFrame([("a", "Alice", 34), ("b", "Bob", 36),
                                ("c", "Charlie", 30), ("d", "David", 29),
                                ("e", "Esther", 32), ("f", "Fanny", 36),
                                ("g", "Gabby", 60)], ["id", "name", "age"])
# Edge DataFrame
e = sqlContext.createDataFrame([("a", "b", "friend"), ("b", "c", "follow"),
                                ("c", "b", "follow"), ("f", "c", "follow"),
                                ("e", "f", "follow"), ("e", "d", "friend"),
                                ("d", "a", "friend"), ("a", "e", "friend")],
                               ["src", "dst", "relationship"])
# Create a GraphFrame
g = GraphFrame(v, e)

g = Graphs(sqlContext).friends()  # Get example graph

# Display the vertex and edge DataFrames
g.vertices.show()
# +--+-------+---+
# |id|   name|age|
# +--+-------+---+
# | a|  Alice| 34|
# | b|    Bob| 36|
# | c|Charlie| 30|
# | d|  David| 29|
# | e| Esther| 32|
# | f|  Fanny| 36|
# | g|  Gabby| 60|
# +--+-------+---+
示例#4
0
import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext.getOrCreate()

sc.addPyFile(
    "/home/dhuy237/.ivy2/jars/graphframes_graphframes-0.8.1-spark3.0-s_2.12.jar"
)

from graphframes.examples import Graphs
from graphframes import GraphFrame

sc.setCheckpointDir("/tmp/graphframes_cps")

g = Graphs(sqlContext).friends()  # Get example graph

result = g.connectedComponents()
result.select("id", "component").orderBy("component").show()
示例#5
0
from pyspark import SparkConf, SparkContext, SQLContext
from graphframes import GraphFrame
from graphframes.examples import Graphs

conf = (SparkConf()
         .setMaster("local")
         .setAppName("My app")
         .set("spark.executor.memory", "1g"))

sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

g = Graphs(sqlContext).friends()

g.vertices.show()
g.edges.show()

#g.bfs("name = 'Charlie'", "age > 30").show()

#result = g.stronglyConnectedComponents(maxIter=10)
#result.select("id", "component").orderBy("component").show()

#result = g.labelPropagation(maxIter=5)
#result.select("id", "label").show()

results = g.shortestPaths(landmarks=["a", "d"])
results.select("id", "distances").show()
from graphframes.examples import Graphs
g = Graphs(sqlContext).friends()  # Get example graph

# Run PageRank until convergence to tolerance "tol".
results = g.pageRank(resetProbability=0.15, tol=0.01)
# Display resulting pageranks and final edge weights
# Note that the displayed pagerank may be truncated, e.g., missing the E notation.
# In Spark 1.5+, you can use show(truncate=False) to avoid truncation.
results.vertices.select("id", "pagerank").show()
results.edges.select("src", "dst", "weight").show()

# Run PageRank for a fixed number of iterations.
results2 = g.pageRank(resetProbability=0.15, maxIter=10)

# Run PageRank personalized for vertex "a"
results3 = g.pageRank(resetProbability=0.15, maxIter=10, sourceId="a")