#!/usr/bin/env python # -*- coding: utf-8 -*- from pyspark import SparkContext from pyspark.sql import SQLContext from graphframes import * from graphframes.examples import Graphs sc = SparkContext('local', 'barcos') sq = SQLContext(sc) friends = Graphs(sq).friends() friends.vertices.show() friends.edges.show() over30 = friends.vertices.filter("age > 30") only_friends = friends.edges.filter("relationship = 'friend'") friends_over_30 = GraphFrame(over30, only_friends) friends_over_30.triplets.show()
v=spark.read.csv('/Users/pc/spark-2.1.0-bin-hadoop2.7/vertices.csv',header=True,inferSchema=True) e=spark.read.csv('/Users/pc/spark-2.1.0-bin-hadoop2.7/edges.csv',header=True,inferSchema=True) g=GraphFrame(v,e) # g.vertices.show() # g.edges.show() # g.inDegrees.show() # g.outDegrees.show() results = g.pageRank(resetProbability=0.01, maxIter=20) results.vertices.select("id", "pagerank").orderBy("pagerank",ascending=False).show() ### connected component ### from graphframes.examples import Graphs g = Graphs(sqlContext).friends() sc.setCheckpointDir('User/pc/checkpoint') result = g.connectedComponents() result.select("id", "component").orderBy("component").show() """ +---+---------+ | id|component| +---+---------+ | a| 0| | d| 0| | e| 0| | f| 0| | b| 0| | c| 0| +---+---------+ means that they all belong to one component which id is 0
# Vertex DataFrame v = sqlContext.createDataFrame([("a", "Alice", 34), ("b", "Bob", 36), ("c", "Charlie", 30), ("d", "David", 29), ("e", "Esther", 32), ("f", "Fanny", 36), ("g", "Gabby", 60)], ["id", "name", "age"]) # Edge DataFrame e = sqlContext.createDataFrame([("a", "b", "friend"), ("b", "c", "follow"), ("c", "b", "follow"), ("f", "c", "follow"), ("e", "f", "follow"), ("e", "d", "friend"), ("d", "a", "friend"), ("a", "e", "friend")], ["src", "dst", "relationship"]) # Create a GraphFrame g = GraphFrame(v, e) g = Graphs(sqlContext).friends() # Get example graph # Display the vertex and edge DataFrames g.vertices.show() # +--+-------+---+ # |id| name|age| # +--+-------+---+ # | a| Alice| 34| # | b| Bob| 36| # | c|Charlie| 30| # | d| David| 29| # | e| Esther| 32| # | f| Fanny| 36| # | g| Gabby| 60| # +--+-------+---+
import pyspark from pyspark.sql import SparkSession sc = pyspark.SparkContext.getOrCreate() sc.addPyFile( "/home/dhuy237/.ivy2/jars/graphframes_graphframes-0.8.1-spark3.0-s_2.12.jar" ) from graphframes.examples import Graphs from graphframes import GraphFrame sc.setCheckpointDir("/tmp/graphframes_cps") g = Graphs(sqlContext).friends() # Get example graph result = g.connectedComponents() result.select("id", "component").orderBy("component").show()
from pyspark import SparkConf, SparkContext, SQLContext from graphframes import GraphFrame from graphframes.examples import Graphs conf = (SparkConf() .setMaster("local") .setAppName("My app") .set("spark.executor.memory", "1g")) sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) g = Graphs(sqlContext).friends() g.vertices.show() g.edges.show() #g.bfs("name = 'Charlie'", "age > 30").show() #result = g.stronglyConnectedComponents(maxIter=10) #result.select("id", "component").orderBy("component").show() #result = g.labelPropagation(maxIter=5) #result.select("id", "label").show() results = g.shortestPaths(landmarks=["a", "d"]) results.select("id", "distances").show()
from graphframes.examples import Graphs g = Graphs(sqlContext).friends() # Get example graph # Run PageRank until convergence to tolerance "tol". results = g.pageRank(resetProbability=0.15, tol=0.01) # Display resulting pageranks and final edge weights # Note that the displayed pagerank may be truncated, e.g., missing the E notation. # In Spark 1.5+, you can use show(truncate=False) to avoid truncation. results.vertices.select("id", "pagerank").show() results.edges.select("src", "dst", "weight").show() # Run PageRank for a fixed number of iterations. results2 = g.pageRank(resetProbability=0.15, maxIter=10) # Run PageRank personalized for vertex "a" results3 = g.pageRank(resetProbability=0.15, maxIter=10, sourceId="a")