def test_gf(self): vertices = spark.createDataFrame([('1', 'Carter', 'Derrick', 50), ('2', 'May', 'Derrick', 26), ('3', 'Mills', 'Jeff', 80), ('4', 'Hood', 'Robert', 65), ('5', 'Banks', 'Mike', 93), ('98', 'Berg', 'Tim', 28), ('99', 'Page', 'Allan', 16)], ['id', 'name', 'firstname', 'age']) edges = spark.createDataFrame([('1', '2', 'friend'), ('2', '1', 'friend'), ('3', '1', 'friend'), ('1', '3', 'friend'), ('2', '3', 'follows'), ('3', '4', 'friend'), ('4', '3', 'friend'), ('5', '3', 'friend'), ('3', '5', 'friend'), ('4', '5', 'follows'), ('98', '99', 'friend'), ('99', '98', 'friend')], ['src', 'dst', 'type']) g = GraphFrame(vertices, edges) g.connectedComponents().show()
def main(): # crate spark session spark = SparkSession.builder.appName("keepindoors graphx connectedComponents()").getOrCreate() # get a mongo client cli = mongo.__get__() # v, ["id","url","title","datetime"] localVertices=[] cursor = mongo.getCollection(cli,"keepindoors","docs").find() for r in cursor: # del "_id" key which will throws error when createDataFrame r["id"] = r["docno"] localVertices.append((r["docno"],r["url"],r["title"],str(r["_id"].generation_time + timedelta(hours=8)))) # e cursor = mongo.getCollection(cli, "keepindoors", "distances").find() localEdges = [] for r in cursor: localEdges.append((r["docno1"],r["docno2"],r["distance"])) v = spark.createDataFrame(localVertices,["id","url","title","datetime"]) e = spark.createDataFrame(localEdges, ["src", "dst","distance"]) g = GraphFrame(v,e) # get sparkContext from sparkSession spark.sparkContext.setCheckpointDir("/tmp/spark/checkpoint") result = g.connectedComponents() # order by component,datetime result = result.orderBy(["component", "datetime"], ascending=[1, 0]).collect() # create component dict component_dict = {} for row in result: record = row.asDict() if record["component"] not in component_dict.keys(): component_dict[record["component"]] = [] component_dict[record["component"]].append(record) # delete mongo collection "component" mongo.deleteAll(cli,"keepindoors","components") # save component_dict into mongo index = 1 for key,item in component_dict.items(): links = [] titles = [] title = "empty title" update_time = "1970-01-01 00:00:00+00:00" for doc in item: titles.append(doc["title"]) links.append(doc["url"]) if doc["datetime"] > update_time: update_time = doc["datetime"] title = doc["title"] mongo.insertDoc({"no":index,"component":key,"title":title,"size":len(item),"links":links,"titles":titles,"update_time":update_time,"docs":item},cli,"keepindoors","components") index += 1
def get_connected_components(vertices_path, edges_path, checkpoint_dir, num_reads): # Read vertices and edges files df_vertices = build_vertices(vertices_path) df_edges = build_edges(edges_path, num_reads) # Build Graph spark = SparkSession.builder.appName("build_graph").getOrCreate() vertices = spark.createDataFrame(df_vertices) edges = spark.createDataFrame(df_edges) g = GraphFrame(vertices, edges) # Display Graph g.vertices.show() g.edges.show() # Connected Components # Get SparkContext using spark.sparkContext spark.sparkContext.setCheckpointDir(dirName=checkpoint_dir) result = g.connectedComponents() dictionary = {} sorted_result = result.select("id", "component").orderBy('component', ascending=False) for row in sorted_result.collect(): if row[1] in dictionary: dictionary[row[1]].append(row[0]) else: dictionary[row[1]] = [row[0]] GL = [] for _, value in dictionary.items(): GL.append(value) return GL, spark, g
import os os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11' import sys from functools import reduce from pyspark.sql.functions import col, lit, when import pyspark from graphframes.examples import Graphs from graphframes import GraphFrame import config sc = pyspark.SparkContext() sc.setCheckpointDir('/tmp') sqlContext = pyspark.SQLContext(sc) inputFile = sys.argv[1] # g = Graphs(sqlContext).friends() # Get example graph df = sqlContext.read.format("csv").option("delimiter", config.delimiter).load(inputFile) # Rename columns to something decent. df = df.withColumnRenamed("_c0", "src")\ .withColumnRenamed("_c1", "dst")\ .withColumnRenamed("_c2", "weight") df.show(5) aggcodes = df.select("src","dst").rdd.flatMap(lambda x: x).distinct() vertices = aggcodes.map(lambda x: (x, x)).toDF(["id","name"]) edges = df.select("src", "dst") graph = GraphFrame(vertices, edges) result = graph.connectedComponents() result.select("id", "component").orderBy("component").show()
# COMMAND ---------- stationGraph.bfs(fromExpr="id = 'Townsend at 7th'", toExpr="id = 'Spear at Folsom'", maxPathLength=2).show(10) # COMMAND ---------- spark.sparkContext.setCheckpointDir("/tmp/checkpoints") # COMMAND ---------- minGraph = GraphFrame(stationVertices, tripEdges.sample(False, 0.1)) cc = minGraph.connectedComponents() # COMMAND ---------- cc.where("component != 0").show() # COMMAND ---------- scc = minGraph.stronglyConnectedComponents(maxIter=3) # COMMAND ----------
# of each vertex and returns a graph with each # vertex assigned a component ID. # NOTE: With GraphFrames 0.3.0 and later releases, # the default Connected Components algorithm requires # setting a Spark checkpoint directory. Users can # revert to the old algorithm using # connectedComponents.setAlgorithm("graphx"). #===================================== # setting a Spark "checkpoint" directory #===================================== # What is a Checkpointing? Checkpointing is a process # of truncating RDD lineage graph and saving it to a # reliable distributed (HDFS) or local file system. # # You call SparkContext.setCheckpointDir(directory: String) # to set the checkpoint directory - the directory where RDDs # are checkpointed. # spark.sparkContext.setCheckpointDir("/tmp/spark_check_point_dir") #========================================== # apply the connectedComponents() algorithm #========================================== # connected_components = graph.connectedComponents() connected_components.select("id", "component").orderBy("component").show() # done! spark.stop()
#filename = '/home/user/leaflet-spark/atom_position_frame_1.npz.npy' coord_matrix = np.load(filename) coord_matrix_broadcast = sc.broadcast(coord_matrix) matrix_size = len(coord_matrix) dist_Matrix = sc.parallelize(coord_matrix) dist_Matrix = dist_Matrix.zipWithIndex() #key-value pairs edge_list = dist_Matrix.flatMap(find_edges) edge_list = edge_list.filter(lambda x: x[0]!=-1) # filter the -1 values sqlContext = SQLContext(sc) Edges = Row('src','dst') edge = edge_list.map(lambda x: Edges(*x)) e = sqlContext.createDataFrame(edge) # e.take(10) v = sqlContext.createDataFrame(sc.parallelize(xrange(matrix_size)).map(lambda i:Row(id=i+1))) # v.show() # create the graph g = GraphFrame(v, e) #g.vertices.show() #g.edges.show() total_time = time() - start_time cc = g.connectedComponents() print cc.select("id", "component").orderBy("component").show() print 'Total time to create the Graphframe: %i sec' % (total_time) print 'Time to calculate the connected components: %i sec ' % (time() - total_time-start_time)
chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)") # chain4.show() # g.find("(c)-[m]->()").show() # Query on sequence, with state (cnt) # (a) Define method for updating state given the next element of the motif. sumFriends = \ lambda cnt, relationship: F.when(relationship == "friend", cnt + 1).otherwise(cnt) # (b) Use sequence operation to apply method to sequence of elements in motif. # In this case, the elements are the 3 edges. condition = \ reduce(lambda cnt, e: sumFriends(cnt, F.col(e).relationship), ["ab", "bc", "cd"], F.lit(0)) # (c) Apply filter to DataFrame. chainWith2Friends2 = chain4.where(condition >= 2) # chainWith2Friends2.show() result = g.connectedComponents() # 结果含义的解释 # result.show() # 强连通图和连通图的区别 ''' Connected is usually associated with undirected graphs (two way edges): there is a path between every two nodes. Strongly connected is usually associated with directed graphs (one way edges): there is a route between every two nodes. Complete graphs are undirected graphs where there is an edge between every pair of nodes. ''' result = g.stronglyConnectedComponents(maxIter=10) # result.orderBy("component").show() ## 社区发现,本质是个聚类算法 result = g.labelPropagation(maxIter=5) # result.show()