def algorithm2(i, g): while (True): aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=F.when( AM.src['value'] == -1, AM.src["id"])) new_vertices = g.vertices.join( aggregates, on="id", how="left_outer").withColumn( "newValue", getid_maximum_udf2("id", "agg", lit(i), "value")).drop("agg").withColumn( 'max_by_rows', greatest('value', 'newValue')).drop( "value", "newValue").withColumnRenamed( "max_by_rows", "value") cached_new_vertices = AM.getCachedDataFrame(new_vertices) g = GraphFrame(cached_new_vertices, g.edges) i += 1 g.vertices.show() if (g.filterVertices( "value == -1").dropIsolatedVertices().edges.count() == 0): final_df = g.vertices final_df = final_df.withColumn( "value", F.when(final_df["value"] == -1, i).otherwise(final_df["value"])) break return final_df
def do_triangles(conf: Conf, g: graphframes.GraphFrame, s: Stepper, vertices_count: int) -> None: """ Pattern for batch oriented iteration - we split the graph into batches using the filterVertices mechanism - we mark the total count of triangles and the partial count - in case of error: * we double the number of batches and the batch number * we restart the iteration at this point with smaller subgraph """ full_set = vertices_count batches = conf.batches_for_triangles total_triangles = conf.count_at_restart batch = conf.batch_at_restart subset = int(full_set / batches) while batch < batches: st = Stepper() count = 0 try: print("try batches=", batches, "subset=", subset, "at batch=", batch) gc.collect() # g1 = g.filterVertices("int(cell/{}) == {}".format(subset, batch)) g1 = g.filterVertices("int(id/{}) == {}".format(subset, batch)) triangles = g1.triangleCount() st.show_step("partial triangleCount") gc.collect() count = triangles.agg({"cell": "sum"}).toPandas()["sum(cell)"][0] st.show_step("partial triangleCount sum") total_triangles += count print("batch=", batch, "vertices=", g1.vertices.count(), "edges=", g1.edges.count(), "total=", total_triangles, "partial", count) except: print("memory error") batches *= 2 batch *= 2 subset = int(full_set / batches) print("restarting with batches=", batches, "subset=", subset, "at batch=", batch) if subset >= 1: continue break batch += 1 s.show_step("triangleCount") print("total=", total_triangles)
# Find the youngest user's age in the graph. g.vertices.groupBy().min("age").show() # Count the number of "follows" in the graph. numFollows = g.edges.filter("relationship = 'follow'").count() # motif finding motifs = g.find("(a)-[e]->(b); (b)-[e2]->(c)").filter("a.id != c.id") motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)") motifs.show() # More complex queries motifs.filter("b.age > 30").show() print("\ngenerate subgraph --- ") g1 = (g.filterVertices("age > 30").filterEdges( "relationship = 'friend'").dropIsolatedVertices()) g1.vertices.show() g1.edges.show() # Breadth-first search (BFS) print("\n BFS") paths = g.bfs( "name = 'Esther'", "age < 32", edgeFilter="relationship != 'friend'", maxPathLength=3, ).show() # In-Degree and Out-Degree Metrics print("\n Degree--------------") inDeg = g.inDegrees
tmp = [] for i in range(len(msg_ids)): tmp.append(int(msg_ids[i])) if max(tmp) < int(id): return superstep else: return -1 check_max_udf = F.udf(check_max, types.IntegerType()) superstep = 1 while (1): sub_g = g.filterVertices("color == -1") #only one vertice left uncolored if (sub_g.vertices.count() == 1): res = sub_g.vertices.withColumn("newColor", lit(superstep)).drop("color") #color remaining vertice new_vertices = g.vertices.join(res, on="id", how="left_outer") \ .withColumnRenamed("color", "oldColor") \ .withColumn("color", get_max_udf(F.col("oldColor"), F.col("newColor"))) \ .drop("oldColor").drop("newColor") print("------------ Graph Coloring Solution ------------") cached_new_vertices = AM.getCachedDataFrame(new_vertices) g = GraphFrame(cached_new_vertices, g.edges)