def algorithm2(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        if (g.filterVertices(
                "value == -1").dropIsolatedVertices().edges.count() == 0):
            final_df = g.vertices
            final_df = final_df.withColumn(
                "value",
                F.when(final_df["value"] == -1,
                       i).otherwise(final_df["value"]))
            break
    return final_df
示例#2
0
def do_triangles(conf: Conf, g: graphframes.GraphFrame, s: Stepper,
                 vertices_count: int) -> None:
    """
    Pattern for batch oriented iteration

    - we split the graph into batches using the filterVertices mechanism
    - we mark the total count of triangles and the partial count
    - in case of error:
       * we double the number of batches and the batch number
       * we restart the iteration at this point with smaller subgraph
    """

    full_set = vertices_count
    batches = conf.batches_for_triangles
    total_triangles = conf.count_at_restart
    batch = conf.batch_at_restart
    subset = int(full_set / batches)

    while batch < batches:
        st = Stepper()
        count = 0
        try:
            print("try batches=", batches, "subset=", subset, "at batch=",
                  batch)
            gc.collect()
            # g1 = g.filterVertices("int(cell/{}) == {}".format(subset, batch))
            g1 = g.filterVertices("int(id/{}) == {}".format(subset, batch))
            triangles = g1.triangleCount()
            st.show_step("partial triangleCount")
            gc.collect()
            count = triangles.agg({"cell": "sum"}).toPandas()["sum(cell)"][0]
            st.show_step("partial triangleCount sum")

            total_triangles += count

            print("batch=", batch, "vertices=", g1.vertices.count(), "edges=",
                  g1.edges.count(), "total=", total_triangles, "partial",
                  count)
        except:
            print("memory error")
            batches *= 2
            batch *= 2
            subset = int(full_set / batches)
            print("restarting with batches=", batches, "subset=", subset,
                  "at batch=", batch)
            if subset >= 1:
                continue
            break

        batch += 1

    s.show_step("triangleCount")
    print("total=", total_triangles)
# Find the youngest user's age in the graph.
g.vertices.groupBy().min("age").show()

# Count the number of "follows" in the graph.
numFollows = g.edges.filter("relationship = 'follow'").count()

# motif finding
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(c)").filter("a.id != c.id")

motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.show()
# More complex queries
motifs.filter("b.age > 30").show()

print("\ngenerate subgraph --- ")
g1 = (g.filterVertices("age > 30").filterEdges(
    "relationship = 'friend'").dropIsolatedVertices())
g1.vertices.show()
g1.edges.show()

# Breadth-first search (BFS)
print("\n BFS")
paths = g.bfs(
    "name = 'Esther'",
    "age < 32",
    edgeFilter="relationship != 'friend'",
    maxPathLength=3,
).show()

# In-Degree and Out-Degree Metrics
print("\n Degree--------------")
inDeg = g.inDegrees
示例#4
0
    tmp = []
    for i in range(len(msg_ids)):
        tmp.append(int(msg_ids[i]))

    if max(tmp) < int(id):
        return superstep
    else:
        return -1


check_max_udf = F.udf(check_max, types.IntegerType())

superstep = 1

while (1):
    sub_g = g.filterVertices("color == -1")

    #only one vertice left uncolored
    if (sub_g.vertices.count() == 1):
        res = sub_g.vertices.withColumn("newColor",
                                        lit(superstep)).drop("color")

        #color remaining vertice
        new_vertices = g.vertices.join(res, on="id", how="left_outer") \
                    .withColumnRenamed("color", "oldColor") \
                    .withColumn("color", get_max_udf(F.col("oldColor"), F.col("newColor"))) \
                    .drop("oldColor").drop("newColor")

        print("------------ Graph Coloring Solution ------------")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)