Exemplo n.º 1
0
def min_rating_col(v, e, max_iterations=4):
    """
    :param v: vertices
    :param e: edges
    :param max_iterations: Iterative graph computations
    :return: print df
    """
    def new_rating(rating, id):
        return {"id": id, "rating": rating}

    player_type = types.StructType([
        types.StructField("id", types.StringType()),
        types.StructField("rating", types.IntegerType()),
    ])
    new_rating_udf = F.udf(new_rating, player_type)
    v = v.withColumn("minRating", new_rating_udf(v['rating'], v["id"]))
    cached_vertices = AM.getCachedDataFrame(v)

    g = GraphFrame(cached_vertices, e)
    g.vertices.show()
    g.edges.show()

    def min_rating(ratings):
        min_rating = -1
        min_rating_id = -1
        for rating in ratings:
            if min_rating == -1 or (rating.rating < min_rating):
                min_rating = rating.rating
                min_rating_id = rating.id
        return {"id": min_rating_id, "rating": min_rating}

    min_rating_udf = F.udf(min_rating, player_type)

    def compare_rating(old_rating, new_rating):
        return old_rating if old_rating.rating < new_rating.rating else new_rating

    compare_rating_udf = F.udf(compare_rating, player_type)

    # Iterative graph computations

    for _ in range(max_iterations):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=AM.src["minRating"])
        res = aggregates.withColumn("newMinRating",
                                    min_rating_udf("agg")).drop("agg")
        new_vertices = g.vertices.join(res, on="id", how="left_outer")\
            .withColumnRenamed("minRating", "oldMinRating")\
            .withColumn("minRating", compare_rating_udf(F.col("oldMinRating"), F.col("newMinRating")))\
            .drop("oldMinRating").drop("newMinRating")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        g.vertices.show()
Exemplo n.º 2
0
def shortest_path(g, origin, destination, column_name="cost"):
    if g.vertices.filter(g.vertices.id == destination).count() == 0:
        return (spark.createDataFrame(sc.emptyRDD(), g.vertices.schema)
                .withColumn("path", F.array()))

    vertices = (g.vertices.withColumn("visited", F.lit(False))
                .withColumn("distance", F.when(g.vertices["id"] == origin, 0)
                            .otherwise(float("inf")))
                .withColumn("path", F.array()))
    cached_vertices = AM.getCachedDataFrame(vertices)
    g2 = GraphFrame(cached_vertices, g.edges)

    while g2.vertices.filter('visited == False').first():
        current_node_id = g2.vertices.filter('visited == False').sort("distance").first().id

        msg_distance = AM.edge[column_name] + AM.src['distance']
        msg_path = add_path_udf(AM.src["path"], AM.src["id"])
        msg_for_dst = F.when(AM.src['id'] == current_node_id, F.struct(msg_distance, msg_path))
        new_distances = g2.aggregateMessages(F.min(AM.msg).alias("aggMess"),
                                             sendToDst=msg_for_dst)

        new_visited_col = F.when(
            g2.vertices.visited | (g2.vertices.id == current_node_id), True).otherwise(False)
        new_distance_col = F.when(new_distances["aggMess"].isNotNull() &
                                  (new_distances.aggMess["col1"] < g2.vertices.distance),
                                  new_distances.aggMess["col1"]) \
            .otherwise(g2.vertices.distance)
        new_path_col = F.when(new_distances["aggMess"].isNotNull() &
                              (new_distances.aggMess["col1"] < g2.vertices.distance),
                              new_distances.aggMess["col2"].cast("array<string>")) \
            .otherwise(g2.vertices.path)

        new_vertices = (g2.vertices.join(new_distances, on="id", how="left_outer")
                        .drop(new_distances["id"])
                        .withColumn("visited", new_visited_col)
                        .withColumn("newDistance", new_distance_col)
                        .withColumn("newPath", new_path_col)
                        .drop("aggMess", "distance", "path")
                        .withColumnRenamed('newDistance', 'distance')
                        .withColumnRenamed('newPath', 'path'))
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g2 = GraphFrame(cached_new_vertices, g2.edges)
        if g2.vertices.filter(g2.vertices.id == destination).first().visited:
            return (g2.vertices.filter(g2.vertices.id == destination)
                    .withColumn("newPath", add_path_udf("path", "id"))
                    .drop("visited", "path")
                    .withColumnRenamed("newPath", "path"))
    return (spark.createDataFrame(sc.emptyRDD(), g.vertices.schema)
            .withColumn("path", F.array()))
def algorithm1(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        g.vertices.createOrReplaceTempView("temp_table")
        if (spark.sql("SELECT * from temp_table where value = -1").count() == 0
            ):
            final_df = g.vertices
            break
    return final_df
def algorithm2(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        if (g.filterVertices(
                "value == -1").dropIsolatedVertices().edges.count() == 0):
            final_df = g.vertices
            final_df = final_df.withColumn(
                "value",
                F.when(final_df["value"] == -1,
                       i).otherwise(final_df["value"]))
            break
    return final_df
Exemplo n.º 5
0
def convert2undirect(g):

    # mirror = g.edges.withColumn('src_temp', F.col('src')) \
    mirror = g.edges.select(F.col("dst").alias("src"), F.col("src").alias("dst"))\
        .withColumn("_id", monotonically_increasing_id())

    cached_mirror = AM.getCachedDataFrame(
        mirror.join(
            g.edges.drop("src").drop("dst").withColumn(
                "_id", monotonically_increasing_id()), "_id",
            "outer").drop("_id"))

    g2 = GraphFrame(g.vertices, cached_mirror)
    cached_edges = AM.getCachedDataFrame(g.edges.union(g2.edges))

    g = GraphFrame(g.vertices, cached_edges)
    return g
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(
            colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)),
            colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(AM.src['color'] == color,
                                              AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(AM.dst['color'] == color,
                                              AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid,
                                            returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(sqlfunctions.sum(
                    AM.msg).alias("aggMess"),
                                                  sendToSrc=msgForSrc,
                                                  sendToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) &
                    (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])).otherwise(
                        v['belief'])  # keep old beliefs for other colors
                newVertices = (
                    v.join(aggregates,
                           on=(v['id'] == aggregates['id']),
                           how='left_outer').drop(
                               aggregates['id']
                           )  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief',
                                newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief'))
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
Exemplo n.º 7
0
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(
                    AM.src['color'] == color,
                    AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(
                    AM.dst['color'] == color,
                    AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(
                    sqlfunctions.sum(AM.msg).alias("aggMess"),
                    sendToSrc=msgForSrc,
                    sendToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) & (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])
                ).otherwise(v['belief'])  # keep old beliefs for other colors
                newVertices = (v
                    .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer')
                    .drop(aggregates['id'])  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief', newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief')
                )
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
Exemplo n.º 8
0
    def spread_activation_step(self, graph, attribute, spreading_factor,
                               transfer_function):
        """
        One step in the spread activation model.
        :param graph: graphframe object, network
        :param attribute: str, name of attribute/influence
        :param spreading_factor: 0 - 1, amount of influence to spread
        :param transfer_function: weighted or unweighted, how to transfer influence along edges
        :return: graphframe object, new network with updated new calculation of attribute in vertices
        """

        # Pass influence/message to neighboring nodes (weighted/unweighted option)
        if transfer_function == "unweighted":
            msgToSrc = (AM.src[attribute] /
                        AM.src["outDegree"]) * (1 - spreading_factor)
            msgToDst = sqlfunctions.when(
                AM.dst["outDegree"] != 0,
                ((AM.src[attribute] / AM.src["outDegree"]) * (spreading_factor)
                 )).otherwise(((1 / AM.dst["inDegree"]) * AM.dst[attribute]) +
                              ((AM.src[attribute] / AM.src["outDegree"]) *
                               (spreading_factor)))
        if transfer_function == "weighted":
            weight = AM.edge["weight"] / AM.src["w_outDegree"]
            msgToSrc = (AM.src[attribute] /
                        AM.src["outDegree"]) * (1 - spreading_factor)
            msgToDst = sqlfunctions.when(
                AM.dst["outDegree"] != 0,
                ((AM.src[attribute]) *
                 (spreading_factor * weight))).otherwise((
                     (1 / AM.dst["inDegree"]) * AM.dst[attribute]) + (
                         (AM.src[attribute]) * (spreading_factor * weight)))

        # Aggregate messages
        agg = graph.aggregateMessages(sqlsum(AM.msg).alias(attribute),
                                      sendToSrc=msgToSrc,
                                      sendToDst=msgToDst)

        # Create a new cached copy of the dataFrame to get new calculated attribute
        cachedNewVertices = AM.getCachedDataFrame(agg)
        tojoin = graph.vertices.select("id", "inDegree", "outDegree",
                                       "w_inDegree", "w_outDegree")
        new_cachedNewVertices = cachedNewVertices.join(tojoin, "id",
                                                       "left_outer")
        new_cachedNewVertices = new_cachedNewVertices.na.fill(0)

        # Return graph with new calculated attribute
        return GraphFrame(new_cachedNewVertices, graph.edges)
for i in range(6):
    # AM.msg contains the next message i.e. next parent in our case
    agg = g.aggregateMessages(collect_list(AM.msg).alias("tmpParent"),
                              sendToDst=msgToDst)

    # Append this message to the parents array column of vertices and also keep it as a standalone column for next iteration
    currentV = g.vertices
    newV = currentV.join(agg, "id", how = "left") \
      .drop(agg["id"]) \
      .withColumn("parents", concat(agg["tmpParent"], currentV["parents"])) \
      .withColumn("lastParent", col("tmpParent")[0]) \
      .drop("tmpParent")

    # Caching the transitionary vertices dataframe is important here, otherwise the Spark job will take very long time to complete
    cachedNewV = AM.getCachedDataFrame(newV)
    g = GraphFrame(cachedNewV, g.edges)

    # Pass the standalone column i.e recent parent to the next iteration
    msgToDst = AM.src["lastParent"]

g = GraphFrame(g.vertices.drop("lastParent"), g.edges)
display(g.vertices)

# COMMAND ----------

dfParents = g.vertices.selectExpr(
    "id", "name",
    "map_from_arrays(transform(sequence(1, size(parents)), x -> concat('L', x)), parents) AS parents"
)
            dispersion += 1

        dispersion_list.append({
            "id": common_neighbour['id'],
            "dispersion": dispersion
        })

    maximum = max(dispersion_list, key=lambda x: x['dispersion'])
    return maximum


calculate_dispersion_type = types.StructType([
    types.StructField("id", types.StringType()),
    types.StructField("dispersion", types.IntegerType())
])
calculate_dispersion_udf = F.udf(calculate_dispersion,
                                 calculate_dispersion_type)

dispersion = common.withColumn(
    "dispersion", calculate_dispersion_udf(
        common["common_neighbours"])).drop("common_neighbours")
dispersion.show()

# final graph with dispersion on edges
print("Final Graph:")
dispersion = dispersion.withColumnRenamed("node", "src")
cached_vertices = AM.getCachedDataFrame(dispersion)
graph = GraphFrame(cached_vertices, graph.edges)
graph.vertices.show(maxPrintSize)
graph.edges.show()
    merged_ids = [(col1, col2) for col1, col2 in joined_ids if col1 != id]
    best_ids = dict(sorted(merged_ids, key=itemgetter(1), reverse=True))
    return [{"id": col1, "distance": col2} for col1, col2 in best_ids.items()]

merge_paths_udf = F.udf(merge_paths, paths_type)

def calculate_closeness(ids):
    nodes = len(ids)
    total_distance = sum([col2 for col1, col2 in ids])
    return 0 if total_distance == 0 else nodes * 1.0 / total_distance

closeness_udf = F.udf(calculate_closeness, DoubleType())

vertices = g.vertices.withColumn("ids", F.array())

cached_vertices = AM.getCachedDataFrame(vertices)

g2 = GraphFrame(cached_vertices, g.edges)

print(g2.vertices.count())

for i in range(0, g2.vertices.count()):
    msg_dst = new_paths_udf(AM.src["ids"], AM.src["id"])
    msg_src = new_paths_udf(AM.dst["ids"], AM.dst["id"])
    agg = g2.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
            sendToSrc=msg_src, sendToDst=msg_dst)
    res = agg.withColumn("newIds", flatten_udf("agg")).drop("agg")
    new_vertices = (g2.vertices.join(res, on="id", how="left_outer")
            .withColumn("mergedIds", merge_paths_udf("ids", "newIds",
                "id")).drop("ids", "newIds")
            .withColumnRenamed("mergedIds", "ids"))
Exemplo n.º 12
0
    def LPAImp(self, numIter, modularity=True):
        """Label propogation algorithm for bipartite networks with synchronous
        updating scheme; Return a data frame with columns which containts the
        vertices ID, labeling assignment and modularity (if specified to
        be returned)

        Keyword Arguments:

        numIter -- Number of iteration for LPAb

        modularity -- A boolean variable indicating whether the
        modularity should be calculated and returned.
        """
        # Assign initial label to the users
        initLabelUDF = F.udf(lambda i, j: i if j == 1 else None,
                             types.IntegerType())
        v = self.gf.vertices.withColumn(
            'label', initLabelUDF(F.col('id'), F.col('nodeType')))
        # Add edges for every node that goes to itself
        E_self = self.SS.createDataFrame(v.select(F.col('id')).rdd)
        E = AM.getCachedDataFrame(
            self.gf.edges.union(
                E_self.withColumn('dst',
                                  F.col('id')).withColumnRenamed('id', 'src')))

        # Create a new graphframe object with labels attached
        LPAbgf = GraphFrame(v, E)

        # Create a UDAF (User Defined Aggregate Function) that returns the most frequent
        # label
        @pandas_udf("int", PandasUDFType.GROUPED_AGG)
        def maxLabel_udf(label_list):
            label_list = list(filter(None, label_list))
            LabelCounts = Counter(label_list)
            mostCommonLabels = [
                i[0] for i in LabelCounts.items()
                if i[1] == max(LabelCounts.values())
            ]
            return np.random.choice(mostCommonLabels)

        for iter_ in range(numIter):
            for nodeType in [1, 2]:
                # For user and repo nodes, send their labels to
                # their destination nodes in alternating order
                msgForDst = F.when(AM.src['nodeType'] == nodeType,
                                   AM.src['label'])
                # If it's repo's turn to send label to their destinations,
                # also send repo's label's to its contributors
                if nodeType == 2:
                    msgForSrc = F.when(AM.src['nodeType'] == 1,
                                       AM.dst['label'])
                else:
                    msgForSrc = None

                # Aggregate messages received from each node
                aggregates = LPAbgf.aggregateMessages(aggCol=maxLabel_udf(
                    AM.msg).alias("aggMess"),
                                                      sendToDst=msgForDst,
                                                      sendToSrc=msgForSrc)
                v = LPAbgf.vertices

                # Update Labels for each node; If there is message for
                # the node, update the node's Label
                newLabelCol = F.when(aggregates["aggMess"].isNotNull(),
                                     aggregates["aggMess"]).otherwise(
                                         v['label'])
                # Outer join aggregates and vertices
                vNew = (
                    v.join(aggregates,
                           on=(v['id'] == aggregates['id']),
                           how='left_outer').drop(aggregates['id'])
                    # Compute new column
                    .withColumn('newLabel', newLabelCol)
                    # Drop messages
                    .drop('aggMess')
                    # Drop old labels
                    .drop('label').withColumnRenamed('newLabel', 'label'))

                cachedvNew = AM.getCachedDataFrame(vNew)
                LPAbgf = GraphFrame(cachedvNew, E)
        # Delete the edges that goes from itself
        LPAbgf = GraphFrame(LPAbgf.vertices, self.gf.edges)
        return LPAbgf
Exemplo n.º 13
0
def shortest_path(sql_context,
                  g,
                  origin,
                  destination,
                  column_name="cost",
                  directed=True,
                  weight=True):
    """

    :param sql_context:
    :param g:
    :param origin:
    :param destination:
    :param column_name:
    :param directed:
    :param weight:
    :return: all path shortest from origin to destination if distance==float("inf") them cann't reach node target
    """

    if g.vertices.filter(g.vertices.id == destination).count() == 0:
        return sql_context.createDataFrame(sql_context.emptyRDD(), g.vertices.schema) \
                .withColumn("path", F.array())

    def add_path(paths, id):
        return paths + [id]

    def add_other_path(path1, path2):

        return [path1] + [path2]

    add_path_udf = F.udf(lambda paths, id: add_path(paths, id),
                         ArrayType(StringType()))
    add_other_path_udf = F.udf(
        lambda path1, path2: add_other_path(path1, path2),
        ArrayType(StringType()))

    vertices = g.vertices.withColumn("visited", F.lit(False))\
        .withColumn("distance", F.when(g.vertices['id'] == origin, 0).otherwise(float("inf"))) \
        .withColumn("path", F.array())
    cached_vertices = AM.getCachedDataFrame(vertices)
    g2 = GraphFrame(cached_vertices, g.edges)
    while g2.vertices.filter('visited == False').first():
        current_node_id = g2.vertices.filter('visited == False')\
            .sort("distance").first().id

        if weight:
            msg_distance = AM.src['distance'] + AM.edge[column_name]
        else:
            msg_distance = AM.src['distance'] + 1

        msg_path = add_path_udf(AM.src["path"], AM.src["id"])
        msg_for_dst = F.when(AM.src["id"] == current_node_id,
                             F.struct(msg_distance, msg_path))
        if directed:
            new_distances = g2.aggregateMessages(F.min(
                AM.msg).alias("aggMess"),
                                                 sendToDst=msg_for_dst)
        else:
            if weight:
                msg_distance = AM.dst['distance'] + AM.edge[column_name]
            else:
                msg_distance = AM.dst['distance'] + 1

            msg_path = add_path_udf(AM.dst["path"], AM.dst["id"])
            msg_for_src = F.when(AM.dst["id"] == current_node_id,
                                 F.struct(msg_distance, msg_path))
            new_distances = g2.aggregateMessages(F.min(
                AM.msg).alias("aggMess"),
                                                 sendToDst=msg_for_dst,
                                                 sendToSrc=msg_for_src)
        new_visited_col = F.when(
            g2.vertices.visited | (g2.vertices.id == current_node_id),
            True).otherwise(False)

        new_distances_col = \
            F.when(
                new_distances["aggMess"].isNotNull() & (new_distances.aggMess["col1"] < g2.vertices.distance),
                                   new_distances.aggMess["col1"])\
            .otherwise(g2.vertices.distance)

        new_path_col = \
            F.when(
                new_distances["aggMess"].isNotNull() & (new_distances.aggMess["col1"] < g2.vertices.distance),
                new_distances.aggMess["col2"]
            ) \
            .when(
                new_distances["aggMess"].isNotNull() & (new_distances.aggMess["col1"] == g2.vertices.distance),
                add_other_path_udf(g2.vertices.path, new_distances.aggMess["col2"])
            ) \
            .otherwise(g2.vertices.path)
        new_vertices = g2.vertices.join(new_distances, on="id", how="left_outer")\
            .drop(new_distances["id"])\
            .withColumn("visited", new_visited_col)\
            .withColumn("newDistance", new_distances_col)\
            .withColumn("newPath", new_path_col)\
            .drop("aggMess", "distance", "path")\
            .withColumnRenamed("newDistance", "distance")\
            .withColumnRenamed("newPath", "path")

        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g2 = GraphFrame(cached_new_vertices, g2.edges)
        if g2.vertices.filter(g2.vertices.id == destination).first().visited:
            return g2.vertices.filter(g2.vertices.id == destination)\
                .withColumn("newPath", add_path_udf("path", "id"))\
                .drop("visited", "path")\
                .withColumnRenamed("newPath", "path")
    return sql_context.createDataFrame(sql_context.emptyRDD(), g.vertices.schema)\
        .withColumn("path", F.array())