Пример #1
0
def get_activity_of_early_adopters(g,
                                   thank_you_page_url,
                                   skip_single_transients=False,
                                   limit=5):
    """
    Given thank you page url, find first early adopters of the product.

    In other words:
        * find first few persistent identities (or transient if they're not matched with any user)
          that visited given thank you page
        * extract their *whole* activity on the domain of the thank_you_page
    """
    return (g.V(thank_you_page_url).hasLabel("website").as_("thank_you").in_(
        "links_to").as_("website_group").select("thank_you").inE(
            "visited").order().by("ts").choose(
                constant(skip_single_transients).is_(P.eq(True)),
                where(outV().in_("has_identity")), identity()).choose(
                    outV().in_("has_identity"),
                    project("type", "id",
                            "purchase_ts").by(constant("persistent")).by(
                                outV().in_("has_identity")).by(values("ts")),
                    project("type", "id", "purchase_ts").by(
                        constant("transient")).by(outV()).by(values("ts"))).
            dedup("id").limit(limit).choose(
                select("type").is_("persistent"),
                project("persistent_id", "transient_id",
                        "purchase_ts").by(select("id").values("pid")).by(
                            select("id").out("has_identity").fold()).by(
                                select("purchase_ts")),
                project("persistent_id", "transient_id", "purchase_ts").by(
                    constant("")).by(select("id").fold()).by(
                        select("purchase_ts"))).project(
                            "persistent_id", "purchase_ts", "devices",
                            "visits").by(select("persistent_id")).by(
                                select("purchase_ts")).by(
                                    select("transient_id").unfold().group().by(
                                        values("uid")).by(values("type"))).
            by(
                select("transient_id").unfold().outE("visited").order().by(
                    "ts").where(inV().in_("links_to").where(
                        P.eq("website_group"))).project(
                            "transientId", "url",
                            "ts").by("uid").by("visited_url").by("ts").fold()))
Пример #2
0
def _calculate_arrivals(topo_traversal: GraphTraversalSource,
                        source_vertex: Vertex, arrival_rates: ARRIVAL_RATES,
                        output_rates: DefaultDict[int, Dict[str, float]],
                        i2i_rps: pd.DataFrame) -> ARRIVAL_RATES:

    # Get all downstream edges and vertices for this source vertex
    out_edges: List[Dict[str, Union[str, int, float]]] = \
        (topo_traversal.V(source_vertex).outE("logically_connected")
         .project("source_task", "source_component", "stream_name",
                  "destination_task", "destination_component")
         .by(outV().properties("task_id").value())
         .by(outV().properties("component").value())
         .by(properties("stream").value())
         .by(inV().properties("task_id").value())
         .by(inV().properties("component").value())
         .toList())

    if not out_edges:
        return arrival_rates

    source_task: int = cast(int, out_edges[0]["source_task"])
    source_component: str = cast(str, out_edges[0]["source_component"])

    LOG.debug("Processing output from source instance %s_%d", source_component,
              source_task)

    for out_edge in out_edges:
        stream: str = cast(str, out_edge["stream_name"])
        try:
            stream_output: float = cast(float,
                                        output_rates[source_task][stream])
        except KeyError:
            LOG.debug(
                "No output rate information for source task %d on "
                "stream %s. Skipping the outgoing edge", source_task, stream)
            continue

        destination_task: int = cast(int, out_edge["destination_task"])

        try:
            r_prob: float = float(
                i2i_rps.loc(axis=0)[source_task, destination_task, stream])
        except KeyError:
            LOG.debug(
                "Unable to find routing probability for connection from "
                "task %d to %d on stream %s", source_task, destination_task,
                stream)

            edge_output: float = 0.0
        else:

            edge_output = (stream_output * r_prob)

            LOG.debug(
                "Output from %s-%d to %s-%d on stream %s is "
                "calculated as %f * %f = %f", source_component, source_task,
                out_edge["destination_component"], destination_task, stream,
                stream_output, r_prob, edge_output)

        arrival_rates[destination_task][(stream,
                                         source_component)] += edge_output

    return arrival_rates
Пример #3
0
def _create_physical_connections(graph_client: GremlinClient, topology_id: str,
                                 topology_ref: str) -> None:

    LOG.info(
        "Creating physical connections for topology: %s, reference: "
        "%s", topology_id, topology_ref)

    topo_traversal: GraphTraversalSource = \
        graph_client.topology_subgraph(topology_id, topology_ref)

    # First get all logically connected pairs of vertex and their associated
    # containers and stream managers
    logical_edges: List[Dict[str, Union[Vertex, Edge]]] = (
        topo_traversal.V().hasLabel(P.within(
            "bolt", "spout")).outE("logically_connected").project(
                "source_instance", "source_container", "source_stream_manager",
                "l_edge", "destination_instance", "destination_container",
                "destination_stream_manager").by(outV()).by(
                    outV().out("is_within")).by(outV().out("is_within").in_(
                        "is_within").hasLabel("stream_manager")).by().by(
                            inV()).by(inV().out("is_within")).by(
                                inV().out("is_within").in_("is_within").
                                hasLabel("stream_manager")).toList())

    LOG.debug("Processing %d logical connected vertices", len(logical_edges))

    for logical_edge in logical_edges:
        source: Vertex = logical_edge["source_instance"]
        source_container: Vertex = logical_edge["source_container"]
        source_stream_manager: Vertex = logical_edge["source_stream_manager"]
        destination: Vertex = logical_edge["destination_instance"]
        destination_container: Vertex = logical_edge["destination_container"]
        destination_stream_manager: Vertex = \
            logical_edge["destination_stream_manager"]
        l_edge: Edge = logical_edge["l_edge"]

        # Connect the source instance to its stream manager, checking first
        # if the connection already exists
        (graph_client.graph_traversal.V(source).coalesce(
            out("physically_connected").is_(source_stream_manager),
            addE("physically_connected").to(source_stream_manager)).next())

        if source_container == destination_container:

            # If the source and destination instances are in the same
            # container then they share the same stream manager so just use
            # the source stream manager found above. Connect the source
            # stream manager to the destination instance

            (graph_client.graph_traversal.V(source_stream_manager).coalesce(
                out("physically_connected").is_(destination),
                addE("physically_connected").to(destination)).next())

            # Set the logical edge for this pair to "local"
            graph_client.graph_traversal.E(l_edge).property("type",
                                                            "local").next()

        else:
            # Connect the two stream managers (if they aren't already)
            (graph_client.graph_traversal.V(source_stream_manager).coalesce(
                out("physically_connected").is_(destination_stream_manager),
                addE("physically_connected").to(
                    destination_stream_manager)).next())

            (graph_client.graph_traversal.V(
                destination_stream_manager).coalesce(
                    out("physically_connected").is_(destination),
                    addE("physically_connected").to(destination)).next())

            # Set the logical edge for this pair to "remote"
            graph_client.graph_traversal.E(l_edge).property("type",
                                                            "remote").next()