예제 #1
0
def get_sibling_attrs(g, transient_id):
    """
    Given transient id, get summary of information we have about it or its sibling nodes.

    We gather:
        * node attributes
        * IP / location information
        * IAB categories of visited websites
    """
    return (g.V(transient_id).choose(
        in_("has_identity"),  # check if this transient id has persistent id
        in_("has_identity").project(
            "identity_group_id", "persistent_id",
            "attributes", "ip_location", "iab_categories").by(
                in_("member").values("igid")).by(values("pid")).by(
                    out("has_identity").valueMap().unfold().group().by(
                        Column.keys).by(
                            select(Column.values).unfold().dedup().fold())).
        by(out("has_identity").out("uses").dedup().valueMap().fold()).by(
            out("has_identity").out("visited").in_("links_to").values(
                "categoryCode").dedup().fold()),
        project("identity_group_id", "persistent_id", "attributes",
                "ip_location",
                "iab_categories").by(constant("")).by(constant("")).by(
                    valueMap().unfold().group().by(Column.keys).by(
                        select(Column.values).unfold().dedup().fold())).by(
                            out("uses").dedup().valueMap().fold()).by(
                                out("visited").in_("links_to").values(
                                    "categoryCode").dedup().fold())))
def undecided_user_audience_check(g, transient_id, website_url,
                                  thank_you_page_url, since,
                                  min_visited_count):
    """
    Given transient id, check whether it belongs to an audience.

    It's simple yes, no question.

    User belongs to an audience whenever all of the following criteria are met:
        * visited some website url at least X times since specific timestamp
        * did not visit thank you page url since specific timestamp
    """
    return (g.V(transient_id).hasLabel("transientId").in_("has_identity").out(
        "has_identity").outE("visited").has("ts", P.gt(since)).choose(
            has("visited_url", website_url),
            groupCount("visits").by(constant("page_visits"))).choose(
                has("visited_url", thank_you_page_url),
                groupCount("visits").by(
                    constant("thank_you_page_vists"))).cap("visits").coalesce(
                        and_(
                            coalesce(select("thank_you_page_vists"),
                                     constant(0)).is_(0),
                            select("page_visits").is_(
                                P.gt(min_visited_count))).choose(
                                    count().is_(1), constant(True)),
                        constant(False)))
예제 #3
0
def query_transient_nodes_for_website(g, website_id, limit=10000):
    return (g.V(website_id).in_("visited").limit(limit).project(
        "uid", "pid"
    ).by("uid").by(in_("has_identity").values("pid").fold()).group().by(
        coalesce(
            select("pid").unfold(),
            constant("transient-nodes-connected-to-website"))).by(
                select("uid").dedup().limit(100).fold()).unfold().project(
                    "persistent-node-id", "transient-nodes").by(
                        select(Column.keys)).by(select(Column.values)).where(
                            select("transient-nodes").unfold().count().is_(
                                P.gt(1)))).toList()
예제 #4
0
def check_duplicate_ip_addresses(g):
    """networks with duplicate ip addresses
    """
    r = g.V().hasLabel("virtual_network").as_('vn').flatMap(
        union(
            select('vn'),
            __.in_().hasLabel("instance_ip").has("instance_ip_address").group(
            ).by("instance_ip_address").unfold().filter(
                lambda: "it.get().value.size() > 1")).fold().filter(
                    lambda: "it.get().size() > 1")).toList()
    if len(r) > 0:
        printo('Found %d %s:' %
               (len(r), check_duplicate_ip_addresses.__doc__.strip()))
    for dup in r:
        # FIXME:
        dup[0].label = 'virtual_network'
        # First item is the vn
        r_ = v_to_r(dup[0])
        printo('  - %s/%s - %s' % (r_.type, r_.uuid, r_.fq_name))
        for ips in dup[1:]:
            for ip, iips in ips.items():
                printo("      %s:" % ip)
                for iip in iips:
                    r_ = v_to_r(iip)
                    printo('        - %s/%s - %s' %
                           (r_.type, r_.uuid, r_.fq_name))
    return r
예제 #5
0
def recommend_similar_audience(g,
                               website_url,
                               categories_limit=3,
                               search_time_limit_in_seconds=15):
    """Given website url, categories_limit, categories_coin recommend similar audience in n most popular categories.

    Similar audience - audience of users that at least once visited subpage of domain that contains IAB-category codes
    that are most popular across users of given website
    """
    average_guy = (g.V(website_url).in_("visited").in_(
        "has_identity").dedup().hasLabel("persistentId").group().by().by(
            out("has_identity").out("visited").in_("links_to").groupCount().by(
                "categoryCode")).select(
                    Column.values).unfold().unfold().group().by(
                        Column.keys).by(select(
                            Column.values).mean()).unfold().order().by(
                                Column.values,
                                Order.desc).limit(categories_limit))

    most_popular_categories = dict(
        chain(*category.items()) for category in average_guy.toList())

    guy_stats_subquery = (out("has_identity").out("visited").in_(
        "links_to").groupCount().by("categoryCode").project(
            *most_popular_categories.keys()))

    conditions_subqueries = []
    for i in most_popular_categories:
        guy_stats_subquery = guy_stats_subquery.by(
            choose(select(i), select(i), constant(0)))
        conditions_subqueries.append(
            select(Column.values).unfold().select(i).is_(
                P.gt(int(most_popular_categories[i]))))

    return (g.V().hasLabel("websiteGroup").has(
        "categoryCode", P.within(list(
            most_popular_categories.keys()))).out("links_to").in_("visited").
            dedup().in_("has_identity").dedup().hasLabel("persistentId").where(
                out("has_identity").out("visited").has(
                    "url", P.neq(website_url))).timeLimit(
                        search_time_limit_in_seconds * 1000).local(
                            group().by().by(guy_stats_subquery).where(
                                or_(*conditions_subqueries))).select(
                                    Column.keys).unfold().out(
                                        "has_identity").values("uid"))
예제 #6
0
def _query_users_activities_stats(g,
                                  website_url,
                                  most_popular_categories,
                                  search_time_limit_in_seconds=30):
    return (g.V().hasLabel("websiteGroup").has(
        "categoryCode", P.within(list(
            most_popular_categories.keys()))).out("links_to").in_("visited").
            dedup().in_("has_identity").dedup().hasLabel("persistentId").where(
                out("has_identity").out("visited").has(
                    "url", P.neq(website_url))).timeLimit(
                        search_time_limit_in_seconds * 1000).
            local(group().by().by(
                out("has_identity").out("visited").in_(
                    "links_to").groupCount().by("categoryCode")).project(
                        "pid", "iabs",
                        "tids").by(select(Column.keys).unfold()).by(
                            select(Column.values).unfold()).by(
                                select(Column.keys).unfold().out(
                                    "has_identity").values("uid").fold())))
예제 #7
0
def get_activity_of_early_adopters(g,
                                   thank_you_page_url,
                                   skip_single_transients=False,
                                   limit=5):
    """
    Given thank you page url, find first early adopters of the product.

    In other words:
        * find first few persistent identities (or transient if they're not matched with any user)
          that visited given thank you page
        * extract their *whole* activity on the domain of the thank_you_page
    """
    return (g.V(thank_you_page_url).hasLabel("website").as_("thank_you").in_(
        "links_to").as_("website_group").select("thank_you").inE(
            "visited").order().by("ts").choose(
                constant(skip_single_transients).is_(P.eq(True)),
                where(outV().in_("has_identity")), identity()).choose(
                    outV().in_("has_identity"),
                    project("type", "id",
                            "purchase_ts").by(constant("persistent")).by(
                                outV().in_("has_identity")).by(values("ts")),
                    project("type", "id", "purchase_ts").by(
                        constant("transient")).by(outV()).by(values("ts"))).
            dedup("id").limit(limit).choose(
                select("type").is_("persistent"),
                project("persistent_id", "transient_id",
                        "purchase_ts").by(select("id").values("pid")).by(
                            select("id").out("has_identity").fold()).by(
                                select("purchase_ts")),
                project("persistent_id", "transient_id", "purchase_ts").by(
                    constant("")).by(select("id").fold()).by(
                        select("purchase_ts"))).project(
                            "persistent_id", "purchase_ts", "devices",
                            "visits").by(select("persistent_id")).by(
                                select("purchase_ts")).by(
                                    select("transient_id").unfold().group().by(
                                        values("uid")).by(values("type"))).
            by(
                select("transient_id").unfold().outE("visited").order().by(
                    "ts").where(inV().in_("links_to").where(
                        P.eq("website_group"))).project(
                            "transientId", "url",
                            "ts").by("uid").by("visited_url").by("ts").fold()))
예제 #8
0
def _get_categories_popular_across_audience_of_website(g,
                                                       website_url,
                                                       categories_limit=3):
    return (g.V(website_url).in_("visited").in_(
        "has_identity").dedup().hasLabel("persistentId").group().by().by(
            out("has_identity").out("visited").in_("links_to").groupCount().by(
                "categoryCode")).select(
                    Column.values).unfold().unfold().group().by(
                        Column.keys).by(select(
                            Column.values).mean()).unfold().order().by(
                                Column.values,
                                Order.desc).limit(categories_limit))
예제 #9
0
def query_users_active_in_given_date_intervals(g, dt_conditions, limit=300):
    """Get users (persistent identities) that interacted with website in given date interval."""

    return (g.V().hasLabel("persistentId").coin(0.5).limit(limit).where(
        out("has_identity").outE("visited").or_(*dt_conditions)).project(
            "persistent_id", "attributes", "ip_location").by(values("pid")).by(
                out("has_identity").valueMap(
                    "browser", "email",
                    "uid").unfold().group().by(Column.keys).by(
                        select(Column.values).unfold().dedup().fold())).by(
                            out("has_identity").out(
                                "uses").dedup().valueMap().fold()))
예제 #10
0
def query_users_intersted_in_content(g, iab_codes, limit=10000):
    """Get users (persistent identities) that interacted with websites with given iab codes."""

    return (g.V().hasLabel("persistentId").coin(0.8).limit(limit).where(
        out("has_identity").out("visited").in_("links_to").has(
            "categoryCode", P.within(iab_codes))).project(
                "persistent_id", "attributes",
                "ip_location").by(values("pid")).by(
                    out("has_identity").valueMap(
                        "browser", "email",
                        "uid").unfold().group().by(Column.keys).by(
                            select(Column.values).unfold().dedup().fold())).by(
                                out("has_identity").out(
                                    "uses").dedup().valueMap().fold()))
def undecided_users_audience(g, website_url, thank_you_page_url, since,
                             min_visited_count):
    """
    Given website url, get all the users that meet audience conditions.

    It returns list of transient identities uids.

    Audience is build from the users that met following criteria:
        * visited some website url at least X times since specific timestamp
        * did not visit thank you page url since specific timestamp
    """
    return (g.V(website_url).hasLabel("website").inE("visited").has(
        "ts", P.gt(since)).outV().in_("has_identity").groupCount().unfold(
        ).dedup().where(select(Column.values).is_(
            P.gt(min_visited_count))).select(Column.keys).as_("pids").map(
                out("has_identity").outE("visited").has(
                    "visited_url",
                    thank_you_page_url).has("ts", P.gt(since)).outV().in_(
                        "has_identity").dedup().values("pid").fold()).as_(
                            "pids_that_visited").select("pids").not_(
                                has("pid",
                                    where(P.within("pids_that_visited")))).out(
                                        "has_identity").values("uid"))
def _get_persistent_ids_which_visited_website(g, root_url):
    return (g.V(root_url).aggregate("root_url").in_("visited").in_(
        "has_identity").dedup().limit(50).fold().project(
            "root_url", "persistent_ids").by(
                select("root_url").unfold().valueMap(True)).by())