def _gen_lateral_bodies(self, within_time: Optional[str] = None): annotations = {} for index, step in enumerate(self._filter.entities): filter_key = "event" if step.type == TREND_FILTER_TYPE_EVENTS else "action__pk" event = (Event.objects.values("distinct_id").annotate( step_ts=Min("timestamp"), person_id=Value("99999999", IntegerField()), ).filter( self._filter.date_filter_Q, **{ filter_key: step.id }, team_id=self._team.pk, **({ "distinct_id": "1234321" } if index > 0 else {}), **({ "timestamp__gte": timezone.now().replace(year=2000, month=1, day=1, hour=0, minute=0, second=0, microsecond=0) } if index > 0 else {}), ).filter( properties_to_Q( self._filter.properties, team_id=self._team.pk, filter_test_accounts=self._filter.filter_test_accounts, )).filter( properties_to_Q(step.properties, team_id=self._team.pk))) with connection.cursor() as cursor: event_string = cursor.mogrify(*event.query.sql_with_params()) # Replace placeholders injected by the Django ORM # We do this because the Django ORM doesn't easily allow us to parameterize sql identifiers # This is probably the most hacky part of the entire query generation event_string = (event_string.decode("utf-8").replace( "'1234321'", "{prev_step_person_id}" ).replace( "'2000-01-01T00:00:00+00:00'::timestamptz", "{prev_step_ts} %s" % (' AND "posthog_event"."timestamp" < "step_{}"."step_ts" + {}'. format(index - 1, within_time) if within_time else ""), ).replace('"posthog_event"."distinct_id"', '"pdi"."person_id"').replace( "99999999", '"pdi"."person_id"').replace( ', "pdi"."person_id" AS "person_id"', "")) event_string = re.sub( # accommodate for identifier e.g. W0 so that it still ends up right after `FROM posthog_event` # not after `ON pdi.distinct_id = posthog_event.distinct_id` r'FROM "posthog_event"( [A-Z][0-9])?', r"FROM posthog_event\1 JOIN posthog_persondistinctid pdi " r"ON pdi.distinct_id = posthog_event.distinct_id", event_string, ) query = sql.SQL(event_string) annotations["step_{}".format(index)] = query return annotations
def _retrieve_people(self, filter: RetentionFilter, team: Team): period = filter.period trunc, fields = self._get_trunc_func("timestamp", period) is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME entity_condition, _ = self.get_entity_condition( filter.target_entity, "events") returning_condition, _ = self.get_entity_condition( filter.returning_entity, "first_event_date") _entity_condition = returning_condition if filter.selected_interval > 0 else entity_condition events = Event.objects.filter(team_id=team.pk).add_person_id(team.pk) filtered_events = events.filter( filter.recurring_date_filter_Q()).filter( properties_to_Q(filter.properties, team_id=team.pk)) inner_events = (Event.objects.filter(team_id=team.pk).filter( properties_to_Q(filter.properties, team_id=team.pk)).add_person_id( team.pk).filter(**{ "person_id": OuterRef("id") }).filter(entity_condition).values("person_id").annotate( first_date=Min(trunc)).filter( filter.reference_date_filter_Q("first_date")).distinct( ) if is_first_time_retention else Event.objects.filter( team_id=team.pk).filter( filter.reference_date_filter_Q()).filter( properties_to_Q( filter.properties, team_id=team.pk)).add_person_id( team.pk).filter( **{ "person_id": OuterRef("id") }).filter(entity_condition)) filtered_events = (filtered_events.filter(_entity_condition).filter( Exists( Person.objects.filter(**{ "id": OuterRef("person_id"), }).filter(Exists(inner_events)).only("id"))).values( "person_id").distinct()).all() people = Person.objects.filter( team=team, id__in=[ p["person_id"] for p in filtered_events[filter.offset:filter.offset + 100] ], ) people = people.prefetch_related( Prefetch("persondistinctid_set", to_attr="distinct_ids_cache")) from posthog.api.person import PersonSerializer return PersonSerializer(people, many=True).data
def _match_distinct_id(self, distinct_id: str) -> bool: filter = Filter(data=self.filters) return ( Person.objects.filter(team_id=self.team_id, persondistinctid__distinct_id=distinct_id) .filter(properties_to_Q(filter.properties, team_id=self.team_id, is_person_query=True)) .exists() )
def query_conditions(self) -> List[List[bool]]: if self.feature_flag.aggregation_group_type_index is None: query: QuerySet = Person.objects.filter( team_id=self.feature_flag.team_id, persondistinctid__distinct_id=self.distinct_id, persondistinctid__team_id=self.feature_flag.team_id, ) else: query = Group.objects.filter( team_id=self.feature_flag.team_id, group_type_index=self.feature_flag. aggregation_group_type_index, group_key=self.hashed_identifier, ) fields = [] for index, condition in enumerate(self.feature_flag.conditions): key = f"condition_{index}" if len(condition.get("properties", {})) > 0: # Feature Flags don't support OR filtering yet expr: Any = properties_to_Q( Filter(data=condition).property_groups.flat, team_id=self.feature_flag.team_id, is_direct_query=True) else: expr = RawSQL("true", []) query = query.annotate( **{key: ExpressionWrapper(expr, output_field=BooleanField())}) fields.append(key) return list(query.values_list(*fields))
def stats(self, request: request.Request, **kwargs) -> response.Response: team_id = self.team_id filter = Filter(request=request) events = (Event.objects.filter( team_id=team_id, event="$autocapture").filter( properties_to_Q(filter.properties, team_id=team_id)).filter(filter.date_filter_Q)) events = events.values("elements_hash").annotate( count=Count(1)).order_by("-count")[0:100] groups = ElementGroup.objects.filter( team_id=team_id, hash__in=[item["elements_hash"] for item in events]).prefetch_related( Prefetch("element_set", queryset=Element.objects.order_by( "order", "id"))) return response.Response([{ "count": item["count"], "hash": item["elements_hash"], "elements": [ ElementSerializer(element).data for element in [ group for group in groups if group.hash == item["elements_hash"] ][0].element_set.all() ], } for item in events])
def _filter_request(self, request: request.Request, queryset: QuerySet) -> QuerySet: if request.GET.get("id"): ids = request.GET["id"].split(",") queryset = queryset.filter(id__in=ids) if request.GET.get("uuid"): uuids = request.GET["uuid"].split(",") queryset = queryset.filter(uuid__in=uuids) if request.GET.get("search"): parts = request.GET["search"].split(" ") contains = [] for part in parts: if ":" in part: matcher, key = part.split(":") if matcher == "has": # Matches for example has:email or has:name queryset = queryset.filter(properties__has_key=key) else: contains.append(part) queryset = queryset.filter( Q(properties__icontains=" ".join(contains)) | Q(persondistinctid__distinct_id__icontains=" ".join( contains))).distinct("id") if request.GET.get("cohort"): queryset = queryset.filter(cohort__id=request.GET["cohort"]) if request.GET.get("properties"): filter = Filter( data={"properties": json.loads(request.GET["properties"])}) queryset = queryset.filter( properties_to_Q(filter.properties, team_id=self.team_id)) queryset = queryset.prefetch_related( Prefetch("persondistinctid_set", to_attr="distinct_ids_cache")) return queryset
def query_people_in_range(self, limit: int) -> Dict[str, Optional[str]]: if self.filter.distinct_id: person = Person.objects.filter( team=self.team, persondistinctid__distinct_id=self.filter.distinct_id).first() return ({ distinct_id: person.properties.get("email") for distinct_id in person.distinct_ids } if person else {}) events_query = (Event.objects.filter(team=self.team).add_person_id( self.team.pk).filter( properties_to_Q(self.filter.person_filter_properties, team_id=self.team.pk)).filter( self.date_filter()).order_by("-timestamp")) sql, params = events_query.query.sql_with_params() query = f""" SELECT events.distinct_id, MIN(posthog_person.properties->>'email') FROM ({sql}) events LEFT OUTER JOIN posthog_persondistinctid ON posthog_persondistinctid.distinct_id = events.distinct_id AND posthog_persondistinctid.team_id = {self.team.pk} LEFT OUTER JOIN posthog_person ON posthog_person.id = posthog_persondistinctid.person_id GROUP BY events.distinct_id ORDER BY MAX(events.timestamp) DESC LIMIT {limit} """ with connection.cursor() as cursor: cursor.execute(query, params) distinct_id_to_email = dict(cursor.fetchall()) return distinct_id_to_email
def query_people_in_range( self, team: Team, filter: SessionsFilter, date_filter: Q, limit: int ) -> Dict[str, Optional[str]]: events_query = ( Event.objects.filter(team=team) .add_person_id(team.pk) .filter(properties_to_Q(filter.properties, team_id=team.pk)) .filter(date_filter) .order_by("-timestamp") .only("distinct_id") ) sql, params = events_query.query.sql_with_params() query = f""" SELECT DISTINCT ON(distinct_id) events.distinct_id, posthog_person.properties->>'email' FROM ({sql}) events LEFT OUTER JOIN posthog_persondistinctid ON posthog_persondistinctid.distinct_id = events.distinct_id AND posthog_persondistinctid.team_id = {team.pk} LEFT OUTER JOIN posthog_person ON posthog_person.id = posthog_persondistinctid.person_id LIMIT {limit} """ with connection.cursor() as cursor: cursor.execute(query, params) distinct_id_to_email = dict(cursor.fetchall()) return distinct_id_to_email
def _filter_request(self, request: request.Request, queryset: EventManager) -> QuerySet: for key, value in request.GET.items(): if key == "event": queryset = queryset.filter(event=request.GET["event"]) elif key == "after": queryset = queryset.filter(timestamp__gt=request.GET["after"]) elif key == "before": queryset = queryset.filter(timestamp__lt=request.GET["before"]) elif key == "person_id": person = Person.objects.get(pk=request.GET["person_id"], team_id=self.team_id) queryset = queryset.filter( distinct_id__in=PersonDistinctId.objects.filter( team_id=self.team_id, person_id=request.GET["person_id"]).values( "distinct_id")) elif key == "distinct_id": queryset = queryset.filter( distinct_id=request.GET["distinct_id"]) elif key == "action_id": queryset = queryset.filter_by_action( Action.objects.get(pk=value)) # type: ignore elif key == "properties": filter = Filter(data={"properties": json.loads(value)}) queryset = queryset.filter( properties_to_Q(filter.properties, team_id=self.team_id)) return queryset
def _filter_request(self, request: request.Request, queryset: EventManager) -> QuerySet: for key, value in request.GET.items(): if key == "event": queryset = queryset.filter(event=request.GET["event"]) elif key == "after": queryset = queryset.filter(timestamp__gt=request.GET["after"]) elif key == "before": queryset = queryset.filter(timestamp__lt=request.GET["before"]) elif key == "person_id": queryset = queryset.filter( distinct_id__in=PersonDistinctId.objects.filter( team_id=self.team_id, person_id=request.GET["person_id"]).values( "distinct_id")) elif key == "distinct_id": queryset = queryset.filter( distinct_id=request.GET["distinct_id"]) elif key == "action_id": queryset = queryset.filter_by_action( Action.objects.get(pk=value)) # type: ignore elif key == "properties": try: properties = json.loads(value) except json.decoder.JSONDecodeError: raise ValidationError("Properties are unparsable!") filter = Filter(data={"properties": properties}) queryset = queryset.filter( properties_to_Q(filter.properties, team_id=self.team_id)) return queryset
def query_db_by_action(self, action, order_by="-timestamp", start=None, end=None) -> models.QuerySet: from posthog.queries.base import properties_to_Q events = self any_step = Q() steps = action.steps.all() if len(steps) == 0: return self.none() for step in steps: step_filter = Filter(data={"properties": step.properties}) subquery = (Event.objects.add_person_id( team_id=action.team_id).filter( properties_to_Q(step_filter.properties, team_id=action.team_id), pk=OuterRef("id"), **self.filter_by_event(step), **self.filter_by_element(model_to_dict(step), team_id=action.team_id), **self.filter_by_period(start, end), ).only("id")) subquery = self.filter_by_url(step, subquery) any_step |= Q(Exists(subquery)) events = self.filter(team_id=action.team_id).filter(any_step) if order_by: events = events.order_by(order_by) return events
def events_query(self, filter: Filter, team: Team) -> QuerySet: return ( Event.objects.filter(team=team) .add_person_id(team.pk) .filter(properties_to_Q(filter.properties, team_id=team.pk)) .order_by("-timestamp") )
def _people_filter(self, extra_filter=None): from posthog.queries.base import properties_to_Q filters = Q() for group in self.groups: if group.get("action_id"): action = Action.objects.get(pk=group["action_id"], team_id=self.team_id) events = (Event.objects.filter_by_action(action).filter( team_id=self.team_id, **({ "timestamp__gt": timezone.now() - relativedelta(days=int(group["days"])) } if group.get("days") else {}), **(extra_filter if extra_filter else {})).order_by("distinct_id").distinct( "distinct_id").values("distinct_id")) filters |= Q(persondistinctid__distinct_id__in=events) elif group.get("properties"): filter = Filter(data=group) filters |= Q( properties_to_Q(filter.properties, team_id=self.team_id, is_person_query=True)) return filters
def events_query(self, filter: Filter, team: Team) -> QuerySet: return (Event.objects.filter(team=team).add_person_id( team.pk).filter(~Q(event="$feature_flag_called")).filter( properties_to_Q(filter.properties, team_id=team.pk, filter_test_accounts=filter. filter_test_accounts)).order_by("-timestamp"))
def test_person_cohort_properties(self): person1_distinct_id = "person1" person1 = Person.objects.create(team=self.team, distinct_ids=[person1_distinct_id], properties={"$some_prop": 1}) cohort1 = Cohort.objects.create(team=self.team, groups=[{ "properties": { "$some_prop": 1 } }], name="cohort1") cohort1.people.add(person1) filter = Filter(data={ "properties": [{ "key": "id", "value": cohort1.pk, "type": "cohort" }], }) matched_person = (Person.objects.filter( team_id=self.team.pk, persondistinctid__distinct_id=person1_distinct_id).filter( properties_to_Q(filter.properties, team_id=self.team.pk, is_person_query=True)).exists()) self.assertTrue(matched_person)
def test_selectors(self): event1 = Event.objects.create( team=self.team, event="$autocapture", elements=[Element.objects.create(tag_name="a"), Element.objects.create(tag_name="div"),], ) event2 = Event.objects.create(team=self.team, event="$autocapture") filter = Filter(data={"properties": [{"key": "selector", "value": "div > a", "type": "element"}]}) events = Event.objects.filter(properties_to_Q(filter.properties, team_id=self.team.pk)) self.assertEqual(events.count(), 1)
def _filter_events(filter: Filter, team: Team, person_query: Optional[bool] = False, order_by: Optional[str] = None): events = Event.objects if person_query: events = events.add_person_id(team.pk) events = events.filter(properties_to_Q(filter.properties, team_id=team.pk)) if order_by: events = events.order_by(order_by) return events.values()
def properties_filter(self, queryset, attr, value, *args, **kwargs): filter = Filter(data={"properties": json.loads(value)}) from posthog.queries.base import properties_to_Q return queryset.filter( properties_to_Q( [prop for prop in filter.property_groups.flat if prop.type == "person"], team_id=self.team_id, is_direct_query=True, ) )
def query_groups(self) -> List[List[bool]]: query: QuerySet = Person.objects.filter( team_id=self.feature_flag.team_id, persondistinctid__distinct_id=self.distinct_id, persondistinctid__team_id=self.feature_flag.team_id, ) fields = [] for index, group in enumerate(self.feature_flag.groups): key = f"group_{index}" subquery = properties_to_Q( Filter(data=group).properties, team_id=self.feature_flag.team_id, is_person_query=True ) query = query.annotate(**{key: ExpressionWrapper(subquery, output_field=BooleanField())}) fields.append(key) return query.values_list(*fields)
def query_groups(self) -> List[List[bool]]: query: QuerySet = Person.objects.filter( team_id=self.feature_flag.team_id, persondistinctid__distinct_id=self.distinct_id, persondistinctid__team_id=self.feature_flag.team_id, ) fields = [] for index, group in enumerate(self.feature_flag.groups): key = f"group_{index}" if len(group.get("properties", {})) > 0: expr: Any = properties_to_Q(Filter(data=group).properties, team_id=self.feature_flag.team_id, is_person_query=True) else: expr = RawSQL("true", []) query = query.annotate( **{key: ExpressionWrapper(expr, output_field=BooleanField())}) fields.append(key) return list(query.values_list(*fields))
def calculate_paths(self, filter: PathFilter, team: Team): date_query = request_to_date_query({"date_from": filter._date_from, "date_to": filter._date_to}, exact=False) resp = [] prop_type = filter.prop_type event, event_filter = filter.target_event start_comparator = filter.comparator sessions = ( Event.objects.add_person_id(team.pk) .filter(team=team, **(event_filter), **date_query) .filter( ~Q(event__in=["$autocapture", "$pageview", "$identify", "$pageleave", "$screen"]) if event is None else Q() ) .filter( properties_to_Q(filter.properties, team_id=team.pk, filter_test_accounts=filter.filter_test_accounts) if filter and (filter.properties or filter.filter_test_accounts) else Q() ) .annotate( previous_timestamp=Window( expression=Lag("timestamp", default=None), partition_by=F("person_id"), order_by=F("timestamp").asc(), ) ) ) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() if event == "$autocapture": sessions_sql = self._add_elements(query_string=sessions_sql) events_notated = "\ SELECT *, CASE WHEN EXTRACT('EPOCH' FROM (timestamp - previous_timestamp)) >= (60 * 30) OR previous_timestamp IS NULL THEN 1 ELSE 0 END AS new_session\ FROM ({}) AS inner_sessions\ ".format( sessions_sql ) sessionified = "\ SELECT events_notated.*, SUM(new_session) OVER (\ ORDER BY person_id\ ,timestamp\ ) AS session\ FROM ({}) as events_notated\ ".format( events_notated ) if filter and filter.start_point: sessionified = self._apply_start_point( start_comparator=start_comparator, query_string=sessionified, start_point=filter.start_point, ) final = "\ SELECT {} as path_type, id, sessionified.session\ ,ROW_NUMBER() OVER (\ PARTITION BY person_id\ ,session ORDER BY timestamp\ ) AS event_number\ FROM ({}) as sessionified\ ".format( prop_type, sessionified ) counts = "\ SELECT event_number || '_' || path_type as target_event, id as target_id, LAG(event_number || '_' || path_type, 1) OVER (\ PARTITION BY session\ ) AS source_event , LAG(id, 1) OVER (\ PARTITION BY session\ ) AS source_id from \ ({}) as final\ where event_number <= 4\ ".format( final ) query = "\ SELECT source_event, target_event, MAX(target_id), MAX(source_id), count(*) from ({}) as counts\ where source_event is not null and target_event is not null\ group by source_event, target_event order by count desc limit 20\ ".format( counts ) cursor = connection.cursor() cursor.execute(query, sessions_sql_params) rows = cursor.fetchall() for row in rows: resp.append( {"source": row[0], "target": row[1], "target_id": row[2], "source_id": row[3], "value": row[4],} ) resp = sorted(resp, key=lambda x: x["value"], reverse=True) return resp
def _determine_query_params(self, filter: RetentionFilter, team: Team): period = filter.period is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME events: QuerySet = QuerySet() entity_condition, entity_condition_strigified = self.get_entity_condition( filter.target_entity, "first_event_date" ) returning_condition, returning_condition_stringified = self.get_entity_condition( filter.returning_entity, "events" ) events = Event.objects.filter(team_id=team.pk).add_person_id(team.pk).annotate(event_date=F("timestamp")) trunc, fields = self._get_trunc_func("timestamp", period) if is_first_time_retention: filtered_events = events.filter(properties_to_Q(filter.properties, team_id=team.pk)) first_date = ( filtered_events.filter(entity_condition) .values("person_id", "event", "action") .annotate(first_date=Min(trunc)) .filter(filter.custom_date_filter_Q("first_date")) .distinct() ) final_query = ( filtered_events.filter(filter.date_filter_Q) .filter(returning_condition) .values_list("person_id", "event_date", "event", "action") .union(first_date.values_list("first_date", "person_id", "event", "action")) ) else: filtered_events = events.filter(filter.date_filter_Q).filter( properties_to_Q(filter.properties, team_id=team.pk) ) first_date = ( filtered_events.filter(entity_condition) .annotate(first_date=trunc) .values("first_date", "person_id", "event", "action") .distinct() ) final_query = ( filtered_events.filter(returning_condition) .values_list("person_id", "event_date", "event", "action") .union(first_date.values_list("first_date", "person_id", "event", "action")) ) start_params = ( (filter.date_from, filter.date_from) if period == "Month" or period == "Hour" else (filter.date_from,) ) event_query, events_query_params = final_query.query.sql_with_params() reference_event_query, first_date_params = first_date.query.sql_with_params() event_params = (filter.target_entity.id, filter.returning_entity.id, filter.target_entity.id) return ( { "event_query": event_query, "reference_event_query": reference_event_query, "fields": fields, "return_condition": returning_condition_stringified, "target_condition": entity_condition_strigified, }, start_params + events_query_params + first_date_params + event_params, )
def test_group_property_filters_direct(self): filter = Filter(data={"properties": [{"key": "some_prop", "value": 5, "type": "group", "group_type_index": 1}]}) query_filter = properties_to_Q(filter.property_groups.flat, team_id=self.team.pk, is_direct_query=True) self.assertEqual(query_filter, Q(group_properties__some_prop=5))
def _filter_persons(filter: Filter, team: Team): # TODO: confirm what to do here? # Postgres only supports ANDing all properties :shrug: persons = Person.objects.filter(properties_to_Q(filter.property_groups.flat, team_id=team.pk, is_direct_query=True)) persons = persons.filter(team_id=team.pk) return [str(uuid) for uuid in persons.values_list("uuid", flat=True)]
def test_group_property_filters_used(self): filter = Filter(data={"properties": [{"key": "some_prop", "value": 5, "type": "group", "group_type_index": 1}]}) self.assertRaises(ValueError, lambda: properties_to_Q(filter.property_groups.flat, team_id=self.team.pk))