예제 #1
0
파일: retention.py 프로젝트: yianz/posthog
    def _retrieve_people(self, filter: RetentionFilter, team: Team):
        period = filter.period
        trunc, fields = self._get_trunc_func("timestamp", period)
        is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME
        entity_condition, _ = self.get_entity_condition(
            filter.target_entity, "events")
        returning_condition, _ = self.get_entity_condition(
            filter.returning_entity, "first_event_date")
        _entity_condition = returning_condition if filter.selected_interval > 0 else entity_condition

        events = Event.objects.filter(team_id=team.pk).add_person_id(team.pk)

        filtered_events = events.filter(
            filter.recurring_date_filter_Q()).filter(
                filter.properties_to_Q(team_id=team.pk))

        inner_events = (Event.objects.filter(team_id=team.pk).filter(
            filter.properties_to_Q(team_id=team.pk)).add_person_id(
                team.pk).filter(**{
                    "person_id": OuterRef("id")
                }).filter(entity_condition).values("person_id").annotate(
                    first_date=Min(trunc)).filter(
                        filter.reference_date_filter_Q("first_date")).distinct(
                        ) if is_first_time_retention else Event.objects.filter(
                            team_id=team.pk).filter(
                                filter.reference_date_filter_Q()).filter(
                                    filter.properties_to_Q(
                                        team_id=team.pk)).add_person_id(
                                            team.pk).filter(
                                                **{
                                                    "person_id": OuterRef("id")
                                                }).filter(entity_condition))

        filtered_events = (filtered_events.filter(_entity_condition).filter(
            Exists(
                Person.objects.filter(**{
                    "id": OuterRef("person_id"),
                }).filter(Exists(inner_events)).only("id"))).values(
                    "person_id").distinct()).all()

        people = Person.objects.filter(
            team=team,
            id__in=[
                p["person_id"]
                for p in filtered_events[filter.offset:filter.offset + 100]
            ],
        )

        people = people.prefetch_related(
            Prefetch("persondistinctid_set", to_attr="distinct_ids_cache"))

        from posthog.api.person import PersonSerializer

        return PersonSerializer(people, many=True).data
예제 #2
0
    def _determine_query_params(self, filter: RetentionFilter, team: Team):

        period = filter.period
        is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME

        events: QuerySet = QuerySet()
        entity_condition, entity_condition_strigified = self.get_entity_condition(
            filter.target_entity, "first_event_date")
        returning_condition, returning_condition_stringified = self.get_entity_condition(
            filter.returning_entity, "events")
        events = Event.objects.filter(team_id=team.pk).add_person_id(
            team.pk).annotate(event_date=F("timestamp"))

        trunc, fields = self._get_trunc_func("timestamp", period)

        if is_first_time_retention:
            filtered_events = events.filter(
                filter.properties_to_Q(team_id=team.pk))
            first_date = (filtered_events.filter(entity_condition).values(
                "person_id", "event",
                "action").annotate(first_date=Min(trunc)).filter(
                    filter.custom_date_filter_Q("first_date")).distinct())
            final_query = (filtered_events.filter(
                filter.date_filter_Q).filter(returning_condition).values_list(
                    "person_id", "event_date", "event", "action").union(
                        first_date.values_list("first_date", "person_id",
                                               "event", "action")))
        else:
            filtered_events = events.filter(filter.date_filter_Q).filter(
                filter.properties_to_Q(team_id=team.pk))
            first_date = (filtered_events.filter(entity_condition).annotate(
                first_date=trunc).values("first_date", "person_id", "event",
                                         "action").distinct())

            final_query = (
                filtered_events.filter(returning_condition).values_list(
                    "person_id", "event_date", "event", "action").union(
                        first_date.values_list("first_date", "person_id",
                                               "event", "action")))

        start_params = ((filter.date_from, filter.date_from)
                        if period == "Month" or period == "Hour" else
                        (filter.date_from, ))

        event_query, events_query_params = final_query.query.sql_with_params()
        reference_event_query, first_date_params = first_date.query.sql_with_params(
        )

        event_params = (filter.target_entity.id, filter.returning_entity.id,
                        filter.target_entity.id)

        return (
            {
                "event_query": event_query,
                "reference_event_query": reference_event_query,
                "fields": fields,
                "return_condition": returning_condition_stringified,
                "target_condition": entity_condition_strigified,
            },
            start_params + events_query_params + first_date_params +
            event_params,
        )
예제 #3
0
    def _execute_sql(
        self,
        filter: RetentionFilter,
        team: Team,
    ) -> Dict[Tuple[int, int], Dict[str, Any]]:

        period = filter.period
        is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME

        events: QuerySet = QuerySet()
        entity_condition, entity_condition_strigified = self.get_entity_condition(
            filter.target_entity, "first_event_date")
        returning_condition, returning_condition_stringified = self.get_entity_condition(
            filter.returning_entity, "events")
        events = Event.objects.filter(team_id=team.pk).add_person_id(
            team.pk).annotate(event_date=F("timestamp"))

        trunc, fields = self._get_trunc_func("timestamp", period)

        if is_first_time_retention:
            filtered_events = events.filter(
                filter.properties_to_Q(team_id=team.pk))
            first_date = (filtered_events.filter(entity_condition).values(
                "person_id", "event",
                "action").annotate(first_date=Min(trunc)).filter(
                    filter.custom_date_filter_Q("first_date")).distinct())
            final_query = (filtered_events.filter(
                filter.date_filter_Q).filter(returning_condition).values_list(
                    "person_id", "event_date", "event", "action").union(
                        first_date.values_list("first_date", "person_id",
                                               "event", "action")))
        else:
            filtered_events = events.filter(filter.date_filter_Q).filter(
                filter.properties_to_Q(team_id=team.pk))
            first_date = (filtered_events.filter(entity_condition).annotate(
                first_date=trunc).values("first_date", "person_id", "event",
                                         "action").distinct())

            final_query = (
                filtered_events.filter(returning_condition).values_list(
                    "person_id", "event_date", "event", "action").union(
                        first_date.values_list("first_date", "person_id",
                                               "event", "action")))

        event_query, events_query_params = final_query.query.sql_with_params()
        reference_event_query, first_date_params = first_date.query.sql_with_params(
        )

        final_query = """
            SELECT
                {fields}
                COUNT(DISTINCT "events"."person_id"),
                array_agg(DISTINCT "events"."person_id") as people
            FROM ({event_query}) events
            LEFT JOIN ({reference_event_query}) first_event_date
              ON (events.person_id = first_event_date.person_id)
            WHERE event_date >= first_date
            AND {target_condition} AND {return_condition}
            OR ({target_condition} AND event_date = first_date)
            GROUP BY date, first_date
        """.format(
            event_query=event_query,
            reference_event_query=reference_event_query,
            fields=fields,
            return_condition=returning_condition_stringified,
            target_condition=entity_condition_strigified,
        )
        event_params = (filter.target_entity.id, filter.returning_entity.id,
                        filter.target_entity.id)

        start_params = ((filter.date_from, filter.date_from)
                        if period == "Month" or period == "Hour" else
                        (filter.date_from, ))

        with connection.cursor() as cursor:
            cursor.execute(
                final_query,
                start_params + events_query_params + first_date_params +
                event_params,
            )
            data = namedtuplefetchall(cursor)

            scores: dict = {}
            for datum in data:
                key = round(datum.first_date, 1)
                if not scores.get(key, None):
                    scores.update({key: {}})
                for person in datum.people:
                    if not scores[key].get(person, None):
                        scores[key].update({person: 1})
                    else:
                        scores[key][person] += 1

        by_dates = {}
        for row in data:
            people = sorted(
                row.people,
                key=lambda p: scores[round(row.first_date, 1)][int(p)],
                reverse=True,
            )

            random_key = "".join(
                random.SystemRandom().choice(string.ascii_uppercase +
                                             string.digits) for _ in range(10))
            cache_key = generate_cache_key("{}{}{}".format(
                random_key, str(round(row.first_date, 0)), str(team.pk)))
            cache.set(
                cache_key,
                people,
                600,
            )
            by_dates.update({
                (int(row.first_date), int(row.date)): {
                    "count": row.count,
                    "people": people[0:100],
                    "offset": 100,
                    "next": cache_key if len(people) > 100 else None,
                }
            })

        return by_dates