Exemplo n.º 1
0
class AsyncMigrationOperationSQL(AsyncMigrationOperation):
    def __init__(
        self,
        *,
        sql: str,
        rollback: Optional[str],
        database: AnalyticsDBMS = AnalyticsDBMS.CLICKHOUSE,
        timeout_seconds: int = ASYNC_MIGRATIONS_DEFAULT_TIMEOUT_SECONDS,
    ):
        self.sql = sql
        self.rollback = rollback
        self.database = database
        self.timeout_seconds = timeout_seconds

    def fn(self, query_id: str):
        self._execute_op(query_id, self.sql)

    def rollback_fn(self, query_id: str):
        if self.rollback is not None:
            self._execute_op(query_id, self.rollback)

    def _execute_op(self, query_id: str, sql: str):
        from posthog.async_migrations.utils import execute_op_clickhouse, execute_op_postgres

        if self.database == AnalyticsDBMS.CLICKHOUSE:
            execute_op_clickhouse(sql, query_id, self.timeout_seconds)
        else:
            execute_op_postgres(sql, query_id)

    __repr__ = sane_repr("sql",
                         "rollback",
                         "database",
                         "timeout_seconds",
                         include_id=False)
Exemplo n.º 2
0
class DashboardPrivilege(UUIDModel):
    dashboard: models.ForeignKey = models.ForeignKey(
        "posthog.Dashboard",
        on_delete=models.CASCADE,
        related_name="privileges",
        related_query_name="privilege",
    )
    user: models.ForeignKey = models.ForeignKey(
        "posthog.User",
        on_delete=models.CASCADE,
        related_name="explicit_dashboard_privileges",
        related_query_name="explicit_dashboard_privilege",
    )
    level: models.PositiveSmallIntegerField = models.PositiveSmallIntegerField(
        choices=Dashboard.RestrictionLevel.choices)
    added_at: models.DateTimeField = models.DateTimeField(auto_now_add=True)
    updated_at: models.DateTimeField = models.DateTimeField(auto_now=True)

    class Meta:
        constraints = [
            models.UniqueConstraint(
                fields=["dashboard", "user"],
                name="unique_explicit_dashboard_privilege"),
        ]

    __repr__ = sane_repr("dashboard", "user", "level")
Exemplo n.º 3
0
class EventProperty(models.Model):
    team: models.ForeignKey = models.ForeignKey(Team, on_delete=models.CASCADE)
    event: models.CharField = models.CharField(max_length=400, null=False)
    property: models.CharField = models.CharField(max_length=400, null=False)

    class Meta:
        constraints = [
            models.UniqueConstraint(
                fields=["team", "event", "property"],
                name="posthog_event_property_unique_team_event_property"),
        ]
        indexes = [
            models.Index(fields=["team", "event"]),
            models.Index(fields=["team", "property"]),
        ]

    __repr__ = sane_repr("event", "property", "team_id")
Exemplo n.º 4
0
class BaseFilter(BaseParamMixin):
    def __init__(self,
                 data: Optional[Dict[str, Any]] = None,
                 request: Optional[request.Request] = None,
                 **kwargs) -> None:
        if request:
            data = {
                **request.GET.dict(),
                **request.data,
                **(data if data else {}),
            }
        elif not data:
            raise ValueError(
                "You need to define either a data dict or a request")
        self._data = data
        self.kwargs = kwargs

        if "team" in kwargs and hasattr(self, "simplify") and not getattr(
                self, "is_simplified", False):
            simplified_filter = getattr(self, "simplify")(kwargs["team"])
            self._data = simplified_filter._data

    def to_dict(self) -> Dict[str, Any]:
        ret = {}

        for _, func in inspect.getmembers(self, inspect.ismethod):
            if hasattr(func,
                       "include_dict"):  # provided by @include_dict decorator
                ret.update(func())

        return ret

    def to_params(self) -> Dict[str, str]:
        return encode_get_request_params(data=self.to_dict())

    def toJSON(self):
        return json.dumps(self.to_dict(),
                          default=lambda o: o.__dict__,
                          sort_keys=True,
                          indent=4)

    def with_data(self, overrides: Dict[str, Any]):
        "Allow making copy of filter whilst preserving the class"
        return type(self)(data={**self._data, **overrides}, **self.kwargs)

    __repr__ = sane_repr("_data", "kwargs", include_id=False)
Exemplo n.º 5
0
class License(models.Model):
    objects: LicenseManager = LicenseManager()

    created_at: models.DateTimeField = models.DateTimeField(auto_now_add=True)
    plan: models.CharField = models.CharField(max_length=200)
    valid_until: models.DateTimeField = models.DateTimeField()
    key: models.CharField = models.CharField(max_length=200)
    max_users: models.IntegerField = models.IntegerField(
        default=None, null=True)  # None = no restriction

    SCALE_PLAN = "scale"
    SCALE_FEATURES = [
        AvailableFeature.ZAPIER,
        AvailableFeature.ORGANIZATIONS_PROJECTS,
        AvailableFeature.GOOGLE_LOGIN,
        AvailableFeature.DASHBOARD_COLLABORATION,
        AvailableFeature.INGESTION_TAXONOMY,
        AvailableFeature.PATHS_ADVANCED,
        AvailableFeature.CORRELATION_ANALYSIS,
        AvailableFeature.GROUP_ANALYTICS,
        AvailableFeature.MULTIVARIATE_FLAGS,
        AvailableFeature.EXPERIMENTATION,
        AvailableFeature.TAGGING,
        AvailableFeature.BEHAVIORAL_COHORT_FILTERING,
        AvailableFeature.WHITE_LABELLING,
        AvailableFeature.SUBSCRIPTIONS,
    ]

    ENTERPRISE_PLAN = "enterprise"
    ENTERPRISE_FEATURES = SCALE_FEATURES + [
        AvailableFeature.DASHBOARD_PERMISSIONING,
        AvailableFeature.PROJECT_BASED_PERMISSIONING,
        AvailableFeature.SAML,
        AvailableFeature.SSO_ENFORCEMENT,
    ]
    PLANS = {SCALE_PLAN: SCALE_FEATURES, ENTERPRISE_PLAN: ENTERPRISE_FEATURES}
    # The higher the plan, the higher its sorting value - sync with front-end licenseLogic
    PLAN_TO_SORTING_VALUE = {SCALE_PLAN: 10, ENTERPRISE_PLAN: 20}

    @property
    def available_features(self) -> List[AvailableFeature]:
        return self.PLANS.get(self.plan, [])

    __repr__ = sane_repr("key", "plan", "valid_until")
Exemplo n.º 6
0
class ExplicitTeamMembership(UUIDModel):
    class Level(models.IntegerChoices):
        """Keep in sync with OrganizationMembership.Level (only difference being organizations having an Owner)."""

        MEMBER = 1, "member"
        ADMIN = 8, "administrator"

    team: models.ForeignKey = models.ForeignKey(
        "posthog.Team",
        on_delete=models.CASCADE,
        related_name="explicit_memberships",
        related_query_name="explicit_membership",
    )
    parent_membership: models.ForeignKey = models.ForeignKey(
        "posthog.OrganizationMembership",
        on_delete=models.CASCADE,
        related_name="explicit_team_memberships",
        related_query_name="explicit_team_membership",
    )
    level: models.PositiveSmallIntegerField = models.PositiveSmallIntegerField(
        default=Level.MEMBER, choices=Level.choices)
    joined_at: models.DateTimeField = models.DateTimeField(auto_now_add=True)
    updated_at: models.DateTimeField = models.DateTimeField(auto_now=True)

    class Meta:
        constraints = [
            models.UniqueConstraint(fields=["team", "parent_membership"],
                                    name="unique_explicit_team_membership"),
        ]

    def __str__(self):
        return str(self.Level(self.level))

    @property
    def effective_level(self) -> "OrganizationMembership.Level":
        """If organization level is higher than project level, then that takes precedence over explicit project level.
        """
        return max(self.level, self.parent_membership.level)

    __repr__ = sane_repr("team", "parent_membership", "level")
Exemplo n.º 7
0
class Entity(PropertyMixin):
    """
    Entities represent either Action or Event objects, nested in Filter objects.
    This object isn't a table in the database. It gets stored against the specific models itself as JSON.
    This class just allows for stronger typing of this object.
    """

    id: Union[int, str]
    type: Literal["events", "actions"]
    order: Optional[int]
    name: Optional[str]
    custom_name: Optional[str]
    math: Optional[MATH_TYPE]
    math_property: Optional[str]
    math_group_type_index: Optional[GroupTypeIndex]
    # Index is not set at all by default (meaning: access = AttributeError) - it's populated in EntitiesMixin.entities
    # Used for identifying entities within a single query during query building,
    # which generally uses Entity objects processed by EntitiesMixin
    # The clean room way to do this would be passing the index _alongside_ the object, but OOP abuse is much less work
    index: int

    def __init__(self, data: Dict[str, Any]) -> None:
        self.id = data["id"]
        if not data.get("type") or data["type"] not in [
                TREND_FILTER_TYPE_ACTIONS,
                TREND_FILTER_TYPE_EVENTS,
        ]:
            raise TypeError(
                "Type needs to be either TREND_FILTER_TYPE_ACTIONS or TREND_FILTER_TYPE_EVENTS"
            )
        self.type = data["type"]
        order_provided = data.get("order")
        if order_provided is not None:
            order_provided = int(order_provided)
        self.order = order_provided
        self.name = data.get("name")
        custom_name = data.get("custom_name")
        if custom_name is not None:
            custom_name = str(custom_name).strip() or None
        self.custom_name = custom_name
        self.math = data.get("math")
        self.math_property = data.get("math_property")
        self.math_group_type_index = validate_group_type_index(
            "math_group_type_index", data.get("math_group_type_index"))

        self._action: Optional[Action] = None
        self._data = data  # push data to instance object so mixins are handled properly
        if self.type == TREND_FILTER_TYPE_EVENTS and not self.name:
            # It won't be an int if it's an event, but mypy...
            self.name = str(self.id)

    def to_dict(self) -> Dict[str, Any]:
        return {
            "id": self.id,
            "type": self.type,
            "order": self.order,
            "name": self.name,
            "custom_name": self.custom_name,
            "math": self.math,
            "math_property": self.math_property,
            "math_group_type_index": self.math_group_type_index,
            "properties": self.property_groups.to_dict(),
        }

    def equals(self, other) -> bool:
        """ Checks if two entities are semantically equal."""
        # Not using __eq__ since that affects hashability

        if self.id != other.id:
            return False

        if self.type != other.type:
            return False

        # TODO: Check operators as well, not just the properties.
        # Effectively check within each property group, that they're the same.
        self_properties = sorted(
            str(prop) for prop in self.property_groups.flat)
        other_properties = sorted(
            str(prop) for prop in other.property_groups.flat)
        if self_properties != other_properties:
            return False

        return True

    def is_superset(self, other) -> bool:
        """ Checks if this entity is a superset version of other. The ids match and the properties of (this) is a subset of the properties of (other)"""

        self_properties = Counter(
            [str(prop) for prop in self.property_groups.flat])
        other_properties = Counter(
            [str(prop) for prop in other.property_groups.flat])

        return self.id == other.id and len(self_properties -
                                           other_properties) == 0

    def get_action(self) -> Action:
        if self.type != TREND_FILTER_TYPE_ACTIONS:
            raise ValueError(
                f"Action can only be fetched for entities of type {TREND_FILTER_TYPE_ACTIONS}, not {self.type}!"
            )

        if self._action and not settings.TEST:
            return self._action

        try:
            self._action = Action.objects.get(id=self.id)
            return self._action
        except:
            raise ValidationError(f"Action ID {self.id} does not exist!")

    __repr__ = sane_repr("id", "type", "order", "name", "custom_name", "math",
                         "math_property", "properties")
Exemplo n.º 8
0
class Organization(UUIDModel):
    class Meta:
        constraints = [
            models.UniqueConstraint(
                fields=["for_internal_metrics"],
                condition=Q(for_internal_metrics=True),
                name="single_for_internal_metrics",
            ),
        ]

    class PluginsAccessLevel(models.IntegerChoices):
        # None means the organization can't use plugins at all. They're hidden. Cloud default.
        NONE = 0, "none"
        # Config means the organization can only enable/disable/configure globally managed plugins.
        # This prevents config orgs from running untrusted code, which the next levels can do.
        CONFIG = 3, "config"
        # Install means the organization has config capabilities + can install own editor/GitHub/GitLab/npm plugins.
        # The plugin repository is off limits, as repository installations are managed by root orgs to avoid confusion.
        INSTALL = 6, "install"
        # Root means the organization has unrestricted plugins access on the instance. Self-hosted default.
        # This includes installing plugins from the repository and managing plugin installations for all other orgs.
        ROOT = 9, "root"

    members: models.ManyToManyField = models.ManyToManyField(
        "posthog.User",
        through="posthog.OrganizationMembership",
        related_name="organizations",
        related_query_name="organization",
    )
    name: models.CharField = models.CharField(max_length=64)
    slug: LowercaseSlugField = LowercaseSlugField(unique=True,
                                                  max_length=MAX_SLUG_LENGTH)
    created_at: models.DateTimeField = models.DateTimeField(auto_now_add=True)
    updated_at: models.DateTimeField = models.DateTimeField(auto_now=True)
    domain_whitelist: ArrayField = ArrayField(
        models.CharField(max_length=256, blank=False),
        blank=True,
        default=list
    )  # Used to allow self-serve account creation based on social login (#5111)
    plugins_access_level: models.PositiveSmallIntegerField = models.PositiveSmallIntegerField(
        default=PluginsAccessLevel.CONFIG
        if settings.MULTI_TENANCY else PluginsAccessLevel.ROOT,
        choices=PluginsAccessLevel.choices,
    )
    available_features = ArrayField(models.CharField(max_length=64,
                                                     blank=False),
                                    blank=True,
                                    default=list)
    for_internal_metrics: models.BooleanField = models.BooleanField(
        default=False)
    is_member_join_email_enabled: models.BooleanField = models.BooleanField(
        default=True)

    # DEPRECATED attributes (should be removed on next major version)
    setup_section_2_completed: models.BooleanField = models.BooleanField(
        default=True)
    personalization: models.JSONField = models.JSONField(default=dict,
                                                         null=False,
                                                         blank=True)

    objects: OrganizationManager = OrganizationManager()

    def __str__(self):
        return self.name

    __repr__ = sane_repr("name")

    @property
    def _billing_plan_details(self) -> Tuple[Optional[str], Optional[str]]:
        """
        Obtains details on the billing plan for the organization.
        Returns a tuple with (billing_plan_key, billing_realm)
        """

        # If on Cloud, grab the organization's price
        if hasattr(self, "billing"):
            if self.billing is None:  # type: ignore
                return (None, None)
            return (self.billing.get_plan_key(), "cloud")  # type: ignore

        # Otherwise, try to find a valid license on this instance
        if License is not None:
            license = License.objects.first_valid()
            if license:
                return (license.plan, "ee")
        return (None, None)

    @property
    def billing_plan(self) -> Optional[str]:
        return self._billing_plan_details[0]

    def update_available_features(self) -> List[Union[AvailableFeature, str]]:
        """Updates field `available_features`. Does not `save()`."""
        plan, realm = self._billing_plan_details
        if not plan:
            self.available_features = []
        elif realm == "ee":
            self.available_features = License.PLANS.get(plan, [])
        else:
            self.available_features = self.billing.available_features  # type: ignore
        return self.available_features

    def is_feature_available(self, feature: Union[AvailableFeature,
                                                  str]) -> bool:
        return feature in self.available_features

    @property
    def active_invites(self) -> QuerySet:
        return self.invites.filter(
            created_at__gte=timezone.now() -
            timezone.timedelta(days=INVITE_DAYS_VALIDITY))

    def get_analytics_metadata(self):
        return {
            "member_count":
            self.members.count(),
            "project_count":
            self.teams.count(),
            "person_count":
            sum(team.person_set.count() for team in self.teams.all()),
            "name":
            self.name,
        }
Exemplo n.º 9
0
class OrganizationInvite(UUIDModel):
    organization: models.ForeignKey = models.ForeignKey(
        "posthog.Organization",
        on_delete=models.CASCADE,
        related_name="invites",
        related_query_name="invite",
    )
    target_email: models.EmailField = models.EmailField(null=True,
                                                        db_index=True)
    first_name: models.CharField = models.CharField(max_length=30,
                                                    blank=True,
                                                    default="")
    created_by: models.ForeignKey = models.ForeignKey(
        "posthog.User",
        on_delete=models.SET_NULL,
        related_name="organization_invites",
        related_query_name="organization_invite",
        null=True,
    )
    emailing_attempt_made: models.BooleanField = models.BooleanField(
        default=False)
    created_at: models.DateTimeField = models.DateTimeField(auto_now_add=True)
    updated_at: models.DateTimeField = models.DateTimeField(auto_now=True)

    def validate(self,
                 *,
                 user: Optional["User"] = None,
                 email: Optional[str] = None) -> None:
        _email = email or getattr(user, "email", None)

        if _email and _email != self.target_email:
            raise exceptions.ValidationError(
                f"This invite is intended for another email address: {mask_email_address(self.target_email)}"
                f". You tried to sign up with {_email}.",
                code="invalid_recipient",
            )

        if self.is_expired():
            raise exceptions.ValidationError(
                "This invite has expired. Please ask your admin for a new one.",
                code="expired",
            )

        if OrganizationMembership.objects.filter(
                organization=self.organization, user=user).exists():
            raise exceptions.ValidationError(
                "You already are a member of this organization.",
                code="user_already_member",
            )

        if OrganizationMembership.objects.filter(
                organization=self.organization,
                user__email=self.target_email,
        ).exists():
            raise exceptions.ValidationError(
                "Another user with this email address already belongs to this organization.",
                code="existing_email_address",
            )

    def use(self, user: "******", *, prevalidated: bool = False) -> None:
        if not prevalidated:
            self.validate(user=user)
        user.join(organization=self.organization)
        if is_email_available(
                with_absolute_urls=True
        ) and self.organization.is_member_join_email_enabled:
            from posthog.tasks.email import send_member_join

            send_member_join.apply_async(
                kwargs={
                    "invitee_uuid": user.uuid,
                    "organization_id": self.organization.id
                })
        OrganizationInvite.objects.filter(
            target_email__iexact=self.target_email).delete()

    def is_expired(self) -> bool:
        """Check if invite is older than INVITE_DAYS_VALIDITY days."""
        return self.created_at < timezone.now() - timezone.timedelta(
            INVITE_DAYS_VALIDITY)

    def __str__(self):
        return f"{settings.SITE_URL}/signup/{self.id}"

    __repr__ = sane_repr("organization", "target_email", "created_by")
Exemplo n.º 10
0
class OrganizationMembership(UUIDModel):
    class Level(models.IntegerChoices):
        """Keep in sync with TeamMembership.Level (only difference being projects not having an Owner)."""

        MEMBER = 1, "member"
        ADMIN = 8, "administrator"
        OWNER = 15, "owner"

    organization: models.ForeignKey = models.ForeignKey(
        "posthog.Organization",
        on_delete=models.CASCADE,
        related_name="memberships",
        related_query_name="membership")
    user: models.ForeignKey = models.ForeignKey(
        "posthog.User",
        on_delete=models.CASCADE,
        related_name="organization_memberships",
        related_query_name="organization_membership",
    )
    level: models.PositiveSmallIntegerField = models.PositiveSmallIntegerField(
        default=Level.MEMBER, choices=Level.choices)
    joined_at: models.DateTimeField = models.DateTimeField(auto_now_add=True)
    updated_at: models.DateTimeField = models.DateTimeField(auto_now=True)

    class Meta:
        constraints = [
            models.UniqueConstraint(fields=["organization_id", "user_id"],
                                    name="unique_organization_membership"),
            models.UniqueConstraint(fields=["organization_id"],
                                    condition=models.Q(level=15),
                                    name="only_one_owner_per_organization"),
        ]

    def __str__(self):
        return str(self.Level(self.level))

    def validate_update(self,
                        membership_being_updated: "OrganizationMembership",
                        new_level: Optional[Level] = None) -> None:
        if new_level is not None:
            if membership_being_updated.id == self.id:
                raise exceptions.PermissionDenied(
                    "You can't change your own access level.")
            if new_level == OrganizationMembership.Level.OWNER:
                if self.level != OrganizationMembership.Level.OWNER:
                    raise exceptions.PermissionDenied(
                        "You can only pass on organization ownership if you're its owner."
                    )
                self.level = OrganizationMembership.Level.ADMIN
                self.save()
            elif new_level > self.level:
                raise exceptions.PermissionDenied(
                    "You can only change access level of others to lower or equal to your current one."
                )
        if membership_being_updated.id != self.id:
            if membership_being_updated.organization_id != self.organization_id:
                raise exceptions.PermissionDenied(
                    "You both need to belong to the same organization.")
            if self.level < OrganizationMembership.Level.ADMIN:
                raise exceptions.PermissionDenied(
                    "You can only edit others if you are an admin.")
            if membership_being_updated.level > self.level:
                raise exceptions.PermissionDenied(
                    "You can only edit others with level lower or equal to you."
                )

    __repr__ = sane_repr("organization", "user", "level")
Exemplo n.º 11
0
class Cohort(models.Model):
    name: models.CharField = models.CharField(max_length=400, null=True, blank=True)
    team: models.ForeignKey = models.ForeignKey("Team", on_delete=models.CASCADE)
    deleted: models.BooleanField = models.BooleanField(default=False)
    groups: models.JSONField = models.JSONField(default=list)
    people: models.ManyToManyField = models.ManyToManyField("Person", through="CohortPeople")

    created_by: models.ForeignKey = models.ForeignKey("User", on_delete=models.SET_NULL, blank=True, null=True)
    created_at: models.DateTimeField = models.DateTimeField(default=timezone.now, blank=True, null=True)
    is_calculating: models.BooleanField = models.BooleanField(default=False)
    last_calculation: models.DateTimeField = models.DateTimeField(blank=True, null=True)
    errors_calculating: models.IntegerField = models.IntegerField(default=0)

    is_static: models.BooleanField = models.BooleanField(default=False)

    objects = CohortManager()

    def get_analytics_metadata(self):
        action_groups_count: int = 0
        properties_groups_count: int = 0
        for group in self.groups:
            action_groups_count += 1 if group.get("action_id") else 0
            properties_groups_count += 1 if group.get("properties") else 0

        return {
            "name_length": len(self.name) if self.name else 0,
            "person_count_precalc": self.people.count(),
            "groups_count": len(self.groups),
            "action_groups_count": action_groups_count,
            "properties_groups_count": properties_groups_count,
            "deleted": self.deleted,
        }

    def calculate_people(self, use_clickhouse=is_clickhouse_enabled()):
        if self.is_static:
            return
        try:
            if not use_clickhouse:
                self.is_calculating = True
                self.save()

            persons_query = self._clickhouse_persons_query() if use_clickhouse else self._postgres_persons_query()
            try:
                sql, params = persons_query.distinct("pk").only("pk").query.sql_with_params()
            except EmptyResultSet:
                query = DELETE_QUERY.format(cohort_id=self.pk)
                params = {}
            else:
                query = "{}{}".format(DELETE_QUERY, UPDATE_QUERY).format(
                    cohort_id=self.pk,
                    values_query=sql.replace('FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1,),
                )

            cursor = connection.cursor()
            with transaction.atomic():
                cursor.execute(query, params)

                self.is_calculating = False
                self.last_calculation = timezone.now()
                self.errors_calculating = 0
                self.save()
        except Exception as err:
            if settings.DEBUG:
                raise err
            self.is_calculating = False
            self.errors_calculating = F("errors_calculating") + 1
            self.save()
            capture_exception(err)

    def calculate_people_ch(self):
        if is_clickhouse_enabled():
            from ee.clickhouse.models.cohort import recalculate_cohortpeople

            recalculate_cohortpeople(self)

    def insert_users_by_list(self, items: List[str]) -> None:
        """
        Items can be distinct_id or email
        """
        batchsize = 1000
        use_clickhouse = is_clickhouse_enabled()
        if use_clickhouse:
            from ee.clickhouse.models.cohort import insert_static_cohort
        try:
            cursor = connection.cursor()
            for i in range(0, len(items), batchsize):
                batch = items[i : i + batchsize]
                persons_query = (
                    Person.objects.filter(team_id=self.team_id)
                    .filter(Q(persondistinctid__team_id=self.team_id, persondistinctid__distinct_id__in=batch))
                    .exclude(cohort__id=self.id)
                )
                if use_clickhouse:
                    insert_static_cohort([p for p in persons_query.values_list("uuid", flat=True)], self.pk, self.team)
                sql, params = persons_query.distinct("pk").only("pk").query.sql_with_params()
                query = UPDATE_QUERY.format(
                    cohort_id=self.pk,
                    values_query=sql.replace('FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1,),
                )
                cursor.execute(query, params)
            self.is_calculating = False
            self.last_calculation = timezone.now()
            self.errors_calculating = 0
            self.save()
        except Exception as err:
            if settings.DEBUG:
                raise err
            self.is_calculating = False
            self.errors_calculating = F("errors_calculating") + 1
            self.save()
            capture_exception(err)

    def insert_users_list_by_uuid(self, items: List[str]) -> None:
        batchsize = 1000
        try:
            cursor = connection.cursor()
            for i in range(0, len(items), batchsize):
                batch = items[i : i + batchsize]
                persons_query = (
                    Person.objects.filter(team_id=self.team_id).filter(uuid__in=batch).exclude(cohort__id=self.id)
                )
                sql, params = persons_query.distinct("pk").only("pk").query.sql_with_params()
                query = UPDATE_QUERY.format(
                    cohort_id=self.pk,
                    values_query=sql.replace('FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1,),
                )
                cursor.execute(query, params)

            self.is_calculating = False
            self.last_calculation = timezone.now()
            self.errors_calculating = 0
            self.save()
        except Exception as err:
            if settings.DEBUG:
                raise err
            self.is_calculating = False
            self.errors_calculating = F("errors_calculating") + 1
            self.save()
            capture_exception(err)

    def __str__(self):
        return self.name

    def _clickhouse_persons_query(self):
        from ee.clickhouse.models.cohort import get_person_ids_by_cohort_id

        uuids = get_person_ids_by_cohort_id(team=self.team, cohort_id=self.pk)
        return Person.objects.filter(uuid__in=uuids, team=self.team)

    def _postgres_persons_query(self):
        return Person.objects.filter(self._people_filter(), team=self.team)

    def _people_filter(self, extra_filter=None):
        from posthog.queries.base import properties_to_Q

        filters = Q()
        for group in self.groups:
            if group.get("action_id"):
                action = Action.objects.get(pk=group["action_id"], team_id=self.team_id)
                events = (
                    Event.objects.filter_by_action(action)
                    .filter(
                        team_id=self.team_id,
                        **(
                            {"timestamp__gt": timezone.now() - relativedelta(days=int(group["days"]))}
                            if group.get("days")
                            else {}
                        ),
                        **(extra_filter if extra_filter else {})
                    )
                    .order_by("distinct_id")
                    .distinct("distinct_id")
                    .values("distinct_id")
                )

                filters |= Q(persondistinctid__distinct_id__in=events)
            elif group.get("properties"):
                filter = Filter(data=group)
                filters |= Q(properties_to_Q(filter.properties, team_id=self.team_id, is_person_query=True))
        return filters

    __repr__ = sane_repr("id", "name", "last_calculation")
Exemplo n.º 12
0
class SessionRecordingList:
    SESSION_RECORDINGS_DEFAULT_LIMIT = 50
    _filter: SessionRecordingsFilter
    _team: Team

    def __init__(self, filter: SessionRecordingsFilter, team: Team) -> None:
        self._filter = filter
        self._team = team

    _recording_duration_select_statement = "EXTRACT(EPOCH FROM MAX(timestamp) - MIN(timestamp)) as duration,"
    _recording_full_snapshot_select_statement = "COUNT(*) FILTER(where snapshot_data->>'type' = '2' OR (snapshot_data->>'has_full_snapshot')::boolean) as full_snapshots"
    _session_recording_event_table = "posthog_sessionrecordingevent"
    _session_recording_select_statements = """
            MIN(session_recordings.start_time) as start_time,
            MIN(session_recordings.end_time) as end_time,
            MIN(session_recordings.duration) as duration,
            MIN(filtered_events.distinct_id) as distinct_id
    """
    _core_session_recording_query: str = """
        SELECT 
            all_recordings.session_id,
            all_recordings.start_time,
            all_recordings.end_time,
            all_recordings.duration,
            all_recordings.distinct_id
        FROM (
            SELECT
                session_id,
                distinct_id,
                MIN(timestamp) AS start_time,
                MAX(timestamp) AS end_time,
                {recording_duration_select_statement}
                {recording_full_snapshot_select_statement}
            FROM {session_recording_event_table}
            WHERE
                team_id = %(team_id)s
                {events_timestamp_clause}
                {distinct_id_clause}
            GROUP BY session_id, distinct_id
        ) as all_recordings
        WHERE full_snapshots > 0
        {recording_start_time_clause}
        {duration_clause} 
    """

    _limited_session_recordings_query: str = """
    {core_session_recording_query}
    ORDER BY start_time DESC
    LIMIT %(limit)s OFFSET %(offset)s
    """

    _session_recordings_query_with_entity_filter: str = """
    SELECT * FROM 
    (
        SELECT
            session_recordings.session_id,
            {session_recording_select_statements}
            {event_filter_aggregate_select_clause}
        FROM (
            {events_query}
        ) AS filtered_events
        JOIN (
            {core_session_recording_query}
        ) AS session_recordings
        ON session_recordings.distinct_id = filtered_events.distinct_id
        WHERE
            filtered_events.timestamp >= session_recordings.start_time 
            AND filtered_events.timestamp <= session_recordings.end_time
        GROUP BY session_recordings.session_id
    ) as session_recordings
    {event_filter_aggregate_where_clause}
    ORDER BY start_time DESC
    LIMIT %(limit)s OFFSET %(offset)s
    """

    def _has_entity_filters(self):
        return self._filter.entities and len(self._filter.entities) > 0

    def _get_limit(self):
        return self._filter.limit or self.SESSION_RECORDINGS_DEFAULT_LIMIT

    # We want to select events beyond the range of the recording to handle the case where
    # a recording spans the time boundaries
    def _get_events_timestamp_clause(self) -> Tuple[Dict[str, Any], str]:
        timestamp_clause = ""
        timestamp_params = {}
        if self._filter.date_from:
            timestamp_clause += "\nAND timestamp >= %(event_start_time)s"
            timestamp_params["event_start_time"] = self._filter.date_from - timedelta(hours=12)
        if self._filter.date_to:
            timestamp_clause += "\nAND timestamp <= %(event_end_time)s"
            timestamp_params["event_end_time"] = self._filter.date_to + timedelta(hours=12)
        return timestamp_params, timestamp_clause

    def _get_recording_start_time_clause(self) -> Tuple[Dict[str, Any], str]:
        start_time_clause = ""
        start_time_params = {}
        if self._filter.date_from:
            start_time_clause += "\nAND start_time >= %(start_time)s"
            start_time_params["start_time"] = self._filter.date_from
        if self._filter.date_to:
            start_time_clause += "\nAND start_time <= %(end_time)s"
            start_time_params["end_time"] = self._filter.date_to
        return start_time_params, start_time_clause

    def _get_distinct_id_clause(self) -> Tuple[Dict[str, Any], str]:
        distinct_id_clause = ""
        distinct_id_params = {}
        if self._filter.person_uuid:
            person = Person.objects.get(uuid=self._filter.person_uuid)
            distinct_id_clause = f"AND distinct_id IN (SELECT distinct_id from posthog_persondistinctid WHERE person_id = %(person_id)s AND team_id = %(team_id)s)"
            distinct_id_params = {"person_id": person.pk, "team_id": self._team.pk}
        return distinct_id_params, distinct_id_clause

    def _get_duration_clause(self) -> Tuple[Dict[str, Any], str]:
        duration_clause = ""
        duration_params = {}
        if self._filter.recording_duration_filter:
            if self._filter.recording_duration_filter.operator == "gt":
                operator = ">"
            else:
                operator = "<"
            duration_clause = "\nAND duration {operator} %(recording_duration)s".format(operator=operator)
            duration_params = {
                "recording_duration": self._filter.recording_duration_filter.value,
            }
        return duration_params, duration_clause

    def _get_events_query(self) -> Tuple[str, list]:
        events: Union[EventManager, QuerySet] = Event.objects.filter(team=self._team).order_by("-timestamp").only(
            "distinct_id", "timestamp"
        )
        if self._filter.date_from:
            events = events.filter(timestamp__gte=self._filter.date_from - timedelta(hours=12))
        if self._filter.date_to:
            events = events.filter(timestamp__lte=self._filter.date_to + timedelta(hours=12))

        keys = []
        event_q_filters = []

        for i, entity in enumerate(self._filter.entities):
            key = f"entity_{i}"
            q_filter = entity_to_Q(entity, self._team.pk)
            event_q_filters.append(q_filter)
            events = events.annotate(**{key: ExpressionWrapper(q_filter, output_field=BooleanField())})
            keys.append(key)

        combined_event_q_filter = Q()
        for events_q_filter in event_q_filters:
            combined_event_q_filter |= events_q_filter

        events = events.filter(combined_event_q_filter)
        events = events.values_list("distinct_id", "timestamp", *keys)

        with connection.cursor() as cursor:
            event_query = cursor.mogrify(*events.query.sql_with_params()).decode("utf-8")

        return event_query, keys

    def _get_events_query_with_aggregate_clauses(self) -> EventsQueryWithAggregateClausesSQL:
        event_query, keys = self._get_events_query()
        aggregate_select_clause = ""
        aggregate_having_conditions = []
        for key in keys:
            aggregate_field_name = f"count_{key}"
            aggregate_select_clause += f"\n, SUM(CASE WHEN {key} THEN 1 ELSE 0 END) as {aggregate_field_name}"
            aggregate_having_conditions.append(f"{aggregate_field_name} > 0")

        aggregate_where_clause = f"WHERE {' AND '.join(aggregate_having_conditions)}"

        return EventsQueryWithAggregateClausesSQL(event_query, {}, aggregate_select_clause, aggregate_where_clause)

    def _build_query(self) -> Tuple[str, Dict[str, Any]]:
        # One more is added to the limit to check if there are more results available
        limit = self._get_limit() + 1
        offset = self._filter.offset or 0
        base_params = {"team_id": self._team.pk, "limit": limit, "offset": offset}
        events_timestamp_params, events_timestamp_clause = self._get_events_timestamp_clause()
        recording_start_time_params, recording_start_time_clause = self._get_recording_start_time_clause()
        distinct_id_params, distinct_id_clause = self._get_distinct_id_clause()
        duration_params, duration_clause = self._get_duration_clause()

        core_session_recording_query = self._core_session_recording_query.format(
            recording_duration_select_statement=self._recording_duration_select_statement,
            recording_full_snapshot_select_statement=self._recording_full_snapshot_select_statement,
            session_recording_event_table=self._session_recording_event_table,
            distinct_id_clause=distinct_id_clause,
            events_timestamp_clause=events_timestamp_clause,
            recording_start_time_clause=recording_start_time_clause,
            duration_clause=duration_clause,
        )
        params = {
            **base_params,
            **distinct_id_params,
            **events_timestamp_params,
            **duration_params,
            **recording_start_time_params,
        }

        if self._has_entity_filters():
            (
                events_query,
                event_query_params,
                aggregate_select_clause,
                aggregate_where_clause,
            ) = self._get_events_query_with_aggregate_clauses()
            return (
                self._session_recordings_query_with_entity_filter.format(
                    session_recording_select_statements=self._session_recording_select_statements,
                    core_session_recording_query=core_session_recording_query,
                    events_query=events_query,
                    event_filter_aggregate_select_clause=aggregate_select_clause,
                    event_filter_aggregate_where_clause=aggregate_where_clause,
                ),
                {**params, **event_query_params},
            )
        return (
            self._limited_session_recordings_query.format(core_session_recording_query=core_session_recording_query),
            params,
        )

    def _data_to_return(self, results: List[Any]) -> List[Dict[str, Any]]:
        return [row._asdict() for row in results]

    def _paginate_results(self, session_recordings) -> SessionRecordingQueryResult:
        limit = self._get_limit()
        more_recordings_available = False
        if len(session_recordings) > limit:
            more_recordings_available = True
            session_recordings = session_recordings[0:limit]
        return SessionRecordingQueryResult(session_recordings, more_recordings_available)

    def run(self, *args, **kwargs) -> SessionRecordingQueryResult:
        with connection.cursor() as cursor:
            query, query_params = self._build_query()
            cursor.execute(query, query_params)
            query_results = namedtuplefetchall(cursor)
        session_recordings = self._data_to_return(query_results)
        return self._paginate_results(session_recordings)

    __repr__ = sane_repr("_team", "_filter")
Exemplo n.º 13
0
class Cohort(models.Model):
    name: models.CharField = models.CharField(max_length=400,
                                              null=True,
                                              blank=True)
    description: models.CharField = models.CharField(max_length=1000,
                                                     blank=True)
    team: models.ForeignKey = models.ForeignKey("Team",
                                                on_delete=models.CASCADE)
    deleted: models.BooleanField = models.BooleanField(default=False)
    groups: models.JSONField = models.JSONField(default=list)
    people: models.ManyToManyField = models.ManyToManyField(
        "Person", through="CohortPeople")
    version: models.IntegerField = models.IntegerField(blank=True, null=True)
    pending_version: models.IntegerField = models.IntegerField(blank=True,
                                                               null=True)
    count: models.IntegerField = models.IntegerField(blank=True, null=True)

    created_by: models.ForeignKey = models.ForeignKey(
        "User", on_delete=models.SET_NULL, blank=True, null=True)
    created_at: models.DateTimeField = models.DateTimeField(
        default=timezone.now, blank=True, null=True)

    is_calculating: models.BooleanField = models.BooleanField(default=False)
    last_calculation: models.DateTimeField = models.DateTimeField(blank=True,
                                                                  null=True)
    errors_calculating: models.IntegerField = models.IntegerField(default=0)

    is_static: models.BooleanField = models.BooleanField(default=False)

    objects = CohortManager()

    def get_analytics_metadata(self):
        action_groups_count: int = 0
        properties_groups_count: int = 0
        for group in self.groups:
            action_groups_count += 1 if group.get("action_id") else 0
            properties_groups_count += 1 if group.get("properties") else 0

        return {
            "name_length": len(self.name) if self.name else 0,
            "person_count_precalc": self.people.count(),
            "groups_count": len(self.groups),
            "action_groups_count": action_groups_count,
            "properties_groups_count": properties_groups_count,
            "deleted": self.deleted,
        }

    def calculate_people(self,
                         new_version: int,
                         batch_size=10000,
                         pg_batch_size=1000):
        if self.is_static:
            return
        try:

            # Paginate fetch batch_size from clickhouse and paginate insert pg_batch_size into postgres
            cursor = 0
            persons = self._clickhouse_persons_query(batch_size=batch_size,
                                                     offset=cursor)
            while persons:
                # TODO: Insert from a subquery instead of pulling retrieving
                # then sending large lists of data backwards and forwards.
                to_insert = [
                    CohortPeople(person_id=person_id,
                                 cohort_id=self.pk,
                                 version=new_version)
                    #  Just pull out the person id as we don't need anything
                    #  else.
                    for person_id in persons.values_list("id", flat=True)
                ]
                #  TODO: make sure this bulk_create doesn't actually return anything
                CohortPeople.objects.bulk_create(to_insert,
                                                 batch_size=pg_batch_size)

                cursor += batch_size
                persons = self._clickhouse_persons_query(batch_size=batch_size,
                                                         offset=cursor)
                time.sleep(5)

        except Exception as err:
            # Clear the pending version people if there's an error
            batch_delete_cohort_people(self.pk, new_version)

            raise err

    def calculate_people_ch(self, pending_version):
        from ee.clickhouse.models.cohort import recalculate_cohortpeople
        from posthog.tasks.cohorts_in_feature_flag import get_cohort_ids_in_feature_flags

        logger.info("cohort_calculation_started",
                    id=self.pk,
                    current_version=self.version,
                    new_version=pending_version)
        start_time = time.monotonic()

        try:
            count = recalculate_cohortpeople(self)

            # only precalculate if used in feature flag
            ids = get_cohort_ids_in_feature_flags()

            if self.pk in ids:
                self.calculate_people(new_version=pending_version)
                # Update filter to match pending version if still valid
                Cohort.objects.filter(pk=self.pk).filter(
                    Q(version__lt=pending_version)
                    | Q(version__isnull=True)).update(version=pending_version,
                                                      count=count)
                self.refresh_from_db()
            else:
                self.count = count

            self.last_calculation = timezone.now()
            self.errors_calculating = 0
        except Exception:
            self.errors_calculating = F("errors_calculating") + 1
            logger.warning(
                "cohort_calculation_failed",
                id=self.pk,
                current_version=self.version,
                new_version=pending_version,
                exc_info=True,
            )
            raise
        finally:
            self.is_calculating = False
            self.save()

        logger.info(
            "cohort_calculation_completed",
            id=self.pk,
            version=pending_version,
            duration=(time.monotonic() - start_time),
        )

    def insert_users_by_list(self, items: List[str]) -> None:
        """
        Items can be distinct_id or email
        Important! Does not insert into clickhouse
        """
        batchsize = 1000
        from ee.clickhouse.models.cohort import insert_static_cohort

        try:
            cursor = connection.cursor()
            for i in range(0, len(items), batchsize):
                batch = items[i:i + batchsize]
                persons_query = (Person.objects.filter(
                    team_id=self.team_id).filter(
                        Q(persondistinctid__team_id=self.team_id,
                          persondistinctid__distinct_id__in=batch)).exclude(
                              cohort__id=self.id))
                insert_static_cohort(
                    [p for p in persons_query.values_list("uuid", flat=True)],
                    self.pk, self.team)
                sql, params = persons_query.distinct("pk").only(
                    "pk").query.sql_with_params()
                query = UPDATE_QUERY.format(
                    cohort_id=self.pk,
                    values_query=sql.replace(
                        'FROM "posthog_person"',
                        f', {self.pk}, {self.version or "NULL"} FROM "posthog_person"',
                        1,
                    ),
                )
                cursor.execute(query, params)
            self.is_calculating = False
            self.last_calculation = timezone.now()
            self.errors_calculating = 0
            self.save()
        except Exception as err:
            if settings.DEBUG:
                raise err
            self.is_calculating = False
            self.errors_calculating = F("errors_calculating") + 1
            self.save()
            capture_exception(err)

    def insert_users_list_by_uuid(self, items: List[str]) -> None:
        batchsize = 1000
        try:
            cursor = connection.cursor()
            for i in range(0, len(items), batchsize):
                batch = items[i:i + batchsize]
                persons_query = (Person.objects.filter(
                    team_id=self.team_id).filter(uuid__in=batch).exclude(
                        cohort__id=self.id))
                sql, params = persons_query.distinct("pk").only(
                    "pk").query.sql_with_params()
                query = UPDATE_QUERY.format(
                    cohort_id=self.pk,
                    values_query=sql.replace(
                        'FROM "posthog_person"',
                        f', {self.pk}, {self.version or "NULL"} FROM "posthog_person"',
                        1,
                    ),
                )
                cursor.execute(query, params)

            self.is_calculating = False
            self.last_calculation = timezone.now()
            self.errors_calculating = 0
            self.save()
        except Exception as err:
            if settings.DEBUG:
                raise err
            self.is_calculating = False
            self.errors_calculating = F("errors_calculating") + 1
            self.save()
            capture_exception(err)

    def __str__(self):
        return self.name

    def _clickhouse_persons_query(self, batch_size=10000, offset=0):
        from ee.clickhouse.models.cohort import get_person_ids_by_cohort_id

        uuids = get_person_ids_by_cohort_id(team=self.team,
                                            cohort_id=self.pk,
                                            limit=batch_size,
                                            offset=offset)
        return Person.objects.filter(uuid__in=uuids, team=self.team)

    __repr__ = sane_repr("id", "name", "last_calculation")
Exemplo n.º 14
0
class Funnel(BaseQuery):

    _filter: Filter
    _team: Team

    def __init__(self, filter: Filter, team: Team) -> None:
        self._filter = filter
        self._team = team

    def _gen_lateral_bodies(self, within_time: Optional[str] = None):
        annotations = {}
        for index, step in enumerate(self._filter.entities):
            filter_key = "event" if step.type == TREND_FILTER_TYPE_EVENTS else "action__pk"
            event = (Event.objects.values("distinct_id").annotate(
                step_ts=Min("timestamp"),
                person_id=Value("99999999", IntegerField()),
            ).filter(
                self._filter.date_filter_Q,
                **{
                    filter_key: step.id
                },
                team_id=self._team.pk,
                **({
                    "distinct_id": "1234321"
                } if index > 0 else {}),
                **({
                    "timestamp__gte":
                    timezone.now().replace(year=2000,
                                           month=1,
                                           day=1,
                                           hour=0,
                                           minute=0,
                                           second=0,
                                           microsecond=0)
                } if index > 0 else {}),
            ).filter(
                properties_to_Q(
                    self._filter.properties,
                    team_id=self._team.pk,
                )).filter(
                    properties_to_Q(step.properties, team_id=self._team.pk)))
            with connection.cursor() as cursor:
                event_string = cursor.mogrify(*event.query.sql_with_params())
            # Replace placeholders injected by the Django ORM
            # We do this because the Django ORM doesn't easily allow us to parameterize sql identifiers
            # This is probably the most hacky part of the entire query generation
            event_string = (event_string.decode("utf-8").replace(
                "'1234321'", "{prev_step_person_id}"
            ).replace(
                "'2000-01-01T00:00:00+00:00'::timestamptz",
                "{prev_step_ts} %s" %
                (' AND "posthog_event"."timestamp" < "step_{}"."step_ts" + {}'.
                 format(index - 1, within_time) if within_time else ""),
            ).replace('"posthog_event"."distinct_id"',
                      '"pdi"."person_id"').replace(
                          "99999999", '"pdi"."person_id"').replace(
                              ', "pdi"."person_id" AS "person_id"', ""))
            event_string = re.sub(
                # accommodate for identifier e.g. W0 so that it still ends up right after `FROM posthog_event`
                # not after `ON pdi.distinct_id = posthog_event.distinct_id`
                r'FROM "posthog_event"( [A-Z][0-9])?',
                r"FROM posthog_event\1 JOIN posthog_persondistinctid pdi "
                #  NOTE: here we are joining on the unique identifier of the
                #  persondistinctid table, i.e. (team_id, distinct_id)
                r"ON pdi.distinct_id = posthog_event.distinct_id AND pdi.team_id = posthog_event.team_id",
                event_string,
            )
            query = sql.SQL(event_string)
            annotations["step_{}".format(index)] = query
        return annotations

    def _serialize_step(
            self,
            step: Entity,
            count: int,
            people: Optional[List[uuid.UUID]] = None) -> Dict[str, Any]:
        if step.type == TREND_FILTER_TYPE_ACTIONS:
            name = step.get_action().name
        else:
            name = step.id
        return {
            "action_id": step.id,
            "name": name,
            "custom_name": step.custom_name,
            "order": step.order,
            "people": people if people else [],
            "count": count,
            "type": step.type,
        }

    def _build_query(self, within_time: Optional[str] = None):
        """Build query using lateral joins using a combination of Django generated SQL
        and sql built using psycopg2
        """
        query_bodies = self._gen_lateral_bodies(within_time=within_time)

        ON_TRUE = "ON TRUE"
        LEFT_JOIN_LATERAL = "LEFT JOIN LATERAL"
        LAT_JOIN_BODY = ("""({query}) {step} {on_true} {join}"""
                         if len(query_bodies) > 1 else
                         """({query}) {step} {on_true} """)

        steps = [sql.Identifier(step) for step, _ in query_bodies.items()]
        select_steps = [
            sql.Composed([
                step,
                sql.SQL("."),
                sql.Identifier("step_ts"),
                sql.SQL(" as "),
                step,
            ]) for step in steps
        ]
        lateral_joins = []
        i = 0
        for step, qb in query_bodies.items():
            if i > 0:
                # For each step after the first we must reference the previous step's person_id and step_ts
                q = qb.format(
                    prev_step_person_id=sql.Composed([
                        steps[i - 1],
                        sql.SQL("."),
                        sql.Identifier("person_id")
                    ]),
                    prev_step_ts=sql.Composed([
                        steps[i - 1],
                        sql.SQL("."),
                        sql.Identifier("step_ts")
                    ]),
                )

            if i == 0:
                # Generate first lateral join body
                # The join conditions are different for first, middles, and last
                # For the first step we include the alias, lateral join, but not 'ON TRUE'
                base_body = sql.SQL(LAT_JOIN_BODY).format(
                    query=qb,
                    step=sql.SQL(step),
                    on_true=sql.SQL(""),
                    join=sql.SQL(LEFT_JOIN_LATERAL),
                )
            elif i == len(query_bodies) - 1:
                # Generate last lateral join body
                # The join conditions are different for first, middles, and last
                # For the last step we include the alias, 'ON TRUE', but not another `LATERAL JOIN`
                base_body = sql.SQL(LAT_JOIN_BODY).format(
                    query=q,
                    step=sql.SQL(step),
                    on_true=sql.SQL(ON_TRUE),
                    join=sql.SQL(""),
                )
            else:
                # Generate middle lateral join body
                # The join conditions are different for first, middles, and last
                # For the middle steps we include the alias, 'ON TRUE', and `LATERAL JOIN`
                base_body = sql.SQL(LAT_JOIN_BODY).format(
                    query=q,
                    step=sql.SQL(step),
                    on_true=sql.SQL(ON_TRUE),
                    join=sql.SQL(LEFT_JOIN_LATERAL),
                )
            lateral_joins.append(base_body)
            i += 1

        event_chain_query = sql.SQL(" ").join(lateral_joins).as_string(
            connection.connection)

        query = f"""
            SELECT
                DISTINCT ON (person.id)
                person.uuid,
                person.created_at,
                person.team_id,
                person.properties,
                person.is_user_id,
                {sql.SQL(",").join(select_steps).as_string(connection.connection)}
            FROM posthog_person person
            JOIN posthog_persondistinctid pdi ON pdi.person_id = person.id
            JOIN {event_chain_query}
            -- join on person_id for the first event.
            -- NOTE: there is some implicit coupling here in that I am
            -- assuming the name of the first event select is "step_0".
            -- Maybe worth cleaning up in the future
            ON person.id = step_0.person_id
            WHERE person.team_id = {self._team.pk} AND person.id IS NOT NULL
            ORDER BY person.id, step_0.step_ts ASC
        """
        return query

    def _build_trends_query(self, filter: Filter) -> sql.SQL:
        # TODO: Only select from_step and to_step in _build_steps_query
        particular_steps = (
            sql.SQL(f'COUNT("step_{index}") as "step_{index}_count"')
            for index in range(len(filter.entities)))
        trends_query = sql.SQL("""
            SELECT
                date_trunc({interval}, {interval_field}) as "date",
                {particular_steps}
            FROM (
                {steps_query}
            ) steps_at_dates GROUP BY "date"
        """).format(
            interval=sql.Literal(filter.interval),
            particular_steps=sql.SQL(",\n").join(particular_steps),
            steps_query=sql.SQL(self._build_query(within_time="'1 day'")),
            interval_field=sql.SQL("step_0") if filter.interval != "week" else
            sql.SQL("(\"step_0\" + interval '1 day') AT TIME ZONE 'UTC'"),
        )
        return trends_query

    def _get_last_step_attr(self, step: object) -> int:
        if len(self._filter.entities) == 1:
            return 0
        return getattr(step,
                       "step_{}_count".format(len(self._filter.entities) - 1))

    def _get_trends(self) -> List[Dict[str, Any]]:
        serialized: Dict[str, Any] = {
            "count": 0,
            "data": [],
            "days": [],
            "labels": []
        }
        with connection.cursor() as cursor:
            qstring = self._build_trends_query(self._filter).as_string(
                cursor.connection)
            cursor.execute(qstring)
            steps_at_dates = namedtuplefetchall(cursor)

        date_range = get_daterange(self._filter.date_from
                                   or steps_at_dates[0].date,
                                   self._filter.date_to,
                                   frequency=self._filter.interval)

        data_array = [{
            "date":
            step.date,
            "count":
            round(self._get_last_step_attr(step) / step.step_0_count * 100)
        } for step in steps_at_dates]

        if self._filter.interval == "week":
            for df in data_array:
                df["date"] -= timedelta(days=df["date"].weekday() + 1)
        elif self._filter.interval == "month":
            for df in data_array:
                df["date"] = df["date"].replace(day=1)
        for df in data_array:
            df["date"] = df["date"].replace(tzinfo=pytz.utc).isoformat()

        datewise_data = {d["date"]: d["count"] for d in data_array}
        values = [(key, datewise_data.get(key.isoformat(), 0))
                  for key in date_range]

        for item in values:
            serialized["days"].append(item[0])
            serialized["data"].append(item[1])
            serialized["labels"].append(
                format_label_date(item[0], self._filter.interval))
        return [serialized]

    def data_to_return(self, results: List[Person]) -> List[Dict[str, Any]]:
        steps = []

        average_time: Dict[int, Dict[str, Any]] = {}
        for index, funnel_step in enumerate(self._filter.entities, start=0):
            if index != 0:
                average_time[index] = {
                    "total_time": timedelta(0),
                    "total_people": 0
                }

        person_score: Dict = defaultdict(int)
        for index, funnel_step in enumerate(self._filter.entities):
            relevant_people = []
            for person in results:
                if (index > 0 and getattr(person, "step_{}".format(index))
                        and getattr(person, "step_{}".format(index - 1))):
                    average_time[index]["total_time"] += getattr(
                        person, "step_{}".format(index)) - getattr(
                            person, "step_{}".format(index - 1))
                    average_time[index]["total_people"] += 1

                if getattr(person, "step_{}".format(index)):
                    person_score[person.uuid] += 1
                    relevant_people.append(person.uuid)
            steps.append(
                self._serialize_step(
                    funnel_step,
                    len(relevant_people) if relevant_people else 0,
                    relevant_people))

        if len(steps) > 0:
            for index, _ in enumerate(steps):
                steps[index]["people"] = sorted(steps[index]["people"],
                                                key=lambda p: person_score[p],
                                                reverse=True)[0:100]

        for index in average_time.keys():
            steps[index - 1]["average_time"] = (
                (average_time[index]["total_time"].total_seconds() /
                 average_time[index]["total_people"])
                if average_time[index]["total_people"] > 0 else 0)

        return steps

    def run(self, *args, **kwargs) -> List[Dict[str, Any]]:
        """
        Builds and runs a query to get all persons that have been in the funnel
        steps defined by `self._filter.entities`. For example, entities may be
        defined as:

            1. event with event name "user signed up"
            2. event with event name "user looked at report"

        For a person to match they have to have gone through all `entities` in
        order. We also only return one such chain of entities, the earliest one
        we find.
        """

        # If no steps are defined, then there's no point in querying the database
        if len(self._filter.entities) == 0:
            return []

        if self._filter.display == TRENDS_LINEAR:
            return self._get_trends()

        with connection.cursor() as cursor:
            # Then we build a query to query for them in order
            qstring = self._build_query(within_time=None)

            cursor.execute(qstring)
            results = namedtuplefetchall(cursor)
        return self.data_to_return(results)

    __repr__ = sane_repr("_team", "_filter")