class AsyncMigrationOperationSQL(AsyncMigrationOperation): def __init__( self, *, sql: str, rollback: Optional[str], database: AnalyticsDBMS = AnalyticsDBMS.CLICKHOUSE, timeout_seconds: int = ASYNC_MIGRATIONS_DEFAULT_TIMEOUT_SECONDS, ): self.sql = sql self.rollback = rollback self.database = database self.timeout_seconds = timeout_seconds def fn(self, query_id: str): self._execute_op(query_id, self.sql) def rollback_fn(self, query_id: str): if self.rollback is not None: self._execute_op(query_id, self.rollback) def _execute_op(self, query_id: str, sql: str): from posthog.async_migrations.utils import execute_op_clickhouse, execute_op_postgres if self.database == AnalyticsDBMS.CLICKHOUSE: execute_op_clickhouse(sql, query_id, self.timeout_seconds) else: execute_op_postgres(sql, query_id) __repr__ = sane_repr("sql", "rollback", "database", "timeout_seconds", include_id=False)
class DashboardPrivilege(UUIDModel): dashboard: models.ForeignKey = models.ForeignKey( "posthog.Dashboard", on_delete=models.CASCADE, related_name="privileges", related_query_name="privilege", ) user: models.ForeignKey = models.ForeignKey( "posthog.User", on_delete=models.CASCADE, related_name="explicit_dashboard_privileges", related_query_name="explicit_dashboard_privilege", ) level: models.PositiveSmallIntegerField = models.PositiveSmallIntegerField( choices=Dashboard.RestrictionLevel.choices) added_at: models.DateTimeField = models.DateTimeField(auto_now_add=True) updated_at: models.DateTimeField = models.DateTimeField(auto_now=True) class Meta: constraints = [ models.UniqueConstraint( fields=["dashboard", "user"], name="unique_explicit_dashboard_privilege"), ] __repr__ = sane_repr("dashboard", "user", "level")
class EventProperty(models.Model): team: models.ForeignKey = models.ForeignKey(Team, on_delete=models.CASCADE) event: models.CharField = models.CharField(max_length=400, null=False) property: models.CharField = models.CharField(max_length=400, null=False) class Meta: constraints = [ models.UniqueConstraint( fields=["team", "event", "property"], name="posthog_event_property_unique_team_event_property"), ] indexes = [ models.Index(fields=["team", "event"]), models.Index(fields=["team", "property"]), ] __repr__ = sane_repr("event", "property", "team_id")
class BaseFilter(BaseParamMixin): def __init__(self, data: Optional[Dict[str, Any]] = None, request: Optional[request.Request] = None, **kwargs) -> None: if request: data = { **request.GET.dict(), **request.data, **(data if data else {}), } elif not data: raise ValueError( "You need to define either a data dict or a request") self._data = data self.kwargs = kwargs if "team" in kwargs and hasattr(self, "simplify") and not getattr( self, "is_simplified", False): simplified_filter = getattr(self, "simplify")(kwargs["team"]) self._data = simplified_filter._data def to_dict(self) -> Dict[str, Any]: ret = {} for _, func in inspect.getmembers(self, inspect.ismethod): if hasattr(func, "include_dict"): # provided by @include_dict decorator ret.update(func()) return ret def to_params(self) -> Dict[str, str]: return encode_get_request_params(data=self.to_dict()) def toJSON(self): return json.dumps(self.to_dict(), default=lambda o: o.__dict__, sort_keys=True, indent=4) def with_data(self, overrides: Dict[str, Any]): "Allow making copy of filter whilst preserving the class" return type(self)(data={**self._data, **overrides}, **self.kwargs) __repr__ = sane_repr("_data", "kwargs", include_id=False)
class License(models.Model): objects: LicenseManager = LicenseManager() created_at: models.DateTimeField = models.DateTimeField(auto_now_add=True) plan: models.CharField = models.CharField(max_length=200) valid_until: models.DateTimeField = models.DateTimeField() key: models.CharField = models.CharField(max_length=200) max_users: models.IntegerField = models.IntegerField( default=None, null=True) # None = no restriction SCALE_PLAN = "scale" SCALE_FEATURES = [ AvailableFeature.ZAPIER, AvailableFeature.ORGANIZATIONS_PROJECTS, AvailableFeature.GOOGLE_LOGIN, AvailableFeature.DASHBOARD_COLLABORATION, AvailableFeature.INGESTION_TAXONOMY, AvailableFeature.PATHS_ADVANCED, AvailableFeature.CORRELATION_ANALYSIS, AvailableFeature.GROUP_ANALYTICS, AvailableFeature.MULTIVARIATE_FLAGS, AvailableFeature.EXPERIMENTATION, AvailableFeature.TAGGING, AvailableFeature.BEHAVIORAL_COHORT_FILTERING, AvailableFeature.WHITE_LABELLING, AvailableFeature.SUBSCRIPTIONS, ] ENTERPRISE_PLAN = "enterprise" ENTERPRISE_FEATURES = SCALE_FEATURES + [ AvailableFeature.DASHBOARD_PERMISSIONING, AvailableFeature.PROJECT_BASED_PERMISSIONING, AvailableFeature.SAML, AvailableFeature.SSO_ENFORCEMENT, ] PLANS = {SCALE_PLAN: SCALE_FEATURES, ENTERPRISE_PLAN: ENTERPRISE_FEATURES} # The higher the plan, the higher its sorting value - sync with front-end licenseLogic PLAN_TO_SORTING_VALUE = {SCALE_PLAN: 10, ENTERPRISE_PLAN: 20} @property def available_features(self) -> List[AvailableFeature]: return self.PLANS.get(self.plan, []) __repr__ = sane_repr("key", "plan", "valid_until")
class ExplicitTeamMembership(UUIDModel): class Level(models.IntegerChoices): """Keep in sync with OrganizationMembership.Level (only difference being organizations having an Owner).""" MEMBER = 1, "member" ADMIN = 8, "administrator" team: models.ForeignKey = models.ForeignKey( "posthog.Team", on_delete=models.CASCADE, related_name="explicit_memberships", related_query_name="explicit_membership", ) parent_membership: models.ForeignKey = models.ForeignKey( "posthog.OrganizationMembership", on_delete=models.CASCADE, related_name="explicit_team_memberships", related_query_name="explicit_team_membership", ) level: models.PositiveSmallIntegerField = models.PositiveSmallIntegerField( default=Level.MEMBER, choices=Level.choices) joined_at: models.DateTimeField = models.DateTimeField(auto_now_add=True) updated_at: models.DateTimeField = models.DateTimeField(auto_now=True) class Meta: constraints = [ models.UniqueConstraint(fields=["team", "parent_membership"], name="unique_explicit_team_membership"), ] def __str__(self): return str(self.Level(self.level)) @property def effective_level(self) -> "OrganizationMembership.Level": """If organization level is higher than project level, then that takes precedence over explicit project level. """ return max(self.level, self.parent_membership.level) __repr__ = sane_repr("team", "parent_membership", "level")
class Entity(PropertyMixin): """ Entities represent either Action or Event objects, nested in Filter objects. This object isn't a table in the database. It gets stored against the specific models itself as JSON. This class just allows for stronger typing of this object. """ id: Union[int, str] type: Literal["events", "actions"] order: Optional[int] name: Optional[str] custom_name: Optional[str] math: Optional[MATH_TYPE] math_property: Optional[str] math_group_type_index: Optional[GroupTypeIndex] # Index is not set at all by default (meaning: access = AttributeError) - it's populated in EntitiesMixin.entities # Used for identifying entities within a single query during query building, # which generally uses Entity objects processed by EntitiesMixin # The clean room way to do this would be passing the index _alongside_ the object, but OOP abuse is much less work index: int def __init__(self, data: Dict[str, Any]) -> None: self.id = data["id"] if not data.get("type") or data["type"] not in [ TREND_FILTER_TYPE_ACTIONS, TREND_FILTER_TYPE_EVENTS, ]: raise TypeError( "Type needs to be either TREND_FILTER_TYPE_ACTIONS or TREND_FILTER_TYPE_EVENTS" ) self.type = data["type"] order_provided = data.get("order") if order_provided is not None: order_provided = int(order_provided) self.order = order_provided self.name = data.get("name") custom_name = data.get("custom_name") if custom_name is not None: custom_name = str(custom_name).strip() or None self.custom_name = custom_name self.math = data.get("math") self.math_property = data.get("math_property") self.math_group_type_index = validate_group_type_index( "math_group_type_index", data.get("math_group_type_index")) self._action: Optional[Action] = None self._data = data # push data to instance object so mixins are handled properly if self.type == TREND_FILTER_TYPE_EVENTS and not self.name: # It won't be an int if it's an event, but mypy... self.name = str(self.id) def to_dict(self) -> Dict[str, Any]: return { "id": self.id, "type": self.type, "order": self.order, "name": self.name, "custom_name": self.custom_name, "math": self.math, "math_property": self.math_property, "math_group_type_index": self.math_group_type_index, "properties": self.property_groups.to_dict(), } def equals(self, other) -> bool: """ Checks if two entities are semantically equal.""" # Not using __eq__ since that affects hashability if self.id != other.id: return False if self.type != other.type: return False # TODO: Check operators as well, not just the properties. # Effectively check within each property group, that they're the same. self_properties = sorted( str(prop) for prop in self.property_groups.flat) other_properties = sorted( str(prop) for prop in other.property_groups.flat) if self_properties != other_properties: return False return True def is_superset(self, other) -> bool: """ Checks if this entity is a superset version of other. The ids match and the properties of (this) is a subset of the properties of (other)""" self_properties = Counter( [str(prop) for prop in self.property_groups.flat]) other_properties = Counter( [str(prop) for prop in other.property_groups.flat]) return self.id == other.id and len(self_properties - other_properties) == 0 def get_action(self) -> Action: if self.type != TREND_FILTER_TYPE_ACTIONS: raise ValueError( f"Action can only be fetched for entities of type {TREND_FILTER_TYPE_ACTIONS}, not {self.type}!" ) if self._action and not settings.TEST: return self._action try: self._action = Action.objects.get(id=self.id) return self._action except: raise ValidationError(f"Action ID {self.id} does not exist!") __repr__ = sane_repr("id", "type", "order", "name", "custom_name", "math", "math_property", "properties")
class Organization(UUIDModel): class Meta: constraints = [ models.UniqueConstraint( fields=["for_internal_metrics"], condition=Q(for_internal_metrics=True), name="single_for_internal_metrics", ), ] class PluginsAccessLevel(models.IntegerChoices): # None means the organization can't use plugins at all. They're hidden. Cloud default. NONE = 0, "none" # Config means the organization can only enable/disable/configure globally managed plugins. # This prevents config orgs from running untrusted code, which the next levels can do. CONFIG = 3, "config" # Install means the organization has config capabilities + can install own editor/GitHub/GitLab/npm plugins. # The plugin repository is off limits, as repository installations are managed by root orgs to avoid confusion. INSTALL = 6, "install" # Root means the organization has unrestricted plugins access on the instance. Self-hosted default. # This includes installing plugins from the repository and managing plugin installations for all other orgs. ROOT = 9, "root" members: models.ManyToManyField = models.ManyToManyField( "posthog.User", through="posthog.OrganizationMembership", related_name="organizations", related_query_name="organization", ) name: models.CharField = models.CharField(max_length=64) slug: LowercaseSlugField = LowercaseSlugField(unique=True, max_length=MAX_SLUG_LENGTH) created_at: models.DateTimeField = models.DateTimeField(auto_now_add=True) updated_at: models.DateTimeField = models.DateTimeField(auto_now=True) domain_whitelist: ArrayField = ArrayField( models.CharField(max_length=256, blank=False), blank=True, default=list ) # Used to allow self-serve account creation based on social login (#5111) plugins_access_level: models.PositiveSmallIntegerField = models.PositiveSmallIntegerField( default=PluginsAccessLevel.CONFIG if settings.MULTI_TENANCY else PluginsAccessLevel.ROOT, choices=PluginsAccessLevel.choices, ) available_features = ArrayField(models.CharField(max_length=64, blank=False), blank=True, default=list) for_internal_metrics: models.BooleanField = models.BooleanField( default=False) is_member_join_email_enabled: models.BooleanField = models.BooleanField( default=True) # DEPRECATED attributes (should be removed on next major version) setup_section_2_completed: models.BooleanField = models.BooleanField( default=True) personalization: models.JSONField = models.JSONField(default=dict, null=False, blank=True) objects: OrganizationManager = OrganizationManager() def __str__(self): return self.name __repr__ = sane_repr("name") @property def _billing_plan_details(self) -> Tuple[Optional[str], Optional[str]]: """ Obtains details on the billing plan for the organization. Returns a tuple with (billing_plan_key, billing_realm) """ # If on Cloud, grab the organization's price if hasattr(self, "billing"): if self.billing is None: # type: ignore return (None, None) return (self.billing.get_plan_key(), "cloud") # type: ignore # Otherwise, try to find a valid license on this instance if License is not None: license = License.objects.first_valid() if license: return (license.plan, "ee") return (None, None) @property def billing_plan(self) -> Optional[str]: return self._billing_plan_details[0] def update_available_features(self) -> List[Union[AvailableFeature, str]]: """Updates field `available_features`. Does not `save()`.""" plan, realm = self._billing_plan_details if not plan: self.available_features = [] elif realm == "ee": self.available_features = License.PLANS.get(plan, []) else: self.available_features = self.billing.available_features # type: ignore return self.available_features def is_feature_available(self, feature: Union[AvailableFeature, str]) -> bool: return feature in self.available_features @property def active_invites(self) -> QuerySet: return self.invites.filter( created_at__gte=timezone.now() - timezone.timedelta(days=INVITE_DAYS_VALIDITY)) def get_analytics_metadata(self): return { "member_count": self.members.count(), "project_count": self.teams.count(), "person_count": sum(team.person_set.count() for team in self.teams.all()), "name": self.name, }
class OrganizationInvite(UUIDModel): organization: models.ForeignKey = models.ForeignKey( "posthog.Organization", on_delete=models.CASCADE, related_name="invites", related_query_name="invite", ) target_email: models.EmailField = models.EmailField(null=True, db_index=True) first_name: models.CharField = models.CharField(max_length=30, blank=True, default="") created_by: models.ForeignKey = models.ForeignKey( "posthog.User", on_delete=models.SET_NULL, related_name="organization_invites", related_query_name="organization_invite", null=True, ) emailing_attempt_made: models.BooleanField = models.BooleanField( default=False) created_at: models.DateTimeField = models.DateTimeField(auto_now_add=True) updated_at: models.DateTimeField = models.DateTimeField(auto_now=True) def validate(self, *, user: Optional["User"] = None, email: Optional[str] = None) -> None: _email = email or getattr(user, "email", None) if _email and _email != self.target_email: raise exceptions.ValidationError( f"This invite is intended for another email address: {mask_email_address(self.target_email)}" f". You tried to sign up with {_email}.", code="invalid_recipient", ) if self.is_expired(): raise exceptions.ValidationError( "This invite has expired. Please ask your admin for a new one.", code="expired", ) if OrganizationMembership.objects.filter( organization=self.organization, user=user).exists(): raise exceptions.ValidationError( "You already are a member of this organization.", code="user_already_member", ) if OrganizationMembership.objects.filter( organization=self.organization, user__email=self.target_email, ).exists(): raise exceptions.ValidationError( "Another user with this email address already belongs to this organization.", code="existing_email_address", ) def use(self, user: "******", *, prevalidated: bool = False) -> None: if not prevalidated: self.validate(user=user) user.join(organization=self.organization) if is_email_available( with_absolute_urls=True ) and self.organization.is_member_join_email_enabled: from posthog.tasks.email import send_member_join send_member_join.apply_async( kwargs={ "invitee_uuid": user.uuid, "organization_id": self.organization.id }) OrganizationInvite.objects.filter( target_email__iexact=self.target_email).delete() def is_expired(self) -> bool: """Check if invite is older than INVITE_DAYS_VALIDITY days.""" return self.created_at < timezone.now() - timezone.timedelta( INVITE_DAYS_VALIDITY) def __str__(self): return f"{settings.SITE_URL}/signup/{self.id}" __repr__ = sane_repr("organization", "target_email", "created_by")
class OrganizationMembership(UUIDModel): class Level(models.IntegerChoices): """Keep in sync with TeamMembership.Level (only difference being projects not having an Owner).""" MEMBER = 1, "member" ADMIN = 8, "administrator" OWNER = 15, "owner" organization: models.ForeignKey = models.ForeignKey( "posthog.Organization", on_delete=models.CASCADE, related_name="memberships", related_query_name="membership") user: models.ForeignKey = models.ForeignKey( "posthog.User", on_delete=models.CASCADE, related_name="organization_memberships", related_query_name="organization_membership", ) level: models.PositiveSmallIntegerField = models.PositiveSmallIntegerField( default=Level.MEMBER, choices=Level.choices) joined_at: models.DateTimeField = models.DateTimeField(auto_now_add=True) updated_at: models.DateTimeField = models.DateTimeField(auto_now=True) class Meta: constraints = [ models.UniqueConstraint(fields=["organization_id", "user_id"], name="unique_organization_membership"), models.UniqueConstraint(fields=["organization_id"], condition=models.Q(level=15), name="only_one_owner_per_organization"), ] def __str__(self): return str(self.Level(self.level)) def validate_update(self, membership_being_updated: "OrganizationMembership", new_level: Optional[Level] = None) -> None: if new_level is not None: if membership_being_updated.id == self.id: raise exceptions.PermissionDenied( "You can't change your own access level.") if new_level == OrganizationMembership.Level.OWNER: if self.level != OrganizationMembership.Level.OWNER: raise exceptions.PermissionDenied( "You can only pass on organization ownership if you're its owner." ) self.level = OrganizationMembership.Level.ADMIN self.save() elif new_level > self.level: raise exceptions.PermissionDenied( "You can only change access level of others to lower or equal to your current one." ) if membership_being_updated.id != self.id: if membership_being_updated.organization_id != self.organization_id: raise exceptions.PermissionDenied( "You both need to belong to the same organization.") if self.level < OrganizationMembership.Level.ADMIN: raise exceptions.PermissionDenied( "You can only edit others if you are an admin.") if membership_being_updated.level > self.level: raise exceptions.PermissionDenied( "You can only edit others with level lower or equal to you." ) __repr__ = sane_repr("organization", "user", "level")
class Cohort(models.Model): name: models.CharField = models.CharField(max_length=400, null=True, blank=True) team: models.ForeignKey = models.ForeignKey("Team", on_delete=models.CASCADE) deleted: models.BooleanField = models.BooleanField(default=False) groups: models.JSONField = models.JSONField(default=list) people: models.ManyToManyField = models.ManyToManyField("Person", through="CohortPeople") created_by: models.ForeignKey = models.ForeignKey("User", on_delete=models.SET_NULL, blank=True, null=True) created_at: models.DateTimeField = models.DateTimeField(default=timezone.now, blank=True, null=True) is_calculating: models.BooleanField = models.BooleanField(default=False) last_calculation: models.DateTimeField = models.DateTimeField(blank=True, null=True) errors_calculating: models.IntegerField = models.IntegerField(default=0) is_static: models.BooleanField = models.BooleanField(default=False) objects = CohortManager() def get_analytics_metadata(self): action_groups_count: int = 0 properties_groups_count: int = 0 for group in self.groups: action_groups_count += 1 if group.get("action_id") else 0 properties_groups_count += 1 if group.get("properties") else 0 return { "name_length": len(self.name) if self.name else 0, "person_count_precalc": self.people.count(), "groups_count": len(self.groups), "action_groups_count": action_groups_count, "properties_groups_count": properties_groups_count, "deleted": self.deleted, } def calculate_people(self, use_clickhouse=is_clickhouse_enabled()): if self.is_static: return try: if not use_clickhouse: self.is_calculating = True self.save() persons_query = self._clickhouse_persons_query() if use_clickhouse else self._postgres_persons_query() try: sql, params = persons_query.distinct("pk").only("pk").query.sql_with_params() except EmptyResultSet: query = DELETE_QUERY.format(cohort_id=self.pk) params = {} else: query = "{}{}".format(DELETE_QUERY, UPDATE_QUERY).format( cohort_id=self.pk, values_query=sql.replace('FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1,), ) cursor = connection.cursor() with transaction.atomic(): cursor.execute(query, params) self.is_calculating = False self.last_calculation = timezone.now() self.errors_calculating = 0 self.save() except Exception as err: if settings.DEBUG: raise err self.is_calculating = False self.errors_calculating = F("errors_calculating") + 1 self.save() capture_exception(err) def calculate_people_ch(self): if is_clickhouse_enabled(): from ee.clickhouse.models.cohort import recalculate_cohortpeople recalculate_cohortpeople(self) def insert_users_by_list(self, items: List[str]) -> None: """ Items can be distinct_id or email """ batchsize = 1000 use_clickhouse = is_clickhouse_enabled() if use_clickhouse: from ee.clickhouse.models.cohort import insert_static_cohort try: cursor = connection.cursor() for i in range(0, len(items), batchsize): batch = items[i : i + batchsize] persons_query = ( Person.objects.filter(team_id=self.team_id) .filter(Q(persondistinctid__team_id=self.team_id, persondistinctid__distinct_id__in=batch)) .exclude(cohort__id=self.id) ) if use_clickhouse: insert_static_cohort([p for p in persons_query.values_list("uuid", flat=True)], self.pk, self.team) sql, params = persons_query.distinct("pk").only("pk").query.sql_with_params() query = UPDATE_QUERY.format( cohort_id=self.pk, values_query=sql.replace('FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1,), ) cursor.execute(query, params) self.is_calculating = False self.last_calculation = timezone.now() self.errors_calculating = 0 self.save() except Exception as err: if settings.DEBUG: raise err self.is_calculating = False self.errors_calculating = F("errors_calculating") + 1 self.save() capture_exception(err) def insert_users_list_by_uuid(self, items: List[str]) -> None: batchsize = 1000 try: cursor = connection.cursor() for i in range(0, len(items), batchsize): batch = items[i : i + batchsize] persons_query = ( Person.objects.filter(team_id=self.team_id).filter(uuid__in=batch).exclude(cohort__id=self.id) ) sql, params = persons_query.distinct("pk").only("pk").query.sql_with_params() query = UPDATE_QUERY.format( cohort_id=self.pk, values_query=sql.replace('FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1,), ) cursor.execute(query, params) self.is_calculating = False self.last_calculation = timezone.now() self.errors_calculating = 0 self.save() except Exception as err: if settings.DEBUG: raise err self.is_calculating = False self.errors_calculating = F("errors_calculating") + 1 self.save() capture_exception(err) def __str__(self): return self.name def _clickhouse_persons_query(self): from ee.clickhouse.models.cohort import get_person_ids_by_cohort_id uuids = get_person_ids_by_cohort_id(team=self.team, cohort_id=self.pk) return Person.objects.filter(uuid__in=uuids, team=self.team) def _postgres_persons_query(self): return Person.objects.filter(self._people_filter(), team=self.team) def _people_filter(self, extra_filter=None): from posthog.queries.base import properties_to_Q filters = Q() for group in self.groups: if group.get("action_id"): action = Action.objects.get(pk=group["action_id"], team_id=self.team_id) events = ( Event.objects.filter_by_action(action) .filter( team_id=self.team_id, **( {"timestamp__gt": timezone.now() - relativedelta(days=int(group["days"]))} if group.get("days") else {} ), **(extra_filter if extra_filter else {}) ) .order_by("distinct_id") .distinct("distinct_id") .values("distinct_id") ) filters |= Q(persondistinctid__distinct_id__in=events) elif group.get("properties"): filter = Filter(data=group) filters |= Q(properties_to_Q(filter.properties, team_id=self.team_id, is_person_query=True)) return filters __repr__ = sane_repr("id", "name", "last_calculation")
class SessionRecordingList: SESSION_RECORDINGS_DEFAULT_LIMIT = 50 _filter: SessionRecordingsFilter _team: Team def __init__(self, filter: SessionRecordingsFilter, team: Team) -> None: self._filter = filter self._team = team _recording_duration_select_statement = "EXTRACT(EPOCH FROM MAX(timestamp) - MIN(timestamp)) as duration," _recording_full_snapshot_select_statement = "COUNT(*) FILTER(where snapshot_data->>'type' = '2' OR (snapshot_data->>'has_full_snapshot')::boolean) as full_snapshots" _session_recording_event_table = "posthog_sessionrecordingevent" _session_recording_select_statements = """ MIN(session_recordings.start_time) as start_time, MIN(session_recordings.end_time) as end_time, MIN(session_recordings.duration) as duration, MIN(filtered_events.distinct_id) as distinct_id """ _core_session_recording_query: str = """ SELECT all_recordings.session_id, all_recordings.start_time, all_recordings.end_time, all_recordings.duration, all_recordings.distinct_id FROM ( SELECT session_id, distinct_id, MIN(timestamp) AS start_time, MAX(timestamp) AS end_time, {recording_duration_select_statement} {recording_full_snapshot_select_statement} FROM {session_recording_event_table} WHERE team_id = %(team_id)s {events_timestamp_clause} {distinct_id_clause} GROUP BY session_id, distinct_id ) as all_recordings WHERE full_snapshots > 0 {recording_start_time_clause} {duration_clause} """ _limited_session_recordings_query: str = """ {core_session_recording_query} ORDER BY start_time DESC LIMIT %(limit)s OFFSET %(offset)s """ _session_recordings_query_with_entity_filter: str = """ SELECT * FROM ( SELECT session_recordings.session_id, {session_recording_select_statements} {event_filter_aggregate_select_clause} FROM ( {events_query} ) AS filtered_events JOIN ( {core_session_recording_query} ) AS session_recordings ON session_recordings.distinct_id = filtered_events.distinct_id WHERE filtered_events.timestamp >= session_recordings.start_time AND filtered_events.timestamp <= session_recordings.end_time GROUP BY session_recordings.session_id ) as session_recordings {event_filter_aggregate_where_clause} ORDER BY start_time DESC LIMIT %(limit)s OFFSET %(offset)s """ def _has_entity_filters(self): return self._filter.entities and len(self._filter.entities) > 0 def _get_limit(self): return self._filter.limit or self.SESSION_RECORDINGS_DEFAULT_LIMIT # We want to select events beyond the range of the recording to handle the case where # a recording spans the time boundaries def _get_events_timestamp_clause(self) -> Tuple[Dict[str, Any], str]: timestamp_clause = "" timestamp_params = {} if self._filter.date_from: timestamp_clause += "\nAND timestamp >= %(event_start_time)s" timestamp_params["event_start_time"] = self._filter.date_from - timedelta(hours=12) if self._filter.date_to: timestamp_clause += "\nAND timestamp <= %(event_end_time)s" timestamp_params["event_end_time"] = self._filter.date_to + timedelta(hours=12) return timestamp_params, timestamp_clause def _get_recording_start_time_clause(self) -> Tuple[Dict[str, Any], str]: start_time_clause = "" start_time_params = {} if self._filter.date_from: start_time_clause += "\nAND start_time >= %(start_time)s" start_time_params["start_time"] = self._filter.date_from if self._filter.date_to: start_time_clause += "\nAND start_time <= %(end_time)s" start_time_params["end_time"] = self._filter.date_to return start_time_params, start_time_clause def _get_distinct_id_clause(self) -> Tuple[Dict[str, Any], str]: distinct_id_clause = "" distinct_id_params = {} if self._filter.person_uuid: person = Person.objects.get(uuid=self._filter.person_uuid) distinct_id_clause = f"AND distinct_id IN (SELECT distinct_id from posthog_persondistinctid WHERE person_id = %(person_id)s AND team_id = %(team_id)s)" distinct_id_params = {"person_id": person.pk, "team_id": self._team.pk} return distinct_id_params, distinct_id_clause def _get_duration_clause(self) -> Tuple[Dict[str, Any], str]: duration_clause = "" duration_params = {} if self._filter.recording_duration_filter: if self._filter.recording_duration_filter.operator == "gt": operator = ">" else: operator = "<" duration_clause = "\nAND duration {operator} %(recording_duration)s".format(operator=operator) duration_params = { "recording_duration": self._filter.recording_duration_filter.value, } return duration_params, duration_clause def _get_events_query(self) -> Tuple[str, list]: events: Union[EventManager, QuerySet] = Event.objects.filter(team=self._team).order_by("-timestamp").only( "distinct_id", "timestamp" ) if self._filter.date_from: events = events.filter(timestamp__gte=self._filter.date_from - timedelta(hours=12)) if self._filter.date_to: events = events.filter(timestamp__lte=self._filter.date_to + timedelta(hours=12)) keys = [] event_q_filters = [] for i, entity in enumerate(self._filter.entities): key = f"entity_{i}" q_filter = entity_to_Q(entity, self._team.pk) event_q_filters.append(q_filter) events = events.annotate(**{key: ExpressionWrapper(q_filter, output_field=BooleanField())}) keys.append(key) combined_event_q_filter = Q() for events_q_filter in event_q_filters: combined_event_q_filter |= events_q_filter events = events.filter(combined_event_q_filter) events = events.values_list("distinct_id", "timestamp", *keys) with connection.cursor() as cursor: event_query = cursor.mogrify(*events.query.sql_with_params()).decode("utf-8") return event_query, keys def _get_events_query_with_aggregate_clauses(self) -> EventsQueryWithAggregateClausesSQL: event_query, keys = self._get_events_query() aggregate_select_clause = "" aggregate_having_conditions = [] for key in keys: aggregate_field_name = f"count_{key}" aggregate_select_clause += f"\n, SUM(CASE WHEN {key} THEN 1 ELSE 0 END) as {aggregate_field_name}" aggregate_having_conditions.append(f"{aggregate_field_name} > 0") aggregate_where_clause = f"WHERE {' AND '.join(aggregate_having_conditions)}" return EventsQueryWithAggregateClausesSQL(event_query, {}, aggregate_select_clause, aggregate_where_clause) def _build_query(self) -> Tuple[str, Dict[str, Any]]: # One more is added to the limit to check if there are more results available limit = self._get_limit() + 1 offset = self._filter.offset or 0 base_params = {"team_id": self._team.pk, "limit": limit, "offset": offset} events_timestamp_params, events_timestamp_clause = self._get_events_timestamp_clause() recording_start_time_params, recording_start_time_clause = self._get_recording_start_time_clause() distinct_id_params, distinct_id_clause = self._get_distinct_id_clause() duration_params, duration_clause = self._get_duration_clause() core_session_recording_query = self._core_session_recording_query.format( recording_duration_select_statement=self._recording_duration_select_statement, recording_full_snapshot_select_statement=self._recording_full_snapshot_select_statement, session_recording_event_table=self._session_recording_event_table, distinct_id_clause=distinct_id_clause, events_timestamp_clause=events_timestamp_clause, recording_start_time_clause=recording_start_time_clause, duration_clause=duration_clause, ) params = { **base_params, **distinct_id_params, **events_timestamp_params, **duration_params, **recording_start_time_params, } if self._has_entity_filters(): ( events_query, event_query_params, aggregate_select_clause, aggregate_where_clause, ) = self._get_events_query_with_aggregate_clauses() return ( self._session_recordings_query_with_entity_filter.format( session_recording_select_statements=self._session_recording_select_statements, core_session_recording_query=core_session_recording_query, events_query=events_query, event_filter_aggregate_select_clause=aggregate_select_clause, event_filter_aggregate_where_clause=aggregate_where_clause, ), {**params, **event_query_params}, ) return ( self._limited_session_recordings_query.format(core_session_recording_query=core_session_recording_query), params, ) def _data_to_return(self, results: List[Any]) -> List[Dict[str, Any]]: return [row._asdict() for row in results] def _paginate_results(self, session_recordings) -> SessionRecordingQueryResult: limit = self._get_limit() more_recordings_available = False if len(session_recordings) > limit: more_recordings_available = True session_recordings = session_recordings[0:limit] return SessionRecordingQueryResult(session_recordings, more_recordings_available) def run(self, *args, **kwargs) -> SessionRecordingQueryResult: with connection.cursor() as cursor: query, query_params = self._build_query() cursor.execute(query, query_params) query_results = namedtuplefetchall(cursor) session_recordings = self._data_to_return(query_results) return self._paginate_results(session_recordings) __repr__ = sane_repr("_team", "_filter")
class Cohort(models.Model): name: models.CharField = models.CharField(max_length=400, null=True, blank=True) description: models.CharField = models.CharField(max_length=1000, blank=True) team: models.ForeignKey = models.ForeignKey("Team", on_delete=models.CASCADE) deleted: models.BooleanField = models.BooleanField(default=False) groups: models.JSONField = models.JSONField(default=list) people: models.ManyToManyField = models.ManyToManyField( "Person", through="CohortPeople") version: models.IntegerField = models.IntegerField(blank=True, null=True) pending_version: models.IntegerField = models.IntegerField(blank=True, null=True) count: models.IntegerField = models.IntegerField(blank=True, null=True) created_by: models.ForeignKey = models.ForeignKey( "User", on_delete=models.SET_NULL, blank=True, null=True) created_at: models.DateTimeField = models.DateTimeField( default=timezone.now, blank=True, null=True) is_calculating: models.BooleanField = models.BooleanField(default=False) last_calculation: models.DateTimeField = models.DateTimeField(blank=True, null=True) errors_calculating: models.IntegerField = models.IntegerField(default=0) is_static: models.BooleanField = models.BooleanField(default=False) objects = CohortManager() def get_analytics_metadata(self): action_groups_count: int = 0 properties_groups_count: int = 0 for group in self.groups: action_groups_count += 1 if group.get("action_id") else 0 properties_groups_count += 1 if group.get("properties") else 0 return { "name_length": len(self.name) if self.name else 0, "person_count_precalc": self.people.count(), "groups_count": len(self.groups), "action_groups_count": action_groups_count, "properties_groups_count": properties_groups_count, "deleted": self.deleted, } def calculate_people(self, new_version: int, batch_size=10000, pg_batch_size=1000): if self.is_static: return try: # Paginate fetch batch_size from clickhouse and paginate insert pg_batch_size into postgres cursor = 0 persons = self._clickhouse_persons_query(batch_size=batch_size, offset=cursor) while persons: # TODO: Insert from a subquery instead of pulling retrieving # then sending large lists of data backwards and forwards. to_insert = [ CohortPeople(person_id=person_id, cohort_id=self.pk, version=new_version) # Just pull out the person id as we don't need anything # else. for person_id in persons.values_list("id", flat=True) ] # TODO: make sure this bulk_create doesn't actually return anything CohortPeople.objects.bulk_create(to_insert, batch_size=pg_batch_size) cursor += batch_size persons = self._clickhouse_persons_query(batch_size=batch_size, offset=cursor) time.sleep(5) except Exception as err: # Clear the pending version people if there's an error batch_delete_cohort_people(self.pk, new_version) raise err def calculate_people_ch(self, pending_version): from ee.clickhouse.models.cohort import recalculate_cohortpeople from posthog.tasks.cohorts_in_feature_flag import get_cohort_ids_in_feature_flags logger.info("cohort_calculation_started", id=self.pk, current_version=self.version, new_version=pending_version) start_time = time.monotonic() try: count = recalculate_cohortpeople(self) # only precalculate if used in feature flag ids = get_cohort_ids_in_feature_flags() if self.pk in ids: self.calculate_people(new_version=pending_version) # Update filter to match pending version if still valid Cohort.objects.filter(pk=self.pk).filter( Q(version__lt=pending_version) | Q(version__isnull=True)).update(version=pending_version, count=count) self.refresh_from_db() else: self.count = count self.last_calculation = timezone.now() self.errors_calculating = 0 except Exception: self.errors_calculating = F("errors_calculating") + 1 logger.warning( "cohort_calculation_failed", id=self.pk, current_version=self.version, new_version=pending_version, exc_info=True, ) raise finally: self.is_calculating = False self.save() logger.info( "cohort_calculation_completed", id=self.pk, version=pending_version, duration=(time.monotonic() - start_time), ) def insert_users_by_list(self, items: List[str]) -> None: """ Items can be distinct_id or email Important! Does not insert into clickhouse """ batchsize = 1000 from ee.clickhouse.models.cohort import insert_static_cohort try: cursor = connection.cursor() for i in range(0, len(items), batchsize): batch = items[i:i + batchsize] persons_query = (Person.objects.filter( team_id=self.team_id).filter( Q(persondistinctid__team_id=self.team_id, persondistinctid__distinct_id__in=batch)).exclude( cohort__id=self.id)) insert_static_cohort( [p for p in persons_query.values_list("uuid", flat=True)], self.pk, self.team) sql, params = persons_query.distinct("pk").only( "pk").query.sql_with_params() query = UPDATE_QUERY.format( cohort_id=self.pk, values_query=sql.replace( 'FROM "posthog_person"', f', {self.pk}, {self.version or "NULL"} FROM "posthog_person"', 1, ), ) cursor.execute(query, params) self.is_calculating = False self.last_calculation = timezone.now() self.errors_calculating = 0 self.save() except Exception as err: if settings.DEBUG: raise err self.is_calculating = False self.errors_calculating = F("errors_calculating") + 1 self.save() capture_exception(err) def insert_users_list_by_uuid(self, items: List[str]) -> None: batchsize = 1000 try: cursor = connection.cursor() for i in range(0, len(items), batchsize): batch = items[i:i + batchsize] persons_query = (Person.objects.filter( team_id=self.team_id).filter(uuid__in=batch).exclude( cohort__id=self.id)) sql, params = persons_query.distinct("pk").only( "pk").query.sql_with_params() query = UPDATE_QUERY.format( cohort_id=self.pk, values_query=sql.replace( 'FROM "posthog_person"', f', {self.pk}, {self.version or "NULL"} FROM "posthog_person"', 1, ), ) cursor.execute(query, params) self.is_calculating = False self.last_calculation = timezone.now() self.errors_calculating = 0 self.save() except Exception as err: if settings.DEBUG: raise err self.is_calculating = False self.errors_calculating = F("errors_calculating") + 1 self.save() capture_exception(err) def __str__(self): return self.name def _clickhouse_persons_query(self, batch_size=10000, offset=0): from ee.clickhouse.models.cohort import get_person_ids_by_cohort_id uuids = get_person_ids_by_cohort_id(team=self.team, cohort_id=self.pk, limit=batch_size, offset=offset) return Person.objects.filter(uuid__in=uuids, team=self.team) __repr__ = sane_repr("id", "name", "last_calculation")
class Funnel(BaseQuery): _filter: Filter _team: Team def __init__(self, filter: Filter, team: Team) -> None: self._filter = filter self._team = team def _gen_lateral_bodies(self, within_time: Optional[str] = None): annotations = {} for index, step in enumerate(self._filter.entities): filter_key = "event" if step.type == TREND_FILTER_TYPE_EVENTS else "action__pk" event = (Event.objects.values("distinct_id").annotate( step_ts=Min("timestamp"), person_id=Value("99999999", IntegerField()), ).filter( self._filter.date_filter_Q, **{ filter_key: step.id }, team_id=self._team.pk, **({ "distinct_id": "1234321" } if index > 0 else {}), **({ "timestamp__gte": timezone.now().replace(year=2000, month=1, day=1, hour=0, minute=0, second=0, microsecond=0) } if index > 0 else {}), ).filter( properties_to_Q( self._filter.properties, team_id=self._team.pk, )).filter( properties_to_Q(step.properties, team_id=self._team.pk))) with connection.cursor() as cursor: event_string = cursor.mogrify(*event.query.sql_with_params()) # Replace placeholders injected by the Django ORM # We do this because the Django ORM doesn't easily allow us to parameterize sql identifiers # This is probably the most hacky part of the entire query generation event_string = (event_string.decode("utf-8").replace( "'1234321'", "{prev_step_person_id}" ).replace( "'2000-01-01T00:00:00+00:00'::timestamptz", "{prev_step_ts} %s" % (' AND "posthog_event"."timestamp" < "step_{}"."step_ts" + {}'. format(index - 1, within_time) if within_time else ""), ).replace('"posthog_event"."distinct_id"', '"pdi"."person_id"').replace( "99999999", '"pdi"."person_id"').replace( ', "pdi"."person_id" AS "person_id"', "")) event_string = re.sub( # accommodate for identifier e.g. W0 so that it still ends up right after `FROM posthog_event` # not after `ON pdi.distinct_id = posthog_event.distinct_id` r'FROM "posthog_event"( [A-Z][0-9])?', r"FROM posthog_event\1 JOIN posthog_persondistinctid pdi " # NOTE: here we are joining on the unique identifier of the # persondistinctid table, i.e. (team_id, distinct_id) r"ON pdi.distinct_id = posthog_event.distinct_id AND pdi.team_id = posthog_event.team_id", event_string, ) query = sql.SQL(event_string) annotations["step_{}".format(index)] = query return annotations def _serialize_step( self, step: Entity, count: int, people: Optional[List[uuid.UUID]] = None) -> Dict[str, Any]: if step.type == TREND_FILTER_TYPE_ACTIONS: name = step.get_action().name else: name = step.id return { "action_id": step.id, "name": name, "custom_name": step.custom_name, "order": step.order, "people": people if people else [], "count": count, "type": step.type, } def _build_query(self, within_time: Optional[str] = None): """Build query using lateral joins using a combination of Django generated SQL and sql built using psycopg2 """ query_bodies = self._gen_lateral_bodies(within_time=within_time) ON_TRUE = "ON TRUE" LEFT_JOIN_LATERAL = "LEFT JOIN LATERAL" LAT_JOIN_BODY = ("""({query}) {step} {on_true} {join}""" if len(query_bodies) > 1 else """({query}) {step} {on_true} """) steps = [sql.Identifier(step) for step, _ in query_bodies.items()] select_steps = [ sql.Composed([ step, sql.SQL("."), sql.Identifier("step_ts"), sql.SQL(" as "), step, ]) for step in steps ] lateral_joins = [] i = 0 for step, qb in query_bodies.items(): if i > 0: # For each step after the first we must reference the previous step's person_id and step_ts q = qb.format( prev_step_person_id=sql.Composed([ steps[i - 1], sql.SQL("."), sql.Identifier("person_id") ]), prev_step_ts=sql.Composed([ steps[i - 1], sql.SQL("."), sql.Identifier("step_ts") ]), ) if i == 0: # Generate first lateral join body # The join conditions are different for first, middles, and last # For the first step we include the alias, lateral join, but not 'ON TRUE' base_body = sql.SQL(LAT_JOIN_BODY).format( query=qb, step=sql.SQL(step), on_true=sql.SQL(""), join=sql.SQL(LEFT_JOIN_LATERAL), ) elif i == len(query_bodies) - 1: # Generate last lateral join body # The join conditions are different for first, middles, and last # For the last step we include the alias, 'ON TRUE', but not another `LATERAL JOIN` base_body = sql.SQL(LAT_JOIN_BODY).format( query=q, step=sql.SQL(step), on_true=sql.SQL(ON_TRUE), join=sql.SQL(""), ) else: # Generate middle lateral join body # The join conditions are different for first, middles, and last # For the middle steps we include the alias, 'ON TRUE', and `LATERAL JOIN` base_body = sql.SQL(LAT_JOIN_BODY).format( query=q, step=sql.SQL(step), on_true=sql.SQL(ON_TRUE), join=sql.SQL(LEFT_JOIN_LATERAL), ) lateral_joins.append(base_body) i += 1 event_chain_query = sql.SQL(" ").join(lateral_joins).as_string( connection.connection) query = f""" SELECT DISTINCT ON (person.id) person.uuid, person.created_at, person.team_id, person.properties, person.is_user_id, {sql.SQL(",").join(select_steps).as_string(connection.connection)} FROM posthog_person person JOIN posthog_persondistinctid pdi ON pdi.person_id = person.id JOIN {event_chain_query} -- join on person_id for the first event. -- NOTE: there is some implicit coupling here in that I am -- assuming the name of the first event select is "step_0". -- Maybe worth cleaning up in the future ON person.id = step_0.person_id WHERE person.team_id = {self._team.pk} AND person.id IS NOT NULL ORDER BY person.id, step_0.step_ts ASC """ return query def _build_trends_query(self, filter: Filter) -> sql.SQL: # TODO: Only select from_step and to_step in _build_steps_query particular_steps = ( sql.SQL(f'COUNT("step_{index}") as "step_{index}_count"') for index in range(len(filter.entities))) trends_query = sql.SQL(""" SELECT date_trunc({interval}, {interval_field}) as "date", {particular_steps} FROM ( {steps_query} ) steps_at_dates GROUP BY "date" """).format( interval=sql.Literal(filter.interval), particular_steps=sql.SQL(",\n").join(particular_steps), steps_query=sql.SQL(self._build_query(within_time="'1 day'")), interval_field=sql.SQL("step_0") if filter.interval != "week" else sql.SQL("(\"step_0\" + interval '1 day') AT TIME ZONE 'UTC'"), ) return trends_query def _get_last_step_attr(self, step: object) -> int: if len(self._filter.entities) == 1: return 0 return getattr(step, "step_{}_count".format(len(self._filter.entities) - 1)) def _get_trends(self) -> List[Dict[str, Any]]: serialized: Dict[str, Any] = { "count": 0, "data": [], "days": [], "labels": [] } with connection.cursor() as cursor: qstring = self._build_trends_query(self._filter).as_string( cursor.connection) cursor.execute(qstring) steps_at_dates = namedtuplefetchall(cursor) date_range = get_daterange(self._filter.date_from or steps_at_dates[0].date, self._filter.date_to, frequency=self._filter.interval) data_array = [{ "date": step.date, "count": round(self._get_last_step_attr(step) / step.step_0_count * 100) } for step in steps_at_dates] if self._filter.interval == "week": for df in data_array: df["date"] -= timedelta(days=df["date"].weekday() + 1) elif self._filter.interval == "month": for df in data_array: df["date"] = df["date"].replace(day=1) for df in data_array: df["date"] = df["date"].replace(tzinfo=pytz.utc).isoformat() datewise_data = {d["date"]: d["count"] for d in data_array} values = [(key, datewise_data.get(key.isoformat(), 0)) for key in date_range] for item in values: serialized["days"].append(item[0]) serialized["data"].append(item[1]) serialized["labels"].append( format_label_date(item[0], self._filter.interval)) return [serialized] def data_to_return(self, results: List[Person]) -> List[Dict[str, Any]]: steps = [] average_time: Dict[int, Dict[str, Any]] = {} for index, funnel_step in enumerate(self._filter.entities, start=0): if index != 0: average_time[index] = { "total_time": timedelta(0), "total_people": 0 } person_score: Dict = defaultdict(int) for index, funnel_step in enumerate(self._filter.entities): relevant_people = [] for person in results: if (index > 0 and getattr(person, "step_{}".format(index)) and getattr(person, "step_{}".format(index - 1))): average_time[index]["total_time"] += getattr( person, "step_{}".format(index)) - getattr( person, "step_{}".format(index - 1)) average_time[index]["total_people"] += 1 if getattr(person, "step_{}".format(index)): person_score[person.uuid] += 1 relevant_people.append(person.uuid) steps.append( self._serialize_step( funnel_step, len(relevant_people) if relevant_people else 0, relevant_people)) if len(steps) > 0: for index, _ in enumerate(steps): steps[index]["people"] = sorted(steps[index]["people"], key=lambda p: person_score[p], reverse=True)[0:100] for index in average_time.keys(): steps[index - 1]["average_time"] = ( (average_time[index]["total_time"].total_seconds() / average_time[index]["total_people"]) if average_time[index]["total_people"] > 0 else 0) return steps def run(self, *args, **kwargs) -> List[Dict[str, Any]]: """ Builds and runs a query to get all persons that have been in the funnel steps defined by `self._filter.entities`. For example, entities may be defined as: 1. event with event name "user signed up" 2. event with event name "user looked at report" For a person to match they have to have gone through all `entities` in order. We also only return one such chain of entities, the earliest one we find. """ # If no steps are defined, then there's no point in querying the database if len(self._filter.entities) == 0: return [] if self._filter.display == TRENDS_LINEAR: return self._get_trends() with connection.cursor() as cursor: # Then we build a query to query for them in order qstring = self._build_query(within_time=None) cursor.execute(qstring) results = namedtuplefetchall(cursor) return self.data_to_return(results) __repr__ = sane_repr("_team", "_filter")