def _format_total_volume_query(self, entity: Entity, filter: Filter, team_id: int) -> List[Dict[str, Any]]: events = process_entity_for_events( entity=entity, team_id=team_id, order_by="-timestamp", ) items, filtered_events = aggregate_by_interval( events=events, team_id=team_id, entity=entity, filter=filter, ) formatted_entities: List[Dict[str, Any]] = [] for _, item in items.items(): formatted_data = append_data(dates_filled=list(item.items()), interval=filter.interval) if filter.display in TRENDS_DISPLAY_BY_VALUE: formatted_data.update({ "aggregated_value": get_aggregate_total(filtered_events, entity) }) formatted_entities.append(formatted_data) return formatted_entities
def _serialize_breakdown(self, entity: Entity, filter: Filter, team_id: int) -> List[Dict[str, Any]]: events = process_entity_for_events( entity=entity, team_id=team_id, order_by="-timestamp", ) items, filtered_events = aggregate_by_interval( events=events, team_id=team_id, entity=entity, filter=filter, breakdown="properties__{}".format(filter.breakdown) if filter.breakdown else None, ) formatted_entities: List[Dict[str, Any]] = [] for value, item in items.items(): new_dict = append_data(dates_filled=list(item.items()), interval=filter.interval) if value != "Total": new_dict.update(breakdown_label(entity, value)) if filter.display in TRENDS_DISPLAY_BY_VALUE: new_dict.update({ "aggregated_value": get_aggregate_breakdown_total(filtered_events, filter, entity, team_id, new_dict["breakdown_value"]) }) formatted_entities.append(new_dict) return formatted_entities
def _serialize_entity(self, entity: Entity, filter: Filter, team_id: int) -> List[Dict[str, Any]]: if filter.interval is None: filter.interval = "day" serialized: Dict[str, Any] = { "action": entity.to_dict(), "label": entity.name, "count": 0, "data": [], "labels": [], "days": [], } response = [] events = process_entity_for_events(entity=entity, team_id=team_id, order_by="-timestamp",) events = events.filter(filter_events(team_id, filter, entity)) items = aggregate_by_interval( filtered_events=events, team_id=team_id, entity=entity, filter=filter, breakdown="properties__{}".format(filter.breakdown) if filter.breakdown else None, ) for value, item in items.items(): new_dict = copy.deepcopy(serialized) if value != "Total": new_dict.update(breakdown_label(entity, value)) new_dict.update(append_data(dates_filled=list(item.items()), interval=filter.interval)) if filter.display == TRENDS_CUMULATIVE: new_dict["data"] = np.cumsum(new_dict["data"]) response.append(new_dict) return response
def _format_normal_query(self, entity: Entity, filter: Filter, team_id: int) -> List[Dict[str, Any]]: events = process_entity_for_events(entity=entity, team_id=team_id, order_by="-timestamp",) events = events.filter(filter_events(team_id, filter, entity)) items = aggregate_by_interval(filtered_events=events, team_id=team_id, entity=entity, filter=filter,) formatted_entities: List[Dict[str, Any]] = [] for _, item in items.items(): formatted_entities.append(append_data(dates_filled=list(item.items()), interval=filter.interval)) return formatted_entities
def calculate_avg(self, filter: Filter, team: Team): parsed_date_from, parsed_date_to, _ = parse_timestamps(filter, team.pk) filters, params = parse_prop_clauses( filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts ) interval_notation = get_trunc_func_ch(filter.interval) num_intervals, seconds_in_interval, _ = get_time_diff( filter.interval or "day", filter.date_from, filter.date_to, team.pk ) entity_conditions, entity_params = entity_query_conditions(filter, team) if not entity_conditions: entity_conditions = ["event != '$feature_flag_called'"] # default conditino params = {**params, **entity_params} entity_query = " OR ".join(entity_conditions) avg_query = SESSIONS_NO_EVENTS_SQL.format( team_id=team.pk, date_from=parsed_date_from, date_to=parsed_date_to, filters=filters, sessions_limit="", entity_filter=f"AND ({entity_query})", ) per_period_query = AVERAGE_PER_PERIOD_SQL.format(sessions=avg_query, interval=interval_notation) null_sql = NULL_SQL.format( date_to=filter.date_to.strftime("%Y-%m-%d 00:00:00"), interval=interval_notation, num_intervals=num_intervals, seconds_in_interval=seconds_in_interval, ) final_query = AVERAGE_SQL.format(sessions=per_period_query, null_sql=null_sql) params = {**params, "team_id": team.pk} response = sync_execute(final_query, params) values = self.clean_values(filter, response) time_series_data = append_data(values, interval=filter.interval, math=None) scaled_data, _ = scale_time_series(time_series_data["data"]) time_series_data.update({"data": scaled_data}) # calculate average total = sum(val[1] for val in values) if total == 0: return [] valid_days = sum(1 if val[1] else 0 for val in values) overall_average = (total / valid_days) if valid_days else 0 result = self._format_avg(overall_average) time_series_data.update(result) return [time_series_data]
def _session_avg( self, base_query: str, params: Tuple[Any, ...], date_filter: Dict[str, datetime], interval: Optional[str] ) -> List[Dict[str, Any]]: def _determineInterval(interval): if interval == "minute": return ( "minute", "min", ) elif interval == "hour": return "hour", "H" elif interval == "week": return "week", "W" elif interval == "month": return "month", "M" else: return "day", "D" interval, interval_freq = _determineInterval(interval) average_length_time = "SELECT date_trunc('{interval}', timestamp) as start_time,\ AVG(length) AS average_session_length_per_day,\ SUM(length) AS total_session_length_per_day, \ COUNT(1) as num_sessions_per_day\ FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length,\ MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format( base_query, interval=interval ) cursor = connection.cursor() cursor.execute(average_length_time, params) time_series_avg = cursor.fetchall() date_range = pd.date_range(date_filter["timestamp__gte"], date_filter["timestamp__lte"], freq=interval_freq,) df = pd.DataFrame([{"date": a[0], "count": a[1], "breakdown": "Total"} for a in time_series_avg]) if interval == "week": df["date"] = df["date"].apply(lambda x: x - pd.offsets.Week(weekday=6)) elif interval == "month": df["date"] = df["date"].apply(lambda x: x - pd.offsets.MonthEnd(n=0)) df_dates = pd.DataFrame(df.groupby("date").mean(), index=date_range) df_dates = df_dates.fillna(0) values = [(key, round(value[0])) if len(value) > 0 else (key, 0) for key, value in df_dates.iterrows()] time_series_data = append_data(values, interval=interval, math=None) # calculate average totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]] overall_average = (totals[0] / totals[1]) if totals else 0 avg_formatted = friendly_time(overall_average) avg_split = avg_formatted.split(" ") time_series_data.update( {"label": "Average Duration of Session ({})".format(avg_split[1]), "count": int(avg_split[0]),} ) time_series_data.update({"chartLabel": "Average Duration of Session (seconds)"}) result = [time_series_data] return result
def calculate_avg(self, filter: Filter, team: Team): # format default dates if not filter._date_from: filter._date_from = relative_date_parse("-7d") if not filter._date_to: filter._date_to = timezone.now() parsed_date_from, parsed_date_to = parse_timestamps(filter) filters, params = parse_prop_clauses("uuid", filter.properties, team) interval_notation = get_interval_annotation_ch(filter.interval) num_intervals, seconds_in_interval = get_time_diff( filter.interval or "day", filter.date_from, filter.date_to) avg_query = SESSIONS_NO_EVENTS_SQL.format( team_id=team.pk, date_from=parsed_date_from, date_to=parsed_date_to, filters="{}".format(filters) if filter.properties else "", sessions_limit="", ) per_period_query = AVERAGE_PER_PERIOD_SQL.format( sessions=avg_query, interval=interval_notation) null_sql = NULL_SQL.format( date_to=(filter.date_to or timezone.now()).strftime("%Y-%m-%d 00:00:00"), interval=interval_notation, num_intervals=num_intervals, seconds_in_interval=seconds_in_interval, ) final_query = AVERAGE_SQL.format(sessions=per_period_query, null_sql=null_sql) params = {**params, "team_id": team.pk} response = sync_execute(final_query, params) values = self.clean_values(filter, response) time_series_data = append_data(values, interval=filter.interval, math=None) # calculate average total = sum(val[1] for val in values) if total == 0: return [] valid_days = sum(1 if val[1] else 0 for val in values) overall_average = (total / valid_days) if valid_days else 0 result = self._format_avg(overall_average) time_series_data.update(result) return [time_series_data]
def serialize_entity( entity: Entity, filter: Filter, params: dict, team_id: int ) -> List[Dict[str, Any]]: interval = params.get("interval") if interval is None: interval = "day" serialized: Dict[str, Any] = { "action": entity.to_dict(), "label": entity.name, "count": 0, "data": [], "labels": [], "days": [], } response = [] events = process_entity_for_events( entity=entity, team_id=team_id, order_by=None if params.get("shown_as") == "Stickiness" else "-timestamp", ) events = events.filter(filter_events(team_id, filter, entity)) if params.get("shown_as", "Volume") == "Volume": items = aggregate_by_interval( filtered_events=events, team_id=team_id, entity=entity, filter=filter, interval=interval, params=params, breakdown="properties__{}".format(params.get("breakdown")) if params.get("breakdown") else None, ) for value, item in items.items(): new_dict = copy.deepcopy(serialized) if value != "Total": new_dict.update(breakdown_label(entity, value)) new_dict.update( append_data(dates_filled=list(item.items()), interval=interval) ) if filter.display == TRENDS_CUMULATIVE: new_dict["data"] = np.cumsum(new_dict["data"]) response.append(new_dict) elif params.get("shown_as") == TRENDS_STICKINESS: new_dict = copy.deepcopy(serialized) new_dict.update( stickiness( filtered_events=events, entity=entity, filter=filter, team_id=team_id ) ) response.append(new_dict) return response
def _serialize_entity(self, entity: Entity, filter: Filter, request: request.Request, team: Team) -> List[Dict[str, Any]]: interval = request.GET.get('interval') if interval is None: interval = 'day' serialized: Dict[str, Any] = { 'action': { 'id': entity.id, 'name': entity.name, 'type': entity.type }, 'label': entity.name, 'count': 0, 'data': [], 'labels': [], 'days': [] } response = [] events = self._process_entity_for_events( entity=entity, team=team, order_by=None if request.GET.get('shown_as') == 'Stickiness' else '-timestamp') events = events.filter(self._filter_events(filter)) if request.GET.get('shown_as', 'Volume') == 'Volume': items = self._aggregate_by_interval( filtered_events=events, entity=entity, filter=filter, interval=interval, request=request, breakdown='properties__{}'.format(request.GET['breakdown']) if request.GET.get('breakdown') else None) for value, item in items.items(): new_dict = copy.deepcopy(serialized) if value != 'Total': new_dict['label'] = '{} - {}'.format( entity.name, value if value else 'undefined') new_dict['breakdown_value'] = value new_dict.update( append_data(dates_filled=list(item.items()), interval=interval)) response.append(new_dict) elif request.GET['shown_as'] == 'Stickiness': new_dict = copy.deepcopy(serialized) new_dict.update( self._stickiness(filtered_events=events, filter=filter)) response.append(new_dict) return response
def _serialize_entity(self, entity: Entity, filter: Filter, request: request.Request, team: Team) -> List[Dict[str, Any]]: interval = request.GET.get('interval') if interval is None: interval = 'day' serialized: Dict[str, Any] = { 'action': entity.to_dict(), 'label': entity.name, 'count': 0, 'data': [], 'labels': [], 'days': [] } response = [] events = self._process_entity_for_events( entity=entity, team=team, order_by=None if request.GET.get('shown_as') == 'Stickiness' else '-timestamp') events = events.filter(self._filter_events(filter, entity)) if request.GET.get('shown_as', 'Volume') == 'Volume': items = self._aggregate_by_interval( filtered_events=events, team=team, entity=entity, filter=filter, interval=interval, request=request, breakdown='properties__{}'.format(request.GET['breakdown']) if request.GET.get('breakdown') else None, ) for value, item in items.items(): new_dict = copy.deepcopy(serialized) if value != 'Total': new_dict.update(self._breakdown_label(entity, value)) new_dict.update( append_data(dates_filled=list(item.items()), interval=interval)) if filter.display == TRENDS_CUMULATIVE: new_dict['data'] = np.cumsum(new_dict['data']) response.append(new_dict) elif request.GET['shown_as'] == TRENDS_STICKINESS: new_dict = copy.deepcopy(serialized) new_dict.update( self._stickiness(filtered_events=events, entity=entity, filter=filter)) response.append(new_dict) return response
def _session_avg(self, base_query: str, params: Tuple[Any, ...], date_filter: Dict[str, datetime]) -> List[Dict[str, Any]]: average_length_time = "SELECT date_trunc('day', timestamp) as start_time,\ AVG(length) AS average_session_length_per_day,\ SUM(length) AS total_session_length_per_day, \ COUNT(1) as num_sessions_per_day\ FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length,\ MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format( base_query) cursor = connection.cursor() cursor.execute(average_length_time, params) time_series_avg = cursor.fetchall() time_series_avg_friendly = [] date_range = pd.date_range( date_filter["timestamp__gte"].date(), date_filter["timestamp__lte"].date(), freq="D", ) time_series_avg_friendly = [( day, round(time_series_avg[index][1] if index < len(time_series_avg) else 0), ) for index, day in enumerate(date_range)] time_series_data = append_data(time_series_avg_friendly, math=None) # calculate average totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]] overall_average = (totals[0] / totals[1]) if totals else 0 avg_formatted = friendly_time(overall_average) avg_split = avg_formatted.split(" ") time_series_data.update({ "label": "Average Duration of Session ({})".format(avg_split[1]), "count": int(avg_split[0]), }) time_series_data.update( {"chartLabel": "Average Duration of Session (seconds)"}) result = [time_series_data] return result
def calculate_sessions(self, events: QuerySet, session_type: str, date_filter) -> List[Dict[str, Any]]: sessions = events\ .annotate(previous_timestamp=Window( expression=Lag('timestamp', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() ))\ .annotate(previous_event=Window( expression=Lag('event', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() )) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() # TODO: add midnight condition all_sessions = '\ SELECT distinct_id, timestamp,\ SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\ SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\ FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\ OR previous_timestamp IS NULL \ THEN 1 ELSE 0 END AS new_session \ FROM ({}) AS inner_sessions\ ) AS outer_sessions'.format(sessions_sql) def distribution(query): return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\ COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\ COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\ COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\ COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\ COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\ COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\ COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\ COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\ COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length FROM ({}) as count GROUP BY 1) agg'.format( query) def average_length_time(query): return 'SELECT date_trunc(\'day\', timestamp) as start_time,\ AVG(length) AS average_session_length_per_day,\ SUM(length) AS total_session_length_per_day, \ COUNT(1) as num_sessions_per_day\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length,\ MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time'.format( query) result: List = [] if session_type == 'avg': cursor = connection.cursor() cursor.execute(average_length_time(all_sessions), sessions_sql_params) time_series_avg = cursor.fetchall() time_series_avg_friendly = [] date_range = pd.date_range(date_filter['timestamp__gte'].date(), date_filter['timestamp__lte'].date(), freq='D') time_series_avg_friendly = [ (day, round(time_series_avg[index][1] if index < len(time_series_avg) else 0)) for index, day in enumerate(date_range) ] time_series_data = append_data(time_series_avg_friendly, math=None) # calculate average totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]] overall_average = (totals[0] / totals[1]) if totals else 0 avg_formatted = friendly_time(overall_average) avg_split = avg_formatted.split(' ') time_series_data.update({ 'label': 'Average Duration of Session ({})'.format(avg_split[1]), 'count': int(avg_split[0]) }) time_series_data.update( {"chartLabel": 'Average Duration of Session (seconds)'}) result = [time_series_data] else: dist_labels = [ '0 seconds (1 event)', '0-3 seconds', '3-10 seconds', '10-30 seconds', '30-60 seconds', '1-3 minutes', '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours' ] cursor = connection.cursor() cursor.execute(distribution(all_sessions), sessions_sql_params) calculated = cursor.fetchall() result = [{ 'label': dist_labels[index], 'count': calculated[0][index] } for index in range(len(dist_labels))] return result
def _session_avg(self, base_query: Query, params: QueryParams, filter: Filter) -> List[Dict[str, Any]]: def _determineInterval(interval): if interval == "minute": return ( "minute", "min", ) elif interval == "hour": return "hour", "H" elif interval == "week": return "week", "W" elif interval == "month": return "month", "M" else: return "day", "D" interval, interval_freq = _determineInterval(filter.interval) average_length_time = "SELECT date_trunc('{interval}', timestamp) as start_time,\ AVG(length) AS average_session_length_per_day,\ SUM(length) AS total_session_length_per_day, \ COUNT(1) as num_sessions_per_day\ FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length,\ MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format( base_query, interval=interval) cursor = connection.cursor() cursor.execute(average_length_time, params) time_series_avg = cursor.fetchall() if len(time_series_avg) == 0: return [] date_range = get_daterange(filter.date_from, filter.date_to, frequency=interval) data_array = [{ "date": a[0], "count": a[1], "breakdown": "Total" } for a in time_series_avg] if interval == "week": for df in data_array: df["date"] -= datetime.timedelta(days=df["date"].weekday() + 1) elif interval == "month": for df in data_array: df["date"] = (df["date"].replace(day=1) + datetime.timedelta( days=32)).replace(day=1) - datetime.timedelta(days=1) datewise_data = {d["date"]: d["count"] for d in data_array} values = [(key, datewise_data.get(key, 0)) for key in date_range] time_series_data = append_data(values, interval=filter.interval, math=None) scaled_data, label = scale_time_series(time_series_data["data"]) time_series_data.update({"data": scaled_data}) # calculate average totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]] overall_average = (totals[0] / totals[1]) if totals else 0 avg_formatted = friendly_time(overall_average) avg_split = avg_formatted.split(" ") time_series_data.update({ "label": "Average Session Length ({})".format(avg_split[1]), "count": int(avg_split[0]), "aggregated_value": int(avg_split[0]), }) time_series_data.update( {"chartLabel": "Average Session Length ({})".format(label)}) result = [time_series_data] return result