def _session_avg( self, base_query: str, params: Tuple[Any, ...], date_filter: Dict[str, datetime], interval: Optional[str] ) -> List[Dict[str, Any]]: def _determineInterval(interval): if interval == "minute": return ( "minute", "min", ) elif interval == "hour": return "hour", "H" elif interval == "week": return "week", "W" elif interval == "month": return "month", "M" else: return "day", "D" interval, interval_freq = _determineInterval(interval) average_length_time = "SELECT date_trunc('{interval}', timestamp) as start_time,\ AVG(length) AS average_session_length_per_day,\ SUM(length) AS total_session_length_per_day, \ COUNT(1) as num_sessions_per_day\ FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length,\ MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format( base_query, interval=interval ) cursor = connection.cursor() cursor.execute(average_length_time, params) time_series_avg = cursor.fetchall() date_range = pd.date_range(date_filter["timestamp__gte"], date_filter["timestamp__lte"], freq=interval_freq,) df = pd.DataFrame([{"date": a[0], "count": a[1], "breakdown": "Total"} for a in time_series_avg]) if interval == "week": df["date"] = df["date"].apply(lambda x: x - pd.offsets.Week(weekday=6)) elif interval == "month": df["date"] = df["date"].apply(lambda x: x - pd.offsets.MonthEnd(n=0)) df_dates = pd.DataFrame(df.groupby("date").mean(), index=date_range) df_dates = df_dates.fillna(0) values = [(key, round(value[0])) if len(value) > 0 else (key, 0) for key, value in df_dates.iterrows()] time_series_data = append_data(values, interval=interval, math=None) # calculate average totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]] overall_average = (totals[0] / totals[1]) if totals else 0 avg_formatted = friendly_time(overall_average) avg_split = avg_formatted.split(" ") time_series_data.update( {"label": "Average Duration of Session ({})".format(avg_split[1]), "count": int(avg_split[0]),} ) time_series_data.update({"chartLabel": "Average Duration of Session (seconds)"}) result = [time_series_data] return result
def _format_avg(self, avg: float): avg_formatted = friendly_time(avg) avg_split = avg_formatted.split(" ") time_series_data = {} time_series_data.update( {"label": "Average Duration of Session ({})".format(avg_split[1]), "count": int(avg_split[0]),} ) time_series_data.update({"chartLabel": "Average Duration of Session (seconds)"}) return time_series_data
def _session_avg(self, base_query: str, params: Tuple[Any, ...], date_filter: Dict[str, datetime]) -> List[Dict[str, Any]]: average_length_time = "SELECT date_trunc('day', timestamp) as start_time,\ AVG(length) AS average_session_length_per_day,\ SUM(length) AS total_session_length_per_day, \ COUNT(1) as num_sessions_per_day\ FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length,\ MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format( base_query) cursor = connection.cursor() cursor.execute(average_length_time, params) time_series_avg = cursor.fetchall() time_series_avg_friendly = [] date_range = pd.date_range( date_filter["timestamp__gte"].date(), date_filter["timestamp__lte"].date(), freq="D", ) time_series_avg_friendly = [( day, round(time_series_avg[index][1] if index < len(time_series_avg) else 0), ) for index, day in enumerate(date_range)] time_series_data = append_data(time_series_avg_friendly, math=None) # calculate average totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]] overall_average = (totals[0] / totals[1]) if totals else 0 avg_formatted = friendly_time(overall_average) avg_split = avg_formatted.split(" ") time_series_data.update({ "label": "Average Duration of Session ({})".format(avg_split[1]), "count": int(avg_split[0]), }) time_series_data.update( {"chartLabel": "Average Duration of Session (seconds)"}) result = [time_series_data] return result
def calculate_sessions(self, events, session_type): sessions = events\ .annotate(previous_timestamp=Window( expression=Lag('timestamp', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() ))\ .annotate(previous_event=Window( expression=Lag('event', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() )) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() # TODO: add midnight condition all_sessions = '\ SELECT distinct_id, timestamp,\ SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\ SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\ FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\ OR previous_timestamp IS NULL \ THEN 1 ELSE 0 END AS new_session \ FROM ({}) AS inner_sessions\ ) AS outer_sessions'.format(sessions_sql) def overall_average_length(query): return 'SELECT COUNT(*) as sessions,\ AVG(length) AS average_session_length\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length FROM ({}) as count GROUP BY 1) agg'.format( query) def distribution(query): return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\ COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\ COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\ COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\ COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\ COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\ COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\ COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\ COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\ COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length FROM ({}) as count GROUP BY 1) agg'.format( query) result = [] if session_type == 'avg': cursor = connection.cursor() cursor.execute(overall_average_length(all_sessions), sessions_sql_params) calculated = cursor.fetchall() avg_length = round(calculated[0][1], 0) avg_formatted = friendly_time(avg_length) result = [{ 'label': 'Number of Sessions', 'count': calculated[0][0] }, { 'label': 'Average Duration of Session', 'count': avg_formatted }] else: dist_labels = [ '0 seconds (1 event)', '0-3 seconds', '3-10 seconds', '10-30 seconds', '30-60 seconds', '1-3 minutes', '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours' ] cursor = connection.cursor() cursor.execute(distribution(all_sessions), sessions_sql_params) calculated = cursor.fetchall() result = [{ 'label': dist_labels[index], 'count': calculated[0][index] } for index in range(len(dist_labels))] return result
def calculate_sessions(self, events: QuerySet, session_type: str, date_filter) -> List[Dict[str, Any]]: sessions = events\ .annotate(previous_timestamp=Window( expression=Lag('timestamp', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() ))\ .annotate(previous_event=Window( expression=Lag('event', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() )) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() # TODO: add midnight condition all_sessions = '\ SELECT distinct_id, timestamp,\ SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\ SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\ FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\ OR previous_timestamp IS NULL \ THEN 1 ELSE 0 END AS new_session \ FROM ({}) AS inner_sessions\ ) AS outer_sessions'.format(sessions_sql) def distribution(query): return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\ COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\ COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\ COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\ COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\ COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\ COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\ COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\ COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\ COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length FROM ({}) as count GROUP BY 1) agg'.format( query) def average_length_time(query): return 'SELECT date_trunc(\'day\', timestamp) as start_time,\ AVG(length) AS average_session_length_per_day,\ SUM(length) AS total_session_length_per_day, \ COUNT(1) as num_sessions_per_day\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length,\ MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time'.format( query) result: List = [] if session_type == 'avg': cursor = connection.cursor() cursor.execute(average_length_time(all_sessions), sessions_sql_params) time_series_avg = cursor.fetchall() time_series_avg_friendly = [] date_range = pd.date_range(date_filter['timestamp__gte'].date(), date_filter['timestamp__lte'].date(), freq='D') time_series_avg_friendly = [ (day, round(time_series_avg[index][1] if index < len(time_series_avg) else 0)) for index, day in enumerate(date_range) ] time_series_data = append_data(time_series_avg_friendly, math=None) # calculate average totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]] overall_average = (totals[0] / totals[1]) if totals else 0 avg_formatted = friendly_time(overall_average) avg_split = avg_formatted.split(' ') time_series_data.update({ 'label': 'Average Duration of Session ({})'.format(avg_split[1]), 'count': int(avg_split[0]) }) time_series_data.update( {"chartLabel": 'Average Duration of Session (seconds)'}) result = [time_series_data] else: dist_labels = [ '0 seconds (1 event)', '0-3 seconds', '3-10 seconds', '10-30 seconds', '30-60 seconds', '1-3 minutes', '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours' ] cursor = connection.cursor() cursor.execute(distribution(all_sessions), sessions_sql_params) calculated = cursor.fetchall() result = [{ 'label': dist_labels[index], 'count': calculated[0][index] } for index in range(len(dist_labels))] return result
def _session_avg(self, base_query: Query, params: QueryParams, filter: Filter) -> List[Dict[str, Any]]: def _determineInterval(interval): if interval == "minute": return ( "minute", "min", ) elif interval == "hour": return "hour", "H" elif interval == "week": return "week", "W" elif interval == "month": return "month", "M" else: return "day", "D" interval, interval_freq = _determineInterval(filter.interval) average_length_time = "SELECT date_trunc('{interval}', timestamp) as start_time,\ AVG(length) AS average_session_length_per_day,\ SUM(length) AS total_session_length_per_day, \ COUNT(1) as num_sessions_per_day\ FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length,\ MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format( base_query, interval=interval) cursor = connection.cursor() cursor.execute(average_length_time, params) time_series_avg = cursor.fetchall() if len(time_series_avg) == 0: return [] date_range = get_daterange(filter.date_from, filter.date_to, frequency=interval) data_array = [{ "date": a[0], "count": a[1], "breakdown": "Total" } for a in time_series_avg] if interval == "week": for df in data_array: df["date"] -= datetime.timedelta(days=df["date"].weekday() + 1) elif interval == "month": for df in data_array: df["date"] = (df["date"].replace(day=1) + datetime.timedelta( days=32)).replace(day=1) - datetime.timedelta(days=1) datewise_data = {d["date"]: d["count"] for d in data_array} values = [(key, datewise_data.get(key, 0)) for key in date_range] time_series_data = append_data(values, interval=filter.interval, math=None) scaled_data, label = scale_time_series(time_series_data["data"]) time_series_data.update({"data": scaled_data}) # calculate average totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]] overall_average = (totals[0] / totals[1]) if totals else 0 avg_formatted = friendly_time(overall_average) avg_split = avg_formatted.split(" ") time_series_data.update({ "label": "Average Session Length ({})".format(avg_split[1]), "count": int(avg_split[0]), "aggregated_value": int(avg_split[0]), }) time_series_data.update( {"chartLabel": "Average Session Length ({})".format(label)}) result = [time_series_data] return result