示例#1
0
    def _session_avg(
        self, base_query: str, params: Tuple[Any, ...], date_filter: Dict[str, datetime], interval: Optional[str]
    ) -> List[Dict[str, Any]]:
        def _determineInterval(interval):
            if interval == "minute":
                return (
                    "minute",
                    "min",
                )
            elif interval == "hour":
                return "hour", "H"
            elif interval == "week":
                return "week", "W"
            elif interval == "month":
                return "month", "M"
            else:
                return "day", "D"

        interval, interval_freq = _determineInterval(interval)

        average_length_time = "SELECT date_trunc('{interval}', timestamp) as start_time,\
                        AVG(length) AS average_session_length_per_day,\
                        SUM(length) AS total_session_length_per_day, \
                        COUNT(1) as num_sessions_per_day\
                        FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length,\
                            MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format(
            base_query, interval=interval
        )

        cursor = connection.cursor()
        cursor.execute(average_length_time, params)
        time_series_avg = cursor.fetchall()

        date_range = pd.date_range(date_filter["timestamp__gte"], date_filter["timestamp__lte"], freq=interval_freq,)
        df = pd.DataFrame([{"date": a[0], "count": a[1], "breakdown": "Total"} for a in time_series_avg])
        if interval == "week":
            df["date"] = df["date"].apply(lambda x: x - pd.offsets.Week(weekday=6))
        elif interval == "month":
            df["date"] = df["date"].apply(lambda x: x - pd.offsets.MonthEnd(n=0))

        df_dates = pd.DataFrame(df.groupby("date").mean(), index=date_range)
        df_dates = df_dates.fillna(0)
        values = [(key, round(value[0])) if len(value) > 0 else (key, 0) for key, value in df_dates.iterrows()]

        time_series_data = append_data(values, interval=interval, math=None)
        # calculate average
        totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]]
        overall_average = (totals[0] / totals[1]) if totals else 0
        avg_formatted = friendly_time(overall_average)
        avg_split = avg_formatted.split(" ")

        time_series_data.update(
            {"label": "Average Duration of Session ({})".format(avg_split[1]), "count": int(avg_split[0]),}
        )
        time_series_data.update({"chartLabel": "Average Duration of Session (seconds)"})
        result = [time_series_data]
        return result
示例#2
0
 def _format_avg(self, avg: float):
     avg_formatted = friendly_time(avg)
     avg_split = avg_formatted.split(" ")
     time_series_data = {}
     time_series_data.update(
         {"label": "Average Duration of Session ({})".format(avg_split[1]), "count": int(avg_split[0]),}
     )
     time_series_data.update({"chartLabel": "Average Duration of Session (seconds)"})
     return time_series_data
示例#3
0
    def _session_avg(self, base_query: str, params: Tuple[Any, ...],
                     date_filter: Dict[str, datetime]) -> List[Dict[str, Any]]:
        average_length_time = "SELECT date_trunc('day', timestamp) as start_time,\
                        AVG(length) AS average_session_length_per_day,\
                        SUM(length) AS total_session_length_per_day, \
                        COUNT(1) as num_sessions_per_day\
                        FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length,\
                            MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format(
            base_query)

        cursor = connection.cursor()
        cursor.execute(average_length_time, params)
        time_series_avg = cursor.fetchall()
        time_series_avg_friendly = []
        date_range = pd.date_range(
            date_filter["timestamp__gte"].date(),
            date_filter["timestamp__lte"].date(),
            freq="D",
        )
        time_series_avg_friendly = [(
            day,
            round(time_series_avg[index][1]
                  if index < len(time_series_avg) else 0),
        ) for index, day in enumerate(date_range)]

        time_series_data = append_data(time_series_avg_friendly, math=None)

        # calculate average
        totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]]
        overall_average = (totals[0] / totals[1]) if totals else 0
        avg_formatted = friendly_time(overall_average)
        avg_split = avg_formatted.split(" ")

        time_series_data.update({
            "label":
            "Average Duration of Session ({})".format(avg_split[1]),
            "count":
            int(avg_split[0]),
        })
        time_series_data.update(
            {"chartLabel": "Average Duration of Session (seconds)"})
        result = [time_series_data]
        return result
示例#4
0
    def calculate_sessions(self, events, session_type):
        sessions = events\
            .annotate(previous_timestamp=Window(
                expression=Lag('timestamp', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))\
            .annotate(previous_event=Window(
                expression=Lag('event', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()
        # TODO: add midnight condition

        all_sessions = '\
            SELECT distinct_id, timestamp,\
                SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\
                SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\
                FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\
                    OR previous_timestamp IS NULL \
                    THEN 1 ELSE 0 END AS new_session \
                    FROM ({}) AS inner_sessions\
                ) AS outer_sessions'.format(sessions_sql)

        def overall_average_length(query):
            return 'SELECT COUNT(*) as sessions,\
                        AVG(length) AS average_session_length\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length FROM ({}) as count GROUP BY 1) agg'.format(
                query)

        def distribution(query):
            return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\
                        COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\
                        COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\
                        COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\
                        COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\
                        COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\
                        COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\
                        COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\
                        COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\
                        COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length FROM ({}) as count GROUP BY 1) agg'.format(
                query)

        result = []
        if session_type == 'avg':
            cursor = connection.cursor()
            cursor.execute(overall_average_length(all_sessions),
                           sessions_sql_params)
            calculated = cursor.fetchall()
            avg_length = round(calculated[0][1], 0)
            avg_formatted = friendly_time(avg_length)
            result = [{
                'label': 'Number of Sessions',
                'count': calculated[0][0]
            }, {
                'label': 'Average Duration of Session',
                'count': avg_formatted
            }]
        else:
            dist_labels = [
                '0 seconds (1 event)', '0-3 seconds', '3-10 seconds',
                '10-30 seconds', '30-60 seconds', '1-3 minutes',
                '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours'
            ]
            cursor = connection.cursor()
            cursor.execute(distribution(all_sessions), sessions_sql_params)
            calculated = cursor.fetchall()
            result = [{
                'label': dist_labels[index],
                'count': calculated[0][index]
            } for index in range(len(dist_labels))]

        return result
示例#5
0
    def calculate_sessions(self, events: QuerySet, session_type: str,
                           date_filter) -> List[Dict[str, Any]]:
        sessions = events\
            .annotate(previous_timestamp=Window(
                expression=Lag('timestamp', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))\
            .annotate(previous_event=Window(
                expression=Lag('event', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()
        # TODO: add midnight condition

        all_sessions = '\
            SELECT distinct_id, timestamp,\
                SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\
                SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\
                FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\
                    OR previous_timestamp IS NULL \
                    THEN 1 ELSE 0 END AS new_session \
                    FROM ({}) AS inner_sessions\
                ) AS outer_sessions'.format(sessions_sql)

        def distribution(query):
            return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\
                        COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\
                        COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\
                        COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\
                        COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\
                        COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\
                        COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\
                        COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\
                        COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\
                        COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length FROM ({}) as count GROUP BY 1) agg'.format(
                query)

        def average_length_time(query):
            return 'SELECT date_trunc(\'day\', timestamp) as start_time,\
                        AVG(length) AS average_session_length_per_day,\
                        SUM(length) AS total_session_length_per_day, \
                        COUNT(1) as num_sessions_per_day\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length,\
                            MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time'.format(
                query)

        result: List = []
        if session_type == 'avg':

            cursor = connection.cursor()
            cursor.execute(average_length_time(all_sessions),
                           sessions_sql_params)
            time_series_avg = cursor.fetchall()
            time_series_avg_friendly = []
            date_range = pd.date_range(date_filter['timestamp__gte'].date(),
                                       date_filter['timestamp__lte'].date(),
                                       freq='D')
            time_series_avg_friendly = [
                (day,
                 round(time_series_avg[index][1]
                       if index < len(time_series_avg) else 0))
                for index, day in enumerate(date_range)
            ]

            time_series_data = append_data(time_series_avg_friendly, math=None)

            # calculate average
            totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]]
            overall_average = (totals[0] / totals[1]) if totals else 0
            avg_formatted = friendly_time(overall_average)
            avg_split = avg_formatted.split(' ')

            time_series_data.update({
                'label':
                'Average Duration of Session ({})'.format(avg_split[1]),
                'count':
                int(avg_split[0])
            })
            time_series_data.update(
                {"chartLabel": 'Average Duration of Session (seconds)'})

            result = [time_series_data]
        else:
            dist_labels = [
                '0 seconds (1 event)', '0-3 seconds', '3-10 seconds',
                '10-30 seconds', '30-60 seconds', '1-3 minutes',
                '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours'
            ]
            cursor = connection.cursor()
            cursor.execute(distribution(all_sessions), sessions_sql_params)
            calculated = cursor.fetchall()
            result = [{
                'label': dist_labels[index],
                'count': calculated[0][index]
            } for index in range(len(dist_labels))]

        return result
示例#6
0
    def _session_avg(self, base_query: Query, params: QueryParams,
                     filter: Filter) -> List[Dict[str, Any]]:
        def _determineInterval(interval):
            if interval == "minute":
                return (
                    "minute",
                    "min",
                )
            elif interval == "hour":
                return "hour", "H"
            elif interval == "week":
                return "week", "W"
            elif interval == "month":
                return "month", "M"
            else:
                return "day", "D"

        interval, interval_freq = _determineInterval(filter.interval)

        average_length_time = "SELECT date_trunc('{interval}', timestamp) as start_time,\
                        AVG(length) AS average_session_length_per_day,\
                        SUM(length) AS total_session_length_per_day, \
                        COUNT(1) as num_sessions_per_day\
                        FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length,\
                            MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format(
            base_query, interval=interval)

        cursor = connection.cursor()
        cursor.execute(average_length_time, params)
        time_series_avg = cursor.fetchall()
        if len(time_series_avg) == 0:
            return []

        date_range = get_daterange(filter.date_from,
                                   filter.date_to,
                                   frequency=interval)
        data_array = [{
            "date": a[0],
            "count": a[1],
            "breakdown": "Total"
        } for a in time_series_avg]

        if interval == "week":
            for df in data_array:
                df["date"] -= datetime.timedelta(days=df["date"].weekday() + 1)
        elif interval == "month":
            for df in data_array:
                df["date"] = (df["date"].replace(day=1) + datetime.timedelta(
                    days=32)).replace(day=1) - datetime.timedelta(days=1)

        datewise_data = {d["date"]: d["count"] for d in data_array}
        values = [(key, datewise_data.get(key, 0)) for key in date_range]

        time_series_data = append_data(values,
                                       interval=filter.interval,
                                       math=None)
        scaled_data, label = scale_time_series(time_series_data["data"])
        time_series_data.update({"data": scaled_data})
        # calculate average
        totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]]
        overall_average = (totals[0] / totals[1]) if totals else 0
        avg_formatted = friendly_time(overall_average)
        avg_split = avg_formatted.split(" ")

        time_series_data.update({
            "label":
            "Average Session Length ({})".format(avg_split[1]),
            "count":
            int(avg_split[0]),
            "aggregated_value":
            int(avg_split[0]),
        })
        time_series_data.update(
            {"chartLabel": "Average Session Length ({})".format(label)})
        result = [time_series_data]
        return result