Пример #1
0
    def build_all_sessions_query(
        self, events: QuerySet, _date_gte=Q()) -> Tuple[Query, QueryParams]:
        sessions = (events.filter(_date_gte).annotate(
            previous_timestamp=Window(
                expression=Lag("timestamp", default=None),
                partition_by=F("distinct_id"),
                order_by=F("timestamp").asc(),
            )).annotate(previous_event=Window(
                expression=Lag("event", default=None),
                partition_by=F("distinct_id"),
                order_by=F("timestamp").asc(),
            )))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()
        all_sessions = "\
            SELECT *,\
                SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\
                SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\
                FROM (SELECT id, team_id, distinct_id, event, elements_hash, timestamp, properties, CASE WHEN EXTRACT('EPOCH' FROM (timestamp - previous_timestamp)) >= (60 * 30)\
                    OR previous_timestamp IS NULL \
                    THEN 1 ELSE 0 END AS new_session \
                    FROM ({}) AS inner_sessions\
                ) AS outer_sessions".format(sessions_sql)

        return all_sessions, sessions_sql_params
Пример #2
0
    def calculate_sessions(self, events: QuerySet, session_type: Optional[str],
                           filter: Filter, team: Team,
                           offset: int) -> List[Dict[str, Any]]:

        # format date filter for session view
        _date_gte = Q()
        if session_type is None:
            # if _date_from is not explicitely set we only want to get the last day worth of data
            # otherwise the query is very slow
            if filter._date_from and filter.date_to:
                _date_gte = Q(
                    timestamp__gte=filter.date_from,
                    timestamp__lte=filter.date_to + relativedelta(days=1),
                )
            else:
                dt = now()
                dt = dt.replace(hour=0, minute=0, second=0, microsecond=0)
                _date_gte = Q(timestamp__gte=dt,
                              timestamp__lte=dt + relativedelta(days=1))
        else:
            if not filter.date_from:
                filter._date_from = (Event.objects.filter(
                    team_id=team).order_by("timestamp")[0].timestamp.replace(
                        hour=0, minute=0, second=0, microsecond=0).isoformat())

        sessions = (events.filter(_date_gte).annotate(
            previous_timestamp=Window(
                expression=Lag("timestamp", default=None),
                partition_by=F("distinct_id"),
                order_by=F("timestamp").asc(),
            )).annotate(previous_event=Window(
                expression=Lag("event", default=None),
                partition_by=F("distinct_id"),
                order_by=F("timestamp").asc(),
            )))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()
        all_sessions = "\
            SELECT *,\
                SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\
                SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\
                FROM (SELECT id, distinct_id, event, elements_hash, timestamp, properties, CASE WHEN EXTRACT('EPOCH' FROM (timestamp - previous_timestamp)) >= (60 * 30)\
                    OR previous_timestamp IS NULL \
                    THEN 1 ELSE 0 END AS new_session \
                    FROM ({}) AS inner_sessions\
                ) AS outer_sessions".format(sessions_sql)

        result: List = []
        if session_type == "avg":
            result = self._session_avg(all_sessions, sessions_sql_params,
                                       filter)
        elif session_type == "dist":
            result = self._session_dist(all_sessions, sessions_sql_params)
        else:
            result = self._session_list(all_sessions, sessions_sql_params,
                                        team, filter, offset)

        return result
Пример #3
0
    def calculate_sessions(
        self,
        events: QuerySet,
        session_type: Optional[str],
        date_filter: Dict[str, datetime],
        team: Team,
        request: request.Request,
    ) -> List[Dict[str, Any]]:

        # format date filter for session view
        _date_gte = Q()
        if session_type is None:
            if request.GET.get("date_from", None):
                _date_gte = Q(
                    timestamp__gte=date_filter["timestamp__gte"],
                    timestamp__lte=date_filter["timestamp__gte"] +
                    relativedelta(days=1),
                )
            else:
                dt = datetime.now()
                dt = dt.replace(hour=0, minute=0, second=0, microsecond=0)
                _date_gte = Q(timestamp__gte=dt,
                              timestamp__lte=dt + relativedelta(days=1))

        sessions = (events.filter(_date_gte).annotate(
            previous_timestamp=Window(
                expression=Lag("timestamp", default=None),
                partition_by=F("distinct_id"),
                order_by=F("timestamp").asc(),
            )).annotate(previous_event=Window(
                expression=Lag("event", default=None),
                partition_by=F("distinct_id"),
                order_by=F("timestamp").asc(),
            )))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()
        all_sessions = "\
            SELECT *,\
                SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\
                SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\
                FROM (SELECT id, distinct_id, event, elements_hash, timestamp, properties, CASE WHEN EXTRACT('EPOCH' FROM (timestamp - previous_timestamp)) >= (60 * 30)\
                    OR previous_timestamp IS NULL \
                    THEN 1 ELSE 0 END AS new_session \
                    FROM ({}) AS inner_sessions\
                ) AS outer_sessions".format(sessions_sql)

        result: List = []
        interval = request.GET.get("interval", None)
        if session_type == "avg":
            result = self._session_avg(all_sessions, sessions_sql_params,
                                       date_filter, interval)
        elif session_type == "dist":
            result = self._session_dist(all_sessions, sessions_sql_params)
        else:
            result = self._session_list(all_sessions, sessions_sql_params,
                                        team, request)

        return result
Пример #4
0
    def calculate_sessions(self, events: QuerySet, session_type: Optional[str],
                           date_filter: Dict[str, datetime], team: Team,
                           request: request.Request) -> List[Dict[str, Any]]:

        if not events:
            return []

        # format date filter for session view
        _date_gte = Q()
        if session_type is None:
            if request.GET.get('date_from', None):
                _date_gte = Q(timestamp__gte=date_filter['timestamp__gte'],
                              timestamp__lte=date_filter['timestamp__gte'] +
                              relativedelta(days=1))
            else:
                dt = events.order_by('-timestamp').values(
                    'timestamp')[0]['timestamp']
                if dt:
                    dt = dt.replace(hour=0, minute=0, second=0, microsecond=0)
                _date_gte = Q(timestamp__gte=dt,
                              timestamp__lte=dt + relativedelta(days=1))

        sessions = events.filter(_date_gte)\
            .annotate(previous_timestamp=Window(
                expression=Lag('timestamp', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))\
            .annotate(previous_event=Window(
                expression=Lag('event', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()
        all_sessions = '\
            SELECT *,\
                SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\
                SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\
                FROM (SELECT id, distinct_id, event, elements_hash, timestamp, properties, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\
                    OR previous_timestamp IS NULL \
                    THEN 1 ELSE 0 END AS new_session \
                    FROM ({}) AS inner_sessions\
                ) AS outer_sessions'.format(sessions_sql)

        result: List = []
        if session_type == 'avg':
            result = self._session_avg(all_sessions, sessions_sql_params,
                                       date_filter)
        elif session_type == 'dist':
            result = self._session_dist(all_sessions, sessions_sql_params)
        else:
            result = self._session_list(all_sessions, sessions_sql_params,
                                        team, date_filter, request)

        return result
Пример #5
0
 def test_lag(self):
     """
     Compute the difference between an employee's salary and the next
     highest salary in the employee's department. Return None if the
     employee has the lowest salary.
     """
     qs = Employee.objects.annotate(lag=Window(
         expression=Lag(expression='salary', offset=1),
         partition_by=F('department'),
         order_by=[F('salary').asc(), F('name').asc()],
     )).order_by('department')
     self.assertQuerysetEqual(qs, [
         ('Williams', 37000, 'Accounting', None),
         ('Jenson', 45000, 'Accounting', 37000),
         ('Jones', 45000, 'Accounting', 45000),
         ('Adams', 50000, 'Accounting', 45000),
         ('Moore', 34000, 'IT', None),
         ('Wilkinson', 60000, 'IT', 34000),
         ('Johnson', 80000, 'Management', None),
         ('Miller', 100000, 'Management', 80000),
         ('Smith', 38000, 'Marketing', None),
         ('Johnson', 40000, 'Marketing', 38000),
         ('Brown', 53000, 'Sales', None),
         ('Smith', 55000, 'Sales', 53000),
     ], transform=lambda row: (row.name, row.salary, row.department, row.lag))
Пример #6
0
def _navigation_base(filter_class_function, reverse_url_function, user, obj, url_name):
    context = {"current_element": obj}
    search_parameters = SearchParametersCache(user, obj.__class__.__name__).cached_data
    if not search_parameters:
        return context

    search_type = search_parameters.get("search_type")
    filter_form_class = filter_class_function(search_type)
    order_by = filter_form_class(data=search_parameters).qs.query.order_by
    order_by_expressions = convert_order_by_strings_to_expressions(order_by) or None
    qs = filter_form_class(data=search_parameters).qs.annotate(
        previous_acronym=Window(
            expression=Lag("acronym"),
            order_by=order_by_expressions,
        ),
        next_acronym=Window(
            expression=Lead("acronym"),
            order_by=order_by_expressions,
        ),
        previous_id=Window(
            expression=Lag("id"),
            order_by=order_by_expressions,
        ),
        next_id=Window(
            expression=Lead("id"),
            order_by=order_by_expressions,
        )
    ).values_list(
        "id",
        "acronym",
        "previous_acronym",
        "previous_id",
        "next_acronym",
        "next_id",
        named=True
    ).order_by(*order_by)

    current_row = _get_current_row(qs, obj)

    if current_row:
        context.update({
            "next_element_title": current_row.next_acronym,
            "next_url": reverse_url_function(current_row.next_id, url_name) if current_row.next_id else None,
            "previous_element_title": current_row.previous_acronym,
            "previous_url": reverse_url_function(current_row.previous_id, url_name) if current_row.previous_id else None
        })
    return context
Пример #7
0
    def list(self, request):
        team = request.user.team_set.get()
        resp = []
        date_query = request_to_date_query(request.GET)

        sessions = Event.objects.filter(
                team=team,
                event='$pageview',
                **date_query
            )\
            .annotate(previous_timestamp=Window(
                expression=Lag('timestamp', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()

        cursor = connection.cursor()
        cursor.execute(
            '\
        SELECT source_event, target_event, count(*) from (\
            SELECT event_number || \'_\' || current_url as target_event,LAG(event_number || \'_\' || current_url, 1) OVER (\
                            PARTITION BY session\
                            ) AS source_event from \
        (\
            SELECT properties->> \'$current_url\' as current_url, sessionified.session\
                ,ROW_NUMBER() OVER (\
                        PARTITION BY distinct_id\
                        ,session ORDER BY timestamp\
                        ) AS event_number\
        FROM (\
            SELECT events_notated.*, SUM(new_session) OVER (\
                ORDER BY distinct_id\
                        ,timestamp\
                ) AS session\
            FROM (\
                SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30) OR previous_timestamp IS NULL THEN 1 ELSE 0 END AS new_session\
                FROM ({}) AS inner_sessions \
            ) as events_notated \
        ) as sessionified\
        ) as final\
        where event_number <= 4\
        ) as counts\
        where source_event is not null and target_event is not null and SUBSTRING(source_event, 3) != SUBSTRING(target_event, 3)\
        group by source_event, target_event order by count desc limit 15\
        '.format(sessions_sql), sessions_sql_params)
        rows = cursor.fetchall()

        for row in rows:
            resp.append({'source': row[0], 'target': row[1], 'value': row[2]})

        resp = sorted(resp, key=lambda x: x['value'], reverse=True)
        return Response(resp)
Пример #8
0
 def test_lag_decimalfield(self):
     qs = Employee.objects.annotate(lag=Window(
         expression=Lag(expression='bonus', offset=1),
         partition_by=F('department'),
         order_by=[F('bonus').asc(), F('name').asc()],
     )).order_by('department', F('bonus').asc(), F('name').asc())
     self.assertQuerysetEqual(qs, [
         ('Williams', 92.5, 'Accounting', None),
         ('Jenson', 112.5, 'Accounting', 92.5),
         ('Jones', 112.5, 'Accounting', 112.5),
         ('Adams', 125, 'Accounting', 112.5),
         ('Moore', 85, 'IT', None),
         ('Wilkinson', 150, 'IT', 85),
         ('Johnson', 200, 'Management', None),
         ('Miller', 250, 'Management', 200),
         ('Smith', 95, 'Marketing', None),
         ('Johnson', 100, 'Marketing', 95),
         ('Brown', 132.5, 'Sales', None),
         ('Smith', 137.5, 'Sales', 132.5),
     ], transform=lambda row: (row.name, row.bonus, row.department, row.lag))
Пример #9
0
 def test_null_source_lag(self):
     msg = "Lag requires a non-null source expression"
     with self.assertRaisesMessage(ValueError, msg):
         Lag(expression=None)
    def handle(self, *args, **options):

        # Retrieve option from command
        option = options.get('frequency')
        option = option[0]

        # See if option is valid
        try:
            frequency_lookup = Frequency.objects.get(t_frequency=option)

            # Empty the Summary table for the specified frequency
            SummaryByCountyFrequency.objects.filter(
                n_frequency=frequency_lookup).delete()

            # Will always look at yesterday's date for retrieving information
            end_date = datetime.now().date() - timedelta(1)

            # Calculate how far back to pull data for based on the command option
            if option == 'Daily':
                start_date = datetime.now().date() - timedelta(2)

            if option == 'Bi-Weekly':
                day_of_week = datetime.now().weekday()
                # If it's Monday
                if day_of_week == 0:
                    # Look back to Friday
                    start_date = datetime.now().date() - timedelta(3)
                # Else when the job runs on Friday
                else:
                    # Look back to Monday
                    start_date = datetime.now().date() - timedelta(4)

            if option == 'Weekly':
                start_date = datetime.now().date() - timedelta(8)

            if option == 'Monthly':
                day_of_month = datetime.now().date().day
                start_date = (datetime.now().date() -
                              timedelta(day_of_month)) + timedelta(1)

            # Retrieve rows from DailyCountyKnownCases where the date is the start date or the end date
            # Also, using the lag function to get the previous row's value for cases
            known_cases = DailyCountyKnownCases.objects.annotate(
                q_cases_lag=Window(expression=Lag('q_cases',
                                                  offset=1,
                                                  default=0),
                                   order_by=('n_county', 'd_date')),
            ).filter(Q(d_date=start_date) | Q(d_date=end_date))

            # Define variable to keep track of number of inserts
            summary_row_inserts = 0

            # Iterate through the results
            for known_case in known_cases:

                # Only want to look at the row that corresponds to yesterday's date since it has the lag value we need
                if known_case.d_date == end_date:

                    # Create and fill variables for SummaryByCountyFrequency columns
                    summary_n_county = known_case.n_county
                    summary_n_frequency = frequency_lookup
                    summary_d_updated = datetime.now().date()
                    summary_q_cases_change = known_case.q_cases - known_case.q_cases_lag
                    summary_q_total_cases = known_case.q_cases
                    summary_q_deaths_change = 0
                    summary_q_total_deaths = 0

                    # Create SummaryByCountyFrequency object with previously created variables
                    summary_row_insert = SummaryByCountyFrequency(
                        n_county=summary_n_county,
                        n_frequency=summary_n_frequency,
                        d_updated=summary_d_updated,
                        q_cases_change=summary_q_cases_change,
                        q_total_cases=summary_q_total_cases,
                        q_deaths_change=summary_q_deaths_change,
                        q_total_deaths=summary_q_total_deaths)

                    # Insert SummaryByCountyFrequency object
                    summary_row_insert.save()

                    # Increment insert counter
                    summary_row_inserts += 1

            self.stdout.write(
                self.style.SUCCESS('Inserted %s %s known cases summary rows' %
                                   (summary_row_inserts, option.lower())))

            ###################################################
            # Update death data on SummaryByCountyFrequency
            ###################################################

            summary_row_updates = 0

            # Retrieve rows from DailyCountyDeaths where the date is the start date or the end date
            # Also, using the lag function to get the previous row's value for cases
            deaths = DailyCountyDeaths.objects.annotate(q_deaths_lag=Window(
                expression=Lag('q_deaths', offset=1, default=0),
                order_by=('n_county', 'd_date')), ).filter(
                    Q(d_date=start_date) | Q(d_date=end_date))

            # Iterate through the results
            for death in deaths:

                # Only want to look at the row that corresponds to yesterday's date since it has the lag value we need
                if death.d_date == end_date:

                    # Create and fill variables for SummaryByCountyFrequency columns that we want to update
                    summary_q_deaths_change = death.q_deaths - death.q_deaths_lag
                    summary_q_total_deaths = death.q_deaths

                    # Update values by county and frequency
                    SummaryByCountyFrequency.objects.filter(
                        n_county=death.n_county,
                        n_frequency=frequency_lookup).update(
                            q_deaths_change=summary_q_deaths_change,
                            q_total_deaths=summary_q_total_deaths)

                    # Increment updates counter
                    summary_row_updates += 1

            self.stdout.write(
                self.style.SUCCESS('Updated %s summary rows' %
                                   summary_row_updates))

        # If the specified option was not in the Frequency table
        except:
            self.stdout.write(
                self.style.ERROR(
                    'Please try a different option. "%s" is invalid.' %
                    option))
Пример #11
0
 def test_lag_negative_offset(self):
     msg = "Lag requires a positive integer for the offset"
     with self.assertRaisesMessage(ValueError, msg):
         Lag(expression="salary", offset=-1)
Пример #12
0
    def calculate_sessions(self, events, session_type):
        sessions = events\
            .annotate(previous_timestamp=Window(
                expression=Lag('timestamp', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))\
            .annotate(previous_event=Window(
                expression=Lag('event', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()
        # TODO: add midnight condition

        all_sessions = '\
            SELECT distinct_id, timestamp,\
                SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\
                SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\
                FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\
                    OR previous_timestamp IS NULL \
                    THEN 1 ELSE 0 END AS new_session \
                    FROM ({}) AS inner_sessions\
                ) AS outer_sessions'.format(sessions_sql)

        def overall_average_length(query):
            return 'SELECT COUNT(*) as sessions,\
                        AVG(length) AS average_session_length\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length FROM ({}) as count GROUP BY 1) agg'.format(
                query)

        def distribution(query):
            return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\
                        COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\
                        COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\
                        COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\
                        COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\
                        COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\
                        COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\
                        COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\
                        COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\
                        COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length FROM ({}) as count GROUP BY 1) agg'.format(
                query)

        result = []
        if session_type == 'avg':
            cursor = connection.cursor()
            cursor.execute(overall_average_length(all_sessions),
                           sessions_sql_params)
            calculated = cursor.fetchall()
            avg_length = round(calculated[0][1], 0)
            avg_formatted = friendly_time(avg_length)
            result = [{
                'label': 'Number of Sessions',
                'count': calculated[0][0]
            }, {
                'label': 'Average Duration of Session',
                'count': avg_formatted
            }]
        else:
            dist_labels = [
                '0 seconds (1 event)', '0-3 seconds', '3-10 seconds',
                '10-30 seconds', '30-60 seconds', '1-3 minutes',
                '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours'
            ]
            cursor = connection.cursor()
            cursor.execute(distribution(all_sessions), sessions_sql_params)
            calculated = cursor.fetchall()
            result = [{
                'label': dist_labels[index],
                'count': calculated[0][index]
            } for index in range(len(dist_labels))]

        return result
Пример #13
0
 def with_prev_attributes(self):
     prev_attributes = Window(expression=Lag('attributes'),
                              partition_by=F('station'),
                              order_by=F('datetime').asc())
     return self.annotate(prev_attributes=prev_attributes)
Пример #14
0
def get_neighbour_pks(model, pk, filterset=None, ordering=None):
    '''
    Given a model and pk that identify an object (model instance) will, given an ordering
    (defaulting to the models ordering) and optionally a filterset (from url_filter), will
    return a tuple that contains two PKs that of the prior and next neighbour in the list
    either of all objects by that ordering or the filtered list (if a filterset is provided)
    
    :returns: a 4 tuple containing (prior_pk, next_pk, row_number, list_length)
     
    :param model:        The model the object is an instance of
    :param pk:           The primary key of the model instance being considered
    :param filterset:    An optional filterset (see https://github.com/miki725/django-url-filter)
    :param ordering:     An optional ordering (otherwise default model ordering is used). See: https://docs.djangoproject.com/en/2.0/ref/models/options/#ordering  
    '''
    # If a filterset is provided ensure it's of the same model as specified (consistency).
    if filterset and not filterset.Meta.model == model:
        return (None, None)

    # Get the ordering list for the model (a list of fields
    # See: https://docs.djangoproject.com/en/2.0/ref/models/options/#ordering
    if ordering is None:
        ordering = model._meta.ordering

    order_by = []
    for f in ordering:
        if f.startswith("-"):
            order_by.append(F(f[1:]).desc())
        else:
            order_by.append(F(f).asc())

    # A default order. We need an order or the window functions crash
    if len(order_by) == 0:
        order_by = ['pk']

    # Define the window functions for each neighbour
    window_lag = Window(expression=Lag("pk"), order_by=order_by)
    window_lead = Window(expression=Lead("pk"), order_by=order_by)
    window_rownnum = Window(expression=RowNumber(), order_by=order_by)

    # Get a queryset annotated with neighbours. If annotated attrs clash with existing attrs an exception
    # will be raised: https://code.djangoproject.com/ticket/11256
    try:
        # Start with all objects
        qs = model.objects.all()

        # Now apply a filterset if we have one
        if not filterset is None:
            # We respect the filterset. BUT we need to wrap it inside a sub query, so that
            # we can apply a DISTNCT ON Pk to avoid duplicate tuples that the window
            # functions can introduce when we are matching multiple remote objects.
            # Alas that's what they do. So we have to constrain it to one tuple per
            # PK.
            #
            # FIXME: Aaargh this won't work for injecting the current PK into the query!
            # My desire is to make sure that the query results include the provided pk.
            # Needs testing in both cases. I can't think of a way to do it alas. This is
            # frustrating me. Problem is across related object filters, or JOINS.
            # qs = filterset.filter() | (model.objects.filter(pk=pk).distinct() & filterset.filter())
            qs = qs.filter(pk__in=Subquery(filterset.filter().distinct(
                'pk').order_by('pk').values('pk')))

        # Now order the objects properly
        qs = qs.order_by(*order_by)

        # Now annotate the queryset with the prior and next PKs
        qs = qs.annotate(neighbour_prior=window_lag,
                         neighbour_next=window_lead,
                         row_number=window_rownnum)
    except:
        return None

    # Finally we need some trickery alas to do a query on the queryset! We can't add this WHERE
    # as a filter because the LAG and LEAD Window functions fail then, they are empty because
    # there is no lagger or leader on the one line result! So we have to run that query on the
    # whole table, then extract from the result the one line we want! Wish I could find a way to
    # do this in the Django ORM not with a raw() call.

    # First we need the SQL from the existing query. Many on-line sources seem to recommend
    # str(qs.query) but this does not return reliable SQL! A bug in Django and much discussed:
    #    https://code.djangoproject.com/ticket/30132
    #    https://code.djangoproject.com/ticket/25705
    #    https://code.djangoproject.com/ticket/25092
    #    https://code.djangoproject.com/ticket/24991
    #    https://code.djangoproject.com/ticket/17741
    #
    # But this, it seems is the reliable method which involves dipping into Django's
    # innards a litte (the SQL compiler)
    sql, params = qs.query.get_compiler(using=qs.db).as_sql()

    # Now we wrap the SQL
    sql = "SELECT * FROM ({}) ao WHERE {}={}".format(sql, model._meta.pk.name,
                                                     pk)

    # And create a new QuerySet
    ao = model.objects.raw(sql, params)

    try:
        if ao:
            if len(ao) == 1:
                return (ao[0].neighbour_prior, ao[0].neighbour_next,
                        ao[0].row_number, qs.count())
            else:
                raise ValueError(
                    "Query error: object appears more than once in neighbour hunt."
                )
        else:
            return (None, ) * 4
    except:
        return (None, ) * 4
Пример #15
0
    def list(self, request):
        team = request.user.team_set.get()
        resp = []
        date_query = request_to_date_query(request.GET)
        event, path_type, event_filter, start_comparator = self._determine_path_type(
            request)
        properties = request.GET.get('properties')
        start_point = request.GET.get('start')

        sessions = Event.objects.add_person_id(team.pk).filter(
                team=team,
                **(event_filter),
                **date_query
            )\
            .filter(~Q(event__in=['$autocapture', '$pageview', '$identify', '$pageleave']) if event is None else Q())\
            .filter(Filter(data={'properties': json.loads(properties)}).properties_to_Q() if properties else Q())\
            .annotate(previous_timestamp=Window(
                expression=Lag('timestamp', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()

        if event == "$autocapture":
            sessions_sql = self._add_elements(query_string=sessions_sql)

        events_notated = '\
        SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30) OR previous_timestamp IS NULL THEN 1 ELSE 0 END AS new_session\
        FROM ({}) AS inner_sessions\
        '.format(sessions_sql)

        sessionified = '\
        SELECT events_notated.*, SUM(new_session) OVER (\
            ORDER BY distinct_id\
                    ,timestamp\
            ) AS session\
        FROM ({}) as events_notated\
        '.format(events_notated)

        if start_point:
            sessionified = self._apply_start_point(
                start_comparator=start_comparator,
                query_string=sessionified,
                start_point=start_point)

        final = '\
        SELECT {} as path_type, id, sessionified.session\
            ,ROW_NUMBER() OVER (\
                    PARTITION BY distinct_id\
                    ,session ORDER BY timestamp\
                    ) AS event_number\
        FROM ({}) as sessionified\
        '.format(path_type, sessionified)

        counts = '\
        SELECT event_number || \'_\' || path_type as target_event, id as target_id, LAG(event_number || \'_\' || path_type, 1) OVER (\
            PARTITION BY session\
            ) AS source_event , LAG(id, 1) OVER (\
            PARTITION BY session\
            ) AS source_id from \
        ({}) as final\
        where event_number <= 4\
        '.format(final)

        cursor = connection.cursor()
        cursor.execute(
            '\
        SELECT source_event, target_event, MAX(target_id), MAX(source_id), count(*) from ({}) as counts\
        where source_event is not null and target_event is not null\
        group by source_event, target_event order by count desc limit 20\
        '.format(counts), sessions_sql_params)
        rows = cursor.fetchall()

        for row in rows:
            resp.append({
                'source': row[0],
                'target': row[1],
                'target_id': row[2],
                'source_id': row[3],
                'value': row[4]
            })

        resp = sorted(resp, key=lambda x: x['value'], reverse=True)
        return Response(resp)
Пример #16
0
 def diff_vs_previous_order(self):
     return self.annotate(prev_order_id=models.Window(
         expression=Lag('id'),
         partition_by=[models.F('customer_id')],
         order_by=models.F('created_at').asc(),
     ))
Пример #17
0
 def _window_helper(attr):
     return F(attr) - Window(expression=Lag(attr),
                             order_by=F('date').desc())
Пример #18
0
def get_months_plot(queryset, field) -> list:
    return queryset.annotate(month=TruncMonth('date__month'))\
        .values('month').annotate(c=Sum(field)).values('month', 'c')\
            .annotate(prev=Window(Lag('c'))).annotate(repayment=F('c')-F('prev'))\
                .values('month', 'c', 'repayment')
Пример #19
0
    def list(self, request):
        team = request.user.team_set.get()
        resp = []
        date_query = request_to_date_query(request.GET)
        event, path_type = self._determine_path_type(request)

        sessions = Event.objects.filter(
                team=team,
                **({"event":event} if event else {'event__regex':'^[^\$].*'}), #anything without $ (default)
                **date_query
            )\
            .annotate(previous_timestamp=Window(
                expression=Lag('timestamp', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()

        if event == "$autocapture":
            element = 'SELECT \'<\'|| e."tag_name" || \'> \'  || e."text" as tag_name_source, e."text" as text_source FROM "posthog_element" e JOIN \
                    ( SELECT group_id, MIN("posthog_element"."order") as minOrder FROM "posthog_element" GROUP BY group_id) e2 ON e.order = e2.minOrder AND e.group_id = e2.group_id where e.group_id = v2.group_id'

            element_group = 'SELECT g."id" as group_id FROM "posthog_elementgroup" g where v1."elements_hash" = g."hash"'
            sessions_sql = 'SELECT * FROM ({}) as v1 JOIN LATERAL ({}) as v2 on true JOIN LATERAL ({}) as v3 on true'.format(
                sessions_sql, element_group, element)

        cursor = connection.cursor()
        cursor.execute(
            '\
        SELECT source_event, target_event, MAX(target_id), MAX(source_id), count(*) from (\
            SELECT event_number || \'_\' || path_type as target_event, id as target_id, LAG(event_number || \'_\' || path_type, 1) OVER (\
                            PARTITION BY session\
                            ) AS source_event , LAG(id, 1) OVER (\
                            PARTITION BY session\
                            ) AS source_id from \
        (\
            SELECT {} as path_type, id, sessionified.session\
                ,ROW_NUMBER() OVER (\
                        PARTITION BY distinct_id\
                        ,session ORDER BY timestamp\
                        ) AS event_number\
        FROM (\
            SELECT events_notated.*, SUM(new_session) OVER (\
                ORDER BY distinct_id\
                        ,timestamp\
                ) AS session\
            FROM (\
                SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30) OR previous_timestamp IS NULL THEN 1 ELSE 0 END AS new_session\
                FROM ({}) AS inner_sessions \
            ) as events_notated \
        ) as sessionified\
        ) as final\
        where event_number <= 4\
        ) as counts\
        where source_event is not null and target_event is not null and SUBSTRING(source_event, 3) != SUBSTRING(target_event, 3)\
        group by source_event, target_event order by count desc limit 15\
        '.format(path_type, sessions_sql), sessions_sql_params)
        rows = cursor.fetchall()

        for row in rows:
            resp.append({
                'source': row[0],
                'target': row[1],
                'target_id': row[2],
                'source_id': row[3],
                'value': row[4]
            })

        resp = sorted(resp, key=lambda x: x['value'], reverse=True)
        return Response(resp)
Пример #20
0
    def calculate_sessions(self, events: QuerySet, session_type: str,
                           date_filter) -> List[Dict[str, Any]]:
        sessions = events\
            .annotate(previous_timestamp=Window(
                expression=Lag('timestamp', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))\
            .annotate(previous_event=Window(
                expression=Lag('event', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()
        # TODO: add midnight condition

        all_sessions = '\
            SELECT distinct_id, timestamp,\
                SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\
                SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\
                FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\
                    OR previous_timestamp IS NULL \
                    THEN 1 ELSE 0 END AS new_session \
                    FROM ({}) AS inner_sessions\
                ) AS outer_sessions'.format(sessions_sql)

        def distribution(query):
            return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\
                        COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\
                        COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\
                        COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\
                        COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\
                        COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\
                        COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\
                        COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\
                        COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\
                        COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length FROM ({}) as count GROUP BY 1) agg'.format(
                query)

        def average_length_time(query):
            return 'SELECT date_trunc(\'day\', timestamp) as start_time,\
                        AVG(length) AS average_session_length_per_day,\
                        SUM(length) AS total_session_length_per_day, \
                        COUNT(1) as num_sessions_per_day\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length,\
                            MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time'.format(
                query)

        result: List = []
        if session_type == 'avg':

            cursor = connection.cursor()
            cursor.execute(average_length_time(all_sessions),
                           sessions_sql_params)
            time_series_avg = cursor.fetchall()
            time_series_avg_friendly = []
            date_range = pd.date_range(date_filter['timestamp__gte'].date(),
                                       date_filter['timestamp__lte'].date(),
                                       freq='D')
            time_series_avg_friendly = [
                (day,
                 round(time_series_avg[index][1]
                       if index < len(time_series_avg) else 0))
                for index, day in enumerate(date_range)
            ]

            time_series_data = append_data(time_series_avg_friendly, math=None)

            # calculate average
            totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]]
            overall_average = (totals[0] / totals[1]) if totals else 0
            avg_formatted = friendly_time(overall_average)
            avg_split = avg_formatted.split(' ')

            time_series_data.update({
                'label':
                'Average Duration of Session ({})'.format(avg_split[1]),
                'count':
                int(avg_split[0])
            })
            time_series_data.update(
                {"chartLabel": 'Average Duration of Session (seconds)'})

            result = [time_series_data]
        else:
            dist_labels = [
                '0 seconds (1 event)', '0-3 seconds', '3-10 seconds',
                '10-30 seconds', '30-60 seconds', '1-3 minutes',
                '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours'
            ]
            cursor = connection.cursor()
            cursor.execute(distribution(all_sessions), sessions_sql_params)
            calculated = cursor.fetchall()
            result = [{
                'label': dist_labels[index],
                'count': calculated[0][index]
            } for index in range(len(dist_labels))]

        return result
Пример #21
0
    def calculate_paths(self, filter: PathFilter, team: Team):
        date_query = request_to_date_query({"date_from": filter._date_from, "date_to": filter._date_to}, exact=False)
        resp = []
        prop_type = filter.prop_type
        event, event_filter = filter.target_event
        start_comparator = filter.comparator

        sessions = (
            Event.objects.add_person_id(team.pk)
            .filter(team=team, **(event_filter), **date_query)
            .filter(
                ~Q(event__in=["$autocapture", "$pageview", "$identify", "$pageleave", "$screen"])
                if event is None
                else Q()
            )
            .filter(
                properties_to_Q(filter.properties, team_id=team.pk, filter_test_accounts=filter.filter_test_accounts)
                if filter and (filter.properties or filter.filter_test_accounts)
                else Q()
            )
            .annotate(
                previous_timestamp=Window(
                    expression=Lag("timestamp", default=None),
                    partition_by=F("person_id"),
                    order_by=F("timestamp").asc(),
                )
            )
        )

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()

        if event == "$autocapture":
            sessions_sql = self._add_elements(query_string=sessions_sql)

        events_notated = "\
        SELECT *, CASE WHEN EXTRACT('EPOCH' FROM (timestamp - previous_timestamp)) >= (60 * 30) OR previous_timestamp IS NULL THEN 1 ELSE 0 END AS new_session\
        FROM ({}) AS inner_sessions\
        ".format(
            sessions_sql
        )

        sessionified = "\
        SELECT events_notated.*, SUM(new_session) OVER (\
            ORDER BY person_id\
                    ,timestamp\
            ) AS session\
        FROM ({}) as events_notated\
        ".format(
            events_notated
        )

        if filter and filter.start_point:
            sessionified = self._apply_start_point(
                start_comparator=start_comparator, query_string=sessionified, start_point=filter.start_point,
            )

        final = "\
        SELECT {} as path_type, id, sessionified.session\
            ,ROW_NUMBER() OVER (\
                    PARTITION BY person_id\
                    ,session ORDER BY timestamp\
                    ) AS event_number\
        FROM ({}) as sessionified\
        ".format(
            prop_type, sessionified
        )

        counts = "\
        SELECT event_number || '_' || path_type as target_event, id as target_id, LAG(event_number || '_' || path_type, 1) OVER (\
            PARTITION BY session\
            ) AS source_event , LAG(id, 1) OVER (\
            PARTITION BY session\
            ) AS source_id from \
        ({}) as final\
        where event_number <= 4\
        ".format(
            final
        )

        query = "\
        SELECT source_event, target_event, MAX(target_id), MAX(source_id), count(*) from ({}) as counts\
        where source_event is not null and target_event is not null\
        group by source_event, target_event order by count desc limit 20\
        ".format(
            counts
        )

        cursor = connection.cursor()
        cursor.execute(query, sessions_sql_params)
        rows = cursor.fetchall()

        for row in rows:
            resp.append(
                {"source": row[0], "target": row[1], "target_id": row[2], "source_id": row[3], "value": row[4],}
            )

        resp = sorted(resp, key=lambda x: x["value"], reverse=True)
        return resp
Пример #22
0
    def parse(self, fpn, path, zug_id, info):
        root_path = path.replace(zug_id, '')
        zug_tag = fpn.find('Zug')

        zug = FahrplanZug(path=zug_id, name=info['name'])
        zug.gattung = zug_tag.get('Gattung')
        zug.nummer = zug_tag.get('Nummer').split('_')
        zug.zug_lauf = zug_tag.get('Zuglauf')
        zug.fahrplan_gruppe = zug_tag.get('FahrplanGruppe')
        zug.deko = (zug_tag.get('Dekozug') == '1')
        zug.is_reisezug = (zug_tag.get('Zugtyp') == '1')
        self.stripRouteNumber(zug)

        zug.speed_anfang = self.toSpeed(self.getAsFloat(zug_tag, 'spAnfang'))
        zug.speed_zug = self.toSpeed(self.getAsFloat(zug_tag,
                                                     'spZugNiedriger'))

        zug.fahrzeug_tree = self.processVarianten(
            fpn.find('Zug/FahrzeugVarianten'))

        zug.save()
        zug.autor.add(*info['autor'])

        pos = 0
        for eintrag in fpn.findall("Zug/FahrplanEintrag"):
            an = self.getDateTime(eintrag, 'Ank')
            ab = self.getDateTime(eintrag, 'Abf')
            ort = eintrag.get('Betrst')
            bedarf = eintrag.get('FplEintrag') == '2'
            kopf = eintrag.get('FzgVerbandAktion') != None
            ereignis = eintrag.find('Ereignis') != None
            if ort == None:
                self.logger.warn("Ignoring eintrag with empty place")
                continue
            eintrag_obj = FahrplanZugEintrag(position=pos,
                                             ort=eintrag.get('Betrst'),
                                             ab=ab,
                                             an=an,
                                             zug=zug,
                                             bedarfshalt=bedarf,
                                             kopf_machen=kopf,
                                             ereignis=ereignis)
            eintrag_obj.save()
            zug.eintraege.add(eintrag_obj)
            pos += 1

        for fahrzeug in fpn.iter('FahrzeugInfo'):
            path = fahrzeug.find('Datei').get('Dateiname')
            try:
                fahrzeug_obj = FahrzeugVariante.objects.get(
                    root_file__iexact=path,
                    haupt_id=fahrzeug.get('IDHaupt'),
                    neben_id=fahrzeug.get('IDNeben'))
                zug.fahrzeuge.add(fahrzeug_obj)

                if not zug.steuerfahrzeug and fahrzeug_obj.fuehrerstand:
                    zug.steuerfahrzeug = fahrzeug_obj

                if not zug.triebfahrzeug and len(fahrzeug_obj.antrieb) > 0:
                    zug.triebfahrzeug = fahrzeug_obj
            except FahrzeugVariante.DoesNotExist:
                self.logger.error("Could not find Fahrzeug Variant " + path +
                                  "/" + fahrzeug.get('IDHaupt') + ":" +
                                  fahrzeug.get('IDNeben'))
                raise

        imgpath = os.path.join('trn', zug.path.replace('\\', '') + ".png")
        zugrenderer = self.ZugRenderer(root_path, self.getRenderer(20))
        zug.bild = zugrenderer.renderImage(zug, imgpath, 1)

        zeit_diff = FahrplanZugEintrag.objects.filter(
            zug_id=zug_id).exclude(Q(ab=None) & Q(an=None)).annotate(
                zeit_previous=Window(expression=Lag('ab'),
                                     order_by=F('position').asc()),
                zeit_diff=Coalesce('an', 'ab') -
                F('zeit_previous')).order_by().values_list('zeit_diff',
                                                           flat=True)

        zug.zeit_bewegung = sum(zeit_diff[1:], timedelta())

        zug.save()