def _get_trunc_func( self, subject: str, period: str ) -> Tuple[Union[TruncHour, TruncDay, TruncWeek, TruncMonth], str]: if period == "Hour": fields = """ FLOOR(DATE_PART('day', first_date - %s) * 24 + DATE_PART('hour', first_date - %s)) AS first_date, FLOOR(DATE_PART('day', event_date - first_date) * 24 + DATE_PART('hour', event_date - first_date)) AS date, """ return TruncHour(subject), fields elif period == "Day": fields = """ FLOOR(DATE_PART('day', first_date - %s)) AS first_date, FLOOR(DATE_PART('day', event_date - first_date)) AS date, """ return TruncDay(subject), fields elif period == "Week": fields = """ FLOOR(DATE_PART('day', first_date - %s) / 7) AS first_date, FLOOR(DATE_PART('day', event_date - first_date) / 7) AS date, """ return TruncWeek(subject), fields elif period == "Month": fields = """ FLOOR((DATE_PART('year', first_date) - DATE_PART('year', %s)) * 12 + DATE_PART('month', first_date) - DATE_PART('month', %s)) AS first_date, FLOOR((DATE_PART('year', event_date) - DATE_PART('year', first_date)) * 12 + DATE_PART('month', event_date) - DATE_PART('month', first_date)) AS date, """ return TruncMonth(subject), fields else: raise ValidationError(f"Period {period} is unsupported.")
def _leads_period_unit_expr(period): """ :param period: - 'hour', 'day', 'week', 'moth', 'year' :param date_to: :return: >>> _leads_period_unit_expr('hour') TruncHour(F(created)) >>> _leads_period_unit_expr('day') TruncDay(F(created)) >>> _leads_period_unit_expr('week') Trunc(F(created)) >>> _leads_period_unit_expr('month') TruncMonth(F(created)) >>> _leads_period_unit_expr('year') TruncYear(F(created)) """ if period == 'hour': return TruncHour('created') elif period == 'day': return TruncDay('created') elif period == 'week': return Trunc('created', 'week') elif period == 'month': return TruncMonth('created') else: return TruncYear('created')
def _determineTrunc( subject: str, period: str ) -> Union[TruncHour, TruncDay, TruncWeek, TruncMonth]: if period == "Hour": return TruncHour(subject) elif period == "Day": return TruncDay(subject) elif period == "Week": return TruncWeek(subject) elif period == "Month": return TruncMonth(subject) else: raise ValueError(f"Period {period} is unsupported.")
def trunc_func( self, field_name: str ) -> Union[TruncHour, TruncDay, TruncWeek, TruncMonth]: if self.interval == "hour": return TruncHour(field_name) elif self.interval == "day": return TruncDay(field_name) elif self.interval == "week": return TruncWeek(field_name) elif self.interval == "month": return TruncMonth(field_name) else: raise ValidationError(f"{self.interval} not supported")
def apply_data_retention(): """ When data retention is enabled, this discards all data applicable for retention. Keeps at least one data point per hour available. """ settings = RetentionSettings.get_solo() if settings.data_retention_in_hours is None: # No retention enabled at all (default behaviour). return current_hour = timezone.now().hour # Only cleanup during nights. Allow from midnight to six a.m. if current_hour > 6: return # Each run should be capped, for obvious performance reasons. MAX_HOURS_CLEANUP = 24 # These models should be rotated with retention. Dict value is the datetime field used. MODELS_TO_CLEANUP = { DsmrReading.objects.processed(): 'timestamp', ElectricityConsumption.objects.all(): 'read_at', GasConsumption.objects.all(): 'read_at', } retention_date = timezone.now() - timezone.timedelta( hours=settings.data_retention_in_hours) # We need to force UTC here, to avoid AmbiguousTimeError's on DST changes. timezone.activate(pytz.UTC) for base_queryset, datetime_field in MODELS_TO_CLEANUP.items(): hours_to_cleanup = base_queryset.filter(**{ '{}__lt'.format(datetime_field): retention_date }).annotate( item_hour=TruncHour(datetime_field)).values('item_hour').annotate( item_count=Count('id')).order_by().filter( item_count__gt=2).order_by('item_hour').values_list( 'item_hour', flat=True)[:MAX_HOURS_CLEANUP] hours_to_cleanup = list(hours_to_cleanup) # Force evaluation. if not hours_to_cleanup: continue for current_hour in hours_to_cleanup: # Fetch all data per hour. data_set = base_queryset.filter( **{ '{}__gte'.format(datetime_field): current_hour, '{}__lt'.format(datetime_field): current_hour + timezone.timedelta(hours=1), }) # Extract the first/last item, so we can exclude it. # NOTE: Want to alter this? Please update "item_count__gt=2" above as well! keeper_pks = [ data_set.order_by(datetime_field)[0].pk, data_set.order_by('-{}'.format(datetime_field))[0].pk ] # Now drop all others. print('Retention | Cleaning up: {} ({})'.format( current_hour, data_set[0].__class__.__name__)) data_set.exclude(pk__in=keeper_pks).delete() timezone.deactivate()
def run(scheduled_process): retention_settings = RetentionSettings.get_solo() if retention_settings.data_retention_in_hours == RetentionSettings.RETENTION_NONE: return scheduled_process.disable( ) # Changing the retention settings in the admin will re-activate it again. # These models should be rotated with retention. Dict value is the datetime field used. ITEM_COUNT_PER_HOUR = 2 MODELS_TO_CLEANUP = { DsmrReading.objects.processed(): 'timestamp', ElectricityConsumption.objects.all(): 'read_at', GasConsumption.objects.all(): 'read_at', } retention_date = timezone.now() - timezone.timedelta( hours=retention_settings.data_retention_in_hours) data_to_clean_up = False # We need to force UTC here, to avoid AmbiguousTimeError's on DST changes. timezone.activate(pytz.UTC) for base_queryset, datetime_field in MODELS_TO_CLEANUP.items(): hours_to_cleanup = base_queryset.filter(**{ '{}__lt'.format(datetime_field): retention_date }).annotate( item_hour=TruncHour(datetime_field)).values('item_hour').annotate( item_count=Count('id')).order_by().filter( item_count__gt=ITEM_COUNT_PER_HOUR ).order_by('item_hour').values_list( 'item_hour', flat=True )[:settings.DSMRREADER_RETENTION_MAX_CLEANUP_HOURS_PER_RUN] hours_to_cleanup = list(hours_to_cleanup) # Force evaluation. if not hours_to_cleanup: continue data_to_clean_up = True for current_hour in hours_to_cleanup: # Fetch all data per hour. data_set = base_queryset.filter( **{ '{}__gte'.format(datetime_field): current_hour, '{}__lt'.format(datetime_field): current_hour + timezone.timedelta(hours=1), }) # Extract the first/last item, so we can exclude it. # NOTE: Want to alter this? Please update ITEM_COUNT_PER_HOUR above as well! keeper_pks = [ data_set.order_by(datetime_field)[0].pk, data_set.order_by('-{}'.format(datetime_field))[0].pk ] # Now drop all others. logger.debug('Retention: Cleaning up: %s (%s)', current_hour, data_set[0].__class__.__name__) data_set.exclude(pk__in=keeper_pks).delete() timezone.deactivate() # Delay for a bit, as there is nothing to do. if not data_to_clean_up: scheduled_process.delay(timezone.timedelta(hours=12))