def __init__( self, queryset, atomically=True, status_thresholds=None, pk_range=None, chunk_time=0.5, chunk_size=2, chunk_min=1, chunk_max=10000, report_progress=False, total=None, ): self.queryset = self.sanitize_queryset(queryset) if atomically: self.maybe_atomic = atomic(using=self.queryset.db) else: # Work around for `with` statement not supporting variable number # of contexts self.maybe_atomic = nullcontext() self.status_thresholds = status_thresholds self.pk_range = pk_range self.rate = WeightedAverageRate(chunk_time) assert ( 0 < chunk_min <= chunk_max ), "Minimum chunk size should not be greater than maximum chunk size." self.chunk_min = chunk_min self.chunk_max = chunk_max self.chunk_size = self.constrain_size(chunk_size) self.report_progress = report_progress self.total = total
def test_fast(self): # If we keep achieving a rate of 100 rows in 0.25 seconds, it should # recommend that we move to 200 rate = WeightedAverageRate(0.5) assert rate.update(100, 0.25) == 200 assert rate.update(100, 0.25) == 200 assert rate.update(100, 0.25) == 200
def test_good_guess(self): # If we are first slow then hit the target at 50, we should be good rate = WeightedAverageRate(0.5) assert rate.update(100, 1.0) == 50 assert rate.update(50, 0.5) == 50 assert rate.update(50, 0.5) == 50 assert rate.update(50, 0.5) == 50
def test_constant(self): # If we keep achieving a rate of 100 rows in 0.5 seconds, it should # recommend that we keep there rate = WeightedAverageRate(0.5) assert rate.update(100, 0.5) == 100 assert rate.update(100, 0.5) == 100 assert rate.update(100, 0.5) == 100
def test_slow(self): # If we keep achieving a rate of 100 rows in 1 seconds, it should # recommend that we move to 50 rate = WeightedAverageRate(0.5) assert rate.update(100, 1.0) == 50 assert rate.update(100, 1.0) == 50 assert rate.update(100, 1.0) == 50
def __init__( self, queryset: models.QuerySet, *, atomically: bool = True, status_thresholds: dict[str, int | float] | None = None, pk_range: _SmartPkRangeType = None, chunk_time: float = 0.5, chunk_size: int = 2, chunk_min: int = 1, chunk_max: int = 10000, report_progress: bool = False, total: int | None = None, ): self.queryset = self.sanitize_queryset(queryset) if atomically: self.maybe_atomic = atomic(using=self.queryset.db) else: # Work around for `with` statement not supporting variable number # of contexts self.maybe_atomic = nullcontext() self.status_thresholds = status_thresholds self.pk_range = pk_range self.rate = WeightedAverageRate(chunk_time) assert ( 0 < chunk_min <= chunk_max ), "Minimum chunk size should not be greater than maximum chunk size." self.chunk_min = chunk_min self.chunk_max = chunk_max self.chunk_size = self.constrain_size(chunk_size) self.report_progress = report_progress self.total = total
def __init__(self, queryset, atomically=True, status_thresholds=None, pk_range=None, chunk_time=0.5, chunk_size=2, chunk_max=10000, report_progress=False, total=None): self.queryset = self.sanitize_queryset(queryset) if atomically: self.maybe_atomic = atomic else: # Work around for `with` statement not supporting variable number # of contexts self.maybe_atomic = noop_context self.status_thresholds = status_thresholds self.pk_range = pk_range self.rate = WeightedAverageRate(chunk_time) self.chunk_max = chunk_max self.chunk_size = min(chunk_size, chunk_max) self.report_progress = report_progress self.total = total
class SmartChunkedIterator: def __init__( self, queryset, atomically=True, status_thresholds=None, pk_range=None, chunk_time=0.5, chunk_size=2, chunk_min=1, chunk_max=10000, report_progress=False, total=None, ): self.queryset = self.sanitize_queryset(queryset) if atomically: self.maybe_atomic = atomic(using=self.queryset.db) else: # Work around for `with` statement not supporting variable number # of contexts self.maybe_atomic = nullcontext() self.status_thresholds = status_thresholds self.pk_range = pk_range self.rate = WeightedAverageRate(chunk_time) assert ( 0 < chunk_min <= chunk_max ), "Minimum chunk size should not be greater than maximum chunk size." self.chunk_min = chunk_min self.chunk_max = chunk_max self.chunk_size = self.constrain_size(chunk_size) self.report_progress = report_progress self.total = total def __iter__(self): first_pk, last_pk = self.get_first_and_last() if first_pk <= last_pk: comp = operator.le # <= direction = 1 else: comp = operator.ge # >= direction = -1 current_pk = first_pk status = GlobalStatus(self.queryset.db) self.init_progress(direction) while comp(current_pk, last_pk): status.wait_until_load_low(self.status_thresholds) start_pk = current_pk current_pk = current_pk + self.chunk_size * direction # Don't process rows that didn't exist at start of iteration if direction == 1: end_pk = min(current_pk, last_pk + 1) else: end_pk = max(current_pk, last_pk - 1) with StopWatch() as timer, self.maybe_atomic: if direction == 1: chunk = self.queryset.filter(pk__gte=start_pk, pk__lt=end_pk) else: chunk = self.queryset.filter(pk__lte=start_pk, pk__gt=end_pk) # Attach the start_pk, end_pk onto the chunk queryset so they # can be read by SmartRangeIterator or other client code chunk._smart_iterator_pks = (start_pk, end_pk) yield chunk self.update_progress(direction, chunk, end_pk) self.adjust_chunk_size(chunk, timer.total_time) self.end_progress() def sanitize_queryset(self, queryset): if queryset.ordered: raise ValueError( "You can't use %s on a QuerySet with an ordering." % self.__class__.__name__) if queryset.query.low_mark or queryset.query.high_mark: raise ValueError("You can't use %s on a sliced QuerySet." % self.__class__.__name__) pk = queryset.model._meta.pk allowed_field = isinstance(pk, self.ALLOWED_PK_FIELD_CLASSES) or ( isinstance(pk, models.ForeignKey) and isinstance( pk.foreign_related_fields[0], self.ALLOWED_PK_FIELD_CLASSES)) if not allowed_field: # If your custom field class should be allowed, just add it to # ALLOWED_PK_FIELD_CLASSES raise ValueError( "You can't use %s on a model with a non-integer primary key." % self.__class__.__name__) return queryset.order_by("pk") ALLOWED_PK_FIELD_CLASSES = ( models.IntegerField, # Also covers e.g. PositiveIntegerField models.AutoField, # Is an integer field but doesn't subclass it :( ) def get_first_and_last(self): if isinstance(self.pk_range, tuple) and len(self.pk_range) == 2: should_be_reversed = (self.pk_range[1] < self.pk_range[0] and self.queryset.query.standard_ordering) if should_be_reversed: self.queryset = self.queryset.reverse() return self.pk_range elif self.pk_range == "all": base_qs = self.queryset.model.objects.using(self.queryset.db).all() elif self.pk_range is None: base_qs = self.queryset else: raise ValueError("Unrecognized value for pk_range: {}".format( self.pk_range)) if not base_qs.query.standard_ordering: # It's reverse()d base_qs = base_qs.reverse() min_qs = base_qs.order_by("pk").values_list("pk", flat=True) max_qs = base_qs.order_by("-pk").values_list("pk", flat=True) try: min_pk = min_qs[0] except IndexError: # We're working on an empty QuerySet, yield no chunks max_pk = min_pk = 0 else: try: max_pk = max_qs[0] except IndexError: # Fix possible race condition - max_qs could find nothing if # all rows (including that with id min_pk) were processed # between finding min_pk and the above [0] max_pk = min_pk if self.queryset.query.standard_ordering: return (min_pk, max_pk) else: return (max_pk, min_pk) def constrain_size(self, chunk_size): return max(min(chunk_size, self.chunk_max), self.chunk_min) def adjust_chunk_size(self, chunk, chunk_time): # If the queryset is not being fetched as-is, e.g. its .delete() is # called, we can't know how many objects were affected, so we just # assume they all exist/existed if chunk._result_cache is None: num_processed = self.chunk_size else: num_processed = len(chunk) if num_processed > 0: new_chunk_size = self.rate.update(num_processed, chunk_time) else: new_chunk_size = self.chunk_size if new_chunk_size < 1: # pragma: no cover new_chunk_size = 1 self.chunk_size = self.constrain_size(new_chunk_size) def init_progress(self, direction): if not self.report_progress: return self.start_time = time.time() self.old_report = "" self.objects_done = 0 self.chunks_done = 0 if self.total is None: # User didn't pass in a total try: self.total = approx_count(self.queryset) if self.total < 1000: self.total = self.queryset.count() except ValueError: # Cannot be approximately counted self.total = self.queryset.count() # Fallback - will be slow self.update_progress(direction) def update_progress(self, direction, chunk=None, end_pk=None): if not self.report_progress: return if chunk is not None: self.chunks_done += 1 if self.objects_done != "???": # If the queryset is not being fetched as-is, e.g. its # .delete() is called, we can't know how many objects were # affected, so we just bum out and write "???". if chunk._result_cache is None: self.objects_done = "???" else: self.objects_done += len(chunk) try: percent_complete = 100 * (float(self.objects_done) / self.total) except (ZeroDivisionError, ValueError): percent_complete = 0.0 report = "{} {} processed {}/{} objects ({:.2f}%) in {} chunks".format( self.model_name, self.__class__.__name__, self.objects_done, self.total, percent_complete, self.chunks_done, ) if end_pk is not None: report += "; {dir} pk so far {end_pk}".format( dir="highest" if direction == 1 else "lowest", end_pk=end_pk) if self.objects_done != "???" and self.rate.avg_rate: n_remaining = self.total - self.objects_done s_remaining = max(0, int(n_remaining // self.rate.avg_rate)) report += ", {} remaining".format(format_duration(s_remaining)) # Add spaces to avoid problem with reverse iteration, see #177. spacing = " " * max(0, len(self.old_report) - len(report)) if self.old_report: # Reset line on successive outputs sys.stdout.write("\r") sys.stdout.write(report) sys.stdout.write(spacing) sys.stdout.flush() self.old_report = report def end_progress(self): if not self.report_progress: return total_time = int(time.time() - self.start_time) sys.stdout.write( "\nFinished! Iterated over {n} object{s} in {duration}.\n".format( n=self.objects_done, s="s" if self.objects_done != 1 else "", duration=format_duration(total_time), )) @cached_property def model_name(self): return self.queryset.model.__name__
def test_zero_division(self): rate = WeightedAverageRate(0.5) assert rate.update(1, 0.0) == 500
class SmartChunkedIterator(object): def __init__(self, queryset, atomically=True, status_thresholds=None, pk_range=None, chunk_time=0.5, chunk_size=2, chunk_min=1, chunk_max=10000, report_progress=False, total=None): self.queryset = self.sanitize_queryset(queryset) if atomically: self.maybe_atomic = atomic else: # Work around for `with` statement not supporting variable number # of contexts self.maybe_atomic = noop_context self.status_thresholds = status_thresholds self.pk_range = pk_range self.rate = WeightedAverageRate(chunk_time) assert 0 < chunk_min <= chunk_max, \ "Minimum chunk size should not be greater than maximum chunk size." self.chunk_min = chunk_min self.chunk_max = chunk_max self.chunk_size = self.constrain_size(chunk_size) self.report_progress = report_progress self.total = total def __iter__(self): first_pk, last_pk = self.get_first_and_last() if first_pk <= last_pk: comp = operator.le # <= direction = 1 else: comp = operator.ge # >= direction = -1 current_pk = first_pk db_alias = self.queryset.db status = GlobalStatus(db_alias) self.init_progress(direction) while comp(current_pk, last_pk): status.wait_until_load_low(self.status_thresholds) start_pk = current_pk current_pk = current_pk + self.chunk_size * direction # Don't process rows that didn't exist at start of iteration if direction == 1: end_pk = min(current_pk, last_pk + 1) else: end_pk = max(current_pk, last_pk - 1) with StopWatch() as timer, self.maybe_atomic(using=db_alias): if direction == 1: chunk = self.queryset.filter(pk__gte=start_pk, pk__lt=end_pk) else: chunk = self.queryset.filter(pk__lte=start_pk, pk__gt=end_pk) # Attach the start_pk, end_pk onto the chunk queryset so they # can be read by SmartRangeIterator or other client code chunk._smart_iterator_pks = (start_pk, end_pk) yield chunk self.update_progress(direction, chunk, end_pk) self.adjust_chunk_size(chunk, timer.total_time) self.end_progress() def sanitize_queryset(self, queryset): if queryset.ordered: raise ValueError( "You can't use %s on a QuerySet with an ordering." % self.__class__.__name__ ) if queryset.query.low_mark or queryset.query.high_mark: raise ValueError( "You can't use %s on a sliced QuerySet." % self.__class__.__name__ ) pk = queryset.model._meta.pk if not isinstance(pk, self.ALLOWED_PK_FIELD_CLASSES): # If your custom field class should be allowed, just add it to # ALLOWED_PK_FIELD_CLASSES raise ValueError( "You can't use %s on a model with a non-integer primary key." % self.__class__.__name__ ) return queryset.order_by('pk') ALLOWED_PK_FIELD_CLASSES = ( models.IntegerField, # Also covers e.g. PositiveIntegerField models.AutoField, # Is an integer field but doesn't subclass it :( models.ForeignKey # Should always point to an integer ) def get_first_and_last(self): if isinstance(self.pk_range, tuple) and len(self.pk_range) == 2: should_be_reversed = ( self.pk_range[1] < self.pk_range[0] and self.queryset.query.standard_ordering ) if should_be_reversed: self.queryset = self.queryset.reverse() return self.pk_range elif self.pk_range == 'all': base_qs = self.queryset.model.objects.using(self.queryset.db).all() elif self.pk_range is None: base_qs = self.queryset else: raise ValueError("Unrecognized value for pk_range: {}" .format(self.pk_range)) if not base_qs.query.standard_ordering: # It's reverse()d base_qs = base_qs.reverse() min_qs = base_qs.order_by('pk').values_list('pk', flat=True) max_qs = base_qs.order_by('-pk').values_list('pk', flat=True) try: min_pk = min_qs[0] except IndexError: # We're working on an empty QuerySet, yield no chunks max_pk = min_pk = 0 else: try: max_pk = max_qs[0] except IndexError: # Fix possible race condition - max_qs could find nothing if # all rows (including that with id min_pk) were processed # between finding min_pk and the above [0] max_pk = min_pk if self.queryset.query.standard_ordering: return (min_pk, max_pk) else: return (max_pk, min_pk) def constrain_size(self, chunk_size): return max(min(chunk_size, self.chunk_max), self.chunk_min) def adjust_chunk_size(self, chunk, chunk_time): # If the queryset is not being fetched as-is, e.g. its .delete() is # called, we can't know how many objects were affected, so we just # assume they all exist/existed if chunk._result_cache is None: num_processed = self.chunk_size else: num_processed = len(chunk) if num_processed > 0: new_chunk_size = self.rate.update(num_processed, chunk_time) else: new_chunk_size = self.chunk_size if new_chunk_size < 1: # pragma: no cover new_chunk_size = 1 self.chunk_size = self.constrain_size(new_chunk_size) def init_progress(self, direction): if not self.report_progress: return self.start_time = time.time() self.old_report = "" self.objects_done = 0 self.chunks_done = 0 if self.total is None: # User didn't pass in a total try: self.total = approx_count(self.queryset) if self.total < 1000: self.total = self.queryset.count() except ValueError: # Cannot be approximately counted self.total = self.queryset.count() # Fallback - will be slow self.update_progress(direction) def update_progress(self, direction, chunk=None, end_pk=None): if not self.report_progress: return if chunk is not None: self.chunks_done += 1 if self.objects_done != "???": # If the queryset is not being fetched as-is, e.g. its # .delete() is called, we can't know how many objects were # affected, so we just bum out and write "???". if chunk._result_cache is None: self.objects_done = "???" else: self.objects_done += len(chunk) try: percent_complete = 100 * (float(self.objects_done) / self.total) except (ZeroDivisionError, ValueError): percent_complete = 0.0 report = "{} {} processed {}/{} objects ({:.2f}%) in {} chunks".format( self.model_name, self.__class__.__name__, self.objects_done, self.total, percent_complete, self.chunks_done, ) if end_pk is not None: report += "; {dir} pk so far {end_pk}".format( dir="highest" if direction == 1 else "lowest", end_pk=end_pk, ) if self.objects_done != '???' and self.rate.avg_rate: n_remaining = self.total - self.objects_done s_remaining = max(0, int(n_remaining // self.rate.avg_rate)) report += ', {} remaining'.format( format_duration(s_remaining) ) # Add spaces to avoid problem with reverse iteration, see #177. spacing = " " * max(0, len(self.old_report) - len(report)) if self.old_report: # Reset line on successive outputs sys.stdout.write("\r") sys.stdout.write(report) sys.stdout.write(spacing) sys.stdout.flush() self.old_report = report def end_progress(self): if not self.report_progress: return total_time = time.time() - self.start_time sys.stdout.write( "\nFinished! Iterated over {n} object{s} in {duration}.\n".format( n=self.objects_done, s='s' if self.objects_done != 1 else '', duration=format_duration(total_time) ) ) @cached_property def model_name(self): return self.queryset.model.__name__
class SmartChunkedIterator(object): def __init__(self, queryset, atomically=True, status_thresholds=None, pk_range=None, chunk_time=0.5, chunk_max=10000, report_progress=False, total=None): self.queryset = self.sanitize_queryset(queryset) if atomically: self.maybe_atomic = atomic else: # Work around for `with` statement not supporting variable number # of contexts self.maybe_atomic = noop_context self.status_thresholds = status_thresholds self.pk_range = pk_range self.rate = WeightedAverageRate(chunk_time) self.chunk_size = 2 # Small but will expand rapidly anyhow self.chunk_max = chunk_max self.report_progress = report_progress self.total = total def __iter__(self): min_pk, max_pk = self.get_min_and_max() current_pk = min_pk db_alias = self.queryset.db status = GlobalStatus(db_alias) self.init_progress() while current_pk <= max_pk: status.wait_until_load_low(self.status_thresholds) start_pk = current_pk current_pk = current_pk + self.chunk_size # Don't process rows that didn't exist at start of iteration end_pk = min(current_pk, max_pk + 1) with StopWatch() as timer, self.maybe_atomic(using=db_alias): chunk = self.queryset.filter(pk__gte=start_pk, pk__lt=end_pk) yield chunk self.update_progress(chunk=chunk, end_pk=end_pk) self.adjust_chunk_size(chunk, timer.total_time) self.end_progress() def sanitize_queryset(self, queryset): if queryset.ordered: raise ValueError( "You can't use %s on a QuerySet with an ordering." % self.__class__.__name__ ) if queryset.query.low_mark or queryset.query.high_mark: raise ValueError( "You can't use %s on a sliced QuerySet." % self.__class__.__name__ ) pk = queryset.model._meta.pk if not isinstance(pk, (models.IntegerField, models.AutoField)): raise ValueError( "You can't use %s on a model with a non-integer primary key." % self.__class__.__name__ ) return queryset.order_by('pk') def get_min_and_max(self): if isinstance(self.pk_range, tuple) and len(self.pk_range) == 2: return self.pk_range elif self.pk_range == 'all': base_qs = self.queryset.model.objects.using(self.queryset.db).all() elif self.pk_range is None: base_qs = self.queryset else: raise ValueError("Unrecognized value for pk_range: {}" .format(self.pk_range)) min_qs = base_qs.order_by('pk').values_list('pk', flat=True) max_qs = base_qs.order_by('-pk').values_list('pk', flat=True) try: min_pk = min_qs[0] except IndexError: # We're working on an empty QuerySet, yield no chunks max_pk = min_pk = 0 else: try: max_pk = max_qs[0] except IndexError: # Fix possible race condition - max_qs could find nothing if # all rows (including that with id min_pk) were processed # between finding min_pk and the above [0] max_pk = min_pk return (min_pk, max_pk) def adjust_chunk_size(self, chunk, chunk_time): # If the queryset is not being fetched as-is, e.g. its .delete() is # called, we can't know how many objects were affected, so we just # assume they all exist/existed if chunk._result_cache is None: num_processed = self.chunk_size else: num_processed = len(chunk) if num_processed > 0: new_chunk_size = self.rate.update(num_processed, chunk_time) else: new_chunk_size = self.chunk_size if new_chunk_size < 1: # pragma: no cover new_chunk_size = 1 if new_chunk_size > self.chunk_max: new_chunk_size = self.chunk_max self.chunk_size = new_chunk_size def init_progress(self): if not self.report_progress: return self.have_reported = False self.objects_done = 0 self.chunks_done = 0 if self.total is None: # User didn't pass in a total try: self.total = approx_count(self.queryset) if self.total < 1000: self.total = self.queryset.count() except ValueError: # Cannot be approximately counted self.total = self.queryset.count() # Fallback - will be slow self.update_progress() def update_progress(self, chunk=None, end_pk=None): if not self.report_progress: return if chunk is not None: self.chunks_done += 1 if self.objects_done != "???": # If the queryset is not being fetched as-is, e.g. its # .delete() is called, we can't know how many objects were # affected, so we just bum out and write "???". if chunk._result_cache is None: self.objects_done = "???" else: self.objects_done += len(chunk) try: percent_complete = 100 * (float(self.objects_done) / self.total) except (ZeroDivisionError, ValueError): percent_complete = 0.0 if not self.have_reported: self.have_reported = True else: # Reset line on successive outputs sys.stdout.write("\r") sys.stdout.write( "{} processed {}/{} objects ({:.2f}%) in {} chunks".format( self.model_name + self.__class__.__name__, self.objects_done, self.total, percent_complete, self.chunks_done, ) ) if end_pk is not None: sys.stdout.write("; highest pk so far {}".format(end_pk)) sys.stdout.flush() def end_progress(self): if not self.report_progress: return sys.stdout.write("\nFinished!\n") @cached_property def model_name(self): return self.queryset.model.__name__