Пример #1
0
 def test_good_guess(self):
     # If we are first slow then hit the target at 50, we should be good
     rate = WeightedAverageRate(0.5)
     assert rate.update(100, 1.0) == 50
     assert rate.update(50, 0.5) == 50
     assert rate.update(50, 0.5) == 50
     assert rate.update(50, 0.5) == 50
Пример #2
0
 def test_slow(self):
     # If we keep achieving a rate of 100 rows in 1 seconds, it should
     # recommend that we move to 50
     rate = WeightedAverageRate(0.5)
     assert rate.update(100, 1.0) == 50
     assert rate.update(100, 1.0) == 50
     assert rate.update(100, 1.0) == 50
Пример #3
0
 def test_fast(self):
     # If we keep achieving a rate of 100 rows in 0.25 seconds, it should
     # recommend that we move to 200
     rate = WeightedAverageRate(0.5)
     assert rate.update(100, 0.25) == 200
     assert rate.update(100, 0.25) == 200
     assert rate.update(100, 0.25) == 200
Пример #4
0
 def test_fast(self):
     # If we keep achieving a rate of 100 rows in 0.25 seconds, it should
     # recommend that we move to 200
     rate = WeightedAverageRate(0.5)
     assert rate.update(100, 0.25) == 200
     assert rate.update(100, 0.25) == 200
     assert rate.update(100, 0.25) == 200
Пример #5
0
 def test_constant(self):
     # If we keep achieving a rate of 100 rows in 0.5 seconds, it should
     # recommend that we keep there
     rate = WeightedAverageRate(0.5)
     assert rate.update(100, 0.5) == 100
     assert rate.update(100, 0.5) == 100
     assert rate.update(100, 0.5) == 100
Пример #6
0
 def test_slow(self):
     # If we keep achieving a rate of 100 rows in 1 seconds, it should
     # recommend that we move to 50
     rate = WeightedAverageRate(0.5)
     assert rate.update(100, 1.0) == 50
     assert rate.update(100, 1.0) == 50
     assert rate.update(100, 1.0) == 50
Пример #7
0
 def test_constant(self):
     # If we keep achieving a rate of 100 rows in 0.5 seconds, it should
     # recommend that we keep there
     rate = WeightedAverageRate(0.5)
     assert rate.update(100, 0.5) == 100
     assert rate.update(100, 0.5) == 100
     assert rate.update(100, 0.5) == 100
Пример #8
0
 def test_good_guess(self):
     # If we are first slow then hit the target at 50, we should be good
     rate = WeightedAverageRate(0.5)
     assert rate.update(100, 1.0) == 50
     assert rate.update(50, 0.5) == 50
     assert rate.update(50, 0.5) == 50
     assert rate.update(50, 0.5) == 50
Пример #9
0
class SmartChunkedIterator:
    def __init__(
        self,
        queryset,
        atomically=True,
        status_thresholds=None,
        pk_range=None,
        chunk_time=0.5,
        chunk_size=2,
        chunk_min=1,
        chunk_max=10000,
        report_progress=False,
        total=None,
    ):
        self.queryset = self.sanitize_queryset(queryset)

        if atomically:
            self.maybe_atomic = atomic(using=self.queryset.db)
        else:
            # Work around for `with` statement not supporting variable number
            # of contexts
            self.maybe_atomic = nullcontext()

        self.status_thresholds = status_thresholds
        self.pk_range = pk_range

        self.rate = WeightedAverageRate(chunk_time)
        assert (
            0 < chunk_min <= chunk_max
        ), "Minimum chunk size should not be greater than maximum chunk size."
        self.chunk_min = chunk_min
        self.chunk_max = chunk_max
        self.chunk_size = self.constrain_size(chunk_size)

        self.report_progress = report_progress
        self.total = total

    def __iter__(self):
        first_pk, last_pk = self.get_first_and_last()
        if first_pk <= last_pk:
            comp = operator.le  # <=
            direction = 1
        else:
            comp = operator.ge  # >=
            direction = -1
        current_pk = first_pk
        status = GlobalStatus(self.queryset.db)

        self.init_progress(direction)

        while comp(current_pk, last_pk):
            status.wait_until_load_low(self.status_thresholds)

            start_pk = current_pk
            current_pk = current_pk + self.chunk_size * direction
            # Don't process rows that didn't exist at start of iteration
            if direction == 1:
                end_pk = min(current_pk, last_pk + 1)
            else:
                end_pk = max(current_pk, last_pk - 1)

            with StopWatch() as timer, self.maybe_atomic:
                if direction == 1:
                    chunk = self.queryset.filter(pk__gte=start_pk,
                                                 pk__lt=end_pk)
                else:
                    chunk = self.queryset.filter(pk__lte=start_pk,
                                                 pk__gt=end_pk)
                # Attach the start_pk, end_pk onto the chunk queryset so they
                # can be read by SmartRangeIterator or other client code
                chunk._smart_iterator_pks = (start_pk, end_pk)
                yield chunk
                self.update_progress(direction, chunk, end_pk)

            self.adjust_chunk_size(chunk, timer.total_time)

        self.end_progress()

    def sanitize_queryset(self, queryset):
        if queryset.ordered:
            raise ValueError(
                "You can't use %s on a QuerySet with an ordering." %
                self.__class__.__name__)

        if queryset.query.low_mark or queryset.query.high_mark:
            raise ValueError("You can't use %s on a sliced QuerySet." %
                             self.__class__.__name__)

        pk = queryset.model._meta.pk
        allowed_field = isinstance(pk, self.ALLOWED_PK_FIELD_CLASSES) or (
            isinstance(pk, models.ForeignKey) and isinstance(
                pk.foreign_related_fields[0], self.ALLOWED_PK_FIELD_CLASSES))
        if not allowed_field:
            # If your custom field class should be allowed, just add it to
            # ALLOWED_PK_FIELD_CLASSES
            raise ValueError(
                "You can't use %s on a model with a non-integer primary key." %
                self.__class__.__name__)

        return queryset.order_by("pk")

    ALLOWED_PK_FIELD_CLASSES = (
        models.IntegerField,  # Also covers e.g. PositiveIntegerField
        models.AutoField,  # Is an integer field but doesn't subclass it :(
    )

    def get_first_and_last(self):
        if isinstance(self.pk_range, tuple) and len(self.pk_range) == 2:
            should_be_reversed = (self.pk_range[1] < self.pk_range[0]
                                  and self.queryset.query.standard_ordering)
            if should_be_reversed:
                self.queryset = self.queryset.reverse()
            return self.pk_range
        elif self.pk_range == "all":
            base_qs = self.queryset.model.objects.using(self.queryset.db).all()
        elif self.pk_range is None:
            base_qs = self.queryset
        else:
            raise ValueError("Unrecognized value for pk_range: {}".format(
                self.pk_range))

        if not base_qs.query.standard_ordering:  # It's reverse()d
            base_qs = base_qs.reverse()

        min_qs = base_qs.order_by("pk").values_list("pk", flat=True)
        max_qs = base_qs.order_by("-pk").values_list("pk", flat=True)
        try:
            min_pk = min_qs[0]
        except IndexError:
            # We're working on an empty QuerySet, yield no chunks
            max_pk = min_pk = 0
        else:
            try:
                max_pk = max_qs[0]
            except IndexError:
                # Fix possible race condition - max_qs could find nothing if
                # all rows (including that with id min_pk) were processed
                # between finding min_pk and the above [0]
                max_pk = min_pk

        if self.queryset.query.standard_ordering:
            return (min_pk, max_pk)
        else:
            return (max_pk, min_pk)

    def constrain_size(self, chunk_size):
        return max(min(chunk_size, self.chunk_max), self.chunk_min)

    def adjust_chunk_size(self, chunk, chunk_time):
        # If the queryset is not being fetched as-is, e.g. its .delete() is
        # called, we can't know how many objects were affected, so we just
        # assume they all exist/existed
        if chunk._result_cache is None:
            num_processed = self.chunk_size
        else:
            num_processed = len(chunk)

        if num_processed > 0:
            new_chunk_size = self.rate.update(num_processed, chunk_time)
        else:
            new_chunk_size = self.chunk_size

        if new_chunk_size < 1:  # pragma: no cover
            new_chunk_size = 1

        self.chunk_size = self.constrain_size(new_chunk_size)

    def init_progress(self, direction):
        if not self.report_progress:
            return

        self.start_time = time.time()
        self.old_report = ""
        self.objects_done = 0
        self.chunks_done = 0
        if self.total is None:  # User didn't pass in a total
            try:
                self.total = approx_count(self.queryset)
                if self.total < 1000:
                    self.total = self.queryset.count()
            except ValueError:  # Cannot be approximately counted
                self.total = self.queryset.count()  # Fallback - will be slow

        self.update_progress(direction)

    def update_progress(self, direction, chunk=None, end_pk=None):
        if not self.report_progress:
            return

        if chunk is not None:
            self.chunks_done += 1
            if self.objects_done != "???":
                # If the queryset is not being fetched as-is, e.g. its
                # .delete() is called, we can't know how many objects were
                # affected, so we just bum out and write "???".
                if chunk._result_cache is None:
                    self.objects_done = "???"
                else:
                    self.objects_done += len(chunk)

        try:
            percent_complete = 100 * (float(self.objects_done) / self.total)
        except (ZeroDivisionError, ValueError):
            percent_complete = 0.0

        report = "{} {} processed {}/{} objects ({:.2f}%) in {} chunks".format(
            self.model_name,
            self.__class__.__name__,
            self.objects_done,
            self.total,
            percent_complete,
            self.chunks_done,
        )

        if end_pk is not None:
            report += "; {dir} pk so far {end_pk}".format(
                dir="highest" if direction == 1 else "lowest", end_pk=end_pk)

            if self.objects_done != "???" and self.rate.avg_rate:
                n_remaining = self.total - self.objects_done
                s_remaining = max(0, int(n_remaining // self.rate.avg_rate))
                report += ", {} remaining".format(format_duration(s_remaining))

        # Add spaces to avoid problem with reverse iteration, see #177.
        spacing = " " * max(0, len(self.old_report) - len(report))

        if self.old_report:
            # Reset line on successive outputs
            sys.stdout.write("\r")

        sys.stdout.write(report)
        sys.stdout.write(spacing)
        sys.stdout.flush()

        self.old_report = report

    def end_progress(self):
        if not self.report_progress:
            return

        total_time = int(time.time() - self.start_time)
        sys.stdout.write(
            "\nFinished! Iterated over {n} object{s} in {duration}.\n".format(
                n=self.objects_done,
                s="s" if self.objects_done != 1 else "",
                duration=format_duration(total_time),
            ))

    @cached_property
    def model_name(self):
        return self.queryset.model.__name__
Пример #10
0
 def test_zero_division(self):
     rate = WeightedAverageRate(0.5)
     assert rate.update(1, 0.0) == 500
Пример #11
0
 def test_zero_division(self):
     rate = WeightedAverageRate(0.5)
     assert rate.update(1, 0.0) == 500
Пример #12
0
class SmartChunkedIterator(object):
    def __init__(self, queryset, atomically=True, status_thresholds=None,
                 pk_range=None, chunk_time=0.5, chunk_size=2, chunk_min=1,
                 chunk_max=10000, report_progress=False, total=None):
        self.queryset = self.sanitize_queryset(queryset)

        if atomically:
            self.maybe_atomic = atomic
        else:
            # Work around for `with` statement not supporting variable number
            # of contexts
            self.maybe_atomic = noop_context

        self.status_thresholds = status_thresholds
        self.pk_range = pk_range

        self.rate = WeightedAverageRate(chunk_time)
        assert 0 < chunk_min <= chunk_max, \
            "Minimum chunk size should not be greater than maximum chunk size."
        self.chunk_min = chunk_min
        self.chunk_max = chunk_max
        self.chunk_size = self.constrain_size(chunk_size)

        self.report_progress = report_progress
        self.total = total

    def __iter__(self):
        first_pk, last_pk = self.get_first_and_last()
        if first_pk <= last_pk:
            comp = operator.le  # <=
            direction = 1
        else:
            comp = operator.ge  # >=
            direction = -1
        current_pk = first_pk
        db_alias = self.queryset.db
        status = GlobalStatus(db_alias)

        self.init_progress(direction)

        while comp(current_pk, last_pk):
            status.wait_until_load_low(self.status_thresholds)

            start_pk = current_pk
            current_pk = current_pk + self.chunk_size * direction
            # Don't process rows that didn't exist at start of iteration
            if direction == 1:
                end_pk = min(current_pk, last_pk + 1)
            else:
                end_pk = max(current_pk, last_pk - 1)

            with StopWatch() as timer, self.maybe_atomic(using=db_alias):
                if direction == 1:
                    chunk = self.queryset.filter(pk__gte=start_pk,
                                                 pk__lt=end_pk)
                else:
                    chunk = self.queryset.filter(pk__lte=start_pk,
                                                 pk__gt=end_pk)
                # Attach the start_pk, end_pk onto the chunk queryset so they
                # can be read by SmartRangeIterator or other client code
                chunk._smart_iterator_pks = (start_pk, end_pk)
                yield chunk
                self.update_progress(direction, chunk, end_pk)

            self.adjust_chunk_size(chunk, timer.total_time)

        self.end_progress()

    def sanitize_queryset(self, queryset):
        if queryset.ordered:
            raise ValueError(
                "You can't use %s on a QuerySet with an ordering." %
                self.__class__.__name__
            )

        if queryset.query.low_mark or queryset.query.high_mark:
            raise ValueError(
                "You can't use %s on a sliced QuerySet." %
                self.__class__.__name__
            )

        pk = queryset.model._meta.pk
        if not isinstance(pk, self.ALLOWED_PK_FIELD_CLASSES):
            # If your custom field class should be allowed, just add it to
            # ALLOWED_PK_FIELD_CLASSES
            raise ValueError(
                "You can't use %s on a model with a non-integer primary key." %
                self.__class__.__name__
            )

        return queryset.order_by('pk')

    ALLOWED_PK_FIELD_CLASSES = (
        models.IntegerField,  # Also covers e.g. PositiveIntegerField
        models.AutoField,  # Is an integer field but doesn't subclass it :(
        models.ForeignKey  # Should always point to an integer
    )

    def get_first_and_last(self):
        if isinstance(self.pk_range, tuple) and len(self.pk_range) == 2:
            should_be_reversed = (
                self.pk_range[1] < self.pk_range[0] and
                self.queryset.query.standard_ordering
            )
            if should_be_reversed:
                self.queryset = self.queryset.reverse()
            return self.pk_range
        elif self.pk_range == 'all':
            base_qs = self.queryset.model.objects.using(self.queryset.db).all()
        elif self.pk_range is None:
            base_qs = self.queryset
        else:
            raise ValueError("Unrecognized value for pk_range: {}"
                             .format(self.pk_range))

        if not base_qs.query.standard_ordering:  # It's reverse()d
            base_qs = base_qs.reverse()

        min_qs = base_qs.order_by('pk').values_list('pk', flat=True)
        max_qs = base_qs.order_by('-pk').values_list('pk', flat=True)
        try:
            min_pk = min_qs[0]
        except IndexError:
            # We're working on an empty QuerySet, yield no chunks
            max_pk = min_pk = 0
        else:
            try:
                max_pk = max_qs[0]
            except IndexError:
                # Fix possible race condition - max_qs could find nothing if
                # all rows (including that with id min_pk) were processed
                # between finding min_pk and the above [0]
                max_pk = min_pk

        if self.queryset.query.standard_ordering:
            return (min_pk, max_pk)
        else:
            return (max_pk, min_pk)

    def constrain_size(self, chunk_size):
        return max(min(chunk_size, self.chunk_max), self.chunk_min)

    def adjust_chunk_size(self, chunk, chunk_time):
        # If the queryset is not being fetched as-is, e.g. its .delete() is
        # called, we can't know how many objects were affected, so we just
        # assume they all exist/existed
        if chunk._result_cache is None:
            num_processed = self.chunk_size
        else:
            num_processed = len(chunk)

        if num_processed > 0:
            new_chunk_size = self.rate.update(num_processed, chunk_time)
        else:
            new_chunk_size = self.chunk_size

        if new_chunk_size < 1:  # pragma: no cover
            new_chunk_size = 1

        self.chunk_size = self.constrain_size(new_chunk_size)

    def init_progress(self, direction):
        if not self.report_progress:
            return

        self.start_time = time.time()
        self.old_report = ""
        self.objects_done = 0
        self.chunks_done = 0
        if self.total is None:  # User didn't pass in a total
            try:
                self.total = approx_count(self.queryset)
                if self.total < 1000:
                    self.total = self.queryset.count()
            except ValueError:  # Cannot be approximately counted
                self.total = self.queryset.count()  # Fallback - will be slow

        self.update_progress(direction)

    def update_progress(self, direction, chunk=None, end_pk=None):
        if not self.report_progress:
            return

        if chunk is not None:
            self.chunks_done += 1
            if self.objects_done != "???":
                # If the queryset is not being fetched as-is, e.g. its
                # .delete() is called, we can't know how many objects were
                # affected, so we just bum out and write "???".
                if chunk._result_cache is None:
                    self.objects_done = "???"
                else:
                    self.objects_done += len(chunk)

        try:
            percent_complete = 100 * (float(self.objects_done) / self.total)
        except (ZeroDivisionError, ValueError):
            percent_complete = 0.0

        report = "{} {} processed {}/{} objects ({:.2f}%) in {} chunks".format(
            self.model_name,
            self.__class__.__name__,
            self.objects_done,
            self.total,
            percent_complete,
            self.chunks_done,
        )

        if end_pk is not None:
            report += "; {dir} pk so far {end_pk}".format(
                dir="highest" if direction == 1 else "lowest",
                end_pk=end_pk,
            )

            if self.objects_done != '???' and self.rate.avg_rate:
                n_remaining = self.total - self.objects_done
                s_remaining = max(0, int(n_remaining // self.rate.avg_rate))
                report += ', {} remaining'.format(
                    format_duration(s_remaining)
                )

        # Add spaces to avoid problem with reverse iteration, see #177.
        spacing = " " * max(0, len(self.old_report) - len(report))

        if self.old_report:
            # Reset line on successive outputs
            sys.stdout.write("\r")

        sys.stdout.write(report)
        sys.stdout.write(spacing)
        sys.stdout.flush()

        self.old_report = report

    def end_progress(self):
        if not self.report_progress:
            return

        total_time = time.time() - self.start_time
        sys.stdout.write(
            "\nFinished! Iterated over {n} object{s} in {duration}.\n".format(
                n=self.objects_done,
                s='s' if self.objects_done != 1 else '',
                duration=format_duration(total_time)
            )
        )

    @cached_property
    def model_name(self):
        return self.queryset.model.__name__
Пример #13
0
class SmartChunkedIterator(object):
    def __init__(self, queryset, atomically=True, status_thresholds=None,
                 pk_range=None, chunk_time=0.5, chunk_max=10000,
                 report_progress=False, total=None):
        self.queryset = self.sanitize_queryset(queryset)

        if atomically:
            self.maybe_atomic = atomic
        else:
            # Work around for `with` statement not supporting variable number
            # of contexts
            self.maybe_atomic = noop_context

        self.status_thresholds = status_thresholds
        self.pk_range = pk_range

        self.rate = WeightedAverageRate(chunk_time)
        self.chunk_size = 2  # Small but will expand rapidly anyhow
        self.chunk_max = chunk_max

        self.report_progress = report_progress
        self.total = total

    def __iter__(self):
        min_pk, max_pk = self.get_min_and_max()
        current_pk = min_pk
        db_alias = self.queryset.db
        status = GlobalStatus(db_alias)

        self.init_progress()

        while current_pk <= max_pk:
            status.wait_until_load_low(self.status_thresholds)

            start_pk = current_pk
            current_pk = current_pk + self.chunk_size
            # Don't process rows that didn't exist at start of iteration
            end_pk = min(current_pk, max_pk + 1)

            with StopWatch() as timer, self.maybe_atomic(using=db_alias):
                chunk = self.queryset.filter(pk__gte=start_pk, pk__lt=end_pk)
                yield chunk
                self.update_progress(chunk=chunk, end_pk=end_pk)

            self.adjust_chunk_size(chunk, timer.total_time)

        self.end_progress()

    def sanitize_queryset(self, queryset):
        if queryset.ordered:
            raise ValueError(
                "You can't use %s on a QuerySet with an ordering." %
                self.__class__.__name__
            )

        if queryset.query.low_mark or queryset.query.high_mark:
            raise ValueError(
                "You can't use %s on a sliced QuerySet." %
                self.__class__.__name__
            )

        pk = queryset.model._meta.pk
        if not isinstance(pk, (models.IntegerField, models.AutoField)):
            raise ValueError(
                "You can't use %s on a model with a non-integer primary key." %
                self.__class__.__name__
            )

        return queryset.order_by('pk')

    def get_min_and_max(self):
        if isinstance(self.pk_range, tuple) and len(self.pk_range) == 2:
            return self.pk_range
        elif self.pk_range == 'all':
            base_qs = self.queryset.model.objects.using(self.queryset.db).all()
        elif self.pk_range is None:
            base_qs = self.queryset
        else:
            raise ValueError("Unrecognized value for pk_range: {}"
                             .format(self.pk_range))

        min_qs = base_qs.order_by('pk').values_list('pk', flat=True)
        max_qs = base_qs.order_by('-pk').values_list('pk', flat=True)
        try:
            min_pk = min_qs[0]
        except IndexError:
            # We're working on an empty QuerySet, yield no chunks
            max_pk = min_pk = 0
        else:
            try:
                max_pk = max_qs[0]
            except IndexError:
                # Fix possible race condition - max_qs could find nothing if
                # all rows (including that with id min_pk) were processed
                # between finding min_pk and the above [0]
                max_pk = min_pk

        return (min_pk, max_pk)

    def adjust_chunk_size(self, chunk, chunk_time):
        # If the queryset is not being fetched as-is, e.g. its .delete() is
        # called, we can't know how many objects were affected, so we just
        # assume they all exist/existed
        if chunk._result_cache is None:
            num_processed = self.chunk_size
        else:
            num_processed = len(chunk)

        if num_processed > 0:
            new_chunk_size = self.rate.update(num_processed, chunk_time)
        else:
            new_chunk_size = self.chunk_size

        if new_chunk_size < 1:  # pragma: no cover
            new_chunk_size = 1

        if new_chunk_size > self.chunk_max:
            new_chunk_size = self.chunk_max

        self.chunk_size = new_chunk_size

    def init_progress(self):
        if not self.report_progress:
            return

        self.have_reported = False
        self.objects_done = 0
        self.chunks_done = 0
        if self.total is None:  # User didn't pass in a total
            try:
                self.total = approx_count(self.queryset)
                if self.total < 1000:
                    self.total = self.queryset.count()
            except ValueError:  # Cannot be approximately counted
                self.total = self.queryset.count()  # Fallback - will be slow

        self.update_progress()

    def update_progress(self, chunk=None, end_pk=None):
        if not self.report_progress:
            return

        if chunk is not None:
            self.chunks_done += 1
            if self.objects_done != "???":
                # If the queryset is not being fetched as-is, e.g. its
                # .delete() is called, we can't know how many objects were
                # affected, so we just bum out and write "???".
                if chunk._result_cache is None:
                    self.objects_done = "???"
                else:
                    self.objects_done += len(chunk)

        try:
            percent_complete = 100 * (float(self.objects_done) / self.total)
        except (ZeroDivisionError, ValueError):
            percent_complete = 0.0

        if not self.have_reported:
            self.have_reported = True
        else:
            # Reset line on successive outputs
            sys.stdout.write("\r")

        sys.stdout.write(
            "{} processed {}/{} objects ({:.2f}%) in {} chunks".format(
                self.model_name + self.__class__.__name__,
                self.objects_done,
                self.total,
                percent_complete,
                self.chunks_done,
            )
        )
        if end_pk is not None:
            sys.stdout.write("; highest pk so far {}".format(end_pk))
        sys.stdout.flush()

    def end_progress(self):
        if not self.report_progress:
            return

        sys.stdout.write("\nFinished!\n")

    @cached_property
    def model_name(self):
        return self.queryset.model.__name__