def test_ibatch_with_list_no_progress(self): input = list(range(0, 100)) # Neatly dividable batches iterator = ibatch(input, batch_size=20) self.assertIsInstance(iterator, Iterator) batches = [batch for batch in iterator] self.assertEqual(len(batches), 5) self.assertEqual(batches[0], list(range(0, 20))) self.assertEqual(batches[-1], list(range(80, 100))) # Rest batches iterator = ibatch(input, batch_size=11) self.assertIsInstance(iterator, Iterator) batches = [batch for batch in iterator] self.assertEqual(len(batches), 10) self.assertEqual(batches[0], list(range(0, 11))) self.assertEqual(batches[-1], list(range(99, 100))) # Batch size equals list length input = list(range(0, 10)) iterator = ibatch(input, batch_size=10) self.assertIsInstance(iterator, Iterator) batches = [batch for batch in iterator] self.assertEqual(len(batches), 1) self.assertEqual(batches[0], list(range(0, 10))) # Batch size larger than list length input = list(range(0, 11)) iterator = ibatch(input, batch_size=20) self.assertIsInstance(iterator, Iterator) batches = [batch for batch in iterator] self.assertEqual(len(batches), 1) self.assertEqual(batches[0], list(range(0, 11)))
def test_ibatch_with_list_progress_no_total(self): progress_bar_mock = Mock(spec=tqdm(disable=True, total=100)) input = range(0, 100) with patch("datagrowth.utils.iterators.tqdm", return_value=progress_bar_mock) as tqdm_mock: # Neatly dividable batches iterator = ibatch(input, batch_size=20, progress_bar=True) self.assertIsInstance(iterator, Iterator) batches = [batch for batch in iterator] self.assertEqual(len(batches), 5) self.assertEqual(batches[0], list(range(0, 20))) self.assertEqual(batches[-1], list(range(80, 100))) tqdm_mock.assert_called_once_with() self.assertEqual(progress_bar_mock.update.call_count, 5) self.assertEqual(progress_bar_mock.close.call_count, 1) progress_bar_mock.reset_mock() with patch("datagrowth.utils.iterators.tqdm", return_value=progress_bar_mock) as tqdm_mock: # Rest batches iterator = ibatch(input, batch_size=11, progress_bar=True) self.assertIsInstance(iterator, Iterator) batches = [batch for batch in iterator] self.assertEqual(len(batches), 10) self.assertEqual(batches[0], list(range(0, 11))) self.assertEqual(batches[-1], list(range(99, 100))) tqdm_mock.assert_called_once_with() self.assertEqual(progress_bar_mock.update.call_count, 10) self.assertEqual(progress_bar_mock.close.call_count, 1) progress_bar_mock.reset_mock() input = range(0, 10) with patch("datagrowth.utils.iterators.tqdm", return_value=progress_bar_mock) as tqdm_mock: # Batch size equals list length iterator = ibatch(input, batch_size=10, progress_bar=True) self.assertIsInstance(iterator, Iterator) batches = [batch for batch in iterator] self.assertEqual(len(batches), 1) self.assertEqual(batches[0], list(range(0, 10))) tqdm_mock.assert_called_once_with() self.assertEqual(progress_bar_mock.update.call_count, 1) self.assertEqual(progress_bar_mock.close.call_count, 1) progress_bar_mock.reset_mock() input = range(0, 11) with patch("datagrowth.utils.iterators.tqdm", return_value=progress_bar_mock) as tqdm_mock: # Batch size larger than list length iterator = ibatch(input, batch_size=20, progress_bar=True) self.assertIsInstance(iterator, Iterator) batches = [batch for batch in iterator] self.assertEqual(len(batches), 1) self.assertEqual(batches[0], list(range(0, 11))) tqdm_mock.assert_called_once_with() self.assertEqual(progress_bar_mock.update.call_count, 1) self.assertEqual(progress_bar_mock.close.call_count, 1)
def __call__(self, queryset): # Prepare some values for serialization processor = self.__class__.__name__ config = self.config.to_dict(private=True, protected=True) # Allow derived classes to filter the target Documents queryset = self.filter_documents(queryset) # Only target Documents that have no ProcessResult associated queryset = queryset.exclude( processresult__result_type=self.result_type) # Create batches of documents with no processing results batches = [] for document_batch in ibatch(queryset, batch_size=self.config.batch_size): batch = self.Batch.objects.create(processor=processor) results = [ self.ProcessResult(document=document, batch=batch) for document in document_batch ] self.ProcessResult.objects.bulk_create(results) batches.append(batch) # Create tasks and dispatch tasks = [ process_and_merge.s(batch.id, config=config) for batch in batches ] finish = full_merge.s(processor, config=config) return self._dispatch_tasks(tasks, finish, asynchronous=self.config.asynchronous)
def batchify(self, phase, iterator, total): batches = int(math.floor(total / self.batch_size)) rest = total % self.batch_size if rest: batches += 1 for batch in ibatch(iterator, batch_size=self.batch_size): self.logger.progress(phase, batches) yield batch
def add(self, data, reset=False, batch_size=500, collection=None, modified_at=None, validate=True): """ Add new data to the Collection in batches, possibly deleting all data before adding. :param data: The data to use for the update :param reset: (optional) whether to delete existing data or not (no by default) :param batch_size: (optional) how many instances to add in a single batch (default: 500) :param collection: (optional) a collection instance to add the data to (default: self) :param modified_at: (optional) the datetime to use as modified_at value for the collection (default: now) :param validate: (deprecated) used to allow JSON schema validation before addition :return: A list of updated or created instances. """ collection = collection or self modified_at = modified_at or make_aware(datetime.now()) Document = collection.get_document_model() assert isinstance(data, (Iterator, list, tuple, dict, Document)), \ f"Collection.add expects data to be formatted as iteratable, dict or {type(Document)} not {type(data)}" if reset: self.documents.all().delete() def prepare_additions(data): prepared = [] if isinstance(data, dict): document = self.init_document(data, collection=collection) document.clean() prepared.append(document) elif isinstance(data, Document): data = self.init_document(data.properties, collection=collection) data.clean() prepared.append(data) else: # type is list for instance in data: prepared += prepare_additions(instance) return prepared count = 0 for additions in ibatch(data, batch_size=batch_size): additions = prepare_additions(additions) count += len(additions) Document.objects.bulk_create( additions, batch_size=datagrowth_settings.DATAGROWTH_MAX_BATCH_SIZE) if collection.modified_at.replace( microsecond=0) != modified_at.replace(microsecond=0): collection.modified_at = modified_at collection.save() return count
def copy_collection(self, collection): Document = collection.get_document_model() source_id = collection.id collection.pk = None collection.id = None collection.dataset_version = self collection.save() for batch in ibatch(Document.objects.filter(collection_id=source_id), batch_size=100): for doc in batch: doc.collection_id = collection.id doc.dataset_version = self doc.pk = None doc.id = None Document.objects.bulk_create(batch) return collection
def handle_label(self, label, **options): try: Model = apps.get_model(label) except LookupError as exc: log.error("Failed to find '{}': {}".format(label, exc)) return assert issubclass(Model, HttpFileResource) log.info("Stripping from {}\r".format(Model.__name__)) batch_size = 500 queryset = Model.objects.filter(status=200) count = queryset.count() for batch in ibatch(queryset.iterator(), batch_size, progress_bar=True, total=count): for instance in batch: if instance.body and instance.body.startswith(options["path"]): instance.body = instance.body.replace(datagrowth_settings.DATAGROWTH_MEDIA_ROOT, "", 1) instance.save()
def update(self, data, by_reference, validate=True, batch_size=32, collection=None): collection = collection or self Document = collection.get_document_model() assert isinstance(data, (Iterator, list, tuple, dict, Document)), \ f"Collection.update expects data to be formatted as iteratable, dict or {type(Document)} not {type(data)}" count = 0 for updates in ibatch(data, batch_size=batch_size): # First we bulk update by getting all objects whose identifier value match any update's "by" value # and then updating these source objects. # One update object can potentially target multiple sources # if multiple objects with an identifier of "by" exist. updated = set() hashed = {update[by_reference]: update for update in updates} sources = { source[by_reference]: source for source in collection.documents.filter( reference__in=hashed.keys()) } for source in sources.values(): source.update(hashed[source.reference], validate=validate) count += 1 updated.add(source.reference) Document.objects.bulk_update( sources.values(), ["properties"], batch_size=datagrowth_settings.DATAGROWTH_MAX_BATCH_SIZE) # After all updates we add all data that hasn't been used in any update operation additions = [ update for identify, update in hashed.items() if identify not in updated ] if len(additions): count += self.add(additions, validate=validate, batch_size=batch_size, collection=collection) return count
def handle_deletion_seeds(self, collection, deletion_seeds): self.info(f"Deleting for {collection.name} ...") document_delete_total = 0 for seeds in ibatch(deletion_seeds, 32, progress_bar=self.show_progress): ids = [seed["external_id"] for seed in seeds] for id in ids: for doc in collection.documents.filter( collection=collection, properties__contains={"external_id": id}): doc.delete() document_delete_total += 1 arrangement_delete_count = 0 for arrangement in Arrangement.objects.annotate(num_docs=Count('document')) \ .filter(collection=collection, num_docs=0): arrangement.delete() arrangement_delete_count += 1 return arrangement_delete_count, document_delete_total
def handle_label(self, label, **options): try: Model = apps.get_model(label) except LookupError as exc: log.error("Failed to find '{}': {}".format(label, exc)) return assert issubclass(Model, HttpFileResource) log.info("Stripping from {}\r".format(Model.__name__)) batch_size = 500 queryset = Model.objects.filter(status=200) count = queryset.count() for batch in ibatch(queryset.iterator(), batch_size, progress_bar=True, total=count): for instance in batch: if instance.body and instance.body.startswith(options["path"]): instance.body = instance.body.replace( options["path"], "", 1) instance.save()
def update(self, data, by_property, batch_size=32, collection=None, modified_at=None, validate=True): """ Update data to the Collection in batches, using a property value to identify which Documents to update. :param data: The data to use for the update :param by_property: The property to identify a Document with :param batch_size: (optional) how many instances to add in a single batch (default: 32) :param collection: (optional) a collection instance to update the data to (default: self) :param modified_at: (optional) the datetime to use as modified_at value for the collection (default: now) :param validate: (deprecated) used to allow JSON schema validation before updates :return: A list of updated or created instances. """ collection = collection or self modified_at = modified_at or make_aware(datetime.now()) Document = collection.get_document_model() assert isinstance(data, (Iterator, list, tuple,)), \ f"Collection.update expects data to be formatted as iteratable not {type(data)}" count = 0 for updates in ibatch(data, batch_size=batch_size): # We bulk update by getting all objects whose property matches # any update's "by_property" property value and then updating these source objects. # One update object can potentially target multiple sources # if multiple objects with the same value for the by_property property exist. updated = set() prepared = [] sources_by_lookup = defaultdict(list) for update in updates: sources_by_lookup[update[by_property]].append(update) target_filters = Q() for lookup_value in sources_by_lookup.keys(): target_filters |= Q( **{f"properties__{by_property}": lookup_value}) for target in collection.documents.filter(target_filters): for update_value in sources_by_lookup[ target.properties[by_property]]: target.update(update_value, commit=False) count += 1 updated.add(target.properties[by_property]) prepared.append(target) Document.objects.bulk_update( prepared, ["properties", "identity", "reference", "modified_at"], batch_size=datagrowth_settings.DATAGROWTH_MAX_BATCH_SIZE) # After all updates we add all data that hasn't been used in any update operation additions = [] for lookup_value, sources in sources_by_lookup.items(): if lookup_value not in updated: additions += sources if len(additions): count += self.add(additions, batch_size=batch_size, collection=collection, modified_at=modified_at) if collection.modified_at.replace( microsecond=0) != modified_at.replace(microsecond=0): collection.modified_at = modified_at collection.save() return count
def queryset_to_disk(queryset, json_file, batch_size=100): count = queryset.all().count() batch_iterator = ibatch(queryset.iterator(), batch_size=batch_size, progress_bar=True, total=count) for batch in batch_iterator: batch_data = serialize("json", batch, use_natural_foreign_keys=True) json_file.writelines([batch_data + "\n"])