def _expand(results, key, queryset, serializer): from documentcloud.documents.tasks import solr_index ids = {r[key] for r in results if key in r} objs = queryset.filter(pk__in=ids) obj_dict = {obj.pk: serializer(obj).data for obj in objs} for result in results: # user and organization should always be available, re-index if they are not if key not in result: solr_index.delay(result["id"]) else: result[key] = obj_dict.get(result[key]) return results
def partial_update(self, request, pk=None, document_pk=None): document = self.get_object(edit=True) serializer = DataAddRemoveSerializer(data=request.data) serializer.is_valid(raise_exception=True) if pk in document.data: document.data[pk].extend(serializer.data.get("values", [])) document.data[pk] = [ i for i in document.data[pk] if i not in serializer.data.get("remove", []) ] else: document.data[pk] = serializer.data.get("values", []) # remove duplicate values document.data[pk] = list(set(document.data[pk])) if not document.data[pk]: # remove key if all values are removed del document.data[pk] document.save() transaction.on_commit( lambda: solr_index.delay(document.pk, field_updates={f"data_{pk}": "set"}) ) return Response(document.data)
def perform_create(self, serializer): bulk = hasattr(serializer, "many") and serializer.many if bulk: file_urls = [d.pop("file_url", None) for d in serializer.validated_data] force_ocrs = [d.pop("force_ocr", False) for d in serializer.validated_data] else: file_urls = [serializer.validated_data.pop("file_url", None)] force_ocrs = [serializer.validated_data.pop("force_ocr", False)] documents = serializer.save( user=self.request.user, organization=self.request.user.organization ) if not bulk: documents = [documents] for document, file_url, force_ocr in zip(documents, file_urls, force_ocrs): transaction.on_commit(lambda d=document: solr_index.delay(d.pk)) if file_url is not None: transaction.on_commit( lambda d=document, fu=file_url, fo=force_ocr: fetch_file_url.delay( fu, d.pk, fo ) )
def create(self, request, *args, **kwargs): """Initiate asyncrhonous creation of entities""" # pylint: disable=unused-argument if not request.user.has_perm("documents.change_document", self.document): raise exceptions.PermissionDenied( "You do not have permission to edit this document" ) with transaction.atomic(): # We select for update here to lock the document between checking if it is # processing and starting the entity extraction to ensure another # thread does not start processing this document before we mark it as # processing document = Document.objects.select_for_update().get(pk=self.document.pk) if document.processing: return Response( {"error": "Already processing"}, status=status.HTTP_400_BAD_REQUEST ) if document.entities.exists(): return Response( {"error": "Entities already created"}, status=status.HTTP_400_BAD_REQUEST, ) document.status = Status.readable document.save() transaction.on_commit( lambda: solr_index.delay(document.pk, field_updates={"status": "set"}) ) transaction.on_commit(lambda: extract_entities.delay(self.document.pk)) return Response("OK")
def create(self, request, *args, **kwargs): document = self.get_object() serializer = self.get_serializer(data={"data": request.data}) serializer.is_valid(raise_exception=True) with transaction.atomic(): # We select for update here to lock the document between checking if it is # processing and starting the page modification to ensure another # thread does not start processing this document before we mark it as # processing document = Document.objects.select_for_update().get(pk=document.pk) if document.processing: return Response( {"error": "Already processing"}, status=status.HTTP_400_BAD_REQUEST ) document.status = Status.pending document.save() transaction.on_commit( lambda: solr_index.delay(document.pk, field_updates={"status": "set"}) ) modify.delay( document.pk, document.page_count, document.slug, document.access, serializer.data, ) return Response(serializer.data, status=status.HTTP_201_CREATED)
def _add_asset_url(results): from documentcloud.documents.tasks import solr_index for result in results: # access and status should always be available, re-index if they are not if "access" not in result or "status" not in result: solr_index.delay(result["id"]) result["asset_url"] = settings.PRIVATE_ASSET_URL elif result["access"] == "public" and result["status"] in ( "success", "readable", ): result["asset_url"] = settings.PUBLIC_ASSET_URL else: result["asset_url"] = settings.PRIVATE_ASSET_URL return results
def create(self, request, *args, **kwargs): with transaction.atomic(): document = self.get_object() serializer = self.get_serializer(data=request.data, many=True) serializer.is_valid(raise_exception=True) if document.processing: return Response( {"error": "Already processing"}, status=status.HTTP_400_BAD_REQUEST ) document.status = Status.pending # we must invalidate the cache after a redaction document.cache_dirty = True document.save() transaction.on_commit( lambda: solr_index.delay(document.pk, field_updates={"status": "set"}) ) redact.delay( document.pk, document.slug, document.access, Language.get_choice(document.language).ocr_code, serializer.data, ) return Response(serializer.data, status=status.HTTP_201_CREATED)
def perform_destroy(self, instance): """Index the note changes in Solr""" super().perform_destroy(instance) transaction.on_commit( lambda: solr_index.delay( self.kwargs["document_pk"], field_updates={"notes": "set"} ) )
def perform_update(self, serializer): """Index the note changes in Solr""" super().perform_update(serializer) transaction.on_commit( lambda: solr_index.delay( self.kwargs["document_pk"], field_updates={"notes": "set"} ) )
def perform_create(self, serializer): """Specify the document Set the status of the document to error """ serializer.save(document_id=self.document.pk) self.document.status = Status.error self.document.save() transaction.on_commit( lambda: solr_index.delay(self.document.pk, field_updates={"status": "set"}) )
def perform_create(self, serializer): """Specify the document, user and organization""" serializer.save( document_id=self.kwargs["document_pk"], user=self.request.user, organization=self.request.user.organization, ) transaction.on_commit( lambda: solr_index.delay( self.kwargs["document_pk"], field_updates={"notes": "set"} ) )
def update(self, request, pk=None, document_pk=None): document = self.get_object(edit=True) serializer = self.serializer_class(data=request.data) serializer.is_valid(raise_exception=True) # remove duplicate values document.data[pk] = list(set(serializer.data["values"])) document.save() transaction.on_commit( lambda: solr_index.delay(document.pk, field_updates={f"data_{pk}": "set"}) ) return Response(document.data)
def destroy(self, request, pk=None, document_pk=None): document = self.get_object(edit=True) if pk in document.data: del document.data[pk] document.save() transaction.on_commit( lambda: solr_index.delay( document.pk, field_updates={f"data_{pk}": "set"} ) ) return Response(status=status.HTTP_204_NO_CONTENT)
def _process(self, document, force_ocr): """Process a document after you have uploaded the file""" transaction.on_commit( lambda: process.delay( document.pk, document.slug, document.access, Language.get_choice(document.language).ocr_code, force_ocr, document.original_extension, ) ) transaction.on_commit( lambda: solr_index.delay(document.pk, field_updates={"status": "set"}) )
def cancel_process(self, request, pk=None): """Cancel processing for a document""" # pylint: disable=unused-argument document = self.get_object() if not document.processing: return Response( {"error": "Not processing"}, status=status.HTTP_400_BAD_REQUEST ) with transaction.atomic(): document.status = Status.error document.save() transaction.on_commit( lambda: solr_index.delay(document.pk, field_updates={"status": "set"}) ) document.errors.create(message="Processing was cancelled") transaction.on_commit(lambda: process_cancel.delay(document.pk)) return Response("OK", status=status.HTTP_200_OK)
def extract_entities(document): """The public entry point to the module. The document should be set to Status.readable before this function is called on it. Mainly a wrapper with error handling to ensure document doesn't get stuck in a processing state. """ from documentcloud.documents.tasks import solr_index try: _extract_entities(document) finally: with transaction.atomic(): document.status = Status.success document.save() transaction.on_commit( lambda: solr_index.delay(document.pk, field_updates={"status": "set"}) ) logger.info("Extracting entities for %s finished", document)
def _update_solr(self, document, old_processing, old_data_keys, validated_data): """Update solr index after updating a document""" # update solr index if old_processing and document.status == Status.success: # if it was processed succesfully, do a full index with text kwargs = {"index_text": True} elif old_processing: # if it is not done processing or error, we may not be indexed yet # do a full index, without text since text has not been processed yet kwargs = {"index_text": False} else: # only update the fields that were updated # never try to update the id validated_data.pop("id", None) data = validated_data.pop("data", None) if data: # we want to update all data keys if data is set directly, # including old data keys which may have been removed all_keys = old_data_keys | data.keys() for key in all_keys: validated_data[f"data_{key}"] = None kwargs = {"field_updates": {f: "set" for f in validated_data}} transaction.on_commit(lambda: solr_index.delay(document.pk, **kwargs))
def save_model(self, request, obj, form, change): super().save(request, obj, form, change) transaction.on_commit(lambda: solr_index.delay( obj.pk, field_updates={f: "set" for f in form.changed_data}))
def post_process(document, modification_data): """Post process the notes and sections for the document as specified by modifications """ from documentcloud.documents.tasks import solr_index # Remove entities (no matter what) document.entities.all().delete() # (document.id, old_page) -> [(new_page, rotation), ...] page_map = _build_page_map(document, modification_data["modifications"]) # load all documents, notes and sections # prefetch all notes and sections documents = Document.objects.prefetch_related( "notes", "sections").filter(id__in=[doc_id for doc_id, _page in page_map]) # map all notes and sections from involved documents to their correct places # the first occurence of a note or section from the original document may be # moved instead of copied create_notes, update_notes, delete_notes = [], [], [] create_sections, update_sections, delete_sections = [], [], [] for source_document in documents: creates, updates, deletes = _process_page_objs( page_map, document, source_document, source_document.notes.all(), remove_note, ) create_notes.extend(creates) update_notes.extend(updates) delete_notes.extend(deletes) creates, updates, deletes = _process_page_objs( page_map, document, source_document, source_document.sections.all(), remove_section, ) create_sections.extend(creates) update_sections.extend(updates) delete_sections.extend(deletes) _commit_db( Note, ["page_number", "x1", "y1", "x2", "y2"], create_notes, update_notes, delete_notes, ) _commit_db(Section, ["page_number"], create_sections, update_sections, delete_sections) document.status = Status.success document.page_spec = modification_data["pagespec"] if "filehash" in modification_data and modification_data["filehash"]: document.file_hash = modification_data["filehash"] document.save() transaction.on_commit(lambda: solr_index.delay(document.pk, field_updates={ "status": "set", "page_count": "set" }))