def _normalize_document_uris_window(session, window): query = (session.query(models.DocumentURI).filter( models.DocumentURI.updated.between(window.start, window.end)).order_by( models.DocumentURI.updated.asc())) for docuri in query: documents = models.Document.find_by_uris(session, [docuri.uri]) if documents.count() > 1: merge_documents(session, documents) existing = session.query(models.DocumentURI).filter( models.DocumentURI.id != docuri.id, models.DocumentURI.document_id == docuri.document_id, models.DocumentURI.claimant_normalized == uri.normalize( docuri.claimant), models.DocumentURI.uri_normalized == uri.normalize(docuri.uri), models.DocumentURI.type == docuri.type, models.DocumentURI.content_type == docuri.content_type, ) if existing.count() > 0: session.delete(docuri) else: docuri._claimant_normalized = uri.normalize(docuri.claimant) docuri._uri_normalized = uri.normalize(docuri.uri) session.flush()
def test_raises_retryable_error_when_flush_fails(self, db_session, merge_data, monkeypatch): def err(): raise sa.exc.IntegrityError(None, None, None) monkeypatch.setattr(db_session, 'flush', err) with pytest.raises(transaction.interfaces.TransientError): document.merge_documents(db_session, merge_data)
def _normalize_document_uris_window(session, window): query = ( session.query(models.DocumentURI) .filter(models.DocumentURI.updated.between(window.start, window.end)) .order_by(models.DocumentURI.updated.asc()) ) for docuri in query: documents = models.Document.find_by_uris(session, [docuri.uri]) if documents.count() > 1: merge_documents(session, documents) existing = session.query(models.DocumentURI).filter( models.DocumentURI.id != docuri.id, models.DocumentURI.document_id == docuri.document_id, models.DocumentURI.claimant_normalized == uri.normalize(docuri.claimant), models.DocumentURI.uri_normalized == uri.normalize(docuri.uri), models.DocumentURI.type == docuri.type, models.DocumentURI.content_type == docuri.content_type, ) if existing.count() > 0: session.delete(docuri) else: docuri._claimant_normalized = uri.normalize(docuri.claimant) docuri._uri_normalized = uri.normalize(docuri.uri) session.flush()
def test_merge_documents_logs_when_its_called(self, caplog, db_session, merge_data): caplog.set_level(logging.INFO) document.merge_documents(db_session, merge_data) assert caplog.record_tuples == [("h.models.document", 20, "Merging 3 documents")]
def test_raises_retryable_error_when_flush_fails(self, db_session, merge_data, monkeypatch): def err(): raise sa.exc.IntegrityError(None, None, None) monkeypatch.setattr(db_session, "flush", err) with pytest.raises(ConcurrentUpdateError): document.merge_documents(db_session, merge_data)
def test_merge_documents_rewires_document_meta(self, db_session, merge_data): master, duplicate_1, duplicate_2 = merge_data document.merge_documents(db_session, merge_data) db_session.flush() assert len(master.meta) == 3 assert len(duplicate_1.meta) == 0 assert len(duplicate_2.meta) == 0
def move_uri(ctx, old, new): """ Move annotations and document equivalence data from one URL to another. This will **replace** the annotation's ``target_uri`` and all the document uri's ``claimant``, plus the matching ``uri`` for self-claim and canonical uris. """ request = ctx.obj["bootstrap"]() annotations = _fetch_annotations(request.db, old) docuris_claimant = _fetch_document_uri_claimants(request.db, old) docuris_uri = _fetch_document_uri_canonical_self_claim(request.db, old) prompt = ( "Changing all annotations and document data matching:\n" '"{old}"\nto:\n"{new}"\n' "This will affect {ann_count} annotations, {doc_claimant} " "document uri claimants, and {doc_uri} document uri self-claims " "or canonical uris.\n" "Are you sure? [y/N]" ).format( old=old, new=new, ann_count=len(annotations), doc_claimant=len(docuris_claimant), doc_uri=len(docuris_uri), ) c = click.prompt(prompt, default="n", show_default=False) if c != "y": print("Aborted") return for annotation in annotations: annotation.target_uri = new for docuri in docuris_claimant: docuri.claimant = new for docuri in docuris_uri: docuri.uri = new if annotations: indexer = BatchIndexer(request.db, request.es, request) ids = [a.id for a in annotations] indexer.index(ids) request.db.flush() documents = models.Document.find_by_uris(request.db, [new]) if documents.count() > 1: merge_documents(request.db, documents) request.tm.commit()
def test_merge_documents_deletes_duplicate_documents(self, db_session, merge_data): _, duplicate_1, duplicate_2 = merge_data document.merge_documents(db_session, merge_data) db_session.flush() count = db_session.query(document.Document) \ .filter(document.Document.id.in_([duplicate_1.id, duplicate_2.id])) \ .count() assert count == 0
def move_uri(ctx, old, new): """ Move annotations and document equivalence data from one URL to another. This will **replace** the annotation's ``target_uri`` and all the document uri's ``claimant``, plus the matching ``uri`` for self-claim and canonical uris. """ request = ctx.obj["bootstrap"]() annotations = _fetch_annotations(request.db, old) docuris_claimant = _fetch_document_uri_claimants(request.db, old) docuris_uri = _fetch_document_uri_canonical_self_claim(request.db, old) prompt = ("Changing all annotations and document data matching:\n" '"{old}"\nto:\n"{new}"\n' "This will affect {ann_count} annotations, {doc_claimant} " "document uri claimants, and {doc_uri} document uri self-claims " "or canonical uris.\n" "Are you sure? [y/N]").format( old=old, new=new, ann_count=len(annotations), doc_claimant=len(docuris_claimant), doc_uri=len(docuris_uri), ) c = click.prompt(prompt, default="n", show_default=False) if c != "y": print("Aborted") return for annotation in annotations: annotation.target_uri = new for docuri in docuris_claimant: docuri.claimant = new for docuri in docuris_uri: docuri.uri = new if annotations: indexer = BatchIndexer(request.db, request.es, request) ids = [a.id for a in annotations] indexer.index(ids) request.db.flush() documents = models.Document.find_by_uris(request.db, [new]) if documents.count() > 1: merge_documents(request.db, documents) request.tm.commit()
def test_merge_documents_rewires_annotations(self, db_session, merge_data): master, duplicate_1, duplicate_2 = merge_data document.merge_documents(db_session, merge_data) db_session.flush() assert (6 == db_session.query( models.Annotation).filter_by(document_id=master.id).count()) assert (0 == db_session.query( models.Annotation).filter_by(document_id=duplicate_1.id).count()) assert (0 == db_session.query( models.Annotation).filter_by(document_id=duplicate_2.id).count())
def test_merge_documents_rewires_annotations(self, db_session, merge_data): master, duplicate_1, duplicate_2 = merge_data document.merge_documents(db_session, merge_data) db_session.flush() assert 6 == \ db_session.query(models.Annotation).filter_by(document_id=master.id).count() assert 0 == \ db_session.query(models.Annotation).filter_by(document_id=duplicate_1.id).count() assert 0 == \ db_session.query(models.Annotation).filter_by(document_id=duplicate_2.id).count()
def test_merge_documents_returns_master(self, db_session, merge_data): master, _, _ = merge_data merged_master = document.merge_documents(db_session, merge_data) assert merged_master == master