Exemplo n.º 1
0
def _normalize_document_uris_window(session, window):
    query = (session.query(models.DocumentURI).filter(
        models.DocumentURI.updated.between(window.start, window.end)).order_by(
            models.DocumentURI.updated.asc()))

    for docuri in query:
        documents = models.Document.find_by_uris(session, [docuri.uri])
        if documents.count() > 1:
            merge_documents(session, documents)

        existing = session.query(models.DocumentURI).filter(
            models.DocumentURI.id != docuri.id,
            models.DocumentURI.document_id == docuri.document_id,
            models.DocumentURI.claimant_normalized == uri.normalize(
                docuri.claimant),
            models.DocumentURI.uri_normalized == uri.normalize(docuri.uri),
            models.DocumentURI.type == docuri.type,
            models.DocumentURI.content_type == docuri.content_type,
        )

        if existing.count() > 0:
            session.delete(docuri)
        else:
            docuri._claimant_normalized = uri.normalize(docuri.claimant)
            docuri._uri_normalized = uri.normalize(docuri.uri)

        session.flush()
Exemplo n.º 2
0
    def test_raises_retryable_error_when_flush_fails(self, db_session, merge_data, monkeypatch):
        def err():
            raise sa.exc.IntegrityError(None, None, None)
        monkeypatch.setattr(db_session, 'flush', err)

        with pytest.raises(transaction.interfaces.TransientError):
            document.merge_documents(db_session, merge_data)
Exemplo n.º 3
0
def _normalize_document_uris_window(session, window):
    query = (
        session.query(models.DocumentURI)
        .filter(models.DocumentURI.updated.between(window.start, window.end))
        .order_by(models.DocumentURI.updated.asc())
    )

    for docuri in query:
        documents = models.Document.find_by_uris(session, [docuri.uri])
        if documents.count() > 1:
            merge_documents(session, documents)

        existing = session.query(models.DocumentURI).filter(
            models.DocumentURI.id != docuri.id,
            models.DocumentURI.document_id == docuri.document_id,
            models.DocumentURI.claimant_normalized == uri.normalize(docuri.claimant),
            models.DocumentURI.uri_normalized == uri.normalize(docuri.uri),
            models.DocumentURI.type == docuri.type,
            models.DocumentURI.content_type == docuri.content_type,
        )

        if existing.count() > 0:
            session.delete(docuri)
        else:
            docuri._claimant_normalized = uri.normalize(docuri.claimant)
            docuri._uri_normalized = uri.normalize(docuri.uri)

        session.flush()
Exemplo n.º 4
0
    def test_raises_retryable_error_when_flush_fails(self, db_session, merge_data, monkeypatch):
        def err():
            raise sa.exc.IntegrityError(None, None, None)
        monkeypatch.setattr(db_session, 'flush', err)

        with pytest.raises(transaction.interfaces.TransientError):
            document.merge_documents(db_session, merge_data)
Exemplo n.º 5
0
    def test_merge_documents_logs_when_its_called(self, caplog, db_session,
                                                  merge_data):
        caplog.set_level(logging.INFO)

        document.merge_documents(db_session, merge_data)

        assert caplog.record_tuples == [("h.models.document", 20,
                                         "Merging 3 documents")]
Exemplo n.º 6
0
    def test_raises_retryable_error_when_flush_fails(self, db_session,
                                                     merge_data, monkeypatch):
        def err():
            raise sa.exc.IntegrityError(None, None, None)

        monkeypatch.setattr(db_session, "flush", err)

        with pytest.raises(ConcurrentUpdateError):
            document.merge_documents(db_session, merge_data)
Exemplo n.º 7
0
    def test_merge_documents_rewires_document_meta(self, db_session, merge_data):
        master, duplicate_1, duplicate_2 = merge_data

        document.merge_documents(db_session, merge_data)
        db_session.flush()

        assert len(master.meta) == 3
        assert len(duplicate_1.meta) == 0
        assert len(duplicate_2.meta) == 0
Exemplo n.º 8
0
    def test_merge_documents_rewires_document_meta(self, db_session, merge_data):
        master, duplicate_1, duplicate_2 = merge_data

        document.merge_documents(db_session, merge_data)
        db_session.flush()

        assert len(master.meta) == 3
        assert len(duplicate_1.meta) == 0
        assert len(duplicate_2.meta) == 0
Exemplo n.º 9
0
def move_uri(ctx, old, new):
    """
    Move annotations and document equivalence data from one URL to another.

    This will **replace** the annotation's ``target_uri`` and all the
    document uri's ``claimant``, plus the matching ``uri`` for self-claim and
    canonical uris.
    """

    request = ctx.obj["bootstrap"]()

    annotations = _fetch_annotations(request.db, old)
    docuris_claimant = _fetch_document_uri_claimants(request.db, old)
    docuris_uri = _fetch_document_uri_canonical_self_claim(request.db, old)

    prompt = (
        "Changing all annotations and document data matching:\n"
        '"{old}"\nto:\n"{new}"\n'
        "This will affect {ann_count} annotations, {doc_claimant} "
        "document uri claimants, and {doc_uri} document uri self-claims "
        "or canonical uris.\n"
        "Are you sure? [y/N]"
    ).format(
        old=old,
        new=new,
        ann_count=len(annotations),
        doc_claimant=len(docuris_claimant),
        doc_uri=len(docuris_uri),
    )
    c = click.prompt(prompt, default="n", show_default=False)

    if c != "y":
        print("Aborted")
        return

    for annotation in annotations:
        annotation.target_uri = new

    for docuri in docuris_claimant:
        docuri.claimant = new

    for docuri in docuris_uri:
        docuri.uri = new

    if annotations:
        indexer = BatchIndexer(request.db, request.es, request)
        ids = [a.id for a in annotations]
        indexer.index(ids)

    request.db.flush()

    documents = models.Document.find_by_uris(request.db, [new])
    if documents.count() > 1:
        merge_documents(request.db, documents)

    request.tm.commit()
Exemplo n.º 10
0
    def test_merge_documents_deletes_duplicate_documents(self, db_session, merge_data):
        _, duplicate_1, duplicate_2 = merge_data

        document.merge_documents(db_session, merge_data)
        db_session.flush()

        count = db_session.query(document.Document) \
            .filter(document.Document.id.in_([duplicate_1.id, duplicate_2.id])) \
            .count()

        assert count == 0
Exemplo n.º 11
0
def move_uri(ctx, old, new):
    """
    Move annotations and document equivalence data from one URL to another.

    This will **replace** the annotation's ``target_uri`` and all the
    document uri's ``claimant``, plus the matching ``uri`` for self-claim and
    canonical uris.
    """

    request = ctx.obj["bootstrap"]()

    annotations = _fetch_annotations(request.db, old)
    docuris_claimant = _fetch_document_uri_claimants(request.db, old)
    docuris_uri = _fetch_document_uri_canonical_self_claim(request.db, old)

    prompt = ("Changing all annotations and document data matching:\n"
              '"{old}"\nto:\n"{new}"\n'
              "This will affect {ann_count} annotations, {doc_claimant} "
              "document uri claimants, and {doc_uri} document uri self-claims "
              "or canonical uris.\n"
              "Are you sure? [y/N]").format(
                  old=old,
                  new=new,
                  ann_count=len(annotations),
                  doc_claimant=len(docuris_claimant),
                  doc_uri=len(docuris_uri),
              )
    c = click.prompt(prompt, default="n", show_default=False)

    if c != "y":
        print("Aborted")
        return

    for annotation in annotations:
        annotation.target_uri = new

    for docuri in docuris_claimant:
        docuri.claimant = new

    for docuri in docuris_uri:
        docuri.uri = new

    if annotations:
        indexer = BatchIndexer(request.db, request.es, request)
        ids = [a.id for a in annotations]
        indexer.index(ids)

    request.db.flush()

    documents = models.Document.find_by_uris(request.db, [new])
    if documents.count() > 1:
        merge_documents(request.db, documents)

    request.tm.commit()
Exemplo n.º 12
0
    def test_merge_documents_deletes_duplicate_documents(self, db_session, merge_data):
        _, duplicate_1, duplicate_2 = merge_data

        document.merge_documents(db_session, merge_data)
        db_session.flush()

        count = db_session.query(document.Document) \
            .filter(document.Document.id.in_([duplicate_1.id, duplicate_2.id])) \
            .count()

        assert count == 0
Exemplo n.º 13
0
    def test_merge_documents_rewires_annotations(self, db_session, merge_data):
        master, duplicate_1, duplicate_2 = merge_data

        document.merge_documents(db_session, merge_data)
        db_session.flush()

        assert (6 == db_session.query(
            models.Annotation).filter_by(document_id=master.id).count())
        assert (0 == db_session.query(
            models.Annotation).filter_by(document_id=duplicate_1.id).count())
        assert (0 == db_session.query(
            models.Annotation).filter_by(document_id=duplicate_2.id).count())
Exemplo n.º 14
0
    def test_merge_documents_rewires_annotations(self, db_session, merge_data):
        master, duplicate_1, duplicate_2 = merge_data

        document.merge_documents(db_session, merge_data)
        db_session.flush()

        assert 6 == \
            db_session.query(models.Annotation).filter_by(document_id=master.id).count()
        assert 0 == \
            db_session.query(models.Annotation).filter_by(document_id=duplicate_1.id).count()
        assert 0 == \
            db_session.query(models.Annotation).filter_by(document_id=duplicate_2.id).count()
Exemplo n.º 15
0
    def test_merge_documents_returns_master(self, db_session, merge_data):
        master, _, _ = merge_data

        merged_master = document.merge_documents(db_session, merge_data)

        assert merged_master == master
Exemplo n.º 16
0
    def test_merge_documents_returns_master(self, db_session, merge_data):
        master, _, _ = merge_data

        merged_master = document.merge_documents(db_session, merge_data)

        assert merged_master == master