Пример #1
0
def migrate_ebl_links():
    """Migrate external links from documents."""
    search = get_documents_with_ebl_eitems()
    click.echo("Found {} documents with ebl links.".format(search.count()))

    for hit in search.scan():
        # make sure the document is in DB not only ES
        document = Document.get_record_by_pid(hit.pid)
        click.echo("Processing document {}...".format(document["pid"]))

        # find the ebl identifier
        ebl_id = next(
            (x for x in document["alternative_identifiers"]
             if x["scheme"] == "EBL"),
            None,
        )

        for url in document["_migration"]["eitems_ebl"]:

            if not ebl_id:
                raise EItemMigrationError(
                    "Document {pid} has no EBL alternative identifier"
                    " while EBL ebook link was found".format(
                        pid=document["pid"]))

            eitem = create_eitem(document["pid"], open_access=False)
            eitem["urls"] = [{"value": "EBL", "login_required": True}]
            eitem.commit()
            EItemIndexer().index(eitem)

        document["_migration"]["eitems_has_ebl"] = False
        document.commit()
        db.session.commit()
        DocumentIndexer().index(document)
Пример #2
0
def migrate_safari_links(raise_exceptions=True):
    """Migrate Safari links from documents."""
    document_class = current_app_ils.document_record_cls

    search = get_documents_with_safari_eitems()
    click.echo("Found {} documents with safari links.".format(search.count()))

    for hit in search.params(scroll='2h').scan():
        # make sure the document is in DB not only ES
        document = document_class.get_record_by_pid(hit.pid)
        click.echo("Processing document {}...".format(document["pid"]))

        try:
            for item in document["_migration"]["eitems_safari"]:
                eitem = create_eitem(document["pid"], open_access=False)
                eitem["urls"] = [item["url"]]
                add_eitem_extra_metadata(eitem, document)
                eitem.model.created = document.model.created
                eitem.commit()
                EItemIndexer().index(eitem)
            document["_migration"]["eitems_has_safari"] = False
            document.commit()
            db.session.commit()
            DocumentIndexer().index(document)

        except Exception as exc:
            handler = eitems_exception_handlers.get(exc.__class__)
            if handler:
                handler(exc, document_pid=document["pid"])
            else:
                if raise_exceptions:
                    raise exc
    def test_on_eitem_update():
        """Test eitem resolvers."""
        indexer = _get_mock()

        pid = "eitemid-1"
        eitem = EItem.get_record_by_pid(pid)
        EItemIndexer().index(eitem)

        referenced = _assert_origin(indexer, EITEM_PID_TYPE, pid)

        # should re-index documents
        n_documents = 1  # from test data
        _assert_contains(referenced, DOCUMENT_PID_TYPE)

        assert len(referenced) == n_documents
Пример #4
0
def migrate_ezproxy_links():
    """Migrate external links from documents."""
    search = get_documents_with_proxy_eitems()
    click.echo("Found {} documents with ezproxy links.".format(search.count()))
    for hit in search.scan():
        # make sure the document is in DB not only ES
        document = Document.get_record_by_pid(hit.pid)
        click.echo("Processing document {}...".format(document["pid"]))

        for url in document["_migration"]["eitems_external"]:
            eitem = create_eitem(document["pid"], open_access=False)
            url["login_required"] = True
            eitem["urls"] = [url]
            eitem.commit()
            EItemIndexer().index(eitem)

        document["_migration"]["eitems_has_proxy"] = False
        document.commit()
        db.session.commit()
        DocumentIndexer().index(document)
Пример #5
0
def migrate_ezproxy_links(raise_exceptions=True):
    """Migrate external links from documents."""
    search = get_documents_with_proxy_eitems()
    click.echo("Found {} documents with ezproxy links.".format(search.count()))
    for hit in search.params(scroll='2h').scan():
        # make sure the document is in DB not only ES
        Document = current_app_ils.document_record_cls
        document = Document.get_record_by_pid(hit.pid)
        click.echo("Processing document {}...".format(document["pid"]))
        open_access = document["_migration"].get("eitems_open_access")
        for item in document["_migration"]["eitems_proxy"]:
            # EzProxy links require login and therefore they need to be
            # restricted
            try:
                eitem = create_eitem(
                    document["pid"],
                    open_access=open_access or item["open_access"],
                )
                if "login_required" not in item["url"]:
                    item["url"]["login_required"] = not open_access
                eitem["urls"] = [item["url"]]
                add_eitem_extra_metadata(eitem, document)
                eitem.model.created = document.model.created
                eitem.commit()
                EItemIndexer().index(eitem)
            except Exception as exc:
                handler = eitems_exception_handlers.get(exc.__class__)
                if handler:
                    handler(exc, document_pid=document["pid"])
                else:
                    if raise_exceptions:
                        raise exc

        document["_migration"]["eitems_has_proxy"] = False
        document.commit()
        db.session.commit()
        DocumentIndexer().index(document)
Пример #6
0
def process_files_from_legacy():
    r"""Process legacy file.

    File dump object
    {
      "comment": null,
      "status": "",
      "version": 1,
      "encoding": null,
      "creation_date": "2014-08-15T16:27:10+00:00",
      "bibdocid": 952822,
      "mime": "application/pdf",
      "full_name": "075030183X_TOC.pdf",
      "superformat": ".pdf",
      "recids_doctype": [
        [
          262151,
          "Additional",
          "075030183X_TOC.pdf"
        ]
      ],
      "path": "/opt/cdsweb/var/data/files/g95/952822/content.pdf;1",
      "size": 264367,
      "license": {},
      "modification_date": "2014-08-15T16:27:10+00:00",
      "copyright": {},
      "url": "http://cds.cern.ch/record/262151/files/075030183X_TOC.pdf",
      "checksum": "a8b4bba8a2bbc6780cc7707387c4702f",
      "description": "1. Table of contents",
      "format": ".pdf",
      "name": "075030183X_TOC",
      "subformat": "",
      "etag": "\"952822.pdf1\"",
      "recid": 262151,
      "flags": [],
      "hidden": false,
      "type": "Additional",
      "full_path": "/opt/cdsweb/var/data/files/g95/952822/content.pdf;1"
    }
    """
    search = get_all_documents_with_files()
    click.echo("Found {} documents with files.".format(search.count()))
    for hit in search.scan():
        # try not to kill legacy server
        time.sleep(3)
        # make sure the document is in DB not only ES
        Document = current_app_ils.document_record_cls
        document = Document.get_record_by_pid(hit.pid)
        click.echo("Processing document {}...".format(document["pid"]))

        try:
            for file_dump in document["_migration"]["files"]:

                # check if url migrated from MARC
                url_in_marc = [
                    item
                    for item in document["_migration"]["eitems_file_links"]
                    if item["value"] == file_dump["url"]
                ]
                if not url_in_marc:
                    msg = ("DOCUMENT: {pid}: ERROR: File {file}"
                           " found in the dump but not in MARC".format(
                               pid=document.pid, file=file_dump["url"]))
                    raise FileMigrationError(msg)

                click.echo("File: {}".format(file_dump["url"]))
                eitem, bucket = create_eitem_with_bucket_for_document(
                    document["pid"])

                # get filename
                file_name = file_dump["description"]
                if not file_name:
                    file_name = file_dump["full_name"]

                file_stream = import_legacy_files(file_dump["url"])

                file = create_file(bucket, file_stream, file_name,
                                   file_dump["checksum"])
                click.echo("Indexing...")
                EItemIndexer().index(eitem)
        except Exception as e:
            msg = "DOCUMENT: {pid} CAN'T MIGRATE FILES ERROR: {error}".format(
                pid=document["pid"], error=str(e))
            click.secho(msg)
            records_logger.error(msg)
            continue

        # make sure the files are not imported twice by setting the flag
        document["_migration"]["eitems_has_files"] = False
        document["_migration"]["has_files"] = False
        document.commit()
        db.session.commit()
        DocumentIndexer().index(document)
Пример #7
0
def process_files_from_legacy():
    r"""Process legacy file.

    File dump object
    {
      "comment": null,
      "status": "",
      "version": 1,
      "encoding": null,
      "creation_date": "2014-08-15T16:27:10+00:00",
      "bibdocid": 952822,
      "mime": "application/pdf",
      "full_name": "075030183X_TOC.pdf",
      "superformat": ".pdf",
      "recids_doctype": [
        [
          262151,
          "Additional",
          "075030183X_TOC.pdf"
        ]
      ],
      "path": "/opt/cdsweb/var/data/files/g95/952822/content.pdf;1",
      "size": 264367,
      "license": {},
      "modification_date": "2014-08-15T16:27:10+00:00",
      "copyright": {},
      "url": "http://cds.cern.ch/record/262151/files/075030183X_TOC.pdf",
      "checksum": "a8b4bba8a2bbc6780cc7707387c4702f",
      "description": "1. Table of contents",
      "format": ".pdf",
      "name": "075030183X_TOC",
      "subformat": "",
      "etag": "\"952822.pdf1\"",
      "recid": 262151,
      "flags": [],
      "hidden": false,
      "type": "Additional",
      "full_path": "/opt/cdsweb/var/data/files/g95/952822/content.pdf;1"
    }
    """
    search = get_all_documents_with_files()
    click.echo("Found {} documents with files.".format(search.count()))
    for hit in search.params(scroll='4h').scan():
        # make sure the document is in DB not only ES
        Document = current_app_ils.document_record_cls
        document = Document.get_record_by_pid(hit.pid)
        click.echo("Processing document {}...".format(document["pid"]))

        for file_dump in document["_migration"]["files"]:
            try:
                # check if url migrated from MARC
                url_in_marc = [
                    item
                    for item in document["_migration"]["eitems_file_links"]
                    if item["url"]["value"] == file_dump["url"]
                ]
                if not url_in_marc:
                    msg = ("DOCUMENT: {pid}: ERROR: File {file}"
                           " found in the dump but not in MARC".format(
                               pid=document.pid, file=file_dump["url"]))
                    raise FileMigrationError(msg)

                click.echo("File: {}".format(file_dump["url"]))

                is_restricted = file_dump.get("status").upper() in \
                    ["SSO", "RESTRICTED"]
                eitem, bucket = create_eitem_with_bucket_for_document(
                    document["pid"], open_access=not is_restricted)
                add_eitem_extra_metadata(eitem, document)
                eitem.model.created = document.model.created
                eitem.commit()

                # get filename
                file_description = file_dump.get("description")
                file_format = file_dump.get("format")
                if file_description and file_format:
                    file_name = f"{file_description}{file_format}"
                else:
                    file_name = file_dump["full_name"]

                relative_path = file_dump.get("ils_relative_path")
                if relative_path:
                    if relative_path.startswith("/"):
                        relative_path = relative_path.replace("/", "", 1)
                    file_stream = import_legacy_files(relative_path)

                    file = create_file(bucket, file_stream, file_name,
                                       file_dump["checksum"])
                    file_stream.close()
                else:
                    raise FileMigrationError("Source file path incorrect")
                click.echo("Indexing...")
                EItemIndexer().index(eitem)

            except Exception as exc:
                handler = eitems_exception_handlers.get(exc.__class__)
                if handler:
                    handler(exc, document_pid=document["pid"])
                else:
                    raise exc
        # make sure the files are not imported twice by setting the flag
        document["_migration"]["eitems_has_files"] = False
        document["_migration"]["has_files"] = False
        document.commit()
        db.session.commit()
        DocumentIndexer().index(document)