def migrate_ebl_links(): """Migrate external links from documents.""" search = get_documents_with_ebl_eitems() click.echo("Found {} documents with ebl links.".format(search.count())) for hit in search.scan(): # make sure the document is in DB not only ES document = Document.get_record_by_pid(hit.pid) click.echo("Processing document {}...".format(document["pid"])) # find the ebl identifier ebl_id = next( (x for x in document["alternative_identifiers"] if x["scheme"] == "EBL"), None, ) for url in document["_migration"]["eitems_ebl"]: if not ebl_id: raise EItemMigrationError( "Document {pid} has no EBL alternative identifier" " while EBL ebook link was found".format( pid=document["pid"])) eitem = create_eitem(document["pid"], open_access=False) eitem["urls"] = [{"value": "EBL", "login_required": True}] eitem.commit() EItemIndexer().index(eitem) document["_migration"]["eitems_has_ebl"] = False document.commit() db.session.commit() DocumentIndexer().index(document)
def migrate_safari_links(raise_exceptions=True): """Migrate Safari links from documents.""" document_class = current_app_ils.document_record_cls search = get_documents_with_safari_eitems() click.echo("Found {} documents with safari links.".format(search.count())) for hit in search.params(scroll='2h').scan(): # make sure the document is in DB not only ES document = document_class.get_record_by_pid(hit.pid) click.echo("Processing document {}...".format(document["pid"])) try: for item in document["_migration"]["eitems_safari"]: eitem = create_eitem(document["pid"], open_access=False) eitem["urls"] = [item["url"]] add_eitem_extra_metadata(eitem, document) eitem.model.created = document.model.created eitem.commit() EItemIndexer().index(eitem) document["_migration"]["eitems_has_safari"] = False document.commit() db.session.commit() DocumentIndexer().index(document) except Exception as exc: handler = eitems_exception_handlers.get(exc.__class__) if handler: handler(exc, document_pid=document["pid"]) else: if raise_exceptions: raise exc
def test_on_eitem_update(): """Test eitem resolvers.""" indexer = _get_mock() pid = "eitemid-1" eitem = EItem.get_record_by_pid(pid) EItemIndexer().index(eitem) referenced = _assert_origin(indexer, EITEM_PID_TYPE, pid) # should re-index documents n_documents = 1 # from test data _assert_contains(referenced, DOCUMENT_PID_TYPE) assert len(referenced) == n_documents
def migrate_ezproxy_links(): """Migrate external links from documents.""" search = get_documents_with_proxy_eitems() click.echo("Found {} documents with ezproxy links.".format(search.count())) for hit in search.scan(): # make sure the document is in DB not only ES document = Document.get_record_by_pid(hit.pid) click.echo("Processing document {}...".format(document["pid"])) for url in document["_migration"]["eitems_external"]: eitem = create_eitem(document["pid"], open_access=False) url["login_required"] = True eitem["urls"] = [url] eitem.commit() EItemIndexer().index(eitem) document["_migration"]["eitems_has_proxy"] = False document.commit() db.session.commit() DocumentIndexer().index(document)
def migrate_ezproxy_links(raise_exceptions=True): """Migrate external links from documents.""" search = get_documents_with_proxy_eitems() click.echo("Found {} documents with ezproxy links.".format(search.count())) for hit in search.params(scroll='2h').scan(): # make sure the document is in DB not only ES Document = current_app_ils.document_record_cls document = Document.get_record_by_pid(hit.pid) click.echo("Processing document {}...".format(document["pid"])) open_access = document["_migration"].get("eitems_open_access") for item in document["_migration"]["eitems_proxy"]: # EzProxy links require login and therefore they need to be # restricted try: eitem = create_eitem( document["pid"], open_access=open_access or item["open_access"], ) if "login_required" not in item["url"]: item["url"]["login_required"] = not open_access eitem["urls"] = [item["url"]] add_eitem_extra_metadata(eitem, document) eitem.model.created = document.model.created eitem.commit() EItemIndexer().index(eitem) except Exception as exc: handler = eitems_exception_handlers.get(exc.__class__) if handler: handler(exc, document_pid=document["pid"]) else: if raise_exceptions: raise exc document["_migration"]["eitems_has_proxy"] = False document.commit() db.session.commit() DocumentIndexer().index(document)
def process_files_from_legacy(): r"""Process legacy file. File dump object { "comment": null, "status": "", "version": 1, "encoding": null, "creation_date": "2014-08-15T16:27:10+00:00", "bibdocid": 952822, "mime": "application/pdf", "full_name": "075030183X_TOC.pdf", "superformat": ".pdf", "recids_doctype": [ [ 262151, "Additional", "075030183X_TOC.pdf" ] ], "path": "/opt/cdsweb/var/data/files/g95/952822/content.pdf;1", "size": 264367, "license": {}, "modification_date": "2014-08-15T16:27:10+00:00", "copyright": {}, "url": "http://cds.cern.ch/record/262151/files/075030183X_TOC.pdf", "checksum": "a8b4bba8a2bbc6780cc7707387c4702f", "description": "1. Table of contents", "format": ".pdf", "name": "075030183X_TOC", "subformat": "", "etag": "\"952822.pdf1\"", "recid": 262151, "flags": [], "hidden": false, "type": "Additional", "full_path": "/opt/cdsweb/var/data/files/g95/952822/content.pdf;1" } """ search = get_all_documents_with_files() click.echo("Found {} documents with files.".format(search.count())) for hit in search.scan(): # try not to kill legacy server time.sleep(3) # make sure the document is in DB not only ES Document = current_app_ils.document_record_cls document = Document.get_record_by_pid(hit.pid) click.echo("Processing document {}...".format(document["pid"])) try: for file_dump in document["_migration"]["files"]: # check if url migrated from MARC url_in_marc = [ item for item in document["_migration"]["eitems_file_links"] if item["value"] == file_dump["url"] ] if not url_in_marc: msg = ("DOCUMENT: {pid}: ERROR: File {file}" " found in the dump but not in MARC".format( pid=document.pid, file=file_dump["url"])) raise FileMigrationError(msg) click.echo("File: {}".format(file_dump["url"])) eitem, bucket = create_eitem_with_bucket_for_document( document["pid"]) # get filename file_name = file_dump["description"] if not file_name: file_name = file_dump["full_name"] file_stream = import_legacy_files(file_dump["url"]) file = create_file(bucket, file_stream, file_name, file_dump["checksum"]) click.echo("Indexing...") EItemIndexer().index(eitem) except Exception as e: msg = "DOCUMENT: {pid} CAN'T MIGRATE FILES ERROR: {error}".format( pid=document["pid"], error=str(e)) click.secho(msg) records_logger.error(msg) continue # make sure the files are not imported twice by setting the flag document["_migration"]["eitems_has_files"] = False document["_migration"]["has_files"] = False document.commit() db.session.commit() DocumentIndexer().index(document)
def process_files_from_legacy(): r"""Process legacy file. File dump object { "comment": null, "status": "", "version": 1, "encoding": null, "creation_date": "2014-08-15T16:27:10+00:00", "bibdocid": 952822, "mime": "application/pdf", "full_name": "075030183X_TOC.pdf", "superformat": ".pdf", "recids_doctype": [ [ 262151, "Additional", "075030183X_TOC.pdf" ] ], "path": "/opt/cdsweb/var/data/files/g95/952822/content.pdf;1", "size": 264367, "license": {}, "modification_date": "2014-08-15T16:27:10+00:00", "copyright": {}, "url": "http://cds.cern.ch/record/262151/files/075030183X_TOC.pdf", "checksum": "a8b4bba8a2bbc6780cc7707387c4702f", "description": "1. Table of contents", "format": ".pdf", "name": "075030183X_TOC", "subformat": "", "etag": "\"952822.pdf1\"", "recid": 262151, "flags": [], "hidden": false, "type": "Additional", "full_path": "/opt/cdsweb/var/data/files/g95/952822/content.pdf;1" } """ search = get_all_documents_with_files() click.echo("Found {} documents with files.".format(search.count())) for hit in search.params(scroll='4h').scan(): # make sure the document is in DB not only ES Document = current_app_ils.document_record_cls document = Document.get_record_by_pid(hit.pid) click.echo("Processing document {}...".format(document["pid"])) for file_dump in document["_migration"]["files"]: try: # check if url migrated from MARC url_in_marc = [ item for item in document["_migration"]["eitems_file_links"] if item["url"]["value"] == file_dump["url"] ] if not url_in_marc: msg = ("DOCUMENT: {pid}: ERROR: File {file}" " found in the dump but not in MARC".format( pid=document.pid, file=file_dump["url"])) raise FileMigrationError(msg) click.echo("File: {}".format(file_dump["url"])) is_restricted = file_dump.get("status").upper() in \ ["SSO", "RESTRICTED"] eitem, bucket = create_eitem_with_bucket_for_document( document["pid"], open_access=not is_restricted) add_eitem_extra_metadata(eitem, document) eitem.model.created = document.model.created eitem.commit() # get filename file_description = file_dump.get("description") file_format = file_dump.get("format") if file_description and file_format: file_name = f"{file_description}{file_format}" else: file_name = file_dump["full_name"] relative_path = file_dump.get("ils_relative_path") if relative_path: if relative_path.startswith("/"): relative_path = relative_path.replace("/", "", 1) file_stream = import_legacy_files(relative_path) file = create_file(bucket, file_stream, file_name, file_dump["checksum"]) file_stream.close() else: raise FileMigrationError("Source file path incorrect") click.echo("Indexing...") EItemIndexer().index(eitem) except Exception as exc: handler = eitems_exception_handlers.get(exc.__class__) if handler: handler(exc, document_pid=document["pid"]) else: raise exc # make sure the files are not imported twice by setting the flag document["_migration"]["eitems_has_files"] = False document["_migration"]["has_files"] = False document.commit() db.session.commit() DocumentIndexer().index(document)