def imprint(self, key, value): """Translates imprints fields.""" reprint = clean_val("g", value, str) date_value = clean_val("c", value, str, req=True) if reprint: reprint = reprint.lower().replace("repr.", "").strip() try: date = parser.parse(date_value, default=datetime.datetime(1954, 1, 1)) cleaned_date = date.date().isoformat() pub_year = str(date.date().year) except ParserError: date_range = date_value.split("-") if len(date_range) == 2: start_date = parser.parse(date_range[0], default=datetime.datetime(1954, 1, 1)) end_date = parser.parse(date_range[1], default=datetime.datetime(1954, 1, 1)) cleaned_date = f"{start_date.date().isoformat()} - " \ f"{end_date.date().isoformat()} " pub_year = f"{start_date.date().year} - {end_date.date().year}" else: raise UnexpectedValue(subfield="c") except Exception: raise UnexpectedValue(subfield="c") self["publication_year"] = pub_year return { "date": cleaned_date if cleaned_date else None, "place": clean_val("a", value, str), "publisher": clean_val("b", value, str), "reprint": reprint, }
def isbns(self, key, value): """Translates isbns stored in the record.""" _migration = self["_migration"] _identifiers = self.get("identifiers", []) val_u = clean_val("u", value, str) val_a = clean_val("a", value, str) val_b = clean_val("b", value, str) if val_u: volume_info = extract_volume_info(val_u) # if set found it means that the isbn is for the whole multipart set_search = re.search(r"(.*?)\(set\.*\)", val_u) if volume_info: # if we have volume there it means that the ISBN is of the volume volume_obj = { "isbn": clean_val("a", value, str), "physical_description": volume_info["description"].strip(), "is_electronic": val_b is not None, } _insert_volume(_migration, volume_info["volume"], volume_obj) raise IgnoreKey("identifiers") if set_search: self["physical_description"] = set_search.group(1).strip() isbn = {"scheme": "ISBN", "value": val_a} return isbn if isbn not in _identifiers else None if not volume_info: # Try to find a volume number volume_number = extract_volume_number(val_u) if volume_number: # volume, but without description volume_obj = { "isbn": clean_val("a", value, str), "is_electronic": val_b is not None, } _insert_volume(_migration, volume_number, volume_obj) raise IgnoreKey("identifiers") elif extract_volume_number(val_u, search=True): raise UnexpectedValue( subfield="u", message=" found volume but failed to parse description", ) else: self["physical_description"] = val_u isbn = {"scheme": "ISBN", "value": val_a} return isbn if isbn not in _identifiers else None if not set_search and not volume_info: self["physical_description"] = val_u isbn = {"scheme": "ISBN", "value": val_a} return isbn if isbn not in _identifiers else None elif not val_u and val_a: # if I dont have volume info but only isbn isbn = {"scheme": "ISBN", "value": val_a} return isbn if isbn not in _identifiers else None else: raise UnexpectedValue(subfield="a", message=" isbn not provided")
def number_of_pages(self, key, value): """Translates number_of_pages fields.""" val = clean_val("a", value, str) if is_excluded(val): raise IgnoreKey("number_of_pages") parts = extract_parts(val) if parts["has_extra"]: raise UnexpectedValue(subfield="a") if parts["physical_copy_description"]: self["physical_copy_description"] = parts["physical_copy_description"] if parts["number_of_pages"]: return str(parts["number_of_pages"]) raise UnexpectedValue(subfield="a")
def related_records(self, key, value): """Translates related_records field.""" _migration = self.get("_migration", {}) _related = _migration.get("related", []) description = None sequence_order = None relation_type = OTHER_RELATION.name relation_type_tag = clean_val("x", value, str) if relation_type_tag: relation_type_tag = relation_type_tag.upper() else: raise UnexpectedValue('Relation type missing.', subfield="x") if relation_type_tag not in ["LANGUAGE", "EDITION", "SEQUENCE", "OTHER"]: raise UnexpectedValue(f'Unsupported relation type {relation_type_tag}', subfield="x") # language if key == "787__" and relation_type_tag == 'LANGUAGE': relation_type = LANGUAGE_RELATION.name # has supplement/supplement to if key == "770__" or key == "772__": if "i" in value and relation_type_tag == 'OTHER': description = clean_val("i", value, str) # continues/is continued by if key == "780__" or key == "785__": if relation_type_tag == 'SEQUENCE': relation_type = SEQUENCE_RELATION.name if key == "780__": sequence_order = "next" else: sequence_order = "previous" related_dict = { "related_recid": clean_val("w", value, str, req=True), "relation_type": relation_type, "relation_description": description, } if relation_type == SEQUENCE_RELATION.name: related_dict.update({"sequence_order": sequence_order}) _related.append(related_dict) _migration.update({ "related": _related, "has_related": True, }) return _migration
def migration(self, key, value): """Translates volumes titles.""" _series_title = self.get("title", None) volume_title = self.get("title", None) _migration = self["_migration"] for v in force_list(value): # check if it is a multipart monograph val_n = clean_val("n", v, str) val_p = clean_val("p", v, str) val_y = clean_val("y", v, str) if not val_n and not val_p: raise UnexpectedValue( subfield="n", message=" this record is probably not a series" ) if val_p and not val_n: raise UnexpectedValue( subfield="n", message=" volume title exists but no volume number", ) volume_index = re.findall(r"\d+", val_n) if val_n else None if volume_index and len(volume_index) > 1: raise UnexpectedValue( subfield="n", message=" volume has more than one digit " ) else: volume_number = extract_volume_number( val_n, raise_exception=True, subfield="n" ) obj = {"title": val_p or volume_title} if val_y: if re.match("\\d+", val_y) and 1600 <= int(val_y) <= 2021: obj["publication_year"] = int(val_y) else: raise UnexpectedValue( subfield="y", message=" unrecognized publication year" ) _insert_volume(_migration, volume_number, obj) if not volume_title: raise MissingRequiredField( subfield="a", message=" this record is missing a main title" ) # series created return _migration
def alternative_identifiers(self, key, value): """Translates external_system_identifiers fields.""" field_type = clean_val("2", value, str) sub_a = clean_val("a", value, str, req=True) indentifier_entry = {} if key == "0247_": if field_type and field_type.lower() == "doi": # if 0247__2 == doi it is a DOI identifier self["identifiers"] = dois(self, key, value) raise IgnoreKey("alternative_identifiers") elif field_type and field_type.lower() == "asin": raise IgnoreKey("alternative_identifiers") else: raise UnexpectedValue(subfield="2") if key == "035__": if "CERCER" in sub_a: raise IgnoreKey("alternative_identifiers") sub_9 = clean_val("9", value, str, req=True) if "CERCER" in sub_9: raise IgnoreKey("alternative_identifiers") # conference_info.identifiers mixed data if sub_9.upper() == "INSPIRE-CNUM": _conference_info = self.get("conference_info", {}) _prev_identifiers = _conference_info.get("identifiers", []) _prev_identifiers.append({ "scheme": "INSPIRE_CNUM", "value": sub_a }) _conference_info.update({"identifiers": _prev_identifiers}) self["conference_info"] = _conference_info raise IgnoreKey("alternative_identifiers") elif sub_9.upper() in EXTERNAL_SYSTEM_IDENTIFIERS: indentifier_entry.update({"value": sub_a, "scheme": sub_9}) elif sub_9.upper() in EXTERNAL_SYSTEM_IDENTIFIERS_TO_IGNORE: raise IgnoreKey("external_system_identifiers") else: raise UnexpectedValue(subfield="9") if key == "036__": if "a" in value and "9" in value: indentifier_entry.update({ "value": sub_a, "scheme": clean_val("9", value, str, req=True) }) return indentifier_entry
def languages(self, key, value): """Translates languages fields.""" lang = clean_val("b", value, str).lower() try: return pycountry.languages.lookup(lang).alpha_2 except (KeyError, AttributeError, LookupError): raise UnexpectedValue(subfield="a")
def mapping(field_map, val, raise_exception=False, default_val=None, subfield=None): """ Maps the old value to a new one according to the map. important: the maps values must be uppercase, in order to catch all the possible values in the field :param field_map: one of the maps specified :param val: old value :param raise_exception if mapping should raise exception when value does not match :raises UnexpectedValue :return: output value matched in map """ if isinstance(val, str): val = val.strip() if val: if isinstance(field_map, dict): for k, v in field_map.items(): if val.upper() in v: return k elif isinstance(field_map, list): if val in field_map: return val if default_val: return default_val if raise_exception: raise UnexpectedValue(subfield=subfield)
def _get_correct_ils_contributor_role(subfield, role): """Clean up roles.""" translations = { "author": "AUTHOR", "author.": "AUTHOR", "dir.": "SUPERVISOR", "dir": "SUPERVISOR", "supervisor": "SUPERVISOR", "ed.": "EDITOR", "editor": "EDITOR", "editor.": "EDITOR", "ed": "EDITOR", "ill.": "ILLUSTRATOR", "ill": "ILLUSTRATOR", "ed. et al.": "EDITOR", } if role is None: return "AUTHOR" if isinstance(role, str): clean_role = role.lower() else: raise UnexpectedValue(subfield=subfield, message="unknown author role") if clean_role not in translations: return "AUTHOR" return translations[clean_role]
def licenses(self, key, value): """Translates license fields.""" ARXIV_LICENSE = "arxiv.org/licenses/nonexclusive-distrib/1.0/" _license = dict() # license url license_url = clean_val("u", value, str) material = mapping(MATERIALS, clean_val("3", value, str, transform="lower"), raise_exception=True, subfield="3") if material: _license["material"] = material internal_notes = clean_val("g", value, str) if internal_notes: _license["internal_notes"] = internal_notes license_id = clean_val("a", value, str) if not license_id: # check if there is the URL instead of the id # the only known URL at the moment is ArXiv if license_url and ARXIV_LICENSE in license_url: license_id = "arXiv-nonexclusive-distrib-1.0" if license_id: _license["license"] = dict(id=license_id) else: raise UnexpectedValue() return _license
def barcode(self, key, value): """Translates the barcodes.""" _migration = self["_migration"] for v in force_list(value): val_a = clean_val("a", v, str) val_n = clean_val("n", v, str) val_x = clean_val("x", v, str) val_9 = clean_val("9", v, str) if val_a or val_9: if val_n or val_x or val_a and val_9: raise UnexpectedValue() identifier = {"scheme": "report_number", "value": val_a or val_9} if val_9: identifier["hidden"] = True identifiers = self.get("identifiers", []) identifiers.append(identifier) self["identifiers"] = identifiers raise IgnoreKey("barcode") if val_n and val_x: volume_number = extract_volume_number( val_n, raise_exception=True, subfield="n" ) _insert_volume(_migration, volume_number, {"barcode": val_x}) elif val_x: raise MissingRequiredField( subfield="n", message=" this record is missing a volume number" ) else: raise MissingRequiredField( subfield="x", message=" this record is missing a barcode number", ) raise IgnoreKey("barcode")
def document_type(self, key, value): """Translates document type field.""" _doc_type = self.get("document_type", {}) def doc_type_mapping(val): if val: return mapping(DOCUMENT_TYPE, val) for v in force_list(value): val_a = doc_type_mapping(clean_val("a", v, str)) val_b = doc_type_mapping(clean_val("b", v, str)) if not val_a and not val_b and not _doc_type: raise UnexpectedValue(subfield="a") if val_a and val_b and (val_a != val_b != _doc_type): raise ManualImportRequired(subfield="a or b - " "inconsistent doc type") if val_a: if _doc_type and _doc_type != val_a: raise ManualImportRequired(subfield="a" "inconsistent doc type") _doc_type = val_a if val_b: if _doc_type and _doc_type != val_a: raise ManualImportRequired(subfield="b" "inconsistent doc type") _doc_type = val_b return _doc_type
def sync_tag(self, key, value): """Synchronisation tag.""" sync_tag = clean_val('a', value, str).upper() if sync_tag in ["ILSSYNC", "ILSLINK"]: return True else: raise UnexpectedValue(subfield='a')
def conference_info(self, key, value): """Translates conference info.""" _conference_info = self.get("conference_info", {}) for v in force_list(value): if key == "111__": try: opening_date = parser.parse(clean_val("9", v, str, req=True)) closing_date = parser.parse(clean_val("z", v, str, req=True)) dates = "{0} - {1}".format( opening_date.date().isoformat(), closing_date.date().isoformat(), ) except ValueError: raise UnexpectedValue(subfield="9 or z") country_code = clean_val("w", v, str) if country_code: try: country_code = str( pycountry.countries.get(alpha_2=country_code).alpha_2) except (KeyError, AttributeError): raise UnexpectedValue(subfield="w") try: series_number = clean_val("n", v, int) except TypeError: raise UnexpectedValue("n", message=" series number not an int") _prev_identifiers = _conference_info.get("identifiers", []) _prev_identifiers.append({ "scheme": "CERN_CODE", "value": clean_val("g", v, str) }) _conference_info.update({ "title": clean_val("a", v, str, req=True), "place": clean_val("c", v, str, req=True), "dates": dates, "identifiers": _prev_identifiers, "series": { "number": series_number }, "country": country_code, }) else: _conference_info.update({"acronym": clean_val("a", v, str)}) return _conference_info
def document_type(self, key, value): """Translates document type field.""" for v in force_list(value): clean_val_a = clean_val("a", v, str) if (((key == "980__" or key == "690C_") and clean_val_a == "PERI") or key == "960__" and clean_val_a == "31"): raise IgnoreKey("document_type") else: raise UnexpectedValue(subfield="a")
def imprint(self, key, value): """Translates imprints fields.""" reprint = clean_val("g", value, str) if reprint: reprint = reprint.lower().replace("repr.", "").strip() try: date = parser.parse(clean_val("c", value, str, req=True)) except ParserError: raise UnexpectedValue(subfield="c") except Exception: raise UnexpectedValue(subfield="c") self["publication_year"] = str(date.date().year) return { "date": clean_val("c", value, str, req=True), "place": clean_val("a", value, str), "publisher": clean_val("b", value, str), "reprint_date": reprint, }
def clean_val( subfield, value, var_type, req=False, regex_format=None, default=None, manual=False, transform=None, ): """ Tests values using common rules. :param subfield: marcxml subfield indicator :param value: mxrcxml value :param var_type: expected type for value to be cleaned :param req: specifies if the value is required in the end schema :param regex_format: specifies if the value should have a pattern :param default: if value is missing and required it outputs default :param manual: if the value should be cleaned manually durign the migration :param transform: string transform function (or callable) :return: cleaned output value """ to_clean = value.get(subfield) if manual and to_clean: raise ManualImportRequired if req and to_clean is None: if default: return default raise MissingRequiredField if to_clean is not None: try: if var_type is str: return clean_str(to_clean, regex_format, req, transform) elif var_type is bool: return bool(to_clean) elif var_type is int: return int(to_clean) else: raise NotImplementedError except ValueError: raise UnexpectedValue(subfield=subfield) except TypeError: raise UnexpectedValue(subfield=subfield)
def table_of_content(self, key, value): """Translates table of content field.""" text = "{0} -- {1}".format( clean_val("a", value, str) or "", clean_val("t", value, str) or "").strip() if text != "--": chapters = re.split(r"; | -- |--", text) return chapters else: raise UnexpectedValue(subfield="a or t")
def imprint(self, key, value): """Translate imprint field.""" _publication_year = self.get("publication_year") if _publication_year: raise UnexpectedValue(subfield="e", message="doubled publication year") self["publication_year"] = clean_val("c", value, str).rstrip('.') return { "place": clean_val("a", value, str).rstrip(':'), "publisher": "Springer", }
def _clean(value_to_clean): if value_to_clean is not None: try: if var_type is str: return clean_str(value_to_clean, regex_format, req, transform) elif var_type is bool: return bool(value_to_clean) elif var_type is int: return int(value_to_clean) else: raise NotImplementedError except ValueError: raise UnexpectedValue(subfield=subfield) except TypeError: raise UnexpectedValue(subfield=subfield) except (UnexpectedValue, MissingRequiredField) as e: e.subfield = subfield e.message += str(force_list(value)) raise e
def title(self, key, value): """Translates title.""" if "title" in self: raise UnexpectedValue() if "b" in value: _alternative_titles = self.get("alternative_titles", []) _alternative_titles.append( {"value": clean_val("b", value, str), "type": "SUBTITLE"} ) self["alternative_titles"] = _alternative_titles return clean_val("a", value, str, req=True)
def languages(self, key, value): """Translates languages fields.""" lang = clean_val("b", value, str).lower() _languages = self.get("languages", []) try: new_lang = pycountry.languages.lookup(lang).alpha_3.upper() if new_lang not in _languages: return new_lang else: raise IgnoreKey("languages") except (KeyError, AttributeError, LookupError): raise UnexpectedValue(subfield="a")
def standard_review(self, key, value): """Translates standard_status field.""" _extensions = self.get("extensions", {}) applicability_list = _extensions.get("standard_review_applicability", []) applicability = mapping( APPLICABILITY, clean_val("i", value, str), ) if applicability and applicability not in applicability_list: applicability_list.append(applicability) if "z" in value: try: check_date = clean_val("z", value, str) # Normalise date for month in month_name[1:]: if month.lower() in check_date.lower(): check_date_month = month check_date_year = re.findall(r"\d+", check_date) if len(check_date_year) > 1: raise UnexpectedValue(subfield="z") datetime_object = datetime.datetime.strptime( "{} 1 {}".format(check_date_month, check_date_year[0]), "%B %d %Y", ) check_date_iso = datetime_object.date().isoformat() _extensions.update({ "standard_review_checkdate": check_date_iso, }) except (ValueError, IndexError): raise UnexpectedValue(subfield="z") _extensions.update({ "standard_review_applicability": applicability_list, "standard_review_standard_validity": clean_val("v", value, str), "standard_review_expert": clean_val("p", value, str), }) return _extensions
def alternative_identifiers(self, key, value): """Translates external_system_identifiers fields.""" field_type = clean_val("2", value, str) sub_a = clean_val("a", value, str, req=True) indentifier_entry = {} if key == "0247_": if field_type and field_type.lower() == "doi": # if 0247__2 == doi it is a DOI identifier self["identifiers"] = dois(self, key, value) raise IgnoreKey("alternative_identifiers") elif field_type and field_type.lower() == "asin": raise IgnoreKey("alternative_identifiers") else: raise UnexpectedValue(subfield="2", ) if key == "035__": if "CERCER" in sub_a: raise IgnoreKey("alternative_identifiers") sub_9 = clean_val("9", value, str, req=True).upper() if "CERCER" in sub_9: raise IgnoreKey("alternative_identifiers") if sub_9 in EXTERNAL_SYSTEM_IDENTIFIERS: indentifier_entry.update({"value": sub_a, "scheme": sub_9}) elif sub_9 in EXTERNAL_SYSTEM_IDENTIFIERS_TO_IGNORE: raise IgnoreKey("external_system_identifiers") else: raise UnexpectedValue(subfield="9") if key == "036__": if "a" in value and "9" in value: sub_9 = clean_val("9", value, str, req=True).upper() if sub_9 in EXTERNAL_SYSTEM_IDENTIFIERS_TO_IGNORE: raise IgnoreKey("external_system_identifiers") indentifier_entry.update({ "value": sub_a, "scheme": clean_val("9", value, str, req=True).upper(), }) return indentifier_entry
def imprint(self, key, value): """Translate imprint field.""" _publication_year = self.get("publication_year") if _publication_year: raise UnexpectedValue(subfield="e", message="doubled publication year") pub_year = reverse_replace(clean_val("c", value, str), ".", "") self["publication_year"] = pub_year return { "place": reverse_replace(clean_val("a", value, str), ":", ""), "publisher": reverse_replace(clean_val("b", value, str), ",", ""), "date": pub_year, }
def imprint(self, key, value): """Translate imprint field.""" _publication_year = self.get("publication_year") if _publication_year: raise UnexpectedValue(subfield="e", message="doubled publication year") self["publication_year"] = clean_val("c", value, str) publisher = ", ".join([entry for entry in value.get("b")]) return { "place": clean_val("a", value, str), "publisher": publisher, "date": clean_val("c", value, str), }
def number_of_pages(self, key, value): """Translates number_of_pages fields.""" val_x = clean_val("x", value, str) val_a = clean_val("a", value, str) if val_x: if val_x == "volume": raise IgnoreKey("number_of_pages") elif val_x.lower() in ["phys.desc.", "phys.desc"]: self["physical_description"] = val_a raise IgnoreKey("number_of_pages") else: if is_excluded(val_a): raise IgnoreKey("number_of_pages") parts = extract_parts(val_a) if parts["has_extra"]: raise UnexpectedValue(subfield="a") if parts["physical_description"]: self["physical_description"] = parts["physical_description"] if parts["number_of_pages"]: return str(parts["number_of_pages"]) raise UnexpectedValue(subfield="a")
def dois(self, key, value): """Translates DOIs.""" _migration = self["_migration"] _identifiers = self.get("identifiers", []) for v in force_list(value): val_2 = clean_val("2", v, str) if val_2 and val_2 != "DOI": raise UnexpectedValue( subfield="2", message=" field is not equal to DOI" ) val_q = clean_val("q", v, str, transform="lower") volume_info = extract_volume_info(val_q) doi = { "value": clean_val("a", v, str, req=True), "source": clean_val("9", v, str), "scheme": "DOI", } if volume_info: # this identifier is for a specific volume volume_obj = { "doi": doi["value"], "material": mapping( MATERIALS, volume_info["description"], raise_exception=True ), "source": doi["source"], } _insert_volume(_migration, volume_info["volume"], volume_obj) else: if re.match(r".* \(.*\)", val_q): raise UnexpectedValue( subfield="q", message=" found a volume number but could not extract it", ) doi["material"] = mapping(MATERIALS, val_q, raise_exception=True) if doi not in _identifiers: _identifiers.append(doi) if len(_identifiers) > 0: self["identifiers"] = _identifiers
def multivolume_record_format(self, key, value): """Multivolume kind.""" val_a = clean_val("a", value, str) _migration = self["_migration"] if val_a == "MULTIVOLUMES-1": parsed = True elif val_a == "MULTIVOLUMES-X" or val_a == "MULTIVOLUMES-x": parsed = False elif val_a == "MULTIVOLUMES-MANUAL": raise Exception("This record should not be migrated!") else: raise UnexpectedValue( subfield="a", message=" unrecognized migration multipart tag" ) _migration["multivolume_record_format"] = parsed raise IgnoreKey("multivolume_record_format")
def title(self, key, value): """Translates title.""" if "title" in self: raise UnexpectedValue(message="Ambiguous title") if "b" in value: _alternative_titles = self.get("alternative_titles", []) subtitle = clean_val("b", value, str).rstrip('/') _alternative_titles.append({"value": subtitle, "type": "SUBTITLE"}) self["alternative_titles"] = _alternative_titles title = clean_val("a", value, str, req=True).rstrip("/") # remove excess white spaces title = " ".join(title.split()) return title