def extract_convert_identifiers(sum_doc): has_isbn = False has_eissn = False identifiers_item = [] identifiers_is_part_of = [] for sum_key in summon_identifier_type_to_metajson_identifier_type: if sum_key in sum_doc: for id_value in sum_doc[sum_key]: id_type = summon_identifier_type_to_metajson_identifier_type[sum_key] if id_type == "issn": identifiers_is_part_of.append(metajson.create_identifier(id_type, id_value)) elif id_type == "eissn": has_eissn = True identifiers_is_part_of.append(metajson.create_identifier(id_type, id_value)) elif id_type == "isbn": has_isbn = True identifiers_is_part_of.append(metajson.create_identifier(id_type, id_value)) else: identifiers_item.append(metajson.create_identifier(id_type, id_value)) return identifiers
def convert_mods_name_to_contributor(mods_name, dai_dict): if mods_name is not None: contributor = Contributor() # extract properties name_type = mods_name.get("type") name_id = mods_name.get("ID") name_parts = mods_name.findall(prefixtag("mods", "namePart")) name_affiliations = mods_name.findall(prefixtag("mods", "affiliation")) name_roleterm = None name_role = mods_name.find(prefixtag("mods", "role")) if name_role is not None: name_roleterm = name_role.find(prefixtag("mods", "roleTerm")) name_descriptions = mods_name.findall(prefixtag("mods", "description")) if name_type == "personal": person = Person() if name_id is not None and dai_dict is not None and name_id in dai_dict: id_value = dai_dict[name_id]["authority"] + "/" + dai_dict[name_id]["value"] identifier = metajson.create_identifier("uri", id_value) person.add_item_to_key(identifier, "identifiers") if name_parts: for name_part in name_parts: if name_part.get("type") == "given": person["name_given"] = name_part.text elif name_part.get("type") == "family": person["name_family"] = name_part.text elif name_part.get("type") == "date": date = name_part.text.replace("(", "").replace(")", "") minus_index = date.find("-") if minus_index == -1: person["date_of_birth"] = date else: person["date_of_birth"] = date[:minus_index] person["date_of_death"] = date[minus_index+1:] elif name_part.get("termsOfAddress") == "date": person["name_terms_of_address"] = name_part.text contributor["person"] = person #print name_type, name_id, name_parts, name_affiliations, name_roleterm, name_descriptions return contributor
def convert_summon_json_document_to_metajson_document(sum_doc, source): document = Document() # Extract Summon properties rec_id = sum_doc["ID"][0].replace("FETCH-", "") sum_type = sum_doc["ContentType"][0] rec_type = summon_document_type_to_metajson_document_type[sum_type] # rec_id, rec_source, rec_type document["rec_id"] = rec_id document["rec_source"] = source document["rec_type"] = rec_type # languages main_language = None if "Language" in sum_doc: languages = [] for sum_lang in sum_doc["Language"]: lang = language_service.convert_english_to_rfc5646(sum_lang) if lang: languages.append(lang) if languages: main_language = languages[0] document["languages"] = languages # extract summon properties contributors = extract_contributors(sum_doc) copyright_statement = extract_value(sum_doc, "Copyright") date_issued = extract_date_issued(sum_doc) degree = extract_value(sum_doc, "DissertationDegree") descriptions = extract_convert_langstrings(sum_doc, "Abstract", main_language) edition = extract_value(sum_doc, "Edition") extent_pages = extract_value(sum_doc, "PageCount") genre = extract_value(sum_doc, "Genre") is_part_of_edition = extract_value(sum_doc, "PublicationEdition") is_part_of_title = extract_value(sum_doc, "PublicationTitle") is_part_of_title_sub = extract_value(sum_doc, "PublicationSubtitle") notes = extract_convert_langstrings(sum_doc, "Notes", main_language) part_issue = extract_value(sum_doc, "Issue") part_page_end = extract_value(sum_doc, "EndPage") part_page_start = extract_value(sum_doc, "StartPage") part_volume = extract_value(sum_doc, "Volume") peer_reviewed = extract_boolean_value(sum_doc, "IsPeerReviewed") publisher = extract_value(sum_doc, "Publisher") publisher_place = extract_value(sum_doc, "PublicationPlace") scholarly = extract_boolean_value(sum_doc, "IsScholarly") series_title = extract_value(sum_doc, "PublicationSeriesTitle") subject_keywords = extract_value(sum_doc, "Keywords", True) subject_names = convert_contributors(sum_doc, "RelatedPersons", None, "person", None) subject_terms = extract_value(sum_doc, "SubjectTerms", True) table_of_contents = extract_convert_langstrings(sum_doc, "TableOfContents", main_language) title = extract_value(sum_doc, "Title") title_sub = extract_value(sum_doc, "Subtitle") # identifiers has_isbn = False has_eissn = False identifiers_item = [] identifiers_is_part_of = [] for sum_key in summon_identifier_type_to_metajson_identifier_type: if sum_key in sum_doc: for id_value in sum_doc[sum_key]: id_type = summon_identifier_type_to_metajson_identifier_type[sum_key] if id_type == "issn": identifiers_is_part_of.append(metajson.create_identifier(id_type, id_value)) elif id_type == "eissn": has_eissn = True identifiers_is_part_of.append(metajson.create_identifier(id_type, id_value)) elif id_type == "isbn": has_isbn = True identifiers_is_part_of.append(metajson.create_identifier(id_type, id_value)) else: identifiers_item.append(metajson.create_identifier(id_type, id_value)) # is_part_of_type determination is_part_of_type = None if sum_type in summon_document_type_to_metajson_document_is_part_of_type: is_part_of_type = summon_document_type_to_metajson_document_is_part_of_type[sum_type] elif is_part_of_title and is_part_of_title != title and rec_type not in ["Book", "Journal", "Magazine", "Newspaper", "Periodical"]: if has_isbn: is_part_of_type = "Book" elif has_eissn: is_part_of_type = "Journal" elif is_part_of_title.lower().find("conference") != -1: is_part_of_type = "Book" elif is_part_of_title.lower().find("review") or is_part_of_title.lower().find("journal"): is_part_of_type = "Journal" elif rec_type == "Dataset": is_part_of_type = "Periodical" else: print "unknown is_part_of_type for rec_type: %s" % rec_type # is_part_of if is_part_of_type: is_part_of = Document() is_part_of.set_key_if_not_none("rec_type", is_part_of_type) is_part_of.set_key_if_not_none("edition", is_part_of_edition) is_part_of.add_items_to_key(identifiers_is_part_of, "identifiers") is_part_of.set_key_if_not_none("peer_reviewed", peer_reviewed) is_part_of.set_key_if_not_none("publisher", publisher) is_part_of.set_key_if_not_none("publisher_place", publisher_place) is_part_of.set_key_if_not_none("title", is_part_of_title) is_part_of.set_key_if_not_none("title_sub", is_part_of_title_sub) document.add_items_to_key(identifiers_item, "identifiers") document.add_items_to_key([is_part_of], "is_part_of") else: document.set_key_if_not_none("peer_reviewed", peer_reviewed) document.set_key_if_not_none("publisher", publisher) document.set_key_if_not_none("publisher_place", publisher_place) document.add_items_to_key(identifiers_is_part_of, "identifiers") document.add_items_to_key(identifiers_item, "identifiers") # series if series_title: series = Document() series.set_key_if_not_none("title", series_title) document.add_items_to_key([series], "series") # classificiations extract_convert_add_classifications(sum_doc, document, "DEWEY", "ddc") extract_convert_add_classifications(sum_doc, document, "Discipline", "discipline") extract_convert_add_classifications(sum_doc, document, "NAICS", "NAICS") # set properties document.set_key_if_not_none("contributors", contributors) document.set_key_if_not_none("copyright_statement", copyright_statement) document.set_key_if_not_none("date_issued", date_issued) document.set_key_if_not_none("degree", degree) document.set_key_if_not_none("descriptions", descriptions) document.set_key_if_not_none("edition", edition) document.set_key_if_not_none("extent_pages", extent_pages) document.set_key_if_not_none("genre", genre) document.set_key_if_not_none("notes", notes) document.set_key_if_not_none("part_issue", part_issue) document.set_key_if_not_none("part_page_end", part_page_end) document.set_key_if_not_none("part_page_start", part_page_start) document.set_key_if_not_none("part_volume", part_volume) document.set_key_if_not_none("scholarly", scholarly) document.set_key_if_not_none("table_of_contents", table_of_contents) document.set_key_if_not_none("title", title) document.set_key_if_not_none("title_sub", title_sub) # subject subject = Subject() if subject_keywords: subject["keywords"] = subject_keywords if subject_names: subject["names"] = subject_names if subject_terms: subject["terms"] = subject_terms if subject: document["subjects"] = subject debug = True if debug: related_items_msg = "\t\t\t\t\t\t" if is_part_of_type: related_items_msg = "\tis_part_of: {} ".format(is_part_of_type) print "{}\t->\titem: {} {}\t:\t{}\t:\t{}".format(sum_type, rec_type, related_items_msg, rec_id, title) return document
def convert_mods_identifier(mods_identifier): if mods_identifier is not None: return metajson.create_identifier(mods_identifier.get("type"), mods_identifier.text)