def convert_endnote_record_to_metajson_document(record, source): document = Document() # TODO # translated_contributors: /contributors/translated-authors/author/style # auth_address: /auth-address/style # label: /label/style # custom1 # Extract endnote properties rec_id = record.find("rec-number").text endnote_type = record.find("ref-type").text rec_type = endnote_record_type_to_metajson_document_type[endnote_type] primary_contributors = extract_contributors(None, "aut", record, "./contributors/authors/author/style") secondary_contributors = extract_contributors(None, "edt", record, "./contributors/secondary-authors/author/style") if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION]: tertiary_contributors = extract_contributors(None, "edt", record, "./contributors/tertiary-authors/author/style") elif endnote_type == TYPE_THESIS: tertiary_contributors = extract_contributors(None, "ths", record, "./contributors/tertiary-authors/author/style") elif endnote_type == TYPE_FILM_OR_BROADCAST: tertiary_contributors = extract_contributors(None, "pro", record, "./contributors/tertiary-authors/author/style") if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION]: subsidiary_contributors = extract_contributors(None, "trl", record, "./contributors/subsidiary-authors/author/style") elif endnote_type == TYPE_FILM_OR_BROADCAST: subsidiary_contributors = extract_contributors(None, "act", record, "./contributors/subsidiary-authors/author/style") translated_contributors = extract_contributors(None, "trl", record, "./contributors/translated-authors/author/style") auth_address = extract_text(record, "./auth-address/style") title = extract_text(record, "./titles/title/style") title_secondary = extract_text(record, "./titles/secondary-title/style") title_tertiary = extract_text(record, "./titles/tertiary-title/style") title_alternative = extract_text(record, "./titles/alt-title/style") title_abbreviated = extract_text(record, "./titles/short-title/style") title_translated = extract_text(record, "./titles/translated-title/style") pages = extract_text(record, "./pages/style") part_volume = extract_text(record, "./volume/style") part_number = extract_text(record, "./number/style") extent_volumes = extract_text(record, "./num-vols/style") edition = extract_text(record, "./edition/style") part_section = extract_text(record, "./section/style") reprint_edition = extract_text(record, "./reprint-edition/style") keywords = extract_text(record, "./keywords/keyword/style") date_year = extract_text(record, "./dates/year/style") date_pub = extract_text(record, "./dates/pub-dates/date/style") publisher_place = extract_text(record, "./pub-location/style") publisher = extract_text(record, "./publisher/style") orig_pub = extract_text(record, "./orig-pub/style") isbn_or_issn = extract_text(record, "./isbn/style") accessionnumber = extract_text(record, "./accession-num/style") callnumber = extract_text(record, "./call-num/style") if endnote_type == TYPE_WEB_PAGE: abstract = extract_text(record, "./pages/style") else: abstract = extract_text(record, "./abstract/style") label = extract_text(record, "./label/style") caption = extract_text(record, "./caption/style") note = extract_text(record, "./notes/style") reviewed_item = extract_text(record, "./reviewed-item/style") rec_type_description = extract_text(record, "./work-type/style") remote_url = extract_text(record, "./urls/related-urls/url/style") custom1 = extract_text(record, "./custom1/style") custom2 = extract_text(record, "./custom2/style") custom3 = extract_text(record, "./custom3/style") custom4 = extract_text(record, "./custom4/style") custom5 = extract_text(record, "./custom5/style") custom6 = extract_text(record, "./custom6/style") custom7 = extract_text(record, "./custom7/style") doi = extract_text(record, "./electronic-resource-num/style") remote_database_name = extract_text(record, "./remote-database-name/style") remote_database_provider = extract_text(record, "./remote-database-provider/style") research_notes = extract_text(record, "./research-notes/style") language = extract_text(record, "./language/style") access_date = extract_text(record, "./access-date/style") # rec_id, rec_source document["rec_id"] = rec_id document["rec_source"] = source # publisher, publisher_place if publisher: publisher = publisher.replace("\r", "; ") if publisher_place: publisher_place = publisher_place.replace("\r", "; ") # type, is_part_of.type, is_part_of.is_part_of.type try: is_part_of_type = endnote_record_type_to_metajson_document_is_part_of_type[endnote_type] except: is_part_of_type = None is_part_of_is_part_of_type = None if title_secondary is not None: if endnote_type == TYPE_FIGURE: # how to determine the is_part_of type ? # if there is a volume or an issue number, it's a JournalArticle, else it's a Book or BookChapter if part_volume is not None or part_number is not None: is_part_of_type = "Article" is_part_of_is_part_of_type = "Journal" else: if title_translated is not None: is_part_of_type = "BookPart" is_part_of_is_part_of_type = "Book" else: is_part_of_type = "Book" elif endnote_type == TYPE_FILM_OR_BROADCAST: rec_type = "VideoPart" is_part_of_type = "VideoRecording" document["rec_type"] = rec_type document.set_key_if_not_none("rec_type_description", rec_type_description) # issn or isbn ? if is_part_of_type in ["Journal", "Newspaper", "Article"]: isbn_or_issn_type = "issn" else: isbn_or_issn_type = "isbn" # is_part_of, is_part_of.is_part_of if is_part_of_type is not None: is_part_of = Document() is_part_of.set_key_if_not_none("rec_type", is_part_of_type) is_part_of.set_key_if_not_none("title", title_secondary) if is_part_of_is_part_of_type is not None: # is_part_of in case of is_part_of.is_part_of # contributors with role aut is_part_of.add_contributors(contributor_service.change_contibutors_role(secondary_contributors, "aut")) # is_part_of.is_part_of is_part_of_is_part_of = Document() is_part_of_is_part_of.set_key_if_not_none("rec_type", is_part_of_is_part_of_type) is_part_of_is_part_of.set_key_if_not_none("title", title_translated) # contributors with role edt is_part_of_is_part_of.add_contributors(contributor_service.change_contibutors_role(translated_contributors, "edt")) #is_part_of_is_part_of.set_key_if_not_none("date_issued",date_year) is_part_of_is_part_of.set_key_if_not_none("publisher", publisher) is_part_of_is_part_of.set_key_if_not_none("publisher_place", publisher_place) is_part_of_is_part_of.set_key_with_value_type_in_list("identifiers", isbn_or_issn, isbn_or_issn_type) is_part_of.add_items_to_key([is_part_of_is_part_of], "is_part_of") else: # is_part_of in case of no is_part_of.is_part_of # contributors with role edt is_part_of.add_contributors(secondary_contributors) #is_part_of.set_key_if_not_none("date_issued",date_year) is_part_of.set_key_if_not_none("publisher", publisher) is_part_of.set_key_if_not_none("publisher_place", publisher_place) is_part_of.set_key_with_value_type_in_list("identifiers", isbn_or_issn, isbn_or_issn_type) document.add_items_to_key([is_part_of], "is_part_of") else: document.set_key_with_value_type_in_list("identifiers", isbn_or_issn, isbn_or_issn_type) if publisher: if endnote_type == TYPE_THESIS: document.add_contributors([contributor_service.convert_formatted_name_to_contributor(publisher, "orgunit", "dgg")]) elif endnote_type == TYPE_FILM_OR_BROADCAST: document.add_contributors([contributor_service.convert_formatted_name_to_contributor(publisher, "orgunit", "dst")]) else: document.set_key_if_not_none("publisher", publisher) document.set_key_if_not_none("publisher_place", publisher_place) # series[] if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION]: series = Document() if endnote_type == TYPE_BOOK and title_secondary: series.set_key_if_not_none("title", title_secondary) series.add_contributors(secondary_contributors) series.set_key_if_not_none("part_volume", part_number) if endnote_type == TYPE_BOOK_SECTION and title_tertiary: series.set_key_if_not_none("title", title_tertiary) series.add_contributors(tertiary_contributors) series.set_key_if_not_none("part_volume", part_number) if "title" in series and len(series) > 2: document.add_items_to_key([series], "series") # originals[] if (reprint_edition or orig_pub) and endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION, TYPE_JOURNAL_ARTICLE, TYPE_FILM_OR_BROADCAST]: original = Document() if reprint_edition: original_title = reprint_edition elif orig_pub: original_title = orig_pub original.set_key_if_not_none("title", original_title) original.set_key_if_not_none("rec_type", rec_type) document.add_items_to_key([original], "original") # review_of[] if reviewed_item and endnote_type in [TYPE_BOOK_SECTION, TYPE_JOURNAL_ARTICLE]: review_of = Document() review_of.set_key_if_not_none("title", reviewed_item) review_of.set_key_if_not_none("rec_type", "Book") document.add_items_to_key([review_of], "review_of") # abstracts[0].value if abstract: document["abstracts"] = [{"value": abstract}] # archive if endnote_type == TYPE_FIGURE and remote_database_provider: archive = Document() archive["title"] = remote_database_provider document.add_items_to_key([archive], "archive") # caption document.set_key_if_not_none("caption", caption) # contributors[] document.add_contributors(primary_contributors) if endnote_type in [TYPE_BOOK, TYPE_THESIS, TYPE_FILM_OR_BROADCAST]: document.add_contributors(tertiary_contributors) if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION, TYPE_FILM_OR_BROADCAST]: document.add_contributors(subsidiary_contributors) if custom4: document.add_contributors(convert_endnote_authors_to_contributors(custom4, "person", "rev")) if endnote_type == TYPE_FIGURE and remote_database_name: document.add_contributors(convert_endnote_authors_to_contributors(remote_database_name, None, "cph")) # edition if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION, TYPE_FILM_OR_BROADCAST, TYPE_WEB_PAGE] and edition: document["edition"] = edition # extent_pages, extent_volumes if endnote_type in [TYPE_BOOK, TYPE_THESIS] and pages: document["extent_pages"] = pages.replace("p.", "").strip() if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION] and extent_volumes: document["extent_volumes"] = extent_volumes # date_issued, date_issued_first if date_year: date_issued = "" date_issued_first = "" orig_index_start = date_year.find("[") orig_index_end = date_year.find("]") if orig_index_start != -1 and orig_index_end != -1: date_issued_first = date_year[orig_index_start + 1: orig_index_end] date_issued = date_year.replace("[" + date_issued_first + "]", "").strip() else: date_issued = date_year.strip() if "is_part_of" in document: if document["is_part_of"][0]["rec_type"] == "Book": document["is_part_of"][0].set_key_if_not_none("date_issued", date_issued) document["is_part_of"][0].set_key_if_not_none("date_issued_first", date_issued_first) elif "is_part_of" in document["is_part_of"][0] and document["is_part_of"][0]["is_part_of"][0]["rec_type"] == "Book": document["is_part_of"][0]["is_part_of"][0].set_key_if_not_none("date_issued", date_issued) document["is_part_of"][0]["is_part_of"][0].set_key_if_not_none("date_issued_first", date_issued_first) else: document.set_key_if_not_none("date_issued_first", date_issued_first) document.set_key_if_not_none("date_issued", date_issued) else: document.set_key_if_not_none("date_issued_first", date_issued_first) document.set_key_if_not_none("date_issued", date_issued) # identifiers[] document.set_key_with_value_type_in_list("identifiers", accessionnumber, "accessionnumber") document.set_key_with_value_type_in_list("identifiers", callnumber, "callnumber") document.set_key_with_value_type_in_list("identifiers", doi, "doi") # language if language: iso639_2 = language_service.convert_unknown_format_to_iso639_2(language) if iso639_2: document["languages"] = [{"authority": "iso639-2b", "value": iso639_2}] # note if endnote_import_note and note: document.set_key_with_value_type_in_list("notes", note, "general") if endnote_import_research_note and research_notes: document.set_key_with_value_type_in_list("notes", research_notes, "user") # part_page_start & part_page_end if endnote_type in [TYPE_BOOK_SECTION, TYPE_FIGURE, TYPE_JOURNAL_ARTICLE] and pages: hyphen_index = pages.find("-") if hyphen_index == -1: document["part_page_start"] = pages.replace("p.", "").strip() else: document["part_page_start"] = pages[:hyphen_index].replace("p.", "").strip() document["part_page_end"] = pages[hyphen_index+1:].replace("p.", "").strip() if endnote_type in [TYPE_JOURNAL_ARTICLE]: document.set_key_if_not_none("part_issue", part_number) elif endnote_type in [TYPE_FIGURE]: document.set_key_if_not_none("part_number", part_number) document.set_key_if_not_none("part_section", part_section) document.set_key_if_not_none("part_volume", part_volume) # resources[0] if remote_url is not None: resource = Resource() resource.set_key_if_not_none("remote_url", remote_url) if endnote_type == TYPE_WEB_PAGE: resource.set_key_if_not_none("date_last_accessed", part_number) else: resource.set_key_if_not_none("date_last_accessed", access_date) document["resources"] = [resource] # subjects[] if endnote_import_keywords and keywords: for keyword in keywords.split(): document.set_key_with_value_type_in_list("subjects", keyword, "topic") # title, title_alternative, title_abbreviated, title_translated document["title"] = title if title_alternative: document["title_alternative"] = [{"title": title_alternative}] if title_abbreviated: document["title_abbreviated"] = [{"title": title_abbreviated}] if not is_part_of_is_part_of_type and title_translated: # the title_translated is used for the is_part_of.is_part_of_type.title document["title_translated"] = [{"title": title_translated}] debug = True if debug: related_items_msg = "" if is_part_of_type: related_items_msg = "\tis_part_of: {} ".format(is_part_of_type) if is_part_of_is_part_of_type: related_items_msg += "\tis_part_of.is_part_of: {} ".format(is_part_of_is_part_of_type) print "# {}\t:\t{}\t:\t{}\t->\titem: {} {}".format(source, rec_id, endnote_type, rec_type, related_items_msg, title) return document
def convert_summon_json_document_to_metajson_document(sum_doc, source): document = Document() # Extract Summon properties rec_id = sum_doc["ID"][0].replace("FETCH-", "") sum_type = sum_doc["ContentType"][0] rec_type = summon_document_type_to_metajson_document_type[sum_type] # rec_id, rec_source, rec_type document["rec_id"] = rec_id document["rec_source"] = source document["rec_type"] = rec_type # languages main_language = None if "Language" in sum_doc: languages = [] for sum_lang in sum_doc["Language"]: lang = language_service.convert_english_to_rfc5646(sum_lang) if lang: languages.append(lang) if languages: main_language = languages[0] document["languages"] = languages # extract summon properties contributors = extract_contributors(sum_doc) copyright_statement = extract_value(sum_doc, "Copyright") date_issued = extract_date_issued(sum_doc) degree = extract_value(sum_doc, "DissertationDegree") descriptions = extract_convert_langstrings(sum_doc, "Abstract", main_language) edition = extract_value(sum_doc, "Edition") extent_pages = extract_value(sum_doc, "PageCount") genre = extract_value(sum_doc, "Genre") is_part_of_edition = extract_value(sum_doc, "PublicationEdition") is_part_of_title = extract_value(sum_doc, "PublicationTitle") is_part_of_title_sub = extract_value(sum_doc, "PublicationSubtitle") notes = extract_convert_langstrings(sum_doc, "Notes", main_language) part_issue = extract_value(sum_doc, "Issue") part_page_end = extract_value(sum_doc, "EndPage") part_page_start = extract_value(sum_doc, "StartPage") part_volume = extract_value(sum_doc, "Volume") peer_reviewed = extract_boolean_value(sum_doc, "IsPeerReviewed") publisher = extract_value(sum_doc, "Publisher") publisher_place = extract_value(sum_doc, "PublicationPlace") scholarly = extract_boolean_value(sum_doc, "IsScholarly") series_title = extract_value(sum_doc, "PublicationSeriesTitle") subject_keywords = extract_value(sum_doc, "Keywords", True) subject_names = convert_contributors(sum_doc, "RelatedPersons", None, "person", None) subject_terms = extract_value(sum_doc, "SubjectTerms", True) table_of_contents = extract_convert_langstrings(sum_doc, "TableOfContents", main_language) title = extract_value(sum_doc, "Title") title_sub = extract_value(sum_doc, "Subtitle") # identifiers has_isbn = False has_eissn = False identifiers_item = [] identifiers_is_part_of = [] for sum_key in summon_identifier_type_to_metajson_identifier_type: if sum_key in sum_doc: for id_value in sum_doc[sum_key]: id_type = summon_identifier_type_to_metajson_identifier_type[sum_key] if id_type == "issn": identifiers_is_part_of.append(metajson.create_identifier(id_type, id_value)) elif id_type == "eissn": has_eissn = True identifiers_is_part_of.append(metajson.create_identifier(id_type, id_value)) elif id_type == "isbn": has_isbn = True identifiers_is_part_of.append(metajson.create_identifier(id_type, id_value)) else: identifiers_item.append(metajson.create_identifier(id_type, id_value)) # is_part_of_type determination is_part_of_type = None if sum_type in summon_document_type_to_metajson_document_is_part_of_type: is_part_of_type = summon_document_type_to_metajson_document_is_part_of_type[sum_type] elif is_part_of_title and is_part_of_title != title and rec_type not in ["Book", "Journal", "Magazine", "Newspaper", "Periodical"]: if has_isbn: is_part_of_type = "Book" elif has_eissn: is_part_of_type = "Journal" elif is_part_of_title.lower().find("conference") != -1: is_part_of_type = "Book" elif is_part_of_title.lower().find("review") or is_part_of_title.lower().find("journal"): is_part_of_type = "Journal" elif rec_type == "Dataset": is_part_of_type = "Periodical" else: print "unknown is_part_of_type for rec_type: %s" % rec_type # is_part_of if is_part_of_type: is_part_of = Document() is_part_of.set_key_if_not_none("rec_type", is_part_of_type) is_part_of.set_key_if_not_none("edition", is_part_of_edition) is_part_of.add_items_to_key(identifiers_is_part_of, "identifiers") is_part_of.set_key_if_not_none("peer_reviewed", peer_reviewed) is_part_of.set_key_if_not_none("publisher", publisher) is_part_of.set_key_if_not_none("publisher_place", publisher_place) is_part_of.set_key_if_not_none("title", is_part_of_title) is_part_of.set_key_if_not_none("title_sub", is_part_of_title_sub) document.add_items_to_key(identifiers_item, "identifiers") document.add_items_to_key([is_part_of], "is_part_of") else: document.set_key_if_not_none("peer_reviewed", peer_reviewed) document.set_key_if_not_none("publisher", publisher) document.set_key_if_not_none("publisher_place", publisher_place) document.add_items_to_key(identifiers_is_part_of, "identifiers") document.add_items_to_key(identifiers_item, "identifiers") # series if series_title: series = Document() series.set_key_if_not_none("title", series_title) document.add_items_to_key([series], "series") # classificiations extract_convert_add_classifications(sum_doc, document, "DEWEY", "ddc") extract_convert_add_classifications(sum_doc, document, "Discipline", "discipline") extract_convert_add_classifications(sum_doc, document, "NAICS", "NAICS") # set properties document.set_key_if_not_none("contributors", contributors) document.set_key_if_not_none("copyright_statement", copyright_statement) document.set_key_if_not_none("date_issued", date_issued) document.set_key_if_not_none("degree", degree) document.set_key_if_not_none("descriptions", descriptions) document.set_key_if_not_none("edition", edition) document.set_key_if_not_none("extent_pages", extent_pages) document.set_key_if_not_none("genre", genre) document.set_key_if_not_none("notes", notes) document.set_key_if_not_none("part_issue", part_issue) document.set_key_if_not_none("part_page_end", part_page_end) document.set_key_if_not_none("part_page_start", part_page_start) document.set_key_if_not_none("part_volume", part_volume) document.set_key_if_not_none("scholarly", scholarly) document.set_key_if_not_none("table_of_contents", table_of_contents) document.set_key_if_not_none("title", title) document.set_key_if_not_none("title_sub", title_sub) # subject subject = Subject() if subject_keywords: subject["keywords"] = subject_keywords if subject_names: subject["names"] = subject_names if subject_terms: subject["terms"] = subject_terms if subject: document["subjects"] = subject debug = True if debug: related_items_msg = "\t\t\t\t\t\t" if is_part_of_type: related_items_msg = "\tis_part_of: {} ".format(is_part_of_type) print "{}\t->\titem: {} {}\t:\t{}\t:\t{}".format(sum_type, rec_type, related_items_msg, rec_id, title) return document