def ris_txt_lines_to_metajson_list(txt_lines, source, rec_id_prefix, only_first_record): document = None ris_type = None rec_type = None is_part_of_rec_type = None previous_key = None previous_value = None for line in txt_lines: if line: line = line.rstrip('\r\n') #logging.debug("line: {}".format(line)) # multi line management if previous_key: key = previous_key value = previous_value + line previous_key = None previous_value = None else: key = line[:2].strip() value = line[6:].strip() if value.endswith("/") and key not in ["Y1", "PY"]: #logging.debug("multi line") previous_key = key previous_value = value.rstrip('/') continue if key is None or len(key) == 0: # empty line -> continue #logging.debug("empty line") continue elif key == RIS_KEY_BEGIN: # record begin with document type -> create document # init document = Document() is_part_of_rec_type = None if source: document["rec_source"] = source ris_type = value rec_type = ris_document_type_to_metajson_document_type[ ris_type] document["rec_type"] = rec_type if ris_type in ris_document_type_to_metajson_document_is_part_of_type: is_part_of_rec_type = ris_document_type_to_metajson_document_is_part_of_type[ ris_type] is_part_of = Document() is_part_of["rec_type"] = is_part_of_rec_type document["is_part_ofs"] = [is_part_of] elif key == RIS_KEY_END: # record end -> return the result # verify the is_part_ofs[0]["title"] if "is_part_ofs" in document and "title" not in document[ "is_part_ofs"][0] and "title_abbreviateds" in document[ "is_part_ofs"][0]: document["is_part_ofs"][0]["title"] = document[ "is_part_ofs"][0]["title_abbreviateds"][0]["title"] del document["is_part_ofs"][0]["title_abbreviateds"] logging.info("# RIS type: {}".format(ris_type)) metajson_service.pretty_print_document(document) yield document else: # process key value #logging.debug("key: {}; value: {}".format(key, value)) if key == "ID": document["rec_id"] = value elif key in [ "T1", "TI", "CT" ] or (key == "BT" and ris_type in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]): # Title Primary -> title document["title"] = value elif key in [ "JF", "JO" ] or (key == "BT" and ris_type not in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]): # Title Secondary -> is_part_of["title"] document.add_is_part_of_title(value) elif key in ["JA", "J1", "J2", "T2"]: # Title Secondary -> is_part_of["title_abbreviateds"][O]["title"] document.add_is_part_of_title_abbreviated(value) elif key == "T3": # Title Series document.add_series_title(value) elif key in ["A1", "AU"]: document.add_creator( creator_service.formatted_name_to_creator( value, None, "aut")) elif key in ["A2", "ED"]: if is_part_of_rec_type: document.add_is_part_of_creator( creator_service.formatted_name_to_creator( value, None, "edt")) else: document.add_creator( creator_service.formatted_name_to_creator( value, None, "edt")) elif key == "A3": document.add_series_creator( creator_service.formatted_name_to_creator( value, None, "aut")) elif key == "A4": document.add_creator( creator_service.formatted_name_to_creator( value, None, "ctb")) elif key in ["PY", "Y1", "DA"]: index_slash = value.find("/") if index_slash != -1: # YYYY/MM/DD/other info (like season) # todo document["date_issued"] = value.strip("/") else: document["date_issued"] = value elif key == "SP": document["part_page_begin"] = value elif key == "EP": document["part_page_end"] = value elif key == "VL": document["part_volume"] = value elif key in ["IS", "CP"]: document["part_issue"] = value elif key in ["AB", "N2"]: document["descriptions"] = [{ "language": "und", "value": value }] elif key == "N1": document["notes"] = [{"language": "und", "value": value}] elif key == "PB": document.add_item_to_key(value, "publishers") elif key == "CY": document.add_item_to_key(value, "publication_places") elif key == "RP": document["publication_status"] = value elif key == "ET": document["edition"] = value elif key == "UR": resource = Resource() resource["url"] = value document.add_item_to_key(resource, "resources") elif key == "AN": # Accession Number identifier = metajson_service.create_identifier( "accessionnumber", value) document.add_identifier(identifier) elif key == "CN": # Call Number identifier = metajson_service.create_identifier( "callnumber", value) document.add_identifier(identifier) elif key == "DO": # DOI identifier = metajson_service.create_identifier( "doi", value) document.add_identifier(identifier) elif key == "SN": # ISBN or ISSN ? id_type = None if rec_type in [ constants.DOC_TYPE_JOURNALARTICLE, constants.DOC_TYPE_MAGAZINEARTICLE, constants.DOC_TYPE_NEWSPAPERARTICLE, constants.DOC_TYPE_JOURNAL ]: id_type = "issn" else: id_type = "isbn" identifier = metajson_service.create_identifier( id_type, value) if is_part_of_rec_type is None: document.add_identifier(identifier) else: document["is_part_ofs"][0].add_identifier(identifier) elif key == "CA": document["caption"] = value elif key == "DB": # Name of Database -> rec_source ? document["rec_source"] = value elif key == "DP": # NDatabase Provider -> rec_source ? document["rec_source"] = value elif key == "KW": if "keywords" not in document: document["keywords"] = {"und": []} document["keywords"]["und"].append(value) else: logging.debug("Not managed key: {} with value: {}".format( key, value))
def ris_txt_lines_to_metajson_list(txt_lines, source, rec_id_prefix, only_first_record): document = None ris_type = None rec_type = None is_part_of_rec_type = None previous_key = None previous_value = None for line in txt_lines: if line: line = line.rstrip('\r\n') #logging.debug("line: {}".format(line)) # multi line management if previous_key: key = previous_key value = previous_value + line previous_key = None previous_value = None else: key = line[:2].strip() value = line[6:].strip() if value.endswith("/") and key not in ["Y1", "PY"]: #logging.debug("multi line") previous_key = key previous_value = value.rstrip('/') continue if key is None or len(key) == 0: # empty line -> continue #logging.debug("empty line") continue elif key == RIS_KEY_BEGIN: # record begin with document type -> create document # init document = Document() is_part_of_rec_type = None if source: document["rec_source"] = source ris_type = value rec_type = ris_document_type_to_metajson_document_type[ris_type] document["rec_type"] = rec_type if ris_type in ris_document_type_to_metajson_document_is_part_of_type: is_part_of_rec_type = ris_document_type_to_metajson_document_is_part_of_type[ris_type] is_part_of = Document() is_part_of["rec_type"] = is_part_of_rec_type document["is_part_ofs"] = [is_part_of] elif key == RIS_KEY_END: # record end -> return the result # verify the is_part_ofs[0]["title"] if "is_part_ofs" in document and "title" not in document["is_part_ofs"][0] and "title_abbreviateds" in document["is_part_ofs"][0]: document["is_part_ofs"][0]["title"] = document["is_part_ofs"][0]["title_abbreviateds"][0]["title"] del document["is_part_ofs"][0]["title_abbreviateds"] logging.info("# RIS type: {}".format(ris_type)) metajson_service.pretty_print_document(document) yield document else: # process key value #logging.debug("key: {}; value: {}".format(key, value)) if key == "ID": document["rec_id"] = value elif key in ["T1", "TI", "CT"] or (key == "BT" and ris_type in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]): # Title Primary -> title document["title"] = value elif key in ["JF", "JO"] or (key == "BT" and ris_type not in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]): # Title Secondary -> is_part_of["title"] document.add_is_part_of_title(value) elif key in ["JA", "J1", "J2", "T2"]: # Title Secondary -> is_part_of["title_abbreviateds"][O]["title"] document.add_is_part_of_title_abbreviated(value) elif key == "T3": # Title Series document.add_series_title(value) elif key in ["A1", "AU"]: document.add_creator(creator_service.formatted_name_to_creator(value, None, "aut")) elif key in ["A2", "ED"]: if is_part_of_rec_type: document.add_is_part_of_creator(creator_service.formatted_name_to_creator(value, None, "edt")) else: document.add_creator(creator_service.formatted_name_to_creator(value, None, "edt")) elif key == "A3": document.add_series_creator(creator_service.formatted_name_to_creator(value, None, "aut")) elif key == "A4": document.add_creator(creator_service.formatted_name_to_creator(value, None, "ctb")) elif key in ["PY", "Y1", "DA"]: index_slash = value.find("/") if index_slash != -1: # YYYY/MM/DD/other info (like season) # todo document["date_issued"] = value.strip("/") else: document["date_issued"] = value elif key == "SP": document["part_page_begin"] = value elif key == "EP": document["part_page_end"] = value elif key == "VL": document["part_volume"] = value elif key in ["IS", "CP"]: document["part_issue"] = value elif key in ["AB", "N2"]: document["descriptions"] = [{"language": "und", "value": value}] elif key == "N1": document["notes"] = [{"language": "und", "value": value}] elif key == "PB": document.add_item_to_key(value, "publishers") elif key == "CY": document.add_item_to_key(value, "publication_places") elif key == "RP": document["publication_status"] = value elif key == "ET": document["edition"] = value elif key == "UR": resource = Resource() resource["url"] = value document.add_item_to_key(resource, "resources") elif key == "AN": # Accession Number identifier = metajson_service.create_identifier("accessionnumber", value) document.add_identifier(identifier) elif key == "CN": # Call Number identifier = metajson_service.create_identifier("callnumber", value) document.add_identifier(identifier) elif key == "DO": # DOI identifier = metajson_service.create_identifier("doi", value) document.add_identifier(identifier) elif key == "SN": # ISBN or ISSN ? id_type = None if rec_type in [constants.DOC_TYPE_JOURNALARTICLE, constants.DOC_TYPE_MAGAZINEARTICLE, constants.DOC_TYPE_NEWSPAPERARTICLE, constants.DOC_TYPE_JOURNAL]: id_type = "issn" else: id_type = "isbn" identifier = metajson_service.create_identifier(id_type, value) if is_part_of_rec_type is None: document.add_identifier(identifier) else: document["is_part_ofs"][0].add_identifier(identifier) elif key == "CA": document["caption"] = value elif key == "DB": # Name of Database -> rec_source ? document["rec_source"] = value elif key == "DP": # NDatabase Provider -> rec_source ? document["rec_source"] = value elif key == "KW": if "keywords" not in document: document["keywords"] = {"und": []} document["keywords"]["und"].append(value) else: logging.debug("Not managed key: {} with value: {}".format(key, value))