def append_subfield_element(xml_line, subfield_code, value): xml_element = '\n <subfield code="' \ '%(sf-code-ref-auth)s">%(value)s</subfield>' % { 'value' : encode_for_xml(value), 'sf-code-ref-auth' : subfield_code, } return xml_line + xml_element
def append_datafield_element(line_marker, citation_structure, line_elements, author, xml_line): """ Finish the current datafield element and start a new one, with a new marker subfield. @param line_marker: (string) The line marker which will be the sole content of the newly created marker subfield. This will always be the first subfield to be created for a new datafield element. @return new_datafield: (string) The string holding the relevant datafield and subfield tags. """ ## Add an author, if one must be added for ibid's, before splitting this line ## Also, if a standard title and an author are both present, save the author for future use new_datafield, author = check_author_for_ibid(line_elements, author) xml_line += new_datafield ## Start the new datafield xml_line += """ </datafield> <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s"> <subfield code="%(sf-code-ref-marker)s">%(marker-val)s</subfield>""" \ % {'df-tag-ref' : CFG_REFEXTRACT_TAG_ID_REFERENCE, 'df-ind1-ref' : CFG_REFEXTRACT_IND1_REFERENCE, 'df-ind2-ref' : CFG_REFEXTRACT_IND2_REFERENCE, 'sf-code-ref-marker' : CFG_REFEXTRACT_SUBFIELD_MARKER, 'marker-val' : encode_for_xml(format_marker(line_marker)) } ## add the past elements for end previous citation to the citation_structure list ## (citation_structure is a reference to the initial citation_structure list found in the calling method) citation_structure.append(line_elements) ## Clear the elements in the referenced list of elements del line_elements[:] return xml_line, author
def check_author_for_ibid(line_elements, author): """ Given a list of elements for an *entire* reference line, and the current author element to be used for ibids, check to see if that author element needs to be inserted into this line, depending on the presence of ibids and whether or not there is already an author paired with an ibid. Also, if no ibids are present in the line, see if the author element needs to be updated, depending on the presence of a normal title and a corresponding author group. @param line_elements: List of line elements for the entire processed reference line @param author: The current parent author element to be used with an ibid @return: (tuple) - containing a possible new author subfield, and the parent author element to be used for future ibids (if any) """ ## Upon splitting, check for ibids in the previous line, ## If an appropriate author was found, pair it with this ibid. ## (i.e., an author has not been explicitly paired with this ibid already ## and an author exists with the parent title to which this ibid refers) if is_in_line_elements("JOURNAL", line_elements): ## Get the title element for this line title_element = is_in_line_elements("JOURNAL", line_elements)[1] if author != None and not is_in_line_elements("AUTH", line_elements) \ and title_element['is_ibid']: ## Return the author subfield which needs to be appended for an ibid in the line ## No need to reset the author to be used for ibids, since this line holds an ibid return """ <subfield code="%(sf-code-ref-auth)s">%(authors)s</subfield>""" \ % {'authors' : encode_for_xml(author['auth_txt'].strip('()')), 'sf-code-ref-auth' : CFG_REFEXTRACT_SUBFIELD_AUTH, }, author ## Set the author for to be used for ibids, when a standard title is present in this line, ## as well as an author if not title_element['is_ibid'] and is_in_line_elements( "AUTH", line_elements): ## Set the author to be used for ibids, in the event that a subsequent ibid is found ## this author element will be repeated. ## This author is only used when an ibid is in a line ## and there is no other author found in the line. author = is_in_line_elements("AUTH", line_elements)[1] ## If there is no author associated with this head title, clear the author to be used for ibids elif not title_element['is_ibid']: author = None ## If an author does not need to be replicated for an ibid, append nothing to the xml line return "", author
def check_author_for_ibid(line_elements, author): """ Given a list of elements for an *entire* reference line, and the current author element to be used for ibids, check to see if that author element needs to be inserted into this line, depending on the presence of ibids and whether or not there is already an author paired with an ibid. Also, if no ibids are present in the line, see if the author element needs to be updated, depending on the presence of a normal title and a corresponding author group. @param line_elements: List of line elements for the entire processed reference line @param author: The current parent author element to be used with an ibid @return: (tuple) - containing a possible new author subfield, and the parent author element to be used for future ibids (if any) """ ## Upon splitting, check for ibids in the previous line, ## If an appropriate author was found, pair it with this ibid. ## (i.e., an author has not been explicitly paired with this ibid already ## and an author exists with the parent title to which this ibid refers) if is_in_line_elements("JOURNAL", line_elements): ## Get the title element for this line title_element = is_in_line_elements("JOURNAL", line_elements)[1] if author != None and not is_in_line_elements("AUTH", line_elements) \ and title_element['is_ibid']: ## Return the author subfield which needs to be appended for an ibid in the line ## No need to reset the author to be used for ibids, since this line holds an ibid return """ <subfield code="%(sf-code-ref-auth)s">%(authors)s</subfield>""" \ % {'authors' : encode_for_xml(author['auth_txt'].strip('()')), 'sf-code-ref-auth' : CFG_REFEXTRACT_SUBFIELD_AUTH, }, author ## Set the author for to be used for ibids, when a standard title is present in this line, ## as well as an author if not title_element['is_ibid'] and is_in_line_elements("AUTH", line_elements): ## Set the author to be used for ibids, in the event that a subsequent ibid is found ## this author element will be repeated. ## This author is only used when an ibid is in a line ## and there is no other author found in the line. author = is_in_line_elements("AUTH", line_elements)[1] ## If there is no author associated with this head title, clear the author to be used for ibids elif not title_element['is_ibid']: author = None ## If an author does not need to be replicated for an ibid, append nothing to the xml line return "", author
def start_datafield_element(line_marker): """ Start a brand new datafield element with a marker subfield. @param line_marker: (string) The line marker which will be the sole content of the newly created marker subfield. This will always be the first subfield to be created for a new datafield element. @return: (string) The string holding the relevant datafield and subfield tags. """ marker_subfield = """ <subfield code="%(sf-code-ref-marker)s">%(marker-val)s</subfield>""" \ % {'sf-code-ref-marker': CFG_REFEXTRACT_SUBFIELD_MARKER, 'marker-val' : encode_for_xml(format_marker(line_marker))} new_datafield = """ <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">%(marker-subfield)s""" \ % {'df-tag-ref' : CFG_REFEXTRACT_TAG_ID_REFERENCE, 'df-ind1-ref' : CFG_REFEXTRACT_IND1_REFERENCE, 'df-ind2-ref' : CFG_REFEXTRACT_IND2_REFERENCE, 'marker-subfield': marker_subfield} return new_datafield
def build_xml_citation(citation_elements, line_marker, inspire_format=None): """ Create the MARC-XML string of the found reference information which was taken from a tagged reference line. @param citation_elements: (list) an ordered list of dictionary elements, with each element corresponding to a found piece of information from a reference line. @param line_marker: (string) The line marker for this single reference line (e.g. [19]) @return xml_line: (string) The MARC-XML representation of the list of reference elements """ if inspire_format is None: inspire_format = CFG_INSPIRE_SITE ## Begin the datafield element xml_line = start_datafield_element(line_marker) ## This will hold the ordering of tags which have been appended to the xml line ## This list will be used to control the desisions involving the creation of new citation lines ## (in the event of a new set of authors being recognised, or strange title ordering...) line_elements = [] ## This is a list which will hold the current 'over-view' of a single reference line, ## as a list of lists, where each list corresponds to the contents of a datafield element ## in the xml mark-up citation_structure = [] auth_for_ibid = None for element in citation_elements: ## Before going onto checking 'what' the next element is, handle misc text and semi-colons ## Multiple misc text subfields will be compressed later ## This will also be the only part of the code that deals with MISC tag_typed elements if element['misc_txt'].strip(".,:;- []"): xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_MISC, element['misc_txt'].strip(".,:;- []")) # Now handle the type dependent actions # TITLE if element['type'] == "JOURNAL": # Select the journal title output format if inspire_format: # ADD to current datafield xml_line += """ <subfield code="%(sf-code-ref-title)s">%(title)s,%(volume)s,%(page)s</subfield>""" \ % {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE, 'title' : encode_for_xml(element['title']), 'volume' : encode_for_xml(element['volume']), 'page' : encode_for_xml(element['page']), } else: # ADD to current datafield xml_line += """ <subfield code="%(sf-code-ref-title)s">%(title)s %(volume)s (%(year)s) %(page)s</subfield>""" \ % {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE, 'title' : encode_for_xml(element['title']), 'volume' : encode_for_xml(element['volume']), 'year' : encode_for_xml(element['year']), 'page' : encode_for_xml(element['page']), } # Now, if there are any extra (numeration based) IBID's after this title if len(element['extra_ibids']) > 0: # At least one IBID is present, these are to be outputted each into their own datafield for ibid in element['extra_ibids']: # %%%%% Set as NEW citation line %%%%% (xml_line, auth_for_ibid) = append_datafield_element(line_marker, citation_structure, line_elements, auth_for_ibid, xml_line) if inspire_format: xml_line += """ <subfield code="%(sf-code-ref-title)s">%(title)s,%(volume)s,%(page)s</subfield>""" \ % {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE, 'title' : encode_for_xml(ibid['title']), 'volume' : encode_for_xml(ibid['volume']), 'page' : encode_for_xml(ibid['page']), } else: xml_line += """ <subfield code="%(sf-code-ref-title)s">%(title)s %(volume)s (%(year)s) %(page)s</subfield>""" \ % {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE, 'title' : encode_for_xml(ibid['title']), 'volume' : encode_for_xml(ibid['volume']), 'year' : encode_for_xml(ibid['year']), 'page' : encode_for_xml(ibid['page']), } # Add a Title element to the past elements list, since we last found an IBID line_elements.append(element) # REPORT NUMBER elif element['type'] == "REPORTNUMBER": # ADD to current datafield xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_REPORT_NUM, element['report_num']) line_elements.append(element) # URL elif element['type'] == "URL": if element['url_string'] == element['url_desc']: # Build the datafield for the URL segment of the reference line: xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_URL, element['url_string']) # Else, in the case that the url string and the description differ in some way, include them both else: # Build the datafield for the URL segment of the reference line: xml_line += """ <subfield code="%(sf-code-ref-url)s">%(url)s</subfield> <subfield code="%(sf-code-ref-url-desc)s">%(url-desc)s</subfield>""" \ % {'sf-code-ref-url' : CFG_REFEXTRACT_SUBFIELD_URL, 'sf-code-ref-url-desc': CFG_REFEXTRACT_SUBFIELD_URL_DESCR, 'url' : encode_for_xml(element['url_string']), 'url-desc' : encode_for_xml(element['url_desc']) } line_elements.append(element) # DOI elif element['type'] == "DOI": ## Split on hitting another DOI in the same line if is_in_line_elements("DOI", line_elements): ## %%%%% Set as NEW citation line %%%%% xml_line, auth_for_ibid = append_datafield_element(line_marker, citation_structure, line_elements, auth_for_ibid, xml_line) xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_DOI, element['doi_string']) line_elements.append(element) # AUTHOR elif element['type'] == "AUTH": value = element['auth_txt'] if element['auth_type'] == 'incl': value = "(%s)" % value if is_in_line_elements("AUTH", line_elements) and line_elements[-1]['type'] != "AUTH": xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_MISC, value) else: xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_AUTH, value) line_elements.append(element) elif element['type'] == "QUOTED": xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_QUOTED, element['title']) line_elements.append(element) elif element['type'] == "ISBN": xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_ISBN, element['ISBN']) line_elements.append(element) elif element['type'] == "BOOK": xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_QUOTED, element['title']) xml_line += '\n <subfield code="%s" />' % \ CFG_REFEXTRACT_SUBFIELD_BOOK line_elements.append(element) elif element['type'] == "PUBLISHER": xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_PUBLISHER, element['publisher']) line_elements.append(element) elif element['type'] == "YEAR": xml_line = append_subfield_element(xml_line, CFG_REFEXTRACT_SUBFIELD_YEAR, element['year']) line_elements.append(element) # Append the author, if needed for an ibid, for the last element # in the entire line. Don't bother setting the author to be used # for ibids, since the line is finished xml_line += check_author_for_ibid(line_elements, auth_for_ibid)[0] # Close the ending datafield element xml_line += "\n </datafield>\n" return xml_line