Exemplo n.º 1
0
def append_subfield_element(xml_line, subfield_code, value):
    xml_element = '\n      <subfield code="' \
        '%(sf-code-ref-auth)s">%(value)s</subfield>' % {
            'value'             : encode_for_xml(value),
            'sf-code-ref-auth'  : subfield_code,
        }
    return xml_line + xml_element
Exemplo n.º 2
0
def append_datafield_element(line_marker, citation_structure, line_elements,
                             author, xml_line):
    """ Finish the current datafield element and start a new one, with a new
        marker subfield.
        @param line_marker: (string) The line marker which will be the sole
        content of the newly created marker subfield. This will always be the
        first subfield to be created for a new datafield element.
        @return new_datafield: (string) The string holding the relevant
        datafield and subfield tags.
    """
    ## Add an author, if one must be added for ibid's, before splitting this line
    ## Also, if a standard title and an author are both present, save the author for future use
    new_datafield, author = check_author_for_ibid(line_elements, author)

    xml_line += new_datafield
    ## Start the new datafield
    xml_line += """
   </datafield>
   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">
      <subfield code="%(sf-code-ref-marker)s">%(marker-val)s</subfield>""" \
    % {'df-tag-ref'         : CFG_REFEXTRACT_TAG_ID_REFERENCE,
       'df-ind1-ref'        : CFG_REFEXTRACT_IND1_REFERENCE,
       'df-ind2-ref'        : CFG_REFEXTRACT_IND2_REFERENCE,
       'sf-code-ref-marker' : CFG_REFEXTRACT_SUBFIELD_MARKER,
       'marker-val'         : encode_for_xml(format_marker(line_marker))
    }

    ## add the past elements for end previous citation to the citation_structure list
    ## (citation_structure is a reference to the initial citation_structure list found in the calling method)
    citation_structure.append(line_elements)

    ## Clear the elements in the referenced list of elements
    del line_elements[:]

    return xml_line, author
Exemplo n.º 3
0
def check_author_for_ibid(line_elements, author):
    """ Given a list of elements for an *entire* reference line, and the current
        author element to be used for ibids, check to see if that author element needs
        to be inserted into this line, depending on the presence of ibids and whether
        or not there is already an author paired with an ibid.
        Also, if no ibids are present in the line, see if the author element needs
        to be updated, depending on the presence of a normal title and a corresponding
        author group.
        @param line_elements: List of line elements for the entire processed reference
        line
        @param author: The current parent author element to be used with an ibid
        @return: (tuple) - containing a possible new author subfield, and the parent
        author element to be used for future ibids (if any)
    """
    ## Upon splitting, check for ibids in the previous line,
    ## If an appropriate author was found, pair it with this ibid.
    ## (i.e., an author has not been explicitly paired with this ibid already
    ## and an author exists with the parent title to which this ibid refers)
    if is_in_line_elements("JOURNAL", line_elements):
        ## Get the title element for this line
        title_element = is_in_line_elements("JOURNAL", line_elements)[1]

        if author != None and not is_in_line_elements("AUTH", line_elements) \
        and title_element['is_ibid']:
            ## Return the author subfield which needs to be appended for an ibid in the line
            ## No need to reset the author to be used for ibids, since this line holds an ibid
            return """
          <subfield code="%(sf-code-ref-auth)s">%(authors)s</subfield>""" \
                % {'authors'          : encode_for_xml(author['auth_txt'].strip('()')),
                   'sf-code-ref-auth' : CFG_REFEXTRACT_SUBFIELD_AUTH,
                  }, author

        ## Set the author for to be used for ibids, when a standard title is present in this line,
        ## as well as an author
        if not title_element['is_ibid'] and is_in_line_elements(
                "AUTH", line_elements):
            ## Set the author to be used for ibids, in the event that a subsequent ibid is found
            ## this author element will be repeated.
            ## This author is only used when an ibid is in a line
            ## and there is no other author found in the line.
            author = is_in_line_elements("AUTH", line_elements)[1]
        ## If there is no author associated with this head title, clear the author to be used for ibids
        elif not title_element['is_ibid']:
            author = None

    ## If an author does not need to be replicated for an ibid, append nothing to the xml line
    return "", author
Exemplo n.º 4
0
def check_author_for_ibid(line_elements, author):
    """ Given a list of elements for an *entire* reference line, and the current
        author element to be used for ibids, check to see if that author element needs
        to be inserted into this line, depending on the presence of ibids and whether
        or not there is already an author paired with an ibid.
        Also, if no ibids are present in the line, see if the author element needs
        to be updated, depending on the presence of a normal title and a corresponding
        author group.
        @param line_elements: List of line elements for the entire processed reference
        line
        @param author: The current parent author element to be used with an ibid
        @return: (tuple) - containing a possible new author subfield, and the parent
        author element to be used for future ibids (if any)
    """
    ## Upon splitting, check for ibids in the previous line,
    ## If an appropriate author was found, pair it with this ibid.
    ## (i.e., an author has not been explicitly paired with this ibid already
    ## and an author exists with the parent title to which this ibid refers)
    if is_in_line_elements("JOURNAL", line_elements):
        ## Get the title element for this line
        title_element = is_in_line_elements("JOURNAL", line_elements)[1]

        if author != None and not is_in_line_elements("AUTH", line_elements) \
        and title_element['is_ibid']:
            ## Return the author subfield which needs to be appended for an ibid in the line
            ## No need to reset the author to be used for ibids, since this line holds an ibid
            return """
          <subfield code="%(sf-code-ref-auth)s">%(authors)s</subfield>""" \
                % {'authors'          : encode_for_xml(author['auth_txt'].strip('()')),
                   'sf-code-ref-auth' : CFG_REFEXTRACT_SUBFIELD_AUTH,
                  }, author

        ## Set the author for to be used for ibids, when a standard title is present in this line,
        ## as well as an author
        if not title_element['is_ibid'] and is_in_line_elements("AUTH", line_elements):
            ## Set the author to be used for ibids, in the event that a subsequent ibid is found
            ## this author element will be repeated.
            ## This author is only used when an ibid is in a line
            ## and there is no other author found in the line.
            author = is_in_line_elements("AUTH", line_elements)[1]
        ## If there is no author associated with this head title, clear the author to be used for ibids
        elif not title_element['is_ibid']:
            author = None

    ## If an author does not need to be replicated for an ibid, append nothing to the xml line
    return "", author
Exemplo n.º 5
0
def start_datafield_element(line_marker):
    """ Start a brand new datafield element with a marker subfield.
        @param line_marker: (string) The line marker which will be the sole
        content of the newly created marker subfield. This will always be the
        first subfield to be created for a new datafield element.
        @return: (string) The string holding the relevant datafield and
        subfield tags.
    """
    marker_subfield = """
      <subfield code="%(sf-code-ref-marker)s">%(marker-val)s</subfield>""" \
            % {'sf-code-ref-marker': CFG_REFEXTRACT_SUBFIELD_MARKER,
               'marker-val'        : encode_for_xml(format_marker(line_marker))}

    new_datafield = """   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">%(marker-subfield)s""" \
    % {'df-tag-ref'     : CFG_REFEXTRACT_TAG_ID_REFERENCE,
       'df-ind1-ref'    : CFG_REFEXTRACT_IND1_REFERENCE,
       'df-ind2-ref'    : CFG_REFEXTRACT_IND2_REFERENCE,
       'marker-subfield': marker_subfield}

    return new_datafield
Exemplo n.º 6
0
def append_datafield_element(line_marker,
                             citation_structure,
                             line_elements,
                             author,
                             xml_line):
    """ Finish the current datafield element and start a new one, with a new
        marker subfield.
        @param line_marker: (string) The line marker which will be the sole
        content of the newly created marker subfield. This will always be the
        first subfield to be created for a new datafield element.
        @return new_datafield: (string) The string holding the relevant
        datafield and subfield tags.
    """
    ## Add an author, if one must be added for ibid's, before splitting this line
    ## Also, if a standard title and an author are both present, save the author for future use
    new_datafield, author = check_author_for_ibid(line_elements, author)

    xml_line += new_datafield
    ## Start the new datafield
    xml_line += """
   </datafield>
   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">
      <subfield code="%(sf-code-ref-marker)s">%(marker-val)s</subfield>""" \
    % {'df-tag-ref'         : CFG_REFEXTRACT_TAG_ID_REFERENCE,
       'df-ind1-ref'        : CFG_REFEXTRACT_IND1_REFERENCE,
       'df-ind2-ref'        : CFG_REFEXTRACT_IND2_REFERENCE,
       'sf-code-ref-marker' : CFG_REFEXTRACT_SUBFIELD_MARKER,
       'marker-val'         : encode_for_xml(format_marker(line_marker))
    }

    ## add the past elements for end previous citation to the citation_structure list
    ## (citation_structure is a reference to the initial citation_structure list found in the calling method)
    citation_structure.append(line_elements)

    ## Clear the elements in the referenced list of elements
    del line_elements[:]

    return xml_line, author
Exemplo n.º 7
0
def build_xml_citation(citation_elements, line_marker, inspire_format=None):
    """ Create the MARC-XML string of the found reference information which was taken
        from a tagged reference line.
        @param citation_elements: (list) an ordered list of dictionary elements,
        with each element corresponding to a found piece of information from a reference line.
        @param line_marker: (string) The line marker for this single reference line (e.g. [19])
        @return xml_line: (string) The MARC-XML representation of the list of reference elements
    """
    if inspire_format is None:
        inspire_format = CFG_INSPIRE_SITE

    ## Begin the datafield element
    xml_line = start_datafield_element(line_marker)

    ## This will hold the ordering of tags which have been appended to the xml line
    ## This list will be used to control the desisions involving the creation of new citation lines
    ## (in the event of a new set of authors being recognised, or strange title ordering...)
    line_elements = []

    ## This is a list which will hold the current 'over-view' of a single reference line,
    ## as a list of lists, where each list corresponds to the contents of a datafield element
    ## in the xml mark-up
    citation_structure = []
    auth_for_ibid = None

    for element in citation_elements:
        ## Before going onto checking 'what' the next element is, handle misc text and semi-colons
        ## Multiple misc text subfields will be compressed later
        ## This will also be the only part of the code that deals with MISC tag_typed elements
        if element['misc_txt'].strip(".,:;- []"):
            xml_line = append_subfield_element(xml_line,
                               CFG_REFEXTRACT_SUBFIELD_MISC,
                               element['misc_txt'].strip(".,:;- []"))

        # Now handle the type dependent actions
        # TITLE
        if element['type'] == "JOURNAL":

            # Select the journal title output format
            if inspire_format:
                # ADD to current datafield
                xml_line += """
      <subfield code="%(sf-code-ref-title)s">%(title)s,%(volume)s,%(page)s</subfield>""" \
              % {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE,
                 'title'            : encode_for_xml(element['title']),
                 'volume'           : encode_for_xml(element['volume']),
                 'page'             : encode_for_xml(element['page']),
                }
            else:
                # ADD to current datafield
                xml_line += """
      <subfield code="%(sf-code-ref-title)s">%(title)s %(volume)s (%(year)s) %(page)s</subfield>""" \
              % {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE,
                 'title'            : encode_for_xml(element['title']),
                 'volume'           : encode_for_xml(element['volume']),
                 'year'             : encode_for_xml(element['year']),
                 'page'             : encode_for_xml(element['page']),
                }

            # Now, if there are any extra (numeration based) IBID's after this title
            if len(element['extra_ibids']) > 0:
                # At least one IBID is present, these are to be outputted each into their own datafield
                for ibid in element['extra_ibids']:
                    # %%%%% Set as NEW citation line %%%%%
                    (xml_line, auth_for_ibid) = append_datafield_element(line_marker,
                                                                         citation_structure,
                                                                         line_elements,
                                                                         auth_for_ibid,
                                                                         xml_line)
                    if inspire_format:
                        xml_line += """
      <subfield code="%(sf-code-ref-title)s">%(title)s,%(volume)s,%(page)s</subfield>""" \
                          % {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE,
                             'title'            : encode_for_xml(ibid['title']),
                             'volume'           : encode_for_xml(ibid['volume']),
                             'page'             : encode_for_xml(ibid['page']),
                            }
                    else:
                        xml_line += """
      <subfield code="%(sf-code-ref-title)s">%(title)s %(volume)s (%(year)s) %(page)s</subfield>""" \
                          % {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE,
                             'title'            : encode_for_xml(ibid['title']),
                             'volume'           : encode_for_xml(ibid['volume']),
                             'year'             : encode_for_xml(ibid['year']),
                             'page'             : encode_for_xml(ibid['page']),
                            }
            # Add a Title element to the past elements list, since we last found an IBID
            line_elements.append(element)

        # REPORT NUMBER
        elif element['type'] == "REPORTNUMBER":
            # ADD to current datafield
            xml_line = append_subfield_element(xml_line,
                                               CFG_REFEXTRACT_SUBFIELD_REPORT_NUM,
                                               element['report_num'])
            line_elements.append(element)

        # URL
        elif element['type'] == "URL":
            if element['url_string'] == element['url_desc']:
                # Build the datafield for the URL segment of the reference line:
                xml_line = append_subfield_element(xml_line,
                                                   CFG_REFEXTRACT_SUBFIELD_URL,
                                                   element['url_string'])
            # Else, in the case that the url string and the description differ in some way, include them both
            else:
                # Build the datafield for the URL segment of the reference line:
                xml_line += """
      <subfield code="%(sf-code-ref-url)s">%(url)s</subfield>
      <subfield code="%(sf-code-ref-url-desc)s">%(url-desc)s</subfield>""" \
                    % {'sf-code-ref-url'     : CFG_REFEXTRACT_SUBFIELD_URL,
                       'sf-code-ref-url-desc': CFG_REFEXTRACT_SUBFIELD_URL_DESCR,
                        'url'                : encode_for_xml(element['url_string']),
                        'url-desc'           : encode_for_xml(element['url_desc'])
                      }
            line_elements.append(element)

        # DOI
        elif element['type'] == "DOI":
            ## Split on hitting another DOI in the same line
            if is_in_line_elements("DOI", line_elements):
                ## %%%%% Set as NEW citation line %%%%%
                xml_line, auth_for_ibid = append_datafield_element(line_marker,
                                                                   citation_structure,
                                                                   line_elements,
                                                                   auth_for_ibid,
                                                                   xml_line)
            xml_line = append_subfield_element(xml_line,
                                               CFG_REFEXTRACT_SUBFIELD_DOI,
                                               element['doi_string'])
            line_elements.append(element)

        # AUTHOR
        elif element['type'] == "AUTH":
            value = element['auth_txt']
            if element['auth_type'] == 'incl':
                value = "(%s)" % value

            if is_in_line_elements("AUTH", line_elements) and line_elements[-1]['type'] != "AUTH":
                xml_line = append_subfield_element(xml_line,
                                                   CFG_REFEXTRACT_SUBFIELD_MISC,
                                                   value)
            else:
                xml_line = append_subfield_element(xml_line,
                                                   CFG_REFEXTRACT_SUBFIELD_AUTH,
                                                   value)
                line_elements.append(element)

        elif element['type'] == "QUOTED":
            xml_line = append_subfield_element(xml_line,
                                               CFG_REFEXTRACT_SUBFIELD_QUOTED,
                                               element['title'])
            line_elements.append(element)

        elif element['type'] == "ISBN":
            xml_line = append_subfield_element(xml_line,
                                               CFG_REFEXTRACT_SUBFIELD_ISBN,
                                               element['ISBN'])
            line_elements.append(element)

        elif element['type'] == "BOOK":
            xml_line = append_subfield_element(xml_line,
                                               CFG_REFEXTRACT_SUBFIELD_QUOTED,
                                               element['title'])
            xml_line += '\n      <subfield code="%s" />' % \
                CFG_REFEXTRACT_SUBFIELD_BOOK
            line_elements.append(element)

        elif element['type'] == "PUBLISHER":
            xml_line = append_subfield_element(xml_line,
                                               CFG_REFEXTRACT_SUBFIELD_PUBLISHER,
                                               element['publisher'])
            line_elements.append(element)

        elif element['type'] == "YEAR":
            xml_line = append_subfield_element(xml_line,
                                               CFG_REFEXTRACT_SUBFIELD_YEAR,
                                               element['year'])
            line_elements.append(element)

    # Append the author, if needed for an ibid, for the last element
    # in the entire line. Don't bother setting the author to be used
    # for ibids, since the line is finished
    xml_line += check_author_for_ibid(line_elements, auth_for_ibid)[0]

    # Close the ending datafield element
    xml_line += "\n   </datafield>\n"

    return xml_line