def record_xml_output(rec, tags=None, order_fn=None):
    """Generates the XML for record 'rec' and returns it as a string
    @rec: record
    @tags: list of tags to be printed"""
    if tags is None:
        tags = []
    if isinstance(tags, str):
        tags = [tags]

    marcxml = []

    # Add the tag 'tag' to each field in rec[tag]
    fields = []
    if rec is not None:
        for tag in rec:
            if not tags or tag in tags:
                for field in rec[tag]:
                    if tag == '773':
                        field = ([subfield for subfield in field[0] if subfield[0] != 'c'], field[1], field[2], field[3], field[4])
                    fields.append((tag, field))
        if order_fn is None:
            record_order_fields(fields)
        else:
            record_order_fields(fields, order_fn)
        for field in fields:
            marcxml.append(field_xml_output(field[1], field[0]))
    return '\n'.join(marcxml)
Пример #2
0
def marcxml_filter_out_tags(recid, fields):
    """
    Returns the fields of record 'recid' that share the same tag and
    indicators as those specified in 'fields', but for which the
    subfield is different. This is nice to emulate a bibupload -c that
    corrects only specific subfields.

    Parameters:
           recid - *int* the id of the record to process

          fields - *list(str)* the list of fields that we want to filter
                   out. Eg ['909COp', '909COo']
    """
    out = ''

    record = get_record(recid)

    # Delete subfields that we want to replace
    for field in fields:
        record_delete_subfield(record,
                               tag=field[0:3],
                               ind1=field[3:4],
                               ind2=field[4:5],
                               subfield_code=field[5:6])

    # Select only datafields that share tag + indicators
    processed_tags_and_ind = []
    for field in fields:
        if not field[0:5] in processed_tags_and_ind:
            # Ensure that we do not process twice the same datafields
            processed_tags_and_ind.append(field[0:5])
            for datafield in record.get(field[0:3], []):
                if datafield[1] == field[3:4].replace('_', ' ') and \
                       datafield[2] == field[4:5].replace('_', ' ') and \
                       datafield[0]:
                    out += field_xml_output(datafield, field[0:3]) + '\n'

    return out
def marcxml_filter_out_tags(recid, fields):
    """
    Returns the fields of record 'recid' that share the same tag and
    indicators as those specified in 'fields', but for which the
    subfield is different. This is nice to emulate a bibupload -c that
    corrects only specific subfields.

    Parameters:
           recid - *int* the id of the record to process

          fields - *list(str)* the list of fields that we want to filter
                   out. Eg ['909COp', '909COo']
    """
    out = ''

    record = get_record(recid)

    # Delete subfields that we want to replace
    for field in fields:
        record_delete_subfield(record,
                               tag=field[0:3],
                               ind1=field[3:4],
                               ind2=field[4:5],
                               subfield_code=field[5:6])

    # Select only datafields that share tag + indicators
    processed_tags_and_ind = []
    for field in fields:
        if not field[0:5] in processed_tags_and_ind:
            # Ensure that we do not process twice the same datafields
            processed_tags_and_ind.append(field[0:5])
            for datafield in record.get(field[0:3], []):
                if datafield[1] == field[3:4].replace('_', ' ') and \
                       datafield[2] == field[4:5].replace('_', ' ') and \
                       datafield[0]:
                    out += field_xml_output(datafield, field[0:3]) + '\n'

    return out
def create_MARC(extracted_image_data, tarball, refno):
    """
    Take the images and their captions and the name of the associated TeX
    file and build a MARCXML record for them.

    @param: extracted_image_data ([(string, string, list, list), ...]):
        a list of tuples of images matched to labels, captions and contexts from
        this document.
    @param: refno (string): the name for the record number field, or None

    @output: a MARCXML record detailing all the arguments as appropriate
        at tarball.insert.xml and a duplicate one at tarball.correct.xml

    @return: the path to the MARCXML record, None if no plots
    """
    root_dir = os.path.dirname(tarball) + os.sep + os.path.basename(tarball) + \
                 '_plots' + os.sep

    # For building result MARCXML
    marcxml = ['<record>']

    # Datafield := (subfields, ind1, ind2, controlfield)
    # Subfield := (code, value)

    #FIXME: Determine what to do without refno
    if refno and refno.isdigit():
        field = (None, ' ', ' ', refno)
        marcxml.append(field_xml_output(field, '001'))

    index = 0
    for (image_location, caption, dummy, contexts) in extracted_image_data:
        if image_location == '':
            # we don't know the image, but the captions are for separate things
            for cap in caption.split(' : '):
                # Add DUMMY-PLOT MARCXML per loose captions
                subfields = []
                subfields.append(('a', DUMMY_IMAGE_TMP))
                subfields.append(('t', "PlotMisc"))
                subfields.append(('d', "%05d %s" % (index, cap)))
                subfields.append(('n', "fig%05d" % (index,)))
                subfields.append(('o', "HIDDEN"))
                marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT"))
                index = index + 1
        else:
            # Merge subfolder into docname, until root directory
            relative_image_path = image_location.replace(root_dir, '')
            docname = "_".join(relative_image_path.split('.')[:-1]).replace('/', '_').replace(';', '').replace(':', '')
            if len(caption) < 3:
                subfields = []
                subfields.append(('a', image_location))
                subfields.append(('t', "PlotMisc"))
                subfields.append(('d', "%05d %s" % (index, caption.replace(' : ', ''))))
                subfields.append(('n', docname))
                subfields.append(('o', "HIDDEN"))
                marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT"))
            else:
                # Add PLOT MARCXML
                subfields = []
                subfields.append(('a', image_location))
                subfields.append(('t', "Plot"))
                subfields.append(('d', "%05d %s" % (index, caption.replace(' : ', ''))))
                subfields.append(('n', docname))
                marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT"))
                if contexts:
                    # Add CONTEXT MARCXML
                    subfields = []
                    subfields.append(('a', "%s.context" % (image_location,)))
                    subfields.append(('t', "Plot"))
                    subfields.append(('f', ".png;context"))
                    subfields.append(('n', docname))
                    subfields.append(('o', "HIDDEN"))
                    marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT"))
            index = index + 1
    marcxml.append('</record>')
    return '\n'.join(marcxml)
Пример #5
0
def create_MARC(extracted_image_data, tarball, refno):
    """
    Take the images and their captions and the name of the associated TeX
    file and build a MARCXML record for them.

    @param: extracted_image_data ([(string, string, list, list), ...]):
        a list of tuples of images matched to labels, captions and contexts from
        this document.
    @param: refno (string): the name for the record number field, or None

    @output: a MARCXML record detailing all the arguments as appropriate
        at tarball.insert.xml and a duplicate one at tarball.correct.xml

    @return: the path to the MARCXML record, None if no plots
    """
    root_dir = os.path.dirname(tarball) + os.sep + os.path.basename(tarball) + \
                 '_plots' + os.sep

    marcxml_fft = []
    index = 0
    for (image_location, caption, dummy, contexts) in extracted_image_data:
        if len(image_location) < 3:
            # If not useful URL -> move on to next
            continue

        # Merge subfolder into docname, until root directory
        relative_image_path = image_location.replace(root_dir, '')
        docname = "_".join(relative_image_path.split('.')[:-1]).replace(
            '/', '_').replace(';', '').replace(':', '')

        if type(caption) == list:
            caption = " ".join(caption)

        if len(caption) < 3:
            subfields = []
            subfields.append(('a', image_location))
            subfields.append(('t', "PlotMisc"))
            subfields.append(
                ('d', "%05d %s" % (index, caption.replace(' : ', ''))))
            subfields.append(('n', docname))
            subfields.append(('o', "HIDDEN"))
            marcxml_fft.append(
                field_xml_output((subfields, ' ', ' ', None), "FFT"))
        else:
            # Add PLOT MARCXML
            subfields = []
            subfields.append(('a', image_location))
            subfields.append(('t', "Plot"))
            subfields.append(
                ('d', "%05d %s" % (index, caption.replace(' : ', ''))))
            subfields.append(('n', docname))
            marcxml_fft.append(
                field_xml_output((subfields, ' ', ' ', None), "FFT"))
            if contexts:
                # Add CONTEXT MARCXML
                subfields = []
                subfields.append(('a', "%s.context" % (image_location, )))
                subfields.append(('t', "Plot"))
                subfields.append(('f', ".png;context"))
                subfields.append(('n', docname))
                subfields.append(('o', "HIDDEN"))
                marcxml_fft.append(
                    field_xml_output((subfields, ' ', ' ', None), "FFT"))
        index += 1

    if marcxml_fft:
        # For building result MARCXML
        marcxml_header = ['<record>']

        # Datafield := (subfields, ind1, ind2, controlfield)
        # Subfield := (code, value)

        #FIXME: Determine what to do without refno
        if refno and refno.isdigit():
            field = (None, ' ', ' ', refno)
            marcxml_header.append(field_xml_output(field, '001'))
        marcxml = marcxml_header + marcxml_fft
        marcxml.append('</record>')
        return '\n'.join(marcxml)
    return ""
Пример #6
0
def update_record(record_id, authors):
    """Update authors in CDS record.

    :param int record_id: record to update author datafields
        Example:
            record_id = 2150939
    :param dict authors: dictionary where keys are author full names and
        values the CDS profile ids to be updated in the given record
        Example:
            authors = {'Ellis, John': '2108556'}

    :return: string representing the record XML element containing
        author (`100`) and/or co-author (`700`) datafields. Empty string if
        nothing to update
        Example:
            '<record>
                <controlfield tag="001">2150939</controlfield>
                <datafield tag="100" ind1=" " ind2=" ">
                    <subfield code="a">Ellis, John</subfield>
                    <subfield code="u">King's Coll. London</subfield>
                    <subfield code="u">CERN</subfield>
                    <subfield code="0">AUTHOR|(CDS)2108556</subfield>
                    <subfield code="9">#BEARD#</subfield>
                </datafield>
            </record>'
    """
    record = get_record(record_id)
    record_author = record_get_field_instances(record, "100")
    record_coauthors = record_get_field_instances(record, "700")

    if len(record_author) > 1:
        print ("Oops: several '100' (main author) fields have been found in "
               "record '{0}'".format(record_id))
        return ""

    datafields = ""
    author = False
    for author_field in record_author:
        try:
            author_name = field_get_subfield_values(author_field, 'a')[0]
            try:
                cds_id = authors[author_name]
                if extend_author_field(author_field, cds_id):
                    datafields += field_xml_output(author_field, "100")
                    author = True
            except KeyError:
                pass
        except IndexError:
            # Author field (`100`) does not have subfield `a`
            pass

    if len(authors) > 1 or not author:
        for coauthor_field in record_coauthors:
            try:
                coauthor_name = field_get_subfield_values(
                    coauthor_field, 'a')[0]
                try:
                    cds_id = authors[coauthor_name]
                    if extend_author_field(coauthor_field, cds_id):
                        author = True
                except KeyError:
                    pass
            except IndexError:
                # Co-author field (`700`) does not have subfield `a`
                pass
            datafields += field_xml_output(coauthor_field, "700")

    # Nothing to update
    if not author:
        # print "No authors to update in record '{0}'".format(record_id)
        return ""

    record = ('<record><controlfield tag="001">{0}</controlfield>{1}'
              '</record>'.format(record_id, datafields))
    return record
def create_MARC(extracted_image_data, tarball, refno):
    """
    Take the images and their captions and the name of the associated TeX
    file and build a MARCXML record for them.

    @param: extracted_image_data ([(string, string, list, list), ...]):
        a list of tuples of images matched to labels, captions and contexts from
        this document.
    @param: refno (string): the name for the record number field, or None

    @output: a MARCXML record detailing all the arguments as appropriate
        at tarball.insert.xml and a duplicate one at tarball.correct.xml

    @return: the path to the MARCXML record, None if no plots
    """
    root_dir = os.path.dirname(tarball) + os.sep + os.path.basename(tarball) + "_plots" + os.sep

    marcxml_fft = []
    index = 0
    for (image_location, caption, dummy, contexts) in extracted_image_data:
        if len(image_location) < 3:
            # If not useful URL -> move on to next
            continue

        # Merge subfolder into docname, until root directory
        relative_image_path = image_location.replace(root_dir, "")
        docname = "_".join(relative_image_path.split(".")[:-1]).replace("/", "_").replace(";", "").replace(":", "")

        if type(caption) == list:
            caption = " ".join(caption)

        if len(caption) < 3:
            subfields = []
            subfields.append(("a", image_location))
            subfields.append(("t", "PlotMisc"))
            subfields.append(("d", "%05d %s" % (index, caption.replace(" : ", ""))))
            subfields.append(("n", docname))
            subfields.append(("o", "HIDDEN"))
            marcxml_fft.append(field_xml_output((subfields, " ", " ", None), "FFT"))
        else:
            # Add PLOT MARCXML
            subfields = []
            subfields.append(("a", image_location))
            subfields.append(("t", "Plot"))
            subfields.append(("d", "%05d %s" % (index, caption.replace(" : ", ""))))
            subfields.append(("n", docname))
            marcxml_fft.append(field_xml_output((subfields, " ", " ", None), "FFT"))
            if contexts:
                # Add CONTEXT MARCXML
                subfields = []
                subfields.append(("a", "%s.context" % (image_location,)))
                subfields.append(("t", "Plot"))
                subfields.append(("f", ".png;context"))
                subfields.append(("n", docname))
                subfields.append(("o", "HIDDEN"))
                marcxml_fft.append(field_xml_output((subfields, " ", " ", None), "FFT"))
        index += 1

    if marcxml_fft:
        # For building result MARCXML
        marcxml_header = ["<record>"]

        # Datafield := (subfields, ind1, ind2, controlfield)
        # Subfield := (code, value)

        # FIXME: Determine what to do without refno
        if refno and refno.isdigit():
            field = (None, " ", " ", refno)
            marcxml_header.append(field_xml_output(field, "001"))
        marcxml = marcxml_header + marcxml_fft
        marcxml.append("</record>")
        return "\n".join(marcxml)
    return ""