def record_xml_output(rec, tags=None, order_fn=None): """Generates the XML for record 'rec' and returns it as a string @rec: record @tags: list of tags to be printed""" if tags is None: tags = [] if isinstance(tags, str): tags = [tags] marcxml = [] # Add the tag 'tag' to each field in rec[tag] fields = [] if rec is not None: for tag in rec: if not tags or tag in tags: for field in rec[tag]: if tag == '773': field = ([subfield for subfield in field[0] if subfield[0] != 'c'], field[1], field[2], field[3], field[4]) fields.append((tag, field)) if order_fn is None: record_order_fields(fields) else: record_order_fields(fields, order_fn) for field in fields: marcxml.append(field_xml_output(field[1], field[0])) return '\n'.join(marcxml)
def marcxml_filter_out_tags(recid, fields): """ Returns the fields of record 'recid' that share the same tag and indicators as those specified in 'fields', but for which the subfield is different. This is nice to emulate a bibupload -c that corrects only specific subfields. Parameters: recid - *int* the id of the record to process fields - *list(str)* the list of fields that we want to filter out. Eg ['909COp', '909COo'] """ out = '' record = get_record(recid) # Delete subfields that we want to replace for field in fields: record_delete_subfield(record, tag=field[0:3], ind1=field[3:4], ind2=field[4:5], subfield_code=field[5:6]) # Select only datafields that share tag + indicators processed_tags_and_ind = [] for field in fields: if not field[0:5] in processed_tags_and_ind: # Ensure that we do not process twice the same datafields processed_tags_and_ind.append(field[0:5]) for datafield in record.get(field[0:3], []): if datafield[1] == field[3:4].replace('_', ' ') and \ datafield[2] == field[4:5].replace('_', ' ') and \ datafield[0]: out += field_xml_output(datafield, field[0:3]) + '\n' return out
def create_MARC(extracted_image_data, tarball, refno): """ Take the images and their captions and the name of the associated TeX file and build a MARCXML record for them. @param: extracted_image_data ([(string, string, list, list), ...]): a list of tuples of images matched to labels, captions and contexts from this document. @param: refno (string): the name for the record number field, or None @output: a MARCXML record detailing all the arguments as appropriate at tarball.insert.xml and a duplicate one at tarball.correct.xml @return: the path to the MARCXML record, None if no plots """ root_dir = os.path.dirname(tarball) + os.sep + os.path.basename(tarball) + \ '_plots' + os.sep # For building result MARCXML marcxml = ['<record>'] # Datafield := (subfields, ind1, ind2, controlfield) # Subfield := (code, value) #FIXME: Determine what to do without refno if refno and refno.isdigit(): field = (None, ' ', ' ', refno) marcxml.append(field_xml_output(field, '001')) index = 0 for (image_location, caption, dummy, contexts) in extracted_image_data: if image_location == '': # we don't know the image, but the captions are for separate things for cap in caption.split(' : '): # Add DUMMY-PLOT MARCXML per loose captions subfields = [] subfields.append(('a', DUMMY_IMAGE_TMP)) subfields.append(('t', "PlotMisc")) subfields.append(('d', "%05d %s" % (index, cap))) subfields.append(('n', "fig%05d" % (index,))) subfields.append(('o', "HIDDEN")) marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT")) index = index + 1 else: # Merge subfolder into docname, until root directory relative_image_path = image_location.replace(root_dir, '') docname = "_".join(relative_image_path.split('.')[:-1]).replace('/', '_').replace(';', '').replace(':', '') if len(caption) < 3: subfields = [] subfields.append(('a', image_location)) subfields.append(('t', "PlotMisc")) subfields.append(('d', "%05d %s" % (index, caption.replace(' : ', '')))) subfields.append(('n', docname)) subfields.append(('o', "HIDDEN")) marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT")) else: # Add PLOT MARCXML subfields = [] subfields.append(('a', image_location)) subfields.append(('t', "Plot")) subfields.append(('d', "%05d %s" % (index, caption.replace(' : ', '')))) subfields.append(('n', docname)) marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT")) if contexts: # Add CONTEXT MARCXML subfields = [] subfields.append(('a', "%s.context" % (image_location,))) subfields.append(('t', "Plot")) subfields.append(('f', ".png;context")) subfields.append(('n', docname)) subfields.append(('o', "HIDDEN")) marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT")) index = index + 1 marcxml.append('</record>') return '\n'.join(marcxml)
def create_MARC(extracted_image_data, tarball, refno): """ Take the images and their captions and the name of the associated TeX file and build a MARCXML record for them. @param: extracted_image_data ([(string, string, list, list), ...]): a list of tuples of images matched to labels, captions and contexts from this document. @param: refno (string): the name for the record number field, or None @output: a MARCXML record detailing all the arguments as appropriate at tarball.insert.xml and a duplicate one at tarball.correct.xml @return: the path to the MARCXML record, None if no plots """ root_dir = os.path.dirname(tarball) + os.sep + os.path.basename(tarball) + \ '_plots' + os.sep marcxml_fft = [] index = 0 for (image_location, caption, dummy, contexts) in extracted_image_data: if len(image_location) < 3: # If not useful URL -> move on to next continue # Merge subfolder into docname, until root directory relative_image_path = image_location.replace(root_dir, '') docname = "_".join(relative_image_path.split('.')[:-1]).replace( '/', '_').replace(';', '').replace(':', '') if type(caption) == list: caption = " ".join(caption) if len(caption) < 3: subfields = [] subfields.append(('a', image_location)) subfields.append(('t', "PlotMisc")) subfields.append( ('d', "%05d %s" % (index, caption.replace(' : ', '')))) subfields.append(('n', docname)) subfields.append(('o', "HIDDEN")) marcxml_fft.append( field_xml_output((subfields, ' ', ' ', None), "FFT")) else: # Add PLOT MARCXML subfields = [] subfields.append(('a', image_location)) subfields.append(('t', "Plot")) subfields.append( ('d', "%05d %s" % (index, caption.replace(' : ', '')))) subfields.append(('n', docname)) marcxml_fft.append( field_xml_output((subfields, ' ', ' ', None), "FFT")) if contexts: # Add CONTEXT MARCXML subfields = [] subfields.append(('a', "%s.context" % (image_location, ))) subfields.append(('t', "Plot")) subfields.append(('f', ".png;context")) subfields.append(('n', docname)) subfields.append(('o', "HIDDEN")) marcxml_fft.append( field_xml_output((subfields, ' ', ' ', None), "FFT")) index += 1 if marcxml_fft: # For building result MARCXML marcxml_header = ['<record>'] # Datafield := (subfields, ind1, ind2, controlfield) # Subfield := (code, value) #FIXME: Determine what to do without refno if refno and refno.isdigit(): field = (None, ' ', ' ', refno) marcxml_header.append(field_xml_output(field, '001')) marcxml = marcxml_header + marcxml_fft marcxml.append('</record>') return '\n'.join(marcxml) return ""
def update_record(record_id, authors): """Update authors in CDS record. :param int record_id: record to update author datafields Example: record_id = 2150939 :param dict authors: dictionary where keys are author full names and values the CDS profile ids to be updated in the given record Example: authors = {'Ellis, John': '2108556'} :return: string representing the record XML element containing author (`100`) and/or co-author (`700`) datafields. Empty string if nothing to update Example: '<record> <controlfield tag="001">2150939</controlfield> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Ellis, John</subfield> <subfield code="u">King's Coll. London</subfield> <subfield code="u">CERN</subfield> <subfield code="0">AUTHOR|(CDS)2108556</subfield> <subfield code="9">#BEARD#</subfield> </datafield> </record>' """ record = get_record(record_id) record_author = record_get_field_instances(record, "100") record_coauthors = record_get_field_instances(record, "700") if len(record_author) > 1: print ("Oops: several '100' (main author) fields have been found in " "record '{0}'".format(record_id)) return "" datafields = "" author = False for author_field in record_author: try: author_name = field_get_subfield_values(author_field, 'a')[0] try: cds_id = authors[author_name] if extend_author_field(author_field, cds_id): datafields += field_xml_output(author_field, "100") author = True except KeyError: pass except IndexError: # Author field (`100`) does not have subfield `a` pass if len(authors) > 1 or not author: for coauthor_field in record_coauthors: try: coauthor_name = field_get_subfield_values( coauthor_field, 'a')[0] try: cds_id = authors[coauthor_name] if extend_author_field(coauthor_field, cds_id): author = True except KeyError: pass except IndexError: # Co-author field (`700`) does not have subfield `a` pass datafields += field_xml_output(coauthor_field, "700") # Nothing to update if not author: # print "No authors to update in record '{0}'".format(record_id) return "" record = ('<record><controlfield tag="001">{0}</controlfield>{1}' '</record>'.format(record_id, datafields)) return record
def create_MARC(extracted_image_data, tarball, refno): """ Take the images and their captions and the name of the associated TeX file and build a MARCXML record for them. @param: extracted_image_data ([(string, string, list, list), ...]): a list of tuples of images matched to labels, captions and contexts from this document. @param: refno (string): the name for the record number field, or None @output: a MARCXML record detailing all the arguments as appropriate at tarball.insert.xml and a duplicate one at tarball.correct.xml @return: the path to the MARCXML record, None if no plots """ root_dir = os.path.dirname(tarball) + os.sep + os.path.basename(tarball) + "_plots" + os.sep marcxml_fft = [] index = 0 for (image_location, caption, dummy, contexts) in extracted_image_data: if len(image_location) < 3: # If not useful URL -> move on to next continue # Merge subfolder into docname, until root directory relative_image_path = image_location.replace(root_dir, "") docname = "_".join(relative_image_path.split(".")[:-1]).replace("/", "_").replace(";", "").replace(":", "") if type(caption) == list: caption = " ".join(caption) if len(caption) < 3: subfields = [] subfields.append(("a", image_location)) subfields.append(("t", "PlotMisc")) subfields.append(("d", "%05d %s" % (index, caption.replace(" : ", "")))) subfields.append(("n", docname)) subfields.append(("o", "HIDDEN")) marcxml_fft.append(field_xml_output((subfields, " ", " ", None), "FFT")) else: # Add PLOT MARCXML subfields = [] subfields.append(("a", image_location)) subfields.append(("t", "Plot")) subfields.append(("d", "%05d %s" % (index, caption.replace(" : ", "")))) subfields.append(("n", docname)) marcxml_fft.append(field_xml_output((subfields, " ", " ", None), "FFT")) if contexts: # Add CONTEXT MARCXML subfields = [] subfields.append(("a", "%s.context" % (image_location,))) subfields.append(("t", "Plot")) subfields.append(("f", ".png;context")) subfields.append(("n", docname)) subfields.append(("o", "HIDDEN")) marcxml_fft.append(field_xml_output((subfields, " ", " ", None), "FFT")) index += 1 if marcxml_fft: # For building result MARCXML marcxml_header = ["<record>"] # Datafield := (subfields, ind1, ind2, controlfield) # Subfield := (code, value) # FIXME: Determine what to do without refno if refno and refno.isdigit(): field = (None, " ", " ", refno) marcxml_header.append(field_xml_output(field, "001")) marcxml = marcxml_header + marcxml_fft marcxml.append("</record>") return "\n".join(marcxml) return ""