def parseMetadataCSV(job, metadataCSVFilePath): """ Parses the metadata.csv into a dict with entries for each file. Each file's entry is an OrderedDict containing the column header and a list of values for each column. Example CSV: Filename,dc.title,dc.type,dc.type,Other metadata objects/foo.jpg,Foo,Photograph,Still Image,Taken on a sunny day objects/bar/,Bar,Photograph,Still Image,All taken on a rainy day Produces: { 'objects/foo.jpg': OrderedDict(dc.title=[Foo], dc.type=[Photograph, Still Image], Other metadata=[Taken on a sunny day]) 'objects/bar': OrderedDict(dc.title=[Bar], dc.date=[Photograph, Still Image], Other metadata=[All taken on a rainy day]) } :param metadataCSVFilePath: Path to the metadata CSV to parse :return: {<filename>: OrderedDict(<metadata name>: [<metadata value>]) } """ metadata = {} # use universal newline mode to support unusual newlines, like \r with open(metadataCSVFilePath, "rbU") as f: reader = csv.reader(f) # Parse first row as header header = next(reader) # Strip filename column, strip whitespace from header values header = [h.strip() for h in header[1:]] # Parse data for row in reader: if not row: continue entry_name = row[0] if entry_name.endswith("/"): entry_name = entry_name[:-1] # Strip file/dir name from values row = row[1:] values = archivematicaFunctions.OrderedListsDict(zip(header, row)) if entry_name in metadata and metadata[entry_name] != values: job.pyprint( "Metadata for", entry_name, "being overwritten. Old:", metadata[entry_name], "New:", values, file=sys.stderr, ) metadata[entry_name] = values return collections.OrderedDict(metadata) # Return a normal OrderedDict
def parseDmdSec(dmdSec, label='[Placeholder title]'): """ Parses a dmdSec into a dict with child tag names and their values :param dmdSec: dmdSec elements :param label: Default title if not provided. Required by CONTENTdm :returns: Dict of {<child element tag>: [<value>, ...] """ # If the dmdSec object is empty (i.e, no DC metadata has been assigned # in the dashboard, and there was no metadata.csv or other metadata file # in the transfer), return a placeholder title. if dmdSec is None: return collections.OrderedDict([('title', [label])]) elementsDict = archivematicaFunctions.OrderedListsDict() # If we are dealing with a DOM object representing the Dublin Core metadata, # check to see if there is a title (required by CONTENTdm). If not, assign a # placeholder title. mdType = dmdSec.xpath('mets:mdWrap/@MDTYPE', namespaces=ns.NSMAP) if mdType == 'DC': dcTitlesDom = dmdSec.findall('.//dcterms:title', namespaces=ns.NSMAP) if not dcTitlesDom: elementsDict['title'] = label # Iterate over all descendants and put in the return dict # Key is the element's tag name, value is a list of the element's text xmldata = dmdSec.find('.//mets:xmlData', namespaces=ns.NSMAP) for element in xmldata.iterdescendants(): tagname = element.tag # Strip namespace prefix # TODO can tag names be unicode? tagname = re.sub(r'{\S+}', '', tagname) # \S = non whitespace if tagname in ('dublincore', ): continue elementsDict[ tagname] = element.text or '' # OrderedListsDict appends to lists as needed return collections.OrderedDict(elementsDict)