def create_structmap(workspace, divs, structmap, filegrp, path=''): """Create structmap based on directory structure """ fptr_list = [] div_list = [] for div in divs.keys(): # It's a file if there is "-techmd.xml", lets create file+fptr # elements if div.endswith('-techmd.xml'): div = div[:-len('-techmd.xml')] div_path = encode_path(os.path.join(decode_path(path), div)) amdids = get_links_event_agent(workspace, div_path) fileid = add_file_to_filesec(workspace, div_path, filegrp, amdids) fptr = mets.fptr(fileid) fptr_list.append(fptr) # It's not a file, lets create a div element else: div_path = encode_path(os.path.join(decode_path(path), div)) amdids = get_links_event_agent(workspace, div_path) _, dmdsec_id = ids_for_files(workspace, div_path, 'dmdsec.xml') div_el = mets.div(type_attr=div, dmdid=dmdsec_id, admid=amdids) div_list.append(div_el) create_structmap(workspace, divs[div], div_el, filegrp, div_path) # Add fptr list first, then div list for fptr_elem in fptr_list: structmap.append(fptr_elem) for div_elem in div_list: structmap.append(div_elem)
def test_encode_path(): """Tests for the encode_path function.""" encoded_path = utils.encode_path('tests/testpath') assert encoded_path == 'tests%2Ftestpath' encoded_path = utils.encode_path( 'tests/testpath', suffix='-testsuffix', prefix='testprefix-' ) assert encoded_path == 'testprefix-tests%2Ftestpath-testsuffix' encoded_path = utils.encode_path('tästs/tøstpath') assert encoded_path == 't%C3%A4sts%2Ft%C3%B8stpath'
def add_file_to_filesec(workspace, path, filegrp): """Add file element to fileGrp element given as parameter. :param workspace: Workspace directorye from which administrative MD files and amd reference files searched. :param path: url encoded path of the file :param lxml.etree.Element filegrp: fileGrp element :param str returns: id of file added to fileGrp :returns: unique identifier of file element """ fileid = '_{}'.format(uuid4()) # Create list of IDs of amdID elements amdids = get_md_references(workspace, path=path) # Create XML element and add it to fileGrp file_el = mets.file_elem(fileid, admid_elements=set(amdids), loctype='URL', xlink_href='file://%s' % encode_path(path, safe='/'), xlink_type='simple', groupid=None) streams = get_objectlist(workspace, path) if streams: for stream in streams: stream_ids = get_md_references(workspace, path=path, stream=stream) stream_el = mets.stream(admid_elements=stream_ids) file_el.append(stream_el) filegrp.append(file_el) return fileid
def add_file_to_filesec(all_amd_refs, object_refs, path, filegrp): """Add file element to fileGrp element given as parameter. :all_amd_refs: XML element tree of administrative metadata references :object_refs: XML tree of object references :path: url encoded path of the file :filegrp: fileGrp element :returns: unique identifier of file element """ fileid = '_{}'.format(uuid4()) # Create list of IDs of amdID elements amdids = get_md_references(all_amd_refs, path=path) # Create XML element and add it to fileGrp file_el = mets.file_elem(fileid, admid_elements=set(amdids), loctype='URL', xlink_href='file://%s' % encode_path(path, safe='/'), xlink_type='simple', groupid=None) streams = get_objectlist(object_refs, path) if streams: for stream in streams: stream_ids = get_md_references(all_amd_refs, path=path, stream=stream) stream_el = mets.stream(admid_elements=stream_ids) file_el.append(stream_el) filegrp.append(file_el) return fileid
def create_premis_agent_file(workspace, event_type, agent_name, agent_type, agent_identifier, event_target=None): """Creates `<event_type>-agent.xml` file. If path to target file is given as `event_target` parameter, the URL-encoded path is used as filename prefix. The file is METS XML file that contains PREMIS agent element inside digiprovMD element. The ID attribute of digiprovMD is hashed from the filename. :param workspace: path to directory where file is created :param event_type: event type (for filename) :param agent_name: PREMIS agentName :param agent_type: PREMIS agentType :param agent_identifier: PREMIS agentIdentifierValue :param event_target: event target file (for filename) :returns: output file path and METS XML element object """ output_filename = '%s-agent-amd.xml' % (event_type) if event_target: output_filename = '%s-%s' % (event_target, output_filename) output_filename = encode_path(output_filename) agent_id = encode_id(output_filename) premis_agent = create_premis_agent(agent_name, agent_type, agent_identifier) agent_mets = _create_mets(premis_agent, agent_id, 'PREMIS:AGENT') _write_mets(agent_mets, os.path.join(workspace, output_filename)) return (os.path.join(workspace, output_filename), agent_mets)
def write_md(self, metadata, mdtype, mdtypeversion, othermdtype=None, section=None, stdout=False): """ Wraps XML metadata into MD element and writes it to a lxml.etree XML file in the workspace. The output filename is <mdtype>-<hash>-othermd.xml, where <mdtype> is the type of metadata given as parameter and <hash> is a string generated from the metadata. Serializing and hashing the root xml element can be rather time consuming and as such this method should not be called for each file unless more efficient way of separating files by the metadata can't be easily implemented. This implementation should be done by the subclasses of metadata_creator. :metadata (Element): metadata XML element :mdtype (string): Value of mdWrap MDTYPE attribute :mdtypeversion (string): Value of mdWrap MDTYPEVERSION attribute :othermdtype (string): Value of mdWrap OTHERMDTYPE attribute :section (string): Type of mets metadata section :stdout (boolean): Print also to stdout :returns: md_id, filename - Metadata id and filename """ digest = generate_digest(metadata) suffix = othermdtype if othermdtype else mdtype filename = encode_path("%s-%s-amd.xml" % (digest, suffix)) md_id = '_{}'.format(digest) filename = os.path.join(self.workspace, filename) if not os.path.exists(filename): xmldata = mets.xmldata() xmldata.append(metadata) mdwrap = mets.mdwrap(mdtype, mdtypeversion, othermdtype) mdwrap.append(xmldata) if section == 'digiprovmd': amd = mets.digiprovmd(md_id) else: amd = mets.techmd(md_id) amd.append(mdwrap) amdsec = mets.amdsec() amdsec.append(amd) mets_ = mets.mets() mets_.append(amdsec) with open(filename, 'wb+') as outfile: outfile.write(xml_helpers.utils.serialize(mets_)) if stdout: print(xml_helpers.utils.serialize(mets_).decode("utf-8")) print("Wrote lxml.etree %s administrative metadata to file " "%s" % (mdtype, outfile.name)) return md_id, filename
def main(arguments=None): """The main method for import_description""" args = parse_arguments(arguments) if args.dmdsec_target: url_t_path = encode_path(args.dmdsec_target, suffix='-dmdsec.xml') else: url_t_path = 'dmdsec.xml' with open(args.dmdsec_location, 'r') as content_file: content = content_file.read() _mets = mets.mets() tree = lxml.etree.fromstring(content) if args.desc_root == 'remove': childs = tree.findall('*') else: childs = [tree] xmldata_e = mets.xmldata(child_elements=childs) ns = h.get_namespace(childs[0]) if ns in METS_MDTYPES.keys(): mdt = METS_MDTYPES[ns]['mdtype'] if 'othermdtype' in METS_MDTYPES[ns]: mdo = METS_MDTYPES[ns]['othermdtype'] else: mdo = None mdv = METS_MDTYPES[ns]['version'] else: raise TypeError("Invalid namespace: %s" % ns) mdwrap_e = mets.mdwrap(mdtype=mdt, othermdtype=mdo, mdtypeversion=mdv, child_elements=[xmldata_e]) dmdsec_e = mets.dmdsec(encode_id(url_t_path), child_elements=[mdwrap_e]) _mets.append(dmdsec_e) if args.stdout: print h.serialize(_mets) output_file = os.path.join(args.workspace, url_t_path) if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) with open(output_file, 'w+') as outfile: outfile.write(h.serialize(_mets)) print "import_description created file: %s" % output_file return 0
def ead3_c_div(parent, structmap, filegrp, workspace, cnum=None): """Create div elements based on ead3 c elements. Fptr elements are created based on ead dao elements. The Ead3 elements tags are put into @type and the @level or @otherlevel attributes from ead3 will be put into @label. """ allowed_c_subs = [ 'c', 'c01', 'c02', 'c03', 'c04', 'c05', 'c06', 'c07', 'c08', 'c09', 'c10', 'c11', 'c12' ] if parent.xpath("./@otherlevel"): level = parent.xpath("./@otherlevel")[0] else: level = parent.xpath("./@level")[0] if cnum: c_div = mets.div(type_attr=('c' + str(cnum)), label=level) cnum_sub = str('0') + str(int(cnum) + 1) else: c_div = mets.div(type_attr='c', label=level) cnum_sub = None for elem in parent.findall("./*"): if ET.QName(elem.tag).localname in allowed_c_subs: ead3_c_div(elem, c_div, filegrp, workspace, cnum=cnum_sub) for files in parent.xpath("./ead3:did/*", namespaces=NAMESPACES): if ET.QName(files.tag).localname in ['dao', 'daoset']: if ET.QName(files.tag).localname == 'daoset': tech_file = encode_path( files.xpath("./ead3:dao/@href", namespaces=NAMESPACES)[0]) else: tech_file = encode_path(files.xpath("./@href")[0]) amdids = get_links_event_agent(workspace, tech_file) fileid = add_file_to_filesec(workspace, tech_file, filegrp, amdids) dao = mets.fptr(fileid=fileid) c_div.append(dao) structmap.append(c_div)
def main(arguments=None): """The main method for argparser""" args = parse_arguments(arguments) # Loop files and create premis objects files = collect_filepaths(dirs=args.files, base=args.base_path) for filename in files: if args.base_path != '': filerel = os.path.relpath(filename, args.base_path) else: filerel = filename xmldata = mets.xmldata() premis_object = create_premis_object( xmldata, filename, args.skip_inspection, args.format_name, args.format_version, args.digest_algorithm, args.message_digest, args.date_created, args.charset) mdwrap = mets.mdwrap('PREMIS:OBJECT', '2.3', child_elements=[xmldata]) techmd = mets.techmd(encode_id( encode_path(filerel, suffix="-techmd.xml")), child_elements=[mdwrap]) amdsec = mets.amdsec(child_elements=[techmd]) _mets = mets.mets(child_elements=[amdsec]) if args.stdout: print h.serialize(_mets) if not os.path.exists(args.workspace): os.makedirs(args.workspace) filename = encode_path(filerel, suffix="-techmd.xml") with open(os.path.join(args.workspace, filename), 'w+') as outfile: outfile.write(h.serialize(_mets)) print "Wrote METS technical metadata to file %s" % outfile.name return 0
def test_import_object_ok(input_file, testpath): arguments = ['--workspace', testpath, input_file] return_code = import_object.main(arguments) output = os.path.join(testpath, encode_path(input_file, suffix='-techmd.xml')) tree = ET.parse(output) root = tree.getroot() assert len(root.xpath('/mets:mets/mets:amdSec/mets:techMD', namespaces=NAMESPACES)) == 1 assert return_code == 0
def write_dict(self, file_metadata_dict, premis_amd_id): """ Write streams to a file for further scripts. :file_metadata_dict: File metadata dict :premis_amd_id: The AMDID of corresponding premis FILE object """ digest = premis_amd_id[1:] filename = encode_path("%s-scraper.json" % digest) filename = os.path.join(self.workspace, filename) if not os.path.exists(filename): with open(filename, 'wt') as outfile: json.dump(file_metadata_dict, outfile) print("Wrote technical data to: %s" % (outfile.name))
def get_fileid(filesec, path): """Find a file with `path` from fileSec. Returns the ID attribute of matching file element. :param path: path of the file :param lxml.etree Element filesec: fileSec element :returns: file element identifier """ encoded_path = encode_path(path, safe='/') element = filesec.xpath( '//mets:fileGrp/mets:file/mets:FLocat[@xlink:href="file://%s"]/..' % encoded_path, namespaces=NAMESPACES)[0] return element.attrib['ID']
def test_import_object_skip_inspection_nodate_ok(input_file, testpath): arguments = ['--workspace', testpath, input_file, '--skip_inspection', '--format_name', 'image/dpx', '--format_version', '1.0', '--digest_algorithm', 'MD5', '--message_digest', '1qw87geiewgwe9'] return_code = import_object.main(arguments) output = os.path.join(testpath, encode_path(input_file, suffix='-techmd.xml')) tree = ET.parse(output) root = tree.getroot() assert len(root.xpath('/mets:mets/mets:amdSec/mets:techMD', namespaces=NAMESPACES)) == 1 assert return_code == 0
def get_provenance_ids(self): """List identifiers of provenance events. Gets list of dataset provenance events from Metax, and reads provenance IDs of the events from event.xml files found in the workspace directory. :returns: list of provenance IDs """ config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) metadata = metax_client.get_dataset(self.dataset_id) languages = get_dataset_languages(metadata) # Get the reference file path from Luigi task input # It already contains the workspace path. event_ids = get_md_references( read_md_references( self.workspace, os.path.basename( self.input()['create_provenance_information'].path))) event_type_ids = {} for event_id in event_ids: event_file = event_id[1:] + "-PREMIS%3AEVENT-amd.xml" event_file_path = os.path.join(self.sip_creation_path, event_file) if not os.path.exists(event_file_path): continue root = ET.parse(encode_path(event_file_path)).getroot() event_type = root.xpath("//premis:eventType", namespaces=NAMESPACES)[0].text event_type_ids[event_type] = event_id provenance_ids = [] for provenance in metadata["research_dataset"]["provenance"]: event_type = get_localized_value( provenance["preservation_event"]["pref_label"], languages=languages) provenance_ids += [event_type_ids[event_type]] return provenance_ids
def test_import_object_validate_pdf_ok(input_file, testpath): arguments = ['--workspace', testpath, 'tests/data/test_import.pdf'] return_code = import_object.main(arguments) output = os.path.join(testpath, encode_path(input_file, suffix='-techmd.xml')) tree = ET.parse(output) root = tree.getroot() assert len(root.xpath('/mets:mets/mets:amdSec/mets:techMD', namespaces=NAMESPACES)) == 1 assert root.xpath('//premis:formatName/text()', namespaces=NAMESPACES)[0] == 'application/pdf' assert root.xpath('//premis:formatVersion/text()', namespaces=NAMESPACES)[0] == '1.4' assert return_code == 0
def test_import_object_structured_ok(testpath): workspace = os.path.abspath(testpath) do = os.path.abspath(os.path.join(os.curdir, 'tests/data/structured')) test_file = "" for element in iterate_files(do): arguments = ['--workspace', workspace, os.path.relpath(element, os.curdir)] return_code = import_object.main(arguments) test_file = os.path.relpath(element, os.curdir) output = os.path.join(testpath, encode_path(test_file, suffix='-techmd.xml')) tree = ET.parse(output) root = tree.getroot() assert len(root.xpath('/mets:mets/mets:amdSec/mets:techMD', namespaces=NAMESPACES)) == 1 assert return_code == 0
def create_premis_event_file(workspace, event_type, event_datetime, event_detail, event_outcome, event_outcome_detail, event_target=None, agent_identifier=None): """Creates `<event_type>-event.xml` file. If path to target file is given as `event_target` parameter, the URL-encoded path is used as filename prefix. The file is METS XML file that contains PREMIS event element inside digiprovMD element. The ID attribute of digiprovMD is hashed from the filename. :param workspace: path to directory where file is created :param event_type: PREMIS eventType :param event_datetime: PREMIS eventDateTime :param event_detail: PREMIS eventDetail :param event_outcome: PREMIS eventOutcome :param event_outcome_detail: PREMIS eventOutcomeDetail :param agent_identifier: PREMIS linkingAgentIdentifierValue :param event_target: event target file (for filename) :returns: output file path and METS XML element object """ output_filename = '%s-event-amd.xml' % event_type if event_target: output_filename = '%s-%s' % (event_target, output_filename) output_filename = encode_path(output_filename) event_id = encode_id(output_filename) premis_event_elem = create_premis_event(event_type, event_datetime, event_detail, event_outcome, event_outcome_detail, agent_identifier) event_mets = _create_mets(premis_event_elem, event_id, 'PREMIS:EVENT') _write_mets(event_mets, os.path.join(workspace, output_filename)) return (os.path.join(workspace, output_filename), event_mets)
def _find_event(workspace, event_type, event_datetime, event_detail, event_outcome, event_outcome_detail): """Helper function to find if a similar event already is created by using the digest of the metadata to see if a file already exist. """ event = create_premis_event( event_type=event_type, event_datetime=event_datetime, event_detail=event_detail, event_outcome=event_outcome, event_outcome_detail=event_outcome_detail) digest = generate_digest(event) expected_filename = encode_path("%s-PREMIS:EVENT-amd.xml" % digest) return os.path.exists(os.path.join(workspace, expected_filename))
def get_fileid(filesec, path, file_ids=None): """Returns the ID for a file. Either finds a file with `path` from fileSec or reads the ID from a dict of `path` and `ID`. Returns the ID attribute of the matching file element. :filesec: fileSec element :path: path of the file :file_ids: Dict of file paths and file IDs :returns: file identifier """ if not file_ids: encoded_path = encode_path(path, safe='/') element = filesec.xpath( '//mets:fileGrp/mets:file/mets:FLocat[@xlink:href="file://%s"]/..' % encoded_path, namespaces=NAMESPACES)[0] fileid = element.attrib['ID'] else: fileid = file_ids[path] return fileid
def write(self, mdtype="OTHER", mdtypeversion="8.3", othermdtype="ADDML", filerel=None, section=None, stdout=False, file_metadata_dict=None): """ Write all the METS XML files and md-reference file. Base class write is overwritten to handle the references correctly and add flatFile fields to METS XML files. :returns: None """ for key in self.etrees: metadata = self.etrees[key] filenames = self.filenames[key] # Create METS XML file amd_id, amd_fname = \ self.write_md(metadata, mdtype, mdtypeversion, othermdtype) # Add all the files to references for filename in filenames: self.add_reference(amd_id, filerel if filerel else filename) # Append all the flatFile elements to the METS XML file append = [ flat_file_str(encode_path(filename), "ref001") for filename in filenames ] append_lines(amd_fname, "<addml:flatFiles>", append) # Write md-references self.write_references() # Clear filenames and etrees self.__init__(self.workspace)
def run(self): """Create a METS document that contains logical structural map. Logical structural map is based on dataset metadata retrieved from Metax. :returns: ``None`` """ # Read the generated physical structmap from file physical_structmap = ET.parse( os.path.join(self.sip_creation_path, 'structmap.xml')) # Get dmdsec id from physical_structmap dmdsec_id = physical_structmap.getroot()[0][0].attrib['DMDID'] # Get provenance id's provenance_ids = self.get_provenance_ids() # Init logical structmap logical_structmap = mets.structmap(type_attr='Fairdata-logical') mets_structmap = mets.mets(child_elements=[logical_structmap]) # Create logical structmap categories = self.find_file_categories() wrapper_div = mets.div(type_attr='logical', dmdid=[dmdsec_id], admid=provenance_ids) for category in categories: div = mets.div(type_attr=category) for filename in categories.get(category): fileid = self.get_fileid(encode_path(filename, safe='/')) div.append(mets.fptr(fileid)) wrapper_div.append(div) logical_structmap.append(wrapper_div) with self.output().open('wb') as output: output.write(h.serialize(mets_structmap))
def main(arguments=None): """The main method for premis_event""" args = parse_arguments(arguments) if args.agent_name: _mets = mets.mets() amdsec = mets.amdsec() _mets.append(amdsec) if args.event_target: agent_id = encode_id( encode_path('%s-%s-agent.xml' % (args.event_target, args.event_type))) output_file = os.path.join( args.workspace, encode_path('%s-%s-agent.xml' % (args.event_target, args.event_type))) else: agent_id = encode_id( encode_path('%s-agent.xml' % (args.event_type))) output_file = os.path.join( args.workspace, encode_path('%s-agent.xml' % (args.event_type))) linking_agent_identifier = create_premis_agent(amdsec, agent_id, args.agent_name, args.agent_type) if args.stdout: print h.serialize(_mets) if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) with open(output_file, 'w+') as outfile: outfile.write(h.serialize(_mets)) print "premis_event created file: %s" % output_file else: linking_agent_identifier = None # Create event _mets = mets.mets() amdsec = mets.amdsec() _mets.append(amdsec) if args.event_target: event_id = encode_id( encode_path('%s-%s-event.xml' % (args.event_target, args.event_type))) output_file = os.path.join( args.workspace, encode_path('%s-%s-event.xml' % (args.event_target, args.event_type))) else: event_id = encode_id(encode_path('%s-event.xml' % (args.event_type))) output_file = os.path.join( args.workspace, encode_path('%s-event.xml' % (args.event_type))) create_premis_event(amdsec, args.event_type, args.event_datetime, args.event_detail, args.event_outcome, args.event_outcome_detail, linking_agent_identifier, event_id) if args.stdout: print h.serialize(_mets) if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) with open(output_file, 'w+') as outfile: outfile.write(h.serialize(_mets)) print "premis_event created file: %s" % output_file return 0
def create_addml_metadata(csv_file, delimiter, isheader, charset, record_separator, quoting_char, flatfile_name=None): """Creates ADDML metadata for a CSV file by default without flatFile element, which is added by the write() method of the AddmlCreator class. This is done to avoid getting different hashes for the same metadata, but different filename. flatFile elements is added if optional parameter flatfile_name is provided. csv_file parameter is not used as the flatFile element name attribute since that will differ from the original filepath e.g. when it is a tmpfile downloaded from IDA. :csv_file: Path to the CSV file :delimiter: Delimiter used in the CSV file :isheader: True if CSV has a header else False :charset: Charset used in the CSV file :record_separator: Char used for separating CSV file fields :quoting_char: Quotation char used in the CSV file :flatfile_name: flatFile elements name attribute :returns: ADDML metadata XML element """ header = csv_header(csv_file, delimiter, isheader) description = ET.Element(addml.addml_ns('description')) reference = ET.Element(addml.addml_ns('reference')) headers = header.split(delimiter) field_definitions = addml.wrapper_elems('fieldDefinitions') for col in headers: elems = addml.definition_elems('fieldDefinition', col, 'String') field_definitions.append(elems) record_definition = addml.definition_elems('recordDefinition', 'record', 'rdef001', [field_definitions]) record_definitions = addml.wrapper_elems('recordDefinitions', [record_definition]) flat_file_definition = addml.definition_elems('flatFileDefinition', 'ref001', 'rec001', [record_definitions]) flat_file_definitions = addml.wrapper_elems('flatFileDefinitions', [flat_file_definition]) data_type = addml.addml_basic_elem('dataType', 'string') field_type = addml.definition_elems('fieldType', 'String', child_elements=[data_type]) field_types = addml.wrapper_elems('fieldTypes', [field_type]) trimmed = ET.Element(addml.addml_ns('trimmed')) record_type = addml.definition_elems('recordType', 'rdef001', child_elements=[trimmed]) record_types = addml.wrapper_elems('recordTypes', [record_type]) delim_file_format = addml.delimfileformat(record_separator, delimiter, quoting_char) charset_elem = addml.addml_basic_elem('charset', charset) flat_file_type = addml.definition_elems( 'flatFileType', 'rec001', child_elements=[charset_elem, delim_file_format]) flat_file_types = addml.wrapper_elems('flatFileTypes', [flat_file_type]) structure_types = addml.wrapper_elems( 'structureTypes', [flat_file_types, record_types, field_types]) if flatfile_name: flatfile = addml.definition_elems('flatFile', encode_path(flatfile_name), 'ref001') elems = [flatfile, flat_file_definitions, structure_types] else: elems = [flat_file_definitions, structure_types] flatfiles = addml.wrapper_elems('flatFiles', elems) addml_root = addml.addml([description, reference, flatfiles]) return addml_root