def test_collect_mdsec_elements(self): f1 = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4())) f1.amdsecs.append(metsrw.AMDSec()) f1.dmdsecs.append(metsrw.SubSection('dmdSec', None)) f2 = metsrw.FSEntry('file2.txt', file_uuid=str(uuid.uuid4())) f2.dmdsecs.append(metsrw.SubSection('dmdSec', None)) mw = metsrw.METSDocument() elements = mw._collect_mdsec_elements([f1, f2]) # Check ordering - dmdSec before amdSec assert isinstance(elements, list) assert len(elements) == 3 assert isinstance(elements[0], metsrw.SubSection) assert elements[0].subsection == 'dmdSec' assert isinstance(elements[1], metsrw.SubSection) assert elements[1].subsection == 'dmdSec' assert isinstance(elements[2], metsrw.AMDSec)
def add_new_files(mets, sip_uuid, sip_dir): """ Add new files to structMap, fileSec. This supports adding new metadata or preservation files. If a new file is a metadata.csv, parse it to create dmdSecs. """ # Find new files # How tell new file from old with same name? Check hash? # QUESTION should the metadata.csv be parsed and only updated if different even if one already existed? new_files = [] old_mets_rel_path = _get_old_mets_rel_path(sip_uuid) metadata_csv = None objects_dir = os.path.join(sip_dir, 'objects') for dirpath, _, filenames in os.walk(objects_dir): for filename in filenames: # Find in METS current_loc = os.path.join(dirpath, filename).replace( sip_dir, '%SIPDirectory%', 1) rel_path = current_loc.replace('%SIPDirectory%', '', 1) print('Looking for', rel_path, 'in METS') fsentry = mets.get_file(path=rel_path) if fsentry is None: # If not in METS (and is not old METS), get File object and # store for later if rel_path != old_mets_rel_path: print(rel_path, 'not found in METS, must be new file') f = models.File.objects.get(currentlocation=current_loc, sip_id=sip_uuid) new_files.append(f) if rel_path == 'objects/metadata/metadata.csv': metadata_csv = f else: print(rel_path, 'found in METS, no further work needed') if not new_files: return mets # Set global counters so getAMDSec will work createmets2.globalAmdSecCounter = int( mets.tree.xpath('count(mets:amdSec)', namespaces=ns.NSMAP)) createmets2.globalTechMDCounter = int( mets.tree.xpath('count(mets:amdSec/mets:techMD)', namespaces=ns.NSMAP)) createmets2.globalDigiprovMDCounter = int( mets.tree.xpath('count(mets:amdSec/mets:digiprovMD)', namespaces=ns.NSMAP)) objects_fsentry = mets.get_file(label='objects', type='Directory') for f in new_files: # Create amdSecs print('Adding amdSec for', f.currentlocation, '(', f.uuid, ')') amdsec, amdid = createmets2.getAMDSec( fileUUID=f.uuid, filePath=None, # Only needed if use=original use=f.filegrpuse, type=None, # Not used sip_uuid=sip_uuid, transferUUID=None, # Only needed if use=original itemdirectoryPath=None, # Only needed if use=original typeOfTransfer=None, # Only needed if use=original baseDirectoryPath=sip_dir, ) print(f.uuid, 'has amdSec with ID', amdid) # Create parent directories if needed dirs = os.path.dirname( f.currentlocation.replace('%SIPDirectory%objects/', '', 1)).split('/') parent_fsentry = objects_fsentry for dirname in (d for d in dirs if d): child = mets.get_file(type='Directory', label=dirname) if child is None: child = metsrw.FSEntry( path=None, type='Directory', label=dirname, ) parent_fsentry.add_child(child) parent_fsentry = child derived_from = None if f.original_file_set.exists(): original_f = f.original_file_set.get().source_file derived_from = mets.get_file(file_uuid=original_f.uuid) entry = metsrw.FSEntry( path=f.currentlocation.replace('%SIPDirectory%', '', 1), use=f.filegrpuse, type='Item', file_uuid=f.uuid, derived_from=derived_from, ) metsrw_amdsec = metsrw.AMDSec(tree=amdsec, section_id=amdid) entry.amdsecs.append(metsrw_amdsec) parent_fsentry.add_child(entry) # Parse metadata.csv and add dmdSecs if metadata_csv: mets = update_metadata_csv(mets, metadata_csv, sip_uuid, sip_dir) return mets
def test_identifier(self): # should be in the format 'amdSec_1' amdsec = metsrw.AMDSec() assert amdsec.id_string()
def test_tree_overwrites_serialize(self): elem = etree.Element('temp') amdsec = metsrw.AMDSec(tree=elem, section_id='id1') assert amdsec.serialize() == elem
def test_tree_no_id(self): with pytest.raises(ValueError) as excinfo: metsrw.AMDSec(tree=etree.Element('amdSec')) assert 'section_id' in str(excinfo.value)
def add_new_files(job, mets, sip_uuid, sip_dir): """ Add new files to structMap, fileSec. This supports adding new metadata or preservation files. If a new file is a metadata.csv, parse it to create dmdSecs. """ # Find new files # How tell new file from old with same name? Check hash? # QUESTION should the metadata.csv be parsed and only updated if different # even if one already existed? new_files = [] old_mets_rel_path = _get_old_mets_rel_path(sip_uuid) metadata_csv = None objects_dir = os.path.join(sip_dir, "objects") for dirpath, _, filenames in os.walk(objects_dir): for filename in filenames: # Find in METS current_loc = os.path.join(dirpath, filename).replace( sip_dir, "%SIPDirectory%", 1 ) rel_path = current_loc.replace("%SIPDirectory%", "", 1) job.pyprint("Looking for", rel_path, "in METS") fsentry = mets.get_file(path=rel_path) if fsentry is None: # If not in METS (and is not old METS), get File object and # store for later if rel_path != old_mets_rel_path: job.pyprint(rel_path, "not found in METS, must be new file") f = models.File.objects.get( currentlocation=current_loc, sip_id=sip_uuid ) new_files.append(f) if rel_path == "objects/metadata/metadata.csv": metadata_csv = f else: job.pyprint(rel_path, "found in METS, no further work needed") if not new_files: return mets # Set global counters so getAMDSec will work state = createmets2.MetsState( globalAmdSecCounter=metsrw.AMDSec.get_current_id_count(), globalTechMDCounter=metsrw.SubSection.get_current_id_count("techMD"), globalDigiprovMDCounter=metsrw.SubSection.get_current_id_count("digiprovMD"), ) objects_fsentry = mets.get_file(label="objects", type="Directory") for f in new_files: # Create amdSecs job.pyprint("Adding amdSec for", f.currentlocation, "(", f.uuid, ")") amdsec, amdid = createmets2.getAMDSec( job, fileUUID=f.uuid, filePath=None, # Only needed if use=original use=f.filegrpuse, sip_uuid=sip_uuid, transferUUID=None, # Only needed if use=original itemdirectoryPath=None, # Only needed if use=original typeOfTransfer=None, # Only needed if use=original baseDirectoryPath=sip_dir, state=state, ) job.pyprint(f.uuid, "has amdSec with ID", amdid) # Create parent directories if needed dirs = os.path.dirname( f.currentlocation.replace("%SIPDirectory%objects/", "", 1) ).split("/") parent_fsentry = objects_fsentry for dirname in (d for d in dirs if d): child = mets.get_file(type="Directory", label=dirname) if child is None: child = metsrw.FSEntry(path=None, type="Directory", label=dirname) parent_fsentry.add_child(child) parent_fsentry = child derived_from = None if f.original_file_set.exists(): original_f = f.original_file_set.get().source_file derived_from = mets.get_file(file_uuid=original_f.uuid) entry = metsrw.FSEntry( path=f.currentlocation.replace("%SIPDirectory%", "", 1), use=f.filegrpuse, type="Item", file_uuid=f.uuid, derived_from=derived_from, ) metsrw_amdsec = metsrw.AMDSec(tree=amdsec, section_id=amdid) entry.amdsecs.append(metsrw_amdsec) parent_fsentry.add_child(entry) # Parse metadata.csv and add dmdSecs if metadata_csv: mets = update_metadata_csv(job, mets, metadata_csv, sip_uuid, sip_dir, state) return mets