def validate_volmets(volume_file, s3_items_by_type, path_prefix): """ Confirm that all paths and hashes in volmets match files in S3. Returns 'None' if no errors detected, or else dict of files only in S3 and only in the METS file. """ parsed = parse_xml(volume_file) only_in_mets = set() only_in_s3 = set() for file_type in ('jp2', 'tiff', 'alto', 'casemets'): volmets_files = set( (path_prefix + i.children('mets|FLocat').attr(resolve_namespace('xlink|href')), i.attr('CHECKSUM')) for i in parsed('mets|fileGrp[USE="%s"] mets|file' % file_type).items()) s3_files = set(s3_items_by_type[file_type]) if s3_files != volmets_files: only_in_mets |= volmets_files - s3_files only_in_s3 |= s3_files - volmets_files if only_in_mets or only_in_s3: return { 'only_in_mets': list(only_in_mets), 'only_in_s3': list(only_in_s3), } return None
def fix_file_group(group_name, new_mime_type=None, new_id_prefix=None): file_group = mets_xml('mets|fileGrp[USE="%s"]' % group_name) for file_el in file_group('mets|file'): file_el = parse_xml(file_el) flocat_el = file_el('mets|FLocat') old_href = flocat_el.attr(resolve_namespace('xlink|href')) new_data = new_file_info[old_href.replace(relative_path_prefix, '')] if new_id_prefix: file_el.attr( 'ID', file_el.attr('ID').replace(group_name, new_id_prefix)) if new_mime_type: file_el.attr('MIMETYPE', new_mime_type) file_el.attr('CHECKSUM', new_data['digest']) file_el.attr('SIZE', str(new_data['length'])) flocat_el.attr(resolve_namespace('xlink|href'), relative_path_prefix + new_data['new_path']) if new_id_prefix: # fix fileGrp element file_group.attr('USE', new_id_prefix) # fix <fptr> elements for fptr in fptr_elements: fileid = fptr.attrib.get('FILEID', '') if fileid.startswith(group_name): fptr.attrib['FILEID'] = fileid.replace( group_name, new_id_prefix)
def test_serialize_xml_should_not_modify_input_xml(unaltered_alto_xml): parsed = parse_xml(unaltered_alto_xml) # make a change parsed('[ID="b17-15"]').attr('ID', 'replace_me') # serialize parsed xml new_xml = serialize_xml(parsed) # undo the change for comparison assert b'replace_me' in new_xml # make sure modification worked new_xml = new_xml.replace(b'replace_me', b'b17-15') # serialized xml should be identical assert unaltered_alto_xml == new_xml
def handle_alto_file(volume_file_path, tempdir, storage_name): storage, out_path = single_file_setup(volume_file_path, tempdir, storage_name) with storage.open(volume_file_path, "r") as in_file: alto_xml = parse_xml(in_file.read()) filename_el = alto_xml('alto|fileName') filename_el.text(filename_el.text().replace('.tif', '.png')) pstep_el = alto_xml('alto|processingStepDescription') pstep_el.text(pstep_el.text().replace('TIFF', 'PNG')) # write out xml out_file, out_path = write_xml_gz(alto_xml, out_path) return format_new_file_info(volume_file_path, out_path, out_file)
def test_versioning(transactional_db, versioned_fixture_name, request): # load initial volume_xml/case_xml/page_xml versioned_instance = request.getfixturevalue(versioned_fixture_name) original_instance = deepcopy(versioned_instance) # starts with no history assert versioned_instance.history.count() == 0 # versions are only created once per transaction. # since tests run in transactions, run an initial sub-transaction to # make sure our next save causes a new version to be created. # note that this is not sufficient when using the temporal_tables # extension, which additionally requires (transaction=True) as an # argument to the pytest.mark.django_db decorator with transaction.atomic(using='capdb'): versioned_instance.save() # make some modifications: versioned_instance.s3_key = 'changed' parsed = parse_xml(versioned_instance.orig_xml) parsed('mets').append("<new_element/>") versioned_instance.orig_xml = serialize_xml(parsed) # save modified version: with transaction.atomic(using='capdb'): versioned_instance.save() # historical version should now exist: previous_version = versioned_instance.history.first() assert previous_version # current version's sys_period should start where historical version's sys_period ends: versioned_instance.refresh_from_db() # load current sys_period assert versioned_instance.sys_period.lower == previous_version.sys_period.upper # historical version should have values from before latest save: assert previous_version.s3_key == original_instance.s3_key assert previous_version.orig_xml == original_instance.orig_xml
def get_case_metadata(case_xml): parsed = parse_xml(case_xml.replace('\xad', '')) # duplicative cases won't have a case section, so rather than using case.caseid we get the volume barcode from the # first alto file entry, and the case number from the casebody: alto_name = parsed('mets|fileGrp[USE="alto"] mets|FLocat')[0].attrib[ resolve_namespace('xlink|href')].split('/')[-1] volume_barcode = re.match(r'([A-Za-z0-9_]+)_(un)?redacted([0-9_]*)', alto_name).group(1) case_number = parsed( 'mets|fileGrp[USE="casebody"] > mets|file').attr["ID"].split('_')[1] case_id = "%s_%s" % (volume_barcode, case_number) metadata = {'volume_barcode': volume_barcode, 'case_id': case_id} if parsed('duplicative|casebody'): first_page = parsed('duplicative|casebody').attr.firstpage last_page = parsed('duplicative|casebody').attr.lastpage return dict( metadata, **{ 'duplicative': True, 'first_page': first_page, 'last_page': last_page, }), parsed citation_entries = parsed('case|case').find('case|citation') citations = [{ 'citation_type': cite.attrib['category'], 'citation_text': cite.text, 'is_duplicative': False } for cite in citation_entries] jurisdiction = parsed('case|court').attr('jurisdiction').strip() name = parsed('case|name').text() name_abbreviation = parsed('case|name').attr('abbreviation') first_page = parsed('casebody|casebody').attr.firstpage last_page = parsed('casebody|casebody').attr.lastpage decision_date_original = parsed('case|decisiondate').text() decision_date = parse_decision_date(decision_date_original) docket_number = parsed('case|docketnumber').text() court = { 'name_abbreviation': parsed('case|court').attr.abbreviation, 'name': parsed('case|court').text(), } # apply manual fixes jurisdiction, court['name'], court['name_abbreviation'] = fix_court_tag( jurisdiction, court['name'], court['name_abbreviation']) judges = [judge.text for judge in parsed('casebody|judges')] attorneys = [attorney.text for attorney in parsed('casebody|attorneys')] parties = [party.text for party in parsed('casebody|parties')] opinions = [{ 'type': opinion.attr('type'), 'author': opinion('casebody|author').text() or None, } for opinion in parsed.items('casebody|opinion')] return dict( metadata, **{ 'name': name, 'name_abbreviation': name_abbreviation, 'jurisdiction': jurisdiction, 'citations': citations, 'first_page': first_page, 'last_page': last_page, 'decision_date_original': decision_date_original, 'decision_date': decision_date, 'court': court, 'docket_number': docket_number, 'duplicative': False, 'judges': judges, 'attorneys': attorneys, 'parties': parties, 'opinions': opinions }), parsed
def xml_equal(s1, s2, **kwargs): e1 = parse_xml(s1)[0] e2 = parse_xml(s2)[0] return elements_equal(e1, e2, **kwargs)
def validate_volume(volume_path): """ Perform basic sanity checks on captar archives, and write "ok" or an error into `validation` folder. Relative to captar_storage: volume_path looks like 'redacted/32044031754302_redacted' output file looks like 'validation/redacted/32044031754302_redacted.txt' """ # helpers top_level_file_sets = {("METS.md5", "METS.xml.gz"), ("BOXES.xml.gz", "METS.md5", "METS.xml.gz")} class ValidationResult(Exception): pass # check last result result_path = str(Path('validation', volume_path).with_suffix('.txt')) if captar_storage.exists(result_path): last_result = json.loads(captar_storage.contents(result_path)) if last_result[0] == "ok": print("Volume %s already validated; skipping." % volume_path) return try: # load tar file as a storage wrapper and get list of items with open_captar_volume(volume_path, raise_on_not_found=False) as volume_storage: if not volume_storage: raise ValidationResult( "index_missing", "Failed to load index for %s" % volume_path) tar_items = set(volume_storage.iter_files_recursive(with_md5=True)) # volmets_path is path with no slashes ending in METS.xml.gz volmets_path = next( (item for item in tar_items if item[0].count("/") == 0 and item[0].endswith("METS.xml.gz") ), None) # check for missing volmets if not volmets_path: raise ValidationResult("volmets_missing", volume_path) # check md5 of volmets md5_path = next( (item[0] for item in tar_items if item[0].count("/") == 0 and item[0].endswith(".md5")), None) if not md5_path: raise ValidationResult("md5_missing") if volmets_path[1] != volume_storage.contents(md5_path): raise ValidationResult("volmets_md5_mismatch") # strip .gz so the storage will decompress for us volmets_path = volmets_path[0][:-3] # check for mismatched files orig_xml = volume_storage.contents(volmets_path) parsed = parse_xml(orig_xml) tar_item_checksum_lookup = dict(tar_items) volmets_files = set() for i in parsed('mets|file').items(): file_name = i.children('mets|FLocat').attr( resolve_namespace('xlink|href')) checksum = i.attr('CHECKSUM') # special case -- because of a processing error, pdf files in volmets don't always have checksums # in that case, default the checksum to the actual file checksum so it will match if checksum is None and file_name.endswith('.pdf'): checksum = tar_item_checksum_lookup.get(file_name) volmets_files.add((file_name, checksum)) # check that all files in METS are expected only_in_mets = volmets_files - tar_items if only_in_mets: raise ValidationResult("only_in_mets", [list(i) for i in only_in_mets]) # check that all files only_in_tar are expected (should be one volmets and one volmets md5) only_in_tar = tuple( sorted(item[0].rsplit('_', 1)[-1] for item in tar_items - volmets_files)) if only_in_tar not in top_level_file_sets: raise ValidationResult("only_in_tar", list(only_in_tar)) # count suffixes suffix_counts = defaultdict(int) for item in volmets_files: suffix_counts[item[0].split('.', 1)[1]] += 1 color_image_count = suffix_counts['jpg'] or suffix_counts['pdf'] if color_image_count == 0 or color_image_count != suffix_counts[ 'tif'] or suffix_counts['xml.gz'] <= color_image_count: raise ValidationResult("unexpected_suffix_counts", suffix_counts) raise ValidationResult("ok") except ValidationResult as result: print(result.args) captar_storage.save(result_path, BytesIO(json.dumps(result.args).encode()))
def handle_mets_file(volume_file_path, tempdir, storage_name, new_file_info, relative_path_prefix=''): storage, out_path = single_file_setup(volume_file_path, tempdir, storage_name) with storage.open(volume_file_path, "r") as in_file: mets_xml = parse_xml(in_file.read()) # add provenance data # spacing at start and end of string matters here -- makes sure formatting matches surrounding elements mets_xml('mets|amdSec').append(""" <digiprovMD ID="digi004"> <mdWrap MDTYPE="PREMIS"> <xmlData> <event xmlns="info:lc/xmlns/premis-v2"> <eventIdentifier> <eventIdentifierType>Local</eventIdentifierType> <eventIdentifierValue>proc0001</eventIdentifierValue> </eventIdentifier> <eventType>compression</eventType> <eventDateTime>%s</eventDateTime> <eventDetail>File compression</eventDetail> </event> <agent xmlns="info:lc/xmlns/premis-v2"> <agentIdentifier> <agentIdentifierType>Local</agentIdentifierType> <agentIdentifierValue>HLSL</agentIdentifierValue> </agentIdentifier> <agentName>Harvard Law School Library</agentName> <agentType>organization</agentType> </agent> </xmlData> </mdWrap> </digiprovMD> """ % (datetime.utcnow().isoformat().split('.')[0] + 'Z')) # update <fileGrp> sections fptr_elements = mets_xml('mets|fptr') def fix_file_group(group_name, new_mime_type=None, new_id_prefix=None): file_group = mets_xml('mets|fileGrp[USE="%s"]' % group_name) for file_el in file_group('mets|file'): file_el = parse_xml(file_el) flocat_el = file_el('mets|FLocat') old_href = flocat_el.attr(resolve_namespace('xlink|href')) new_data = new_file_info[old_href.replace(relative_path_prefix, '')] if new_id_prefix: file_el.attr( 'ID', file_el.attr('ID').replace(group_name, new_id_prefix)) if new_mime_type: file_el.attr('MIMETYPE', new_mime_type) file_el.attr('CHECKSUM', new_data['digest']) file_el.attr('SIZE', str(new_data['length'])) flocat_el.attr(resolve_namespace('xlink|href'), relative_path_prefix + new_data['new_path']) if new_id_prefix: # fix fileGrp element file_group.attr('USE', new_id_prefix) # fix <fptr> elements for fptr in fptr_elements: fileid = fptr.attrib.get('FILEID', '') if fileid.startswith(group_name): fptr.attrib['FILEID'] = fileid.replace( group_name, new_id_prefix) fix_file_group('jp2', 'image/jpg', 'jpg') # use this if doing jp2 -> compressed jp2 # fix_file_group('jp2') # use this if compressing tiff -> png # fix_file_group('tiff', 'image/png', 'png') fix_file_group('alto', 'text/xml+gzip') fix_file_group('casemets', 'text/xml+gzip') # write out xml out_file, out_path = write_xml_gz(mets_xml, out_path) return format_new_file_info(volume_file_path, out_path, out_file)