def prepared_ead(request, archive, filename, mode): """Display information about changes made by preparing an EAD file for publication. If no changes are made, user will be redirected to main admin page with a message to that effect. In **summary** mode, displays a brief, color-coded summary of changes between original and prepped version of the file. In **diff** mode, displays a full, side-by-side diff generated by :class:`difflib.HtmlDiff`. (Note: because it is very large, the full diff is *not* embedded in the site template, and is intended to be opened in a new window.) :param filename: name of the file to prep; should be base filename only, document will be pulled from the configured source directory. :param mode: one of **diff** or **summary** """ # determine full path based on archive / svn arch = Archive.objects.get(slug=archive) # arch = get_object_or_404(Archive, slug=archive) fullpath = os.path.join(arch.svn_local_path, filename) changes = [] # TODO: expire cache if file has changed since prepped eadxml was cached prep_ead = prepared_eadxml(request, arch.slug, filename) if prep_ead.status_code == 200: orig_ead = load_xmlobject_from_file(fullpath, FindingAid) # validate or not? original_xml = orig_ead.serializeDocument() # store as serialized by xml object, so xml output will be the same prep_xml = prep_ead.content ead = load_xmlobject_from_string(prep_xml, FindingAid) # validate? if mode == 'diff': diff = difflib.HtmlDiff(8, 80) # set columns to wrap at 80 characters # generate a html table with line-by-line comparison (meant to be called in a new window) changes = diff.make_file(original_xml.split('\n'), prep_xml.split('\n')) return HttpResponse(changes) elif mode == 'summary': # prepared EAD should pass sanity checks required for publication errors = utils.check_eadxml(ead) changes = list(difflib.unified_diff(original_xml.split('\n'), prep_xml.split('\n'))) if not changes: messages.info(request, 'No changes made to <b>%s</b>; EAD is already prepared.' % filename) # redirect to main admin page with code 303 (See Other) return HttpResponseSeeOtherRedirect(reverse('fa-admin:index')) elif prep_ead.status_code == 500: # something went wrong with generating prep xml; could be one of: # - non-well-formed xml (failed to load original document at all) # - error generating an ARK for the document errors = [prep_ead.content] else: # this shouldn't happen; not 200 or 500 == something went dreadfully wrong errors = ['Something went wrong trying to load the specified document.', prep_ead.content] # pass along the output in case it is useful? return render(request, 'fa_admin/prepared.html', { 'filename': filename, 'changes': changes, 'errors': errors, 'xml_status': prep_ead.status_code, 'archive': arch})
def test_prep_ead(self): # valid fixtures is an ead with series/subseries, and index # - clear out fixture ark url to trigger generating a new one (simulated) del(self.valid_ead.eadid.url) del(self.valid_ead.eadid.identifier) ead = utils.prep_ead(self.valid_ead, self.valid_eadfile) self.assert_(isinstance(ead, FindingAid), "prep_ead should return an instance of FindingAid") self.assertEqual(u'hartsfield558', ead.eadid.value) self.assertEqual(u'hartsfield558_series1', ead.dsc.c[0].id) self.assertEqual(u'hartsfield558_subseries6.1', ead.dsc.c[5].c[0].id) self.assertEqual(u'hartsfield558_index1', ead.archdesc.index[0].id) # ark should be generated and stored in eadid url self.assertEqual(MockDjangoPidmanClient.test_ark, ead.eadid.url) # short-form ark should be stored in identifier attribute self.assert_(MockDjangoPidmanClient.test_ark.endswith(ead.eadid.identifier)) # ead with no series eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'pittsfreeman1036.xml') ead = load_xmlobject_from_file(eadfile, FindingAid) ead = utils.prep_ead(ead, eadfile) self.assert_(isinstance(ead, FindingAid), "prep_ead should return an instance of FindingAid") self.assertEqual(u'pittsfreeman1036', ead.eadid.value) # series with no unitid eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'raoul548.xml') ead = load_xmlobject_from_file(eadfile, FindingAid) ead = utils.prep_ead(ead, eadfile) self.assertEqual(u'raoul548_series3', ead.dsc.c[2].id) # whitespace cleanup ead = utils.prep_ead(self.invalid_ead, self.invalid_eadfile) # - no leading whitespace in list title # ead.archdesc.origination is getting normalized, so can't be used for testing origination = ead.node.xpath('//e:origination/e:persname', namespaces={'e': EAD_NAMESPACE}) self.assertEqual(u'Hartsfield, William Berry.', origination[0].text) # test the node text directly (does not include unitdate) self.assertEqual(u'William Berry Hartsfield papers, ', ead.unittitle.node.text) self.assertEqual(u'Gone with the wind (Motion picture)', ead.archdesc.controlaccess.controlaccess[0].title[0].value) self.assertEqual(u'Allen, Ivan.', ead.archdesc.controlaccess.controlaccess[1].person_name[0].value) self.assertEqual(u'Mines and mineral resources--Georgia.', ead.archdesc.controlaccess.controlaccess[3].subject[1].value) # unicode characters self.assertEqual(u'Motion pictures--Georgia. \u2026', ead.archdesc.controlaccess.controlaccess[3].subject[2].value) self.assertEqual(u'Motion pictures.', ead.archdesc.controlaccess.controlaccess[-1].genre_form[0].value) # remaining errors after clean-up: # 1 - duplicate origination # 2 - > 2 containers in a did (summary error and list of problem dids) # 2 - 1 container in a did (summary error and list of problem dids) # = 5 self.assertEqual(5, len(utils.check_eadxml(ead)), "only 3 errors (duplicate origination, 3 containers in a did, 1 container in a did) should be left in invalid test fixture after cleaning") # special case - unittitle begins with a <title> eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'pittsfreeman1036.xml') ead = load_xmlobject_from_file(eadfile, FindingAid) ead = utils.prep_ead(ead, eadfile) self.assertFalse(unicode(ead.list_title).startswith('None'), 'cleaned unittitle with leading <title> should not start with "None"')
def test_check_eadxml(self): # use invalid ead fixture to check error detection ead = self.invalid_ead ead.eadid.value = 'foo#~@/' # set invalid eadid for this test only # invalid fixture has several errors errors = utils.check_eadxml(ead) self.assertNotEqual(0, len(errors)) # - series/subseries ids missing, index id missing self.assert_("series c01 id attribute is not set for Series 1: Personal papers, 1918-1986" in errors, 'c01 missing id error reported') self.assert_("subseries c02 id attribute is not set for Subseries 6.1: Minerals and mining files, 1929-1970" in errors, 'c02 missing id error reported') self.assert_("index id attribute is not set for Index of Selected Correspondents" in errors, 'index missing id error reported') # - origination count error self.assert_("Site expects only one archdesc/did/origination; found 2" in errors, 'multiple origination error reported') # - whitespace in list title self.assert_("Found leading whitespace in list title field (origination/persname): " + "' Hartsfield, William Berry.'" in errors, 'leading whitespace in origination reported') # - eadid regex self.assert_("eadid '%s' does not match site URL regular expression" % ead.eadid.value in errors, 'eadid regex error reported') #ARK in url and identifier not set or invalid self.assert_("eadid url is either not set or not an ARK. " + "To correct, run the prep process again." in errors, 'eadid ark not in url') self.assert_("eadid identifier is either not set or not an ARK" + "To correct, run the prep process again." in errors, 'eadid ark not in identifier') #valid ARKs in url and identifier but do not match ark1 = "http://testpid.library.emory.edu/ark:/25593/1234" ark1_short = "ark:/25593/1234" ark2_short = "ark:/25593/567" ead.eadid.url = ark1 ead.eadid.identifier = ark2_short errors = utils.check_eadxml(ead) self.assert_("eadid url is either not set or not an ARK. " + "To correct, run the prep process again." not in errors, 'valid eadid ark set in url') self.assert_("eadid identifier is either not set or not an ARK" + "To correct, run the prep process again." not in errors, 'valid eadid ark set in identifier') self.assert_("eadid url and identifier do not match: url '%s' should end with identifier '%s'" % (ark1, ark2_short) in errors, 'eadid url and identifier do not march') # Change url and identifier to match ead.eadid.url = ark1 ead.eadid.identifier = ark1_short errors = utils.check_eadxml(ead) self.assert_("eadid url and identifier do not match: url '%s' should end with identifier '%s'" % (ark1, ark1_short) not in errors, 'eadid url and identifier march') # - list title first letter regex # simulate non-whitespace, non-alpha first letter in list title ead.list_title.node.text = "1234" # list title is not normally settable; overriding for test errors = utils.check_eadxml(ead) self.assert_("First letter ('1') of list title field origination/persname does not match browse letter URL regex '%s'" \ % TITLE_LETTERS in errors, 'title first letter regex error reported') # empty/unset list title field ead.list_title.node.text = None errors = utils.check_eadxml(ead) self.assert_("List title seems to be empty" in errors) # - whitespace in control access terms self.assert_("Found leading whitespace in controlaccess term ' Gone with the wind (Motion picture)' (title)" in errors, 'controlaccess title leading whitespace reported') self.assert_("Found leading whitespace in controlaccess term ' \t Selznick, David O., 1902-1965.' (persname)" in errors, 'controlaccess name leading whitespace reported') self.assert_("Found leading whitespace in controlaccess term ' \t Mines and mineral resources--Georgia.' (subject)" in errors, 'controlaccess subject leading whitespace reported') self.assert_("Found leading whitespace in controlaccess term ' Motion pictures.' (genreform)" in errors, 'controlaccess genre leading whitespace reported') # - did with > 2 containers self.assert_('Site expects maximum of 2 containers per did; found 1 did(s) with more than 2' in errors, 'did with more than 2 containers reported') # - did with only 1 container self.assert_('Site expects 2 containers per did; found 1 did(s) with only 1' in errors, 'did with only 1 container reported') # make sure we handle quirky document with a <title> at the beginning of the <unittitle> eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'pittsfreeman1036.xml') ead_nested_title = load_xmlobject_from_file(eadfile, FindingAid) errors = utils.check_eadxml(ead_nested_title) self.assert_(all('list title' not in err for err in errors), 'nested <title> in <unittitle> should not generate a list title whitespace error')