Пример #1
0
    def test_extref(self):
        self.content.node = etree.fromstring(self.EXTREF)
        fmt = format_ead(self.content)
        self.assert_('<a href="http://pid.emory.edu/ark:/25593/8zgst">Irish Literary Miscellany</a>'
            in fmt, 'extref tag converted to a href')

        self.content.node = etree.fromstring(self.EXTREF_NOLINK)
        fmt = format_ead(self.content)
        self.assert_('<a>Irish Literary Miscellany</a>'
            in fmt, 'formatter should not fail when extref has no href')
Пример #2
0
 def test_title_emph(self):
     self.content.node = etree.fromstring(self.TITLE_EMPH)
     fmt = format_ead(self.content)
     self.assert_('<em>Biographical source:</em> "Shaw, George' in fmt,
         "emph tag rendered correctly in section with title")
     self.assert_('<span class="ead-title">Contemporary Authors Online</span>, Gale' in fmt,
         "title rendered correctly in sectino with emph tag")
Пример #3
0
    def test_title(self):
        self.content.node = etree.fromstring(self.TITLE)
        fmt = format_ead(self.content)
        self.assert_('magazine <span class="ead-title">The Smart Set</span> from' in fmt,
                     "title tag converted correctly to span class ead-title")

        # title variants
        # - doublequotes
        self.content.node = etree.fromstring(self.TITLE_QUOT)
        fmt = format_ead(self.content)
        self.assertEqual('"Terminus"', fmt)
        # - multiple
        self.content.node = etree.fromstring(self.TITLE_MULTI)
        fmt = format_ead(self.content)
        self.assertEqual('Some Author: "Terminus", "Saturday"', fmt)

        # - multiple titles + RDFa
        fmt = format_ead(self.content, rdfa=True)
        self.assertEqual('<span rel="dc:creator"><span typeof="schema:Person"><span property="schema:name">Some Author</span></span></span>: "<span inlist="inlist" property="dc:title">Terminus</span>", "<span inlist="inlist" property="dc:title">Saturday</span>"',
                         fmt)
Пример #4
0
 def test_exist_match(self):
     self.content.node = etree.fromstring(self.EXIST_MATCH)
     fmt = format_ead(self.content)
     self.assert_('Pitts v. <span class="exist-match">Freeman</span>'
         in fmt, 'exist:match tag converted to span for highlighting')
Пример #5
0
 def test_notrans(self):
     self.content.node = etree.fromstring(self.NOTRANS)
     fmt = format_ead(self.content)
     self.assert_('magazine <span class="ead-title">The Smart Set</span>...' in fmt,
         "nested format rendered correctly")
Пример #6
0
 def test_bold(self):
     self.content.node = etree.fromstring(self.BOLD)
     fmt = format_ead(self.content)
     self.assert_('<span class="ead-bold">Pitts v. Freeman</span> school desegregation' in fmt,
         "render bold converted correctly to span class ead-bold")
Пример #7
0
 def test_italics(self):
     self.content.node = etree.fromstring(self.ITALICS)
     fmt = format_ead(self.content)
     self.assert_('<span class="ead-italic">Pitts v. Freeman</span> school desegregation' in fmt,
         "render italic converted correctly to span class ead-italic")
Пример #8
0
def check_eadxml(ead):
    """Sanity checks specific to the EAD xml, independent of file or eXist.

    Checks the following:
     - series and index ids are present
     - fields used for search/browse title match code expectations:
        - at most one top-level origination
        - no leading whitespace in list-title (origination or unittitle)
        - alphabetical first letter (for first-letter browse)
     - eadid matches site URL regex

    :param ead: :class:`~findingaids.fa.models.FindingAid` ead instance to be checked
    :returns: list of all errors found
    :rtype: list
    """
    # NOTE: throughout, be sure to use unicode instead of string
    errors = []

    # check that series ids are set
    if ead.dsc and ead.dsc.hasSeries():
        for series in ead.dsc.c:
            errors.extend(check_series_ids(series))

    # check that any index ids are set
    for index in ead.archdesc.index:
        if not index.id:
            errors.append("%(node)s id attribute is not set for %(label)s"
                % {'node': local_name(index.node), 'label': unicode(index.head)})

    # eadid matches appropriate site URL regex
    if not re.match('^%s$' % EADID_URL_REGEX, ead.eadid.value):   # entire eadid should match regex
        errors.append("eadid '%s' does not match site URL regular expression" \
                      % ead.eadid.value)

    # multiple tests to ensure xml used for search/browse list-title matches what code expects
    # -- since list title is pulled from multiple places, give enough context so it can be found & corrected
    list_title_path = "%s/%s" % (local_name(ead.list_title.node.getparent()),
                                 local_name(ead.list_title.node))
    # - check for at most one top-level origination
    origination_count = ead.node.xpath('count(e:archdesc/e:did/e:origination)',
                                       namespaces={'e': EAD_NAMESPACE})
    if int(origination_count) > 1:
        errors.append("Site expects only one archdesc/did/origination; found %d" \
                        % origination_count)

    # container list formatting (based on encoding practice) expects only 2 containers per did
    # - dids with more than 2 containers
    containers = ead.node.xpath('//e:did[count(e:container) > 2]',
                                namespaces={'e': EAD_NAMESPACE})
    if len(containers):
        errors.append("Site expects maximum of 2 containers per did; found %d did(s) with more than 2" \
                        % len(containers))
        errors.append(['Line %d: %s' % (c.sourceline, tostring(c)) for c in containers])
    # - dids with only one container
    containers = ead.node.xpath('//e:did[count(e:container) = 1]',
                                namespaces={'e': EAD_NAMESPACE})
    if len(containers):
        errors.append("Site expects 2 containers per did; found %d did(s) with only 1" \
                        % len(containers))
        errors.append(['Line %d: %s' % (c.sourceline, tostring(c)) for c in containers])

    # - no leading whitespace in list title
    title_node = ead.node.xpath("%s/text()" % ead.list_title_xpath,
                                namespaces={'e': EAD_NAMESPACE})
    if hasattr(title_node[0], 'text'):
        title_text = title_node[0].text
    else:
        title_text = unicode(title_node)

    if title_text is None:
        errors.append("List title seems to be empty")
    elif re.match(r'\s+', title_text):
        # using node.text because unicode() normalizes, which obscures whitespace problems
        errors.append("Found leading whitespace in list title field (%s): '%s'" %
                      (list_title_path, ead.list_title.node.text))
        # report with enough context that they can find the appropriate element to fix

    # - first letter of title matches regex   -- only check if whitespace test fails
    elif not re.match(TITLE_LETTERS, ead.first_letter):
        errors.append("First letter ('%s') of list title field %s does not match browse letter URL regex '%s'" % \
                      (ead.first_letter, list_title_path, TITLE_LETTERS))

    # leading space in unit title (could be list title but might not be)
    # NOTE: title can contain and even start with subtags such as <title>
    # or <emph>, which is hard to account for with lxml or text() xpath.
    # Using format_ead to generate html that would be displayed, and then
    # stripping tags to check for any leading whitespace within a leading tag
    title = striptags(format_ead(ead.unittitle))
    if re.match(r'\s+', title):
        errors.append("Found leading whitespace in unittitle: '%s'" %
                      title)

    # leading whitespace in control access fields (if any)
    if ead.archdesc.controlaccess and ead.archdesc.controlaccess.controlaccess:
        for ca in ead.archdesc.controlaccess.controlaccess:
            for term in ca.terms:
                # NOTE: using node text because term.value is now normalized
                if re.match(r'\s+', unicode(term.node.text)):
                    errors.append("Found leading whitespace in controlaccess term '%s' (%s)" \
                                 % (term.node.text, local_name(term.node)))

    # eadid url should contain resolvable ARK
    if ead.eadid.url is None or not is_ark(ead.eadid.url):
        errors.append("eadid url is either not set or not an ARK. " +
            "To correct, run the prep process again.")

    # eadid identifier should contain short-form ARK
    if ead.eadid.identifier is None or not is_ark(ead.eadid.identifier):
        errors.append("eadid identifier is either not set or not an ARK" +
            "To correct, run the prep process again.")

    # short- and long-form ARKs should match each other
    if ead.eadid.url is not None and ead.eadid.identifier is not None and \
        not ead.eadid.url.endswith(ead.eadid.identifier):
        errors.append("eadid url and identifier do not match: url '%s' should end with identifier '%s'" \
                     % (ead.eadid.url, ead.eadid.identifier))

    return errors