Пример #1
0
def load_alma_bib_record(kdip):
    """
    Bib record from Alma.
    """
    if isinstance(kdip, basestring):
        kdip = models.KDip.objects.get(kdip_id=kdip)

    item = requests.get('%sitems' % settings.ALMA_API_ROOT,
        params={
            'item_barcode': kdip.kdip_id,
            'apikey': settings.ALMA_APIKEY
        }
    )

    bib_rec = item.text.encode('utf-8').strip()
    item_obj = load_xmlobject_from_string(bib_rec, models.AlmaBibItem)

    kdip.mms_id = item_obj.mms_id
    kdip.save()

    bib = requests.get('%sbibs/%s' % (settings.ALMA_API_ROOT, kdip.mms_id),
        params={'apikey': settings.ALMA_APIKEY}
    )

    bib_xml = bib.text.encode('utf-8').strip()

    return load_xmlobject_from_string(bib_xml, models.AlmaBibRecord)
Пример #2
0
    def query(self, xquery=None, start=1, how_many=10, cache=False, session=None,
        release=None, result_type=None):
        """Execute an XQuery query, returning the results directly.

        :param xquery: a string XQuery query
        :param start: first index to return (1-based)
        :param how_many: maximum number of items to return
        :param cache: boolean, to cache a query and return a session id (optional)
        :param session: session id, to retrieve a cached session (optional)
        :param release: session id to be released (optional)
        :rtype: the resultType specified at the creation of this ExistDB;
                defaults to :class:`QueryResult`.

        """

        # xml_s = self.server.query(xquery, how_many, start, kwargs)
        params = {
            '_howmany': how_many,
            '_start': start,
        }
        if xquery is not None:
            params['_query'] = xquery
        if cache:
            params['_cache'] = 'yes'
        if release is not None:
            params['_release'] = release
        if session is not None:
            params['_session'] = session
        if result_type is None:
            result_type = self.resultType

        opts = ' '.join('%s=%s' % (key.lstrip('_'), val)
                        for key, val in params.iteritems() if key != '_query')
        if xquery:
            debug_query = '\n%s' % xquery
        else:
            debug_query = ''
        logger.debug('query %s%s' % (opts, debug_query))

        response = self.session.get(self.restapi_path(''), params=params, stream=False)

        if response.status_code == requests.codes.ok:
            # successful release doesn't return any content
            if release is not None:
                return True  # successfully released

            # TODO: test unicode handling
            return xmlmap.load_xmlobject_from_string(response.content, result_type)

        # 400 bad request returns an xml error we can parse
        elif response.status_code == requests.codes.bad_request:
            err = xmlmap.load_xmlobject_from_string(response.content, ExistExceptionResponse)
            raise ExistDBException(err.message)

        # not sure if any information is available on other error codes
        else:
            raise ExistDBException(response.content)
Пример #3
0
 def test_index_data(self):
     loaded = load_xmlobject_from_string(SAMPLE_MODS, mods.Mods)
     index_data = loaded.index_data()
     self.assertEqual(index_data['abstract'], [u'Poétry description...'])
     self.assertEqual(index_data['contributor_display'], ['Smith, Tom, 1803 or 4-1860 (creator)', 'Baker, Jim, 1718-1762 (director)', 'Wilson, Jane', 'Brown University. English (sponsor)'])
     self.assertEqual(index_data['copyrightDate'], '2008-01-01T00:00:00Z')
     self.assertEqual(index_data['dateCreated'], '2008-02-03T00:00:00Z')
     self.assertEqual(index_data['dateModified'], '2008-05-06T00:00:00Z')
     self.assertEqual(index_data['dateModified_ssim'], ['2008-06-07-2009-01-02', 'invalid date', '2008-06-07'])
     self.assertEqual(index_data['genre'], [u'aat theses', u'bdr theses', u'local theses'])
     self.assertEqual(index_data['mods_genre_aat_ssim'], [u'aat theses'])
     self.assertEqual(index_data['mods_genre_bdr_ssim'], [u'bdr theses'])
     self.assertEqual(index_data['mods_genre_local_ssim'], [u'local theses'])
     self.assertEqual(index_data['mods_access_condition_logo_ssim'], [u'http://i.creativecommons.org/p/zero/1.0/88x31.png'])
     self.assertEqual(index_data['mods_access_condition_use_text_tsim'], [u'To the extent possible under law, the person who associated CC0 with this work has waived all copyright and related or neighboring rights to this work.'])
     self.assertEqual(index_data['mods_access_condition_use_link_ssim'], [u'http://creativecommons.org/publicdomain/zero/1.0/'])
     self.assertEqual(index_data['mods_id'], 'id101')
     self.assertEqual(index_data['mods_id_test_type_ssim'], ['Test type id'])
     self.assertEqual(index_data['mods_note_random_type_ssim'], [u'random type note'])
     self.assertEqual(index_data['mods_note_display_label_ssim'], [u'display label note'])
     self.assertEqual(index_data['mods_title_alt'], [u'alternative title'])
     self.assertEqual(index_data['name'], ['Smith, Tom', 'Baker, Jim', 'Wilson, Jane', 'Brown University. English'])
     self.assertEqual(index_data['note'], [u'Thésis (Ph.D.)', u'discarded: random type note', u'Short: Without ending', u'Display @#$label? display label note'])
     self.assertEqual(index_data['other_title'], [u'Other title'])
     self.assertEqual(index_data['primary_title'], u'Poétry')
     self.assertEqual(index_data['keyword'], [u'Display Labél! modernism', u'metalepsis', u'Display Label: Yeats', u'Stevens', u'Merrill', u'Eliot', u"label missing colon: post modernism"])
     self.assertEqual(index_data['mods_subject_ssim'], [u'Display Labél! modernism', u'metalepsis', u'Display Label: Yeats', u'Stevens', u'Merrill', u'Eliot', u"label missing colon: post modernism"])
     self.assertEqual(index_data['mods_subject_display_label_ssim'], [u'modernism', u'Yeats'])
     self.assertEqual(index_data['mods_subject_local_ssim'], [u'Stevens', u'Eliot'])
Пример #4
0
 def test_geographic_subjects(self):
     loaded = load_xmlobject_from_string(SAMPLE_MODS, mods.Mods)
     subject = [s for s in loaded.subjects if s.hierarchical_geographic][0]
     self.assertEqual(subject.hierarchical_geographic.country, 'United States')
     self.assertEqual(subject.hierarchical_geographic.state, 'Louisiana')
     self.assertEqual(subject.hierarchical_geographic.city, 'New Orleans')
     self.assertEqual(subject.hierarchical_geographic.city_section, 'Lower Ninth Ward')
Пример #5
0
 def test_main(self):
     self.fox.pid = 'sample:123'
     fox_string = self.fox.serialize()
     #round trip to make sure we have what we want. 
     read_fox = load_xmlobject_from_string(fox_string, Fox)
     self.assertEqual('sample:123',
                      read_fox.pid)
Пример #6
0
 def test_isvalid(self):
     # if additions to MODS test fixture cause validation errors, uncomment the next 2 lines to debug
     #self.mods.is_valid()
     #print self.mods.validation_errors()
     self.assertTrue(self.mods.is_valid())
     invalid_mods = load_xmlobject_from_string(self.invalid_xml, mods.MODS)
     self.assertFalse(invalid_mods.is_valid())
    def process_article(self, pid, symp_pub, options):
        self.output(1,"Processing Article %s" % pid)

        # put article xml
        url = '%s/%s' % (self.pub_create_url, pid)
        status = None
        if symp_pub.is_empty():
            self.output(1,"Skipping becase XML is empty")
            self.counts['skipped']+=1
            return
        valid = symp_pub.is_valid()
        self.output(2,"XML valid: %s" % valid)
        if not valid:
            self.output(0, "Error publication xml is not valid for pid %s %s" % (pid, symp_pub.validation_errors()))
            self.counts['errors']+=1
            return
        if not options['noact']:
            response = self.session.put(url, data=symp_pub.serialize())
            status = response.status_code
        self.output(2,"PUT %s %s" %  (url, status if status else "<NO ACT>"))
        self.output(2, "=====================================================================")
        self.output(2, symp_pub.serialize(pretty=True).decode('utf-8', 'replace'))
        self.output(2,"---------------------------------------------------------------------")
        if status and status not in [200, 201]:
            self.output(0,"Error publication PUT returned code %s for %s" % (status, pid))
            self.counts['errors']+=1
            return
        elif not options['noact']:
            # checkd for warnings
            for w in load_xmlobject_from_string(response.raw.read(), OESympImportPublication).warnings:
                self.output(0, 'Warning: %s %s' % (pid, w.message))
                self.counts['warnings']+=1
        self.counts['articles_processed']+=1
Пример #8
0
 def test_round_trip(self):
     self.mods.title = "Sample title"
     self.mods.publisher = "BUL"
     mods_str = self.mods.serialize(pretty=False)
     loaded = load_xmlobject_from_string(mods_str, Mods)
     self.assertEqual(loaded.title, 'Sample title')
     self.assertEqual(loaded.publisher, 'BUL')
Пример #9
0
 def test_subjects(self):
     self.mods.title = "Sample"
     topics = ['sample', 'test']
     for keyword in topics:
         self.mods.subjects.append(mods.Subject(topic=keyword))
     new_mods = load_xmlobject_from_string(self.mods.serialize(), mods.Mods)
     self.assertEqual(topics, [s.topic for s in new_mods.subjects])
Пример #10
0
    def test_load_sample_mods(self):
        loaded = load_xmlobject_from_string(SAMPLE_MODS, mods.Mods)
        self.assertEqual(loaded.id, 'id101')
        self.assertEqual(loaded.title, 'Poétry\n    Title')
        self.assertEqual(loaded.title_info[1].title, 'Other title')
        self.assertEqual(loaded.title_info[2].title, 'alternative title')
        self.assertEqual(loaded.title_info[2].type, 'alternative')
        self.assertEqual(loaded.title_info[2].label, 'First line')
        self.assertEqual(loaded.origin_info.label, 'date added')
        self.assertEqual(loaded.origin_info.places[0].place_terms[0].text, 'USA')
        self.assertEqual(loaded.origin_info.places[0].place_terms[0].authority, 'auth')
        self.assertEqual(loaded.origin_info.places[0].place_terms[0].authority_uri, 'http://auth.com')
        self.assertEqual(loaded.origin_info.places[0].place_terms[0].value_uri, 'http://auth.com/usa')

        #test names
        personal_names = [name.name_parts[0].text for name in loaded.names if name.type == 'personal' and name.name_parts[0].text]
        self.assertEqual(len(personal_names), 3)
        personal_name_list = ['Smith, Tom', 'Baker, Jim', 'Wilson, Jane']
        for i in range(3):
            self.assertTrue(personal_names[i] in personal_name_list)
        corporate_names = [name.name_parts[0].text for name in loaded.names if name.type == 'corporate']
        corporate_name_list = ['Brown University. English', 'Providence, RI']
        self.assertEqual(corporate_names, corporate_name_list)
        tom_smith = [name for name in loaded.names if name.name_parts[0].text == 'Smith, Tom'][0]
        self.assertEqual(tom_smith.authority, 'fast')
        self.assertEqual(tom_smith.authority_uri, 'http://fast.com')
        self.assertEqual(tom_smith.value_uri, 'http://fast.com/1')
        self.assertEqual(tom_smith.roles[0].authority, 'marcrelator')
        self.assertEqual(tom_smith.roles[0].authority_uri, 'http://id.loc.gov/vocabulary/relators')
        self.assertEqual(tom_smith.roles[0].value_uri, 'http://id.loc.gov/vocabulary/relators/cre')

        self.assertEqual(loaded.resource_type, 'text')
        self.assertEqual(loaded.genres[1].text, 'aat theses')
        self.assertEqual(loaded.genres[4].text, '123')
        self.assertEqual(loaded.genres[4].authority, 'fast')
        self.assertEqual(loaded.genres[4].authority_uri, 'http://fast.com')
        self.assertEqual(loaded.genres[4].value_uri, 'http://fast.com/123')
        s = [s for s in loaded.subjects if s.topic == '456'][0]
        self.assertEqual(s.authority, 'fast')
        self.assertEqual(s.authority_uri, 'http://fast.com')
        self.assertEqual(s.value_uri, 'http://fast.com/456')
        self.assertEqual(loaded.notes[0].text, 'Thésis (Ph.D.)')
        self.assertEqual(loaded.target_audiences[0].text, 'Target Audience')
        self.assertEqual(loaded.target_audiences[0].authority, 'local')
        self.assertEqual(loaded.physical_description.extent, 'viii, 208 p.')
        self.assertEqual(loaded.physical_description.digital_origin, 'born digital')
        self.assertEqual(loaded.physical_description.note, 'note 1')
        self.assertEqual(loaded.classifications[0].text, 'Some classification')
        self.assertEqual(loaded.classifications[0].label, 'Test classification')
        self.assertEqual(loaded.classifications[0].authority, 'classauth')
        self.assertEqual(loaded.classifications[0].authority_uri, 'http://classauth.com')
        self.assertEqual(loaded.classifications[0].value_uri, 'http://classauth.com/some')
        self.assertEqual(loaded.locations[0].physical.text, 'Random location')
        self.assertEqual(loaded.locations[0].physical.authority, 'locauth')
        self.assertEqual(loaded.locations[0].physical.authority_uri, 'http://locauth.com')
        self.assertEqual(loaded.locations[0].physical.value_uri, 'http://locauth.com/random')
        self.assertEqual(loaded.locations[0].holding_simple.copy_information[0].notes[0].text, 'location note')
        self.assertEqual(loaded.related_items[1].label, 'location of original')
        self.assertEqual(loaded.related_items[1].classifications[0].text, 'Classification')
Пример #11
0
 def test_setting_xlink_href(self):
     access_condition = mods.AccessCondition(text='access condition')
     access_condition.node.set('{%s}href' % mods.XLINK_NAMESPACE, 'http://example.com')
     self.mods.access_conditions.append(access_condition)
     mods_str = self.mods.serialize(pretty=False)
     loaded = load_xmlobject_from_string(mods_str, mods.Mods)
     xlink_href = loaded.access_conditions[0].node.get('{%s}href' % mods.XLINK_NAMESPACE)
     self.assertEqual(xlink_href, 'http://example.com')
Пример #12
0
 def loadFixtureData(self, fname):
     data = load_fixture_data(fname)
     # if pidspace is specified, get a new pid from fedora and set it as the pid in the xml
     if hasattr(self, "pidspace"):
         xml = xmlmap.load_xmlobject_from_string(data, _MinimalFoxml)
         xml.pid = self.getNextPid()
         return xml.serialize()
     else:
         return data
Пример #13
0
 def test_multiContextAndHolder(self):
     self.init_context("rights1")
     self.init_context("rights2")
     self.init_context("rights3")
     self.init_holder()
     rights_str = self.rights.serialize(pretty=True)
     loaded = load_xmlobject_from_string(rights_str, Rights)
     self.assertEqual(len(loaded.ctext), 3)
     self.assertEqual(loaded.holder.context_ids, "rights1 rights2 rights3")
Пример #14
0
    def from_string(cls, xml_string, validate=True):
        """ Creates a Python object from a XML string

        :param xml_string: XML string
        :param validate: XML should be validated against the embedded XSD definition
        :type validate: Boolean
        :returns: the Python object
        """
        return xmlmap.load_xmlobject_from_string(xml_string, xmlclass=cls, validate=validate)
Пример #15
0
 def test_subjects(self):
     self.mods.title = "Sample"
     local = ['sample', 'test']
     for keyword in local:
         subject = LocalTopic()
         subject.topic = keyword
         self.mods.local_topic.append(subject)
     new_mods = load_xmlobject_from_string(self.mods.serialize(), Mods)
     self.assertEqual(local, [n.topic for n in new_mods.local_topic])
Пример #16
0
    def test_update_instance(self):
        # initialize data the same way a view processing a POST would
        update_form = TestForm(self.post_data, instance=self.testobj)
        # check that form is valid - if no errors, this populates cleaned_data
        self.assertTrue(update_form.is_valid())

        instance = update_form.update_instance()
        self.assert_(isinstance(instance, TestObject))
        self.assertEqual(21, instance.int)
        self.assertEqual(False, instance.bool)
        self.assertEqual('b', instance.id)
        self.assertEqual('completely new text content', instance.longtext)
        self.assertEqual(0, instance.other_child.val)

        # spot check that values were set properly in the xml
        xml = instance.serialize()
        self.assert_('id="b"' in xml)
        self.assert_('<boolean>no</boolean>' in xml)

        # test save on form with no pre-existing xmlobject instance
        class SimpleForm(XmlObjectForm):
            class Meta:
                model = TestObject
                fields = ['id', 'bool', 'longtext'] # fields with simple, top-level xpaths
                # creation for nested node not yet supported in xmlmap - excluding int
                exclude = ['child']      # exclude subform to simplify testing

        new_form = SimpleForm({'id': 'A1', 'bool': True, 'longtext': 'la-di-dah'})
        self.assertTrue(new_form.is_valid())
        instance = new_form.update_instance()
        self.assert_(isinstance(instance, TestObject),
            "update_instance on unbound xmlobjectform returns correct xmlobject instance")
        self.assertEqual(True, instance.bool)
        self.assertEqual('A1', instance.id)
        self.assertEqual('la-di-dah', instance.longtext)
        # spot check values in created-from-scratch xml
        xml = instance.serialize()
        self.assert_('id="A1"' in xml)
        self.assert_('<boolean>yes</boolean>' in xml)

        # formset deletion
        data = self.post_data.copy()
        # update post data to test deleting items
        data.update({
            'children-INITIAL_FORMS': 4,        # only initial forms can be deleted
            'children-0-DELETE': True,
            'children-2-DELETE': True,
        })
        # make a copy object, since the instance will be updated by the form
        testobj = xmlmap.load_xmlobject_from_string(self.testobj.serialize(), TestObject)
        update_form = TestForm(data, instance=self.testobj)
        # check that form is valid - if no errors, this populates cleaned_data
        self.assertTrue(update_form.is_valid())
        instance = update_form.update_instance()
        # children 0 and 2 should be removed from the updated instance
        self.assert_(testobj.children[0] not in instance.children)
        self.assert_(testobj.children[2] not in instance.children)
Пример #17
0
    def generate_tei(self, ocrpage):
        '''Generate TEI facsimile for the current page'''
        try:
            result = ocrpage.xsl_transform(filename=self.ocr_to_teifacsimile_xsl,
                return_type=unicode, **self.tei_options)
            # returns _XSLTResultTree, which is not JSON serializable;
            return xmlmap.load_xmlobject_from_string(result, tei.Facsimile)

        except etree.XMLSyntaxError:
            logger.warn('OCR xml for %s is invalid', self.pid)
Пример #18
0
   def test_isvalid(self):
       self.assertTrue(self.dc.is_valid())
       
       invalid = """<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/">
 <dc:title>Feet in the Fire</dc:title>
       <not_a_dc_field>bogus</not_a_dc_field>
       </oai_dc:dc>
       """
       invalid_dc = load_xmlobject_from_string(invalid, DublinCore)
       self.assertFalse(invalid_dc.is_valid())
Пример #19
0
    def test_isvalid(self):
        self.assertTrue(self.dc.is_valid())

        invalid = """<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/">
  <dc:title>Feet in the Fire</dc:title>
        <not_a_dc_field>bogus</not_a_dc_field>
        </oai_dc:dc>
        """
        invalid_dc = load_xmlobject_from_string(invalid, DublinCore)
        self.assertFalse(invalid_dc.is_valid())
Пример #20
0
    def test_pageV1_0(self):
        # page 1.0 - abbyy ocr content

        page = PageV1_0(
            Mock())  # use mock for fedora api, since we won't make any calls
        page.pid = 'rdxtest:4607'
        page.page_order = 5
        # vol = VolumeV1_0(Mock())
        with patch('readux.books.models.PageV1_0.volume') as mockvolume:
            mockvolume.uriref = rdflib.URIRef('vol:1')
            mockvolume.display_label = 'Mabel Meredith'
            mockvolume.volume = None
            mockvolume.creator = ['Townley, Arthur']
            mockvolume.date = '1863'

            # update fixture xml with ids
            with open(VolumeV1_0.ocr_add_ids_xsl) as xslfile:
                result = self.fr6v1.xsl_transform(filename=xslfile,
                                                  return_type=unicode)
                fr6v1_with_ids = load_xmlobject_from_string(
                    result, abbyyocr.Document)

            # use the first page with substantial text content as input
            ocr_page = fr6v1_with_ids.pages[5]
            teipage = page.generate_tei(ocr_page)
            # NOTE: uncomment to see generated TEI
            # print tei.serialize()

            # should be generating valid tei
            # if not tei.schema_valid():
            # print tei.schema_validation_errors()
            self.assertTrue(teipage.schema_valid(),
                            'generated TEI facsimile should be schema-valid')
            # inspect the tei and check for expected values
            # - page identifier based on page_order value passed in
            self.assertEqual(ocr_page.id, teipage.page.id,
                             'tei id should be carried through from ocr xml')
            self.assertEqual(page.display_label, teipage.title,
                             'tei title should be set from page diplay label')
            # distributor not mapped in teimap, so just use xpath to check
            self.assertEqual(
                settings.TEI_DISTRIBUTOR,
                teipage.node.xpath('string(//t:publicationStmt/t:distributor)',
                                   namespaces={'t': teipage.ROOT_NS}),
                'configured tei distributor should be set in publication statement'
            )
            # recognized as abbyy input
            self.assert_('Abbyy file' in teipage.header.source_description,
                         'input should be recognized as Abbyy ocr')
            # brief bibliographic data
            self.assert_(
                mockvolume.display_label in teipage.header.source_description)
            self.assert_(
                mockvolume.creator[0] in teipage.header.source_description)
            self.assert_(mockvolume.date in teipage.header.source_description)
    def test_WriteRead(self):
        self.ir.depositor_name = "Johnny"
        self.ir.depositor_email = "*****@*****.**"
        self.ir.date = "2012-05-31"
        self.ir.filename = "Multiple files"
        self.ir.collections_date = "2012-05-31"
        self.ir.collection = '598'

        ir_str = self.ir.serializeDocument(pretty=True)
        loaded = load_xmlobject_from_string(ir_str, IR)
        self.assertEqual(loaded.collection, '598')
Пример #22
0
    def process_relations(self, pid, relations, options):

        self.output(1, "Processing Relationss for %s" % pid)

        # put relationship xml
        url = self.relation_create_url
        status = None
        for r in relations:
            self.output(0, "%s %s" % (r.from_object, r.to_object))
            status = None
            valid = r.is_valid()
            self.output(2, "XML valid: %s" % valid)
            if not valid:
                self.output(
                    0,
                    "Error because a relation xml is not valid for pid %s %s" %
                    (pid, r.validation_errors()))
                self.counts['errors'] += 1
                continue
            if not options['noact']:
                response = self.session.post(self.relation_create_url,
                                             data=r.serialize())
                status = response.status_code

            self.output(2,
                        "POST %s %s" % (url, status if status else "<NO ACT>"))
            self.output(2, r.serialize(pretty=True))
            self.output(
                2,
                "---------------------------------------------------------------------"
            )
        self.output(
            2,
            "====================================================================="
        )
        if status and status not in [200, 201]:
            self.output(
                0,
                "Error relation POST returned code %s for %s" % (status, pid))
            self.counts['errors'] += 1
            return
        elif not options['noact']:
            # checkd for warnings
            try:
                for w in load_xmlobject_from_string(
                        response.raw.read(), OESympImportArticle).warnings:
                    self.output(0, 'Warning: %s %s' % (pid, w.message))
                    self.counts['warnings'] += 1
            except:
                self.output(
                    0,
                    "Trouble reding warnings for relation record in %s" % pid)

        self.counts['relations_processed'] += 1
Пример #23
0
    def from_string(cls, xml_string, validate=True):
        """ Creates a Python object from a XML string

        :param xml_string: XML string
        :param validate: XML should be validated against the embedded XSD definition
        :type validate: Boolean
        :returns: the Python object
        """
        return xmlmap.load_xmlobject_from_string(xml_string,
                                                 xmlclass=cls,
                                                 validate=validate)
Пример #24
0
 def test_index_title_parts(self):
     loaded = load_xmlobject_from_string(SAMPLE_MODS, mods.Mods)
     primary_title = loaded.title_info_list[0]
     primary_title.subtitle = "Primary Subtitle"
     primary_title.part_name = "Primary Part 1"
     primary_title.part_number = "4"
     primary_title.non_sort  = "The"
     index_data = loaded.index_data()
     self.assertEqual(index_data['subtitle'], u'Primary Subtitle')
     self.assertEqual(index_data['partnumber'], u'4')
     self.assertEqual(index_data['partname'], u'Primary Part 1')
     self.assertEqual(index_data['nonsort'], u'The')
Пример #25
0
 def test_relsIsMemberOf(self):
   r = RelsExt()
   r.about = 'info:fedora/test:124'
   ## add MemberOf to RelsExt
   mo = MemberOf()
   mo.name = 'info:fedora/test:master'
   r.is_member_of.append(mo)
   ## add RelsExt to fox-object
   self.fox.rels_ext = r
   ## test after round-trip
   fox_object = load_xmlobject_from_string( self.fox.serialize(), Fox )
   self.assertTrue( 'info:fedora/test:master' == fox_object.rels_ext.is_member_of[0].name )
   self.assertTrue( '<rel:isMemberOf rdf:resource="info:fedora/test:master"/>' in fox_object.serialize() )
Пример #26
0
 def add_ocr_ids(self, regenerate_ids=False):
     'Update OCR xml with ids for pages, blocks, lines, etc'
     with open(self.ocr_add_ids_xsl) as xslfile:
         try:
             result = self.ocr.content.xsl_transform(filename=xslfile,
                 return_type=unicode, id_prefix='rdx_%s.' % self.noid,
                 regenerate_ids='true' if regenerate_ids else '')
             # set the result as ocr datastream content
             self.ocr.content = xmlmap.load_xmlobject_from_string(result)
             return True
         except etree.XMLSyntaxError:
             logger.warn('OCR xml for %s is invalid', self.pid)
             return False
Пример #27
0
    def test_pageV1_0(self):
        # page 1.0 - abbyy ocr content

        page = PageV1_0(Mock()) # use mock for fedora api, since we won't make any calls
        page.pid = 'rdxtest:4607'
        page.page_order = 5
        # vol = VolumeV1_0(Mock())
        with patch('readux.books.models.PageV1_0.volume') as mockvolume:
            mockvolume.uriref = rdflib.URIRef('vol:1')
            mockvolume.display_label = 'Mabel Meredith'
            mockvolume.volume = None
            mockvolume.creator = ['Townley, Arthur']
            mockvolume.date = '1863'

            # update fixture xml with ids
            with open(VolumeV1_0.ocr_add_ids_xsl) as xslfile:
                result = self.fr6v1.xsl_transform(filename=xslfile,
                    return_type=unicode)
                fr6v1_with_ids = load_xmlobject_from_string(result,
                    abbyyocr.Document)

            # use the first page with substantial text content as input
            ocr_page = fr6v1_with_ids.pages[5]
            teipage = page.generate_tei(ocr_page)
            # NOTE: uncomment to see generated TEI
            # print tei.serialize()

            # should be generating valid tei
            # if not tei.schema_valid():
                # print tei.schema_validation_errors()
            self.assertTrue(teipage.schema_valid(),
                'generated TEI facsimile should be schema-valid')
            # inspect the tei and check for expected values
            # - page identifier based on page_order value passed in
            self.assertEqual(ocr_page.id, teipage.page.id,
                'tei id should be carried through from ocr xml')
            self.assertEqual(page.display_label, teipage.title,
                'tei title should be set from page diplay label')
            # distributor not mapped in teimap, so just use xpath to check
            self.assertEqual(settings.TEI_DISTRIBUTOR,
                teipage.node.xpath('string(//t:publicationStmt/t:distributor)',
                    namespaces={'t': teipage.ROOT_NS}),
                'configured tei distributor should be set in publication statement')
            # recognized as abbyy input
            self.assert_('Abbyy file' in teipage.header.source_description,
                'input should be recognized as Abbyy ocr')
            # brief bibliographic data
            self.assert_(mockvolume.display_label in teipage.header.source_description)
            self.assert_(mockvolume.creator[0] in teipage.header.source_description)
            self.assert_(mockvolume.date in teipage.header.source_description)
Пример #28
0
 def test_multiple_cmodels(self):
     #first model
     r = RelsExt()
     r.about = 'info:fedora/test:123' 
     m1 = Cmodel()
     m1.name = 'info:fedora/bdr-cmodel:commonMetadata'
     r.model.append(m1)
     #second model
     m2 = Cmodel()
     m2.name = 'info:fedora/bdr-cmodel:masterImage'
     r.model.append(m2)
     self.fox.rels_ext = r
     read_fox = load_xmlobject_from_string(self.fox.serialize(), Fox)
     self.assertTrue('info:fedora/bdr-cmodel:commonMetadata' in [m.name for m in read_fox.rels_ext.model])
     self.assertTrue('info:fedora/bdr-cmodel:masterImage' in [m.name for m in read_fox.rels_ext.model])
Пример #29
0
    def _render_item_to_rdf(self, xmlstring):
        # convenience method for testing ead file component rdf output

        # load xml as an ead series item
        component = load_xmlobject_from_string(xmlstring, Series)
        # render with the file_item template used in findingaid display
        self.ctxt.update({'component': component})
        result = self.item_tmpl.render(self.ctxt)
        # parse as RDFa and return the resulting rdflib graph
        # - patch in namespaces before parsing as rdfa
        result = '<html xmlns:schema="%s" xmlns:bibo="%s">%s</html>' % \
            (self.SCHEMA_ORG, self.BIBO, result)
        g = rdflib.Graph()
        g.parse(data=result, format='rdfa')
        return g
Пример #30
0
 def test_add_users_and_build_hydra(self):
     self.builder.addReader('*****@*****.**').addReader('BROWN:GROUP')
     self.builder.addReader('*****@*****.**')
     self.builder.addEditor('*****@*****.**')
     self.builder.addReader('*****@*****.**').addDiscoverer('*****@*****.**')
     rights = self.builder.build_hydra()
     rights_str = rights.serialize(pretty=True)
     hydra_rights = load_xmlobject_from_string(rights_str, HydraRights)
     self.assertEqual(hydra_rights.discover_access_group, [])
     self.assertEqual(hydra_rights.discover_access_person, ['*****@*****.**'])
     self.assertEqual(hydra_rights.read_access_group, ['BROWN:GROUP'])
     self.assertEqual(sorted(hydra_rights.read_access_person), ['*****@*****.**', '*****@*****.**'])
     self.assertEqual(hydra_rights.edit_access_group, [])
     self.assertEqual(hydra_rights.edit_access_person, ['*****@*****.**'])
     self.assertEqual(hydra_rights.delete_access_group, [])
     self.assertEqual(hydra_rights.delete_access_person, [])
Пример #31
0
    def process_step(self, form):
        if self.steps.current == '0':
            text_type = form.data['0-text_type']
            text = form.data['0-text']

            # Prepare message
            uima_response = {}
            uima_response['response'] = None
            uima_corr_id = str(uuid.uuid4())
            uima_body = json.dumps({
                'text': text,
                'mode': text_type,
            })

            def uima_on_response(channel, method, props, body):
                if uima_corr_id == props.correlation_id:
                    uima_response['response'] = body

            # Call UIMA
            uima_connection = BlockingConnection(
                ConnectionParameters(host=RABBITMQ_SERVER))
            uima_channel = uima_connection.channel()
            uima_result = uima_channel.queue_declare(exclusive=True)
            uima_callback_queue = uima_result.method.queue
            uima_channel.basic_consume(uima_on_response,
                                       no_ack=True,
                                       queue=uima_callback_queue)
            uima_channel.basic_publish(exchange='',
                                       routing_key='uima_plain_worker',
                                       properties=BasicProperties(
                                           reply_to=uima_callback_queue,
                                           content_type='application/json',
                                           correlation_id=uima_corr_id,
                                       ),
                                       body=uima_body)

            while uima_response['response'] is None:
                uima_connection.process_data_events()

            # Transform result into HTML
            result = uima_response['response']
            result = xmlmap.load_xmlobject_from_string(result,
                                                       xmlclass=RocheTEI)
            result = result.body.xsl_transform(xsl=XSL_TRANSFORM_1).serialize()
            self.uima_result = result

        return self.get_form_step_data(form)
Пример #32
0
    def get_items(self, **kwargs):
        '''Query the DigWF API getItems method.  If no search terms
        are specified, getItems returns any items that are in the
        **Ready for Repository** state.  Any keyword arguments will be
        passed to getItems as query arguments.  Currently supports:

          * control_key (e.g., ocm or ocn number) - may match more
            than one item
          * item_id - the item id for the record in the DigWF
          * pid - the noid portion of the pid/ARK for the item

        :returns: :class:`Items`
        '''
        url = '%s/getItems' % self.base_url
        r = requests.get(url, params=kwargs)
        if r.status_code == requests.codes.ok:
            return xmlmap.load_xmlobject_from_string(r.content, Items)  # possible r.text ?
Пример #33
0
def load_bib_record(kdip):
    """
    Method to load MARC XML from Am
    http://discovere.emory.edu:8991/cgi-bin/get_alma_record?item_id=010002483050
    Method accepts a KDip object of a barcode as a string.
    """
    if isinstance(kdip, basestring):
        barcode = kdip
    else:
        barcode = kdip.kdip_id

    get_bib_rec = requests.get( \
        'https://kleene.library.emory.edu/cgi-bin/get_alma_record?item_id=', \
        params={'item_id': barcode})

    return load_xmlobject_from_string( \
        get_bib_rec.text.encode('utf-8'), models.Marc)
Пример #34
0
    def process_article(self, pid, symp_pub, options):
        self.output(1, "Processing Article %s" % pid)

        # put article xml
        url = '%s/%s' % (self.pub_create_url, pid)
        status = None
        if symp_pub.is_empty():
            self.output(1, "Skipping becase XML is empty")
            self.counts['skipped'] += 1
            return
        valid = symp_pub.is_valid()
        self.output(2, "XML valid: %s" % valid)
        if not valid:
            self.output(
                0, "Error publication xml is not valid for pid %s %s" %
                (pid, symp_pub.validation_errors()))
            self.counts['errors'] += 1
            return
        if not options['noact']:
            response = self.session.put(url, data=symp_pub.serialize())
            status = response.status_code
        self.output(2, "PUT %s %s" % (url, status if status else "<NO ACT>"))
        self.output(
            2,
            "====================================================================="
        )
        self.output(2,
                    symp_pub.serialize(pretty=True).decode('utf-8', 'replace'))
        self.output(
            2,
            "---------------------------------------------------------------------"
        )
        if status and status not in [200, 201]:
            self.output(
                0, "Error publication PUT returned code %s for %s" %
                (status, pid))
            self.counts['errors'] += 1
            return
        elif not options['noact']:
            # checkd for warnings
            for w in load_xmlobject_from_string(response.raw.read(),
                                                OESympImportArticle).warnings:
                self.output(0, 'Warning: %s %s' % (pid, w.message))
                self.counts['warnings'] += 1
        self.counts['articles_processed'] += 1
Пример #35
0
    def get_all_xpaths(self):
        result = set()

        if self.nodeset:
            result.add(self.nodeset)
        if self.has_variables():
            for variable in self.get_variables():
                result.add(variable.function)

        if self.actions:
            for action in self.actions:
                for frame in action.stack.frames:
                    result.add(frame.if_clause)
                    for datum in getattr(frame, 'datums', []):
                        result.add(datum.value)

        def _get_graph_config_xpaths(configuration):
            result = set()
            for config in configuration.configs:
                result.add(config.xpath_function)
            return result

        for field in self.fields:
            if field.template.form == 'graph':
                s = etree.tostring(field.template.node)
                template = load_xmlobject_from_string(s,
                                                      xmlclass=GraphTemplate)
                result.update(
                    _get_graph_config_xpaths(template.graph.configuration))
                for series in template.graph.series:
                    result.add(series.nodeset)
                    result.update(
                        _get_graph_config_xpaths(series.configuration))
            else:
                result.add(field.header.text.xpath_function)
                result.add(field.template.text.xpath_function)
                if field.template.text.xpath:
                    for variable in field.template.text.xpath.variables:
                        if variable.xpath:
                            result.add(str(variable.xpath.function))

        for detail in self.details:
            result.update(detail.get_all_xpaths())
        result.discard(None)
        return result
Пример #36
0
def index(request):

    qs = TextAnnotation.objects.all()

    uima_latest = []
    for uima in qs[:10]:
        q = xmlmap.load_xmlobject_from_string(uima.text.encode("utf-8"), xmlclass=RocheTEI)
        result = q.body.xsl_transform(xsl=XSL_TRANSFORM_1).serialize()
        # Remove div and p
        uima_latest.append([uima, result])


    qs = Annotation.objects.all()

    annotation_latest = qs[:100]

    data = {'uima_latest': uima_latest, 'annotation_latest': annotation_latest, }
    return render(request, 'activity/index.html', data)
Пример #37
0
    def get_items(self, **kwargs):
        '''Query the DigWF API getItems method.  If no search terms
        are specified, getItems returns any items that are in the
        **Ready for Repository** state.  Any keyword arguments will be
        passed to getItems as query arguments.  Currently supports:

          * control_key (e.g., ocm or ocn number) - may match more
            than one item
          * item_id - the item id for the record in the DigWF
          * pid - the noid portion of the pid/ARK for the item

        :returns: :class:`Items`
        '''
        url = '%s/getItems' % self.base_url
        r = requests.get(url, params=kwargs)
        if r.status_code == requests.codes.ok:
            return xmlmap.load_xmlobject_from_string(
                r.content, Items)  # possible r.text ?
Пример #38
0
def UpdateCustom(server, path, purge=False):
    """
    Function to update custom xml datastream for all existing objects.
    """
    i = 0
    username, password, root = repo.Get_Configs(server)
    repo = Repository(root=root, username=username, password=password)

    xml_files = (x for x in os.listdir(path) if "DATA.xml" in x)
    for xml in xml_files:
        pid = repo.Get_Pid(xml, repo)
        dsx = DatastreamXml(pid, server=server, repo=repo)
        if pid is not None:
            print "Object found for {0}".format(xml)
            custom_ds = CustomEtd(os.path.join(path, xml),
                                  server=server,
                                  pid=pid)
            root = custom_ds.CustomDs()

            custom_xml = os.path.join(path, xml.replace("DATA", "CUSTOM"))

            #with open(custom_xml, "w") as f:
            #    f.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True))

            xml_object = xmlmap.load_xmlobject_from_string(
                etree.tostring(root,
                               pretty_print=True,
                               encoding='utf-8',
                               xml_declaration=True))

            if purge is True:
                dsx.digital_object.api.purgeDatastream(pid, "CUSTOM")
                print "PURGED CUSTOM"

            new_datastream = DatastreamObject(
                dsx.digital_object,
                "CUSTOM",
                "Custom metadata compiled by MSUL",
                mimetype="text/xml",
                control_group="X")
            new_datastream.content = xml_object
            new_datastream.label = "Custom metadata compiled by MSUL"
            new_datastream.save()
Пример #39
0
def show_annotated(request, uima_id):
    """
    Show previously annotated UIMA result.
    """

    try:
        uima = TextAnnotation.objects.get(pk=int(uima_id))
        result = uima.text
    except:
        result = ''

    # TODO: catch XMLSyntaxError
    # XSLT transform result
    q = xmlmap.load_xmlobject_from_string(result.encode("utf-8"), xmlclass=RocheTEI)
    result = q.body.xsl_transform(xsl=XSL_TRANSFORM_1).serialize()

    # TODO: need an other template...
    data = {'tei_documents': [q], 'tei_transform': result, 'max_juan': 0, }
    return render_to_response('browser/text_view.html', data, context_instance=RequestContext(request))
Пример #40
0
 def __init__(self,**kwargs):
     """
     Initializes web OPAC address from passed in variable.
     """
     if kwargs.has_key('opac_url'):
         self.opac_url  = kwargs.get('opac_url')
     else:
         self.opac_url = None
     if kwargs.has_key('item_id'):
         self.item_id = kwargs.get('item_id')
         raw_xml_url = self.opac_url + self.item_id
         try:
             raw_xml = urllib2.urlopen(raw_xml_url).read()
             self.item_xml = xmlmap.load_xmlobject_from_string(raw_xml,xmlclass=ItemRecord) 
         except:
             logging.error("ERROR with %s" % raw_xml_url)
             self.item_xml = None 
     else:
         self.item_id = None
Пример #41
0
def call_api(**kwargs):
    if 'ak' not in kwargs:
        if hasattr(settings, 'ROMEO_API_KEY'):
            kwargs['ak'] = settings.ROMEO_API_KEY
    query_args = urlencode(kwargs)
    url = '%s?%s' % (API_BASE_URL, query_args)
    response_file = None
    response = None
    try:
        response_file = urlopen(url)
        response = xmlmap.load_xmlobject_from_string(response_file.read(),
                        xmlclass=Response)
    finally:
        if response_file is not None:
            response_file.close()
    if response is not None:
        return response
    else:
        return Response() # dummy value to return when things have gone horribly wrong
Пример #42
0
def index(request):

    qs = TextAnnotation.objects.all()

    uima_latest = []
    for uima in qs[:10]:
        q = xmlmap.load_xmlobject_from_string(uima.text.encode("utf-8"),
                                              xmlclass=RocheTEI)
        result = q.body.xsl_transform(xsl=XSL_TRANSFORM_1).serialize()
        # Remove div and p
        uima_latest.append([uima, result])

    qs = Annotation.objects.all()

    annotation_latest = qs[:100]

    data = {
        'uima_latest': uima_latest,
        'annotation_latest': annotation_latest,
    }
    return render(request, 'activity/index.html', data)
Пример #43
0
    def query(self,
              xquery=None,
              start=1,
              how_many=10,
              cache=False,
              session=None,
              release=None,
              result_type=None):
        """Execute an XQuery query, returning the results directly.

        :param xquery: a string XQuery query
        :param start: first index to return (1-based)
        :param how_many: maximum number of items to return
        :param cache: boolean, to cache a query and return a session id (optional)
        :param session: session id, to retrieve a cached session (optional)
        :param release: session id to be released (optional)
        :rtype: the resultType specified at the creation of this ExistDB;
                defaults to :class:`QueryResult`.

        """

        # xml_s = self.server.query(xquery, how_many, start, kwargs)
        params = {
            '_howmany': how_many,
            '_start': start,
        }
        if xquery is not None:
            params['_query'] = xquery
        if cache:
            params['_cache'] = 'yes'
        if release is not None:
            params['_release'] = release
        if session is not None:
            params['_session'] = session
        if result_type is None:
            result_type = self.resultType

        opts = ' '.join('%s=%s' % (key.lstrip('_'), val)
                        for key, val in params.items() if key != '_query')
        if xquery:
            debug_query = '\n%s' % xquery
        else:
            debug_query = ''
        logger.debug('query %s%s', opts, debug_query)
        start = time.time()
        response = self.session.get(self.restapi_path(''),
                                    params=params,
                                    stream=False,
                                    **self.session_opts)

        if xquery_called is not None:
            args = {
                'xquery': xquery,
                'start': start,
                'how_many': how_many,
                'cache': cache,
                'session': session,
                'release': release,
                'result_type': result_type
            }
            xquery_called.send(sender=self.__class__,
                               time_taken=time.time() - start,
                               name='query',
                               return_value=response,
                               args=[],
                               kwargs=args)

        if response.status_code == requests.codes.ok:
            # successful release doesn't return any content
            if release is not None:
                return True  # successfully released

            # TODO: test unicode handling
            return xmlmap.load_xmlobject_from_string(response.content,
                                                     result_type)

        # 400 bad request returns an xml error we can parse
        elif response.status_code == requests.codes.bad_request:
            err = xmlmap.load_xmlobject_from_string(response.content,
                                                    ExistExceptionResponse)
            raise ExistDBException(err.message)

        # not sure if any information is available on other error codes
        else:
            raise ExistDBException(response.content)
Пример #44
0
 def from_xml(cls, node):
     return load_xmlobject_from_string(ElementTree.tostring(node), cls)
Пример #45
0
    def handle(self, *args, **options):
        self.verbosity = int(
            options['verbosity'])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        self.counts = defaultdict(int)

        #connection to repository
        repo = Repository(username=settings.FEDORA_MANAGEMENT_USER,
                          password=settings.FEDORA_MANAGEMENT_PASSWORD)

        #Symplectic-Elements setup
        self.session = requests.Session()
        self.session.auth = (settings.SYMPLECTIC_USER,
                             settings.SYMPLECTIC_PASSWORD)
        self.session.verify = False
        self.session.stream = True
        self.session.headers.update({'Content-Type': 'text/xml'})

        self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL,
                                        "publications")
        self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL,
                                         "publication/records/manual")
        self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL,
                                              "relationships")

        #if pids specified, use that list
        try:
            if len(args) != 0:
                pids = list(args)
                pid_set = [repo.get_object(pid=p, type=Article) for p in pids]

            else:
                #search for Articles.
                pid_set = repo.get_objects_with_cmodel(
                    Article.ARTICLE_CONTENT_MODEL, Article)

        except Exception as e:
            raise CommandError('Error getting pid list (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            self.counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0, "Error getting page: %s : %s " % (p, e.message))
                self.counts['errors'] += 1
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(
                            1, "Skipping %s because pid does not exist" %
                            article.pid)
                        self.counts['skipped'] += 1
                        continue
                    title = article.descMetadata.content.title_info.title if (
                        article.descMetadata.content.title_info
                        and article.descMetadata.content.title_info.title
                    ) else None
                    if title is None or title == '':
                        self.output(
                            1, "Skipping %s because OE Title does not exist" %
                            (article.pid))
                        self.counts['skipped'] += 1
                        continue

                    if not article.is_published:
                        self.output(
                            1, "Skipping %s because pid is not published" %
                            article.pid)
                        self.counts['skipped'] += 1
                        continue

                    # try to detect article by PMC
                    if article.pmcid and not options['force']:
                        response = self.session.get(
                            self.pub_query_url,
                            params={
                                'query':
                                'external-identifiers.pmc="PMC%s"' %
                                article.pmcid,
                                'detail':
                                'full'
                            })
                        entries = load_xmlobject_from_string(
                            response.raw.read(), OESympImportArticle).entries
                        self.output(
                            2, "Query for PMC Match: GET %s %s" %
                            (response.url, response.status_code))
                        if response.status_code == 200:
                            if len(entries) >= 1:
                                self.output(
                                    1,
                                    "Skipping %s because PMC PMC%s already exists"
                                    % (article.pid, article.pmcid))
                                self.counts['skipped'] += 1

                                if options['rel']:
                                    symp_pub, relations = article.as_symp(
                                        source=entries[0].source,
                                        source_id=entries[0].source_id)
                                    self.process_relations(
                                        entries[0].source_id, relations,
                                        options)
                                    sleep(1)
                                continue
                        else:
                            self.output(
                                1,
                                "Skipping %s because trouble with request %s %s"
                                % (article.pid, response.status_code,
                                   entries[0].title))
                            self.counts['skipped'] += 1
                            continue

                    # try to detect article by Title if it does not have PMC
                    if not options['force']:
                        response = self.session.get(self.pub_query_url,
                                                    params={
                                                        'query':
                                                        'title~"%s"' % title,
                                                        'detail': 'full'
                                                    })
                        entries = load_xmlobject_from_string(
                            response.raw.read(), OESympImportArticle).entries
                        # Accouont for mutiple results
                        titles = [e.title for e in entries]
                        self.output(
                            2, "Query for Title Match: GET %s %s" %
                            (response.url, response.status_code))
                        if response.status_code == 200:
                            found = False
                            for t in titles:
                                success, percent = percent_match(title, t, 90)
                                self.output(
                                    1, "Percent Title Match '%s' '%s' %s " %
                                    (title, t, percent))
                                if success:
                                    found = True
                            if found:
                                self.output(
                                    1,
                                    "Skipping %s because Title \"%s\" already exists"
                                    % (article.pid, title))
                                self.counts['skipped'] += 1

                                # update relations if rel is set
                                if options['rel']:
                                    symp_pub, relations = article.as_symp(
                                        source=entries[0].source,
                                        source_id=entries[0].source_id)
                                    self.process_relations(
                                        entries[0].source_id, relations,
                                        options)
                                    sleep(1)
                                continue
                        else:
                            self.output(
                                1,
                                "Skipping %s because trouble with request %s %s"
                                % (article.pid, response.status_code,
                                   entries[0].title))
                            self.counts['skipped'] += 1
                            continue

                    # Process article and relations
                    symp_pub, relations = article.as_symp()
                    self.process_article(article.pid, symp_pub, options)
                    self.process_relations(article.pid, relations, options)
                    sleep(1)

                except Exception as e:
                    self.output(
                        0, "Error processing pid: %s : %s " %
                        (article.pid, e.message))
                    import traceback
                    traceback.print_exc()
                    self.counts['errors'] += 1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
        self.stdout.write("Warnings: %s\n" % self.counts['warnings'])
        self.stdout.write("Articles Processed: %s\n" %
                          self.counts['articles_processed'])
        self.stdout.write("Relations Processed: %s\n" %
                          self.counts['relations_processed'])
Пример #46
0
 def setUp(self):
     super(TestMods, self).setUp()
     self.mods = load_xmlobject_from_string(self.FIXTURE, mods.MODS)
Пример #47
0
 def from_string(cls, xml_string):
     """ Creates a Report from a XML string """
     return xmlmap.load_xmlobject_from_string(xml_string, xmlclass=cls)
Пример #48
0
 def from_xml(cls, node):
     return load_xmlobject_from_string(ElementTree.tostring(node, encoding='utf-8'), cls)
Пример #49
0
def annotation_to_tei(annotation, teivol):
    '''Generate a tei note from an annotation.  Sets annotation id,
    slugified tags as ana attribute, username as resp attribute, and
    annotation content is converted from markdown to TEI.

    :param annotation: :class:`~readux.annotations.models.Annotation`
    :param teivol: :class:`~readux.books.tei.AnnotatedFacsimile` tei
        document, for converting related page ARK uris into TEI ids
    :returns: :class:`readux.books.tei.Note`
    '''
    # NOTE: annotation created/edited dates are not included here
    # because they were determined not to be relevant for our purposes

    # sample note provided by Alice
    # <note resp="JPK" xml:id="oshnp50n1" n="1"><p>This is an example note.</p></note>

    # convert markdown-formatted text content to tei
    note_content = markdown_tei.convert(annotation.text)
    # markdown results could be a list of paragraphs, and not a proper
    # xml tree; also, pags do not include namespace
    # wrap in a note element and set the default namespace as tei
    teinote = load_xmlobject_from_string('<note xmlns="%s">%s</note>' % \
        (teimap.TEI_NAMESPACE, note_content),
        tei.Note)

    # what id do we want? annotation uuid? url?
    teinote.id = 'annotation-%s' % annotation.id  # can't start with numeric
    teinote.href = absolutize_url(annotation.get_absolute_url())
    teinote.type = 'annotation'

    # if an annotation includes tags, reference them by slugified id in @ana
    if 'tags' in annotation.info() and annotation.info()['tags']:
        tags = ' '.join(
            set('#%s' % slugify(t.strip()) for t in annotation.info()['tags']))
        teinote.ana = tags

    # if the annotation has an associated user, mark the author
    # as responsible for the note
    if annotation.user:
        teinote.resp = annotation.user.username

    # include full markdown of the annotation, as a backup for losing
    # content converting from markdown to tei, and for easy display
    teinote.markdown = annotation.text

    # if annotation contains related pages, generate a link group
    if annotation.related_pages:
        for rel_page in annotation.related_pages:
            page_ref = tei.Ref(text=rel_page, type='related page')
            # find tei page identifier from the page ark
            target = teivol.page_id_by_xlink(rel_page)
            if target is not None:
                page_ref.target = '#%s' % target
            teinote.related_pages.append(page_ref)

    # if annotation includes citations, add them to the tei
    # NOTE: expects these citations to be TEI encoded already (generated
    # by the zotero api and added via meltdown-zotero annotator plugin)
    if annotation.extra_data.get('citations', None):
        for bibl in annotation.extra_data['citations']:
            # zotero tei export currently includes an id that is not
            # a valid ncname (contains : and /)
            bibsoup = BeautifulSoup(bibl, 'xml')
            # convert xml id into the format we want:
            # zotero-#### (zotero item id)
            for bibl_struct in bibsoup.find_all('biblStruct'):
                bibl_struct['xml:id'] = 'zotero-%s' % \
                    bibl_struct['xml:id'].split('/')[-1]

            teibibl = load_xmlobject_from_string(bibsoup.biblStruct.prettify(),
                                                 tei.BiblStruct)
            teinote.citations.append(teibibl)

    return teinote
Пример #50
0
 def from_pid(cls, pid):
     r = requests.get(annotation_xml_url(pid))
     if not r.ok:
         raise Exception('error retrieving annotation data for %s: %s - %s' % (pid, r.status_code, r.content))
     mods_obj = load_xmlobject_from_string(r.content, mods.Mods)
     return cls(pid=pid, mods_obj=mods_obj)
Пример #51
0
 def test_SubordinateComponents_noseries(self):
     # simple finding aid with no series but only a container list
     simple_dsc = """<dsc><c01 level="file"/></dsc>"""
     dsc = load_xmlobject_from_string(simple_dsc, eadmap.SubordinateComponents)
     self.assertFalse(dsc.hasSeries())
Пример #52
0
 def parse(cls, xml):
     """Запуск парсера из XML"""
     return load_xmlobject_from_string(xml, cls)
Пример #53
0
    def handle(self, *paths, **options):

        if not len(paths):
            raise CommandError('Please specify path to content for import.')
        if len(paths) > 1:
            # this limitation is kind of arbitrary, but keep thing simple for now
            raise CommandError(
                'Import currently only supports a single volume.')
        path = paths[0]

        dry_run = options.get('dry_run', False)
        verbosity = options.get('verbosity', self.v_normal)

        repo = ManagementRepository()

        # make collection required to avoid accidentally forgetting it
        coll = options.get('collection', None)
        if coll is None:
            raise CommandError('Please specify collection pid')

        collection = repo.get_object(coll, type=Collection)
        if not collection.exists:
            raise CommandError('Collection %s does not exist' % coll)
        if not collection.has_requisite_content_models:
            raise CommandError('%s is not a collection' % coll)

        try:
            start = time.time()
            bag = bagit.Bag(path)
            # NOTE: could consider using fast validation, but files probably are
            # not so large or so numerous that this will be an issue
            if verbosity > self.v_normal:
                self.stdout.write('Validating bag %s' % path)
            fast_validate = options.get('fast_validate')
            bag.validate(fast=fast_validate)
            if verbosity >= self.v_normal:
                self.stdout.write(
                    'Validated %s in %.02fs %s' %
                    (path, time.time() - start,
                     '(fast validation enabled)' if fast_validate else ''))
        except bagit.BagError as err:
            # failed to load directory as a bag
            raise CommandError('Please supply a valid BagIt as input. %s' %
                               err)
        except bagit.BagValidationError as err:
            # bag is not valid
            raise CommandError('Input is not a valid bag. %s' % err)

        files = {'pdf': None, 'marcxml': None, 'dc': None}
        checksums = {}

        # this is potentially a long list, but go ahead and store since we will
        # be consulting it multiple times
        payload_files = list(bag.payload_files())

        # identify required contents within the bag by extension and name
        for data_path in payload_files:
            # path is relative to bag root dir
            filename = os.path.join(path, data_path)

            # get extension and name
            basename = os.path.basename(filename)
            basefile, ext = os.path.splitext(basename)
            # NOTE: splitext leaves . on the ext portion

            if ext.lower() == '.pdf':
                files['pdf'] = filename
                checksums['pdf'] = bag.entries[data_path].get('md5', None)

            elif ext.lower() == '.xml':

                if basefile.lower() == 'marc':
                    files['marcxml'] = filename
                    checksums['marcxml'] = bag.entries[data_path].get(
                        'md5', None)

                elif basefile.lower() == 'dc':
                    files['dc'] = filename
                    checksums['dc'] = bag.entries[data_path].get('md5', None)

        # check that required components are present
        err = False
        for label, filepath in files.iteritems():
            if filepath is None:
                self.stderr.write('%s not found' % label.upper())
                err = True

            elif checksums[label] is None:
                self.stderr.write('No MD5 checksum found for %s' %
                                  label.upper())
                err = True

        if err:
            raise CommandError(
                'Cannot import without all required files and checksums.')

        # all pieces are available, so proceed with ingest

        # construct book and ingest
        if verbosity > self.v_normal:
            self.stdout.write('Creating book object with marxml %s' %
                              files['marcxml'])
        try:
            marcxml = load_xmlobject_from_file(files['marcxml'], MinMarcxml)
        except XMLSyntaxError as err:
            raise CommandError('Failed to load %s as xml: %s' %
                               (files['marcxml'], err))
        try:
            dcxml = load_xmlobject_from_file(files['dc'], DublinCore)
        except XMLSyntaxError as err:
            raise CommandError('Failed to load %s as xml: %s' %
                               (files['dc'], err))

        # look for book by ocm number first, in case a previous ingest failed
        book_pids = Book.pids_by_label(marcxml.ocm_number)
        # error if we find more than one
        if len(book_pids) > 1:
            raise CommandError('Multiple books exist with label %s. Please correct this first.' \
                                % marcxml.ocm_number)

        # if we find exactly one, use that instead of creating a new book
        elif len(book_pids) == 1:
            book = repo.get_object(book_pids[0], type=Book)
            if verbosity >= self.v_normal:
                self.stdout.write('Using existing book %s with ocm number %s' % \
                    (book.pid, marcxml.ocm_number))

        # otherwise, ingest new book
        else:
            book = repo.get_object(type=Book)
            # set book label to ocm number from the marc
            book.label = marcxml.ocm_number
            if verbosity > self.v_normal:
                self.stdout.write('Book label %s' % book.label)

            # associate with collection
            if collection is not None:
                book.collection = collection
                if verbosity > self.v_normal:
                    self.stdout.write('Associating with collection %s' %
                                      collection.short_label)
            book.marcxml.content = marcxml
            # NOTE: import checksum can't be used because xml may be serialized differently
            # book.marcxml.checksum = checksums['marcxml']
            book.dc.content = dcxml
            # NOTE: import checksum can't be used because DC is modified to add ARK
            # book.dc.checksum = checksums['dc']

            # save; bail if error
            if not dry_run:
                try:
                    saved = book.save('ingest')
                    if not saved:
                        raise CommandError(
                            'Failed to ingest book into repository')
                    if verbosity >= self.v_normal:
                        self.stdout.write('Successfully ingested book %s' \
                                    % book.pid)
                except RequestFailed as err:
                    raise CommandError('Error ingesting book: %s' % err)

        # in case of pre-existing book object, check for existing volume
        if book.volume_set:
            if len(book.volume_set) > 1:
                raise CommandError('Book %s has multiple volumes; import not supported' \
                    % book.pid)
            else:
                # use existing volume object
                vol = book.volume_set[0]
                if verbosity >= self.v_normal:
                    self.stdout.write('Using existing volume %s' % vol.pid)

        # otherwise, create new volume object
        else:
            # construct volume (v1.1), associate with book, and ingest
            if verbosity > self.v_normal:
                self.stdout.write('Creating volume with %s' % files['pdf'])
            with open(files['pdf']) as pdf_file:
                vol = repo.get_object(type=VolumeV1_1)
                # set volume label to ocm number from the marc + volume number
                # for consistency with lsdi content, use ocm_v# notation
                # V.0 indicates single-volume book
                vol.label = '%s_V.0' % marcxml.ocm_number
                # set pdf content
                vol.pdf.content = pdf_file
                vol.pdf.checksum = checksums['pdf']
                # set relation to parent book object
                vol.book = book
                # minimal DC metadata derived from book metadata
                vol.dc.content.title = book.dc.content.title
                for t in book.dc.content.type_list:
                    vol.dc.content.type_list.append(t)
                vol.dc.content.format = book.dc.content.format
                vol.dc.content.language = book.dc.content.language
                vol.dc.content.rights = book.dc.content.rights

                if not dry_run:
                    try:
                        saved = vol.save('ingest')
                        if not saved:
                            # NOTE: possibly, if this fails, we should deactivate the book object
                            # but will leave that to manual processing for now
                            raise CommandError(
                                'Failed to ingest volume into repository')
                        else:
                            if verbosity >= self.v_normal:
                                self.stdout.write('Successfully ingested volume %s' \
                                    % vol.pid)
                    except RequestFailed as err:
                        raise CommandError('Error ingesting volume: %s' % err)

        #### page import

        # if volume has existing pages, bail
        if len(vol.pages):
            raise CommandError('Volume %s already has %s page%s' % \
                (vol.pid, len(vol.pages), '' if len(vol.pages) == 1 else 's'))

        # should page import happen here?
        # - identify numeric jp2/jpf files in the bag and get total count
        # - identify numeric .xml files in the bag and get total count
        # - make sure counts match up
        # Question: can we assume no start/end blank pages for now?
        # - start looping through, create page-1.1 and associate with book,
        #   and ingest
        # - set first page as primary image on the volume
        # - report number of pages ingested

        image_files = []

        # identify page files (images and ocr xml)
        for data_path in payload_files:
            # get extension and name
            basename = os.path.basename(data_path)
            basefile, ext = os.path.splitext(basename)
            if ext in ['.jp2', '.jpf']:
                image_files.append(data_path)
                # check that MD5 is present and bail if not
                # - this is probably redundant since by this point validation
                # has passed and previous content has checksums, but
                # ingest will assume checksums are available so better to error
                # *before* starting to ingest page-level content
                if bag.entries[data_path].get('md5', None) is None:
                    raise CommandError('No MD5 checksum for %s' % data_path)

        # ensure pages are sorted into page-order
        image_files.sort()

        # NOTE: disabled for now; tunebook does not appear to include alto
        # for pages with no text content
        ## find matching page ocr files
        # for imgfile in image_files:
        #     basefile, ext = os.path.splitext(imgfile)
        #     ocrfile = '%s.xml' % basefile
        #     if ocrfile not in payload_files:
        #         raise CommandError('No OCR xml page present for %s (expected %s)' % \
        #             (imgfile, ocrfile))

        # pre-generate empty xml in case we need it to force eulfedora to not
        # create ocr datastream when no ocr is present
        emptyxml = load_xmlobject_from_string('<empty/>')

        # iterate through page images and put into fedora
        pageindex = 1
        for imgfile in image_files:
            if verbosity > self.v_normal:
                print 'Creating Page object for %s' % imgfile
            # path is relative to bag root dir
            img_filename = os.path.join(path, imgfile)

            page = repo.get_object(type=PageV1_1)
            # set page label
            page.label = '%s page %d' % (vol.label, pageindex)
            # set the relation to the volume object
            page.volume = vol
            logger.debug('Page %s volume %s' % (page.pid, page.volume.pid))
            # set a dc:title based on volume title
            page.dc.content.title = '%s page %d' % (vol.dc.content.title,
                                                    pageindex)
            # set page order
            page.page_order = pageindex

            with open(img_filename) as img_content:
                # set image content
                page.image.content = img_content
                page.image.checksum = bag.entries[imgfile]['md5']
                # assume jpeg2000 for now (only looking for jp2/jpf)
                page.image.mimetype = 'image/jp2'

                # check for ocr xml within the bag, same base name as image
                basefile, ext = os.path.splitext(imgfile)
                ocrfile = '%s.xml' % basefile

                if ocrfile in payload_files:
                    page.ocr.content = load_xmlobject_from_file(
                        os.path.join(path, ocrfile))
                    # NOTE: can't use MD5 from bag because XML may be
                    # serialized differently when sent to Fedora
                    # (unless we treat as file instead of xml...)
                    # page.ocr.checksum = bag.entries[ocrfile]['md5']

                    if verbosity > self.v_normal:
                        print 'Setting OCR for Page from %s' % ocrfile

                else:
                    # warn but do not error if ocr xml is not found
                    self.stdout.write('Warning: no OCR xml found for %s' %
                                      imgfile)
                    # explicitly set xml content to empty so eulfedora doesn't
                    # attempt to bootstrap & ingest (and error)
                    page.ocr.content = emptyxml

                if not dry_run:
                    try:
                        # for now, if any page ingest errors, bail out
                        # (unclear what would cause load to fail midway)

                        saved = page.save()

                        if not saved:
                            raise CommandError('Failed to ingest page %d into repository' \
                                % pageindex)

                    except RequestFailed as err:
                        raise CommandError('Error ingesting page %d: %s' %
                                           (pageindex, err))

            # set first page as primary image for the volume
            if not dry_run and pageindex == 1:
                vol.primary_image = page
                vol.save('adding primary image relation')

            # increase page index for next page
            pageindex += 1

        if verbosity >= self.v_normal:
            # total is pageindex - 1 since pageindex incremented at end of loop
            self.stdout.write('Created %d pages' % (pageindex - 1))
Пример #54
0
    def test_update_instance(self):
        # initialize data the same way a view processing a POST would
        update_form = TestForm(self.post_data, instance=self.testobj)
        # check that form is valid - if no errors, this populates cleaned_data
        self.assertTrue(update_form.is_valid())

        instance = update_form.update_instance()
        self.assert_(isinstance(instance, TestObject))
        self.assertEqual(21, instance.int)
        self.assertEqual(False, instance.bool)
        self.assertEqual('b', instance.id)
        self.assertEqual('completely new text content', instance.longtext)
        self.assertEqual(0, instance.other_child.val)

        # spot check that values were set properly in the xml
        xml = instance.serialize()
        self.assert_('id="b"' in xml)
        self.assert_('<boolean>no</boolean>' in xml)

        # test save on form with no pre-existing xmlobject instance
        class SimpleForm(XmlObjectForm):
            class Meta:
                model = TestObject
                fields = ['id', 'bool',
                          'longtext']  # fields with simple, top-level xpaths
                # creation for nested node not yet supported in xmlmap - excluding int
                exclude = ['child']  # exclude subform to simplify testing

        new_form = SimpleForm({
            'id': 'A1',
            'bool': True,
            'longtext': 'la-di-dah'
        })
        self.assertTrue(new_form.is_valid())
        instance = new_form.update_instance()
        self.assert_(
            isinstance(instance, TestObject),
            "update_instance on unbound xmlobjectform returns correct xmlobject instance"
        )
        self.assertEqual(True, instance.bool)
        self.assertEqual('A1', instance.id)
        self.assertEqual('la-di-dah', instance.longtext)
        # spot check values in created-from-scratch xml
        xml = instance.serialize()
        self.assert_('id="A1"' in xml)
        self.assert_('<boolean>yes</boolean>' in xml)

        # formset deletion
        data = self.post_data.copy()
        # update post data to test deleting items
        data.update({
            'children-INITIAL_FORMS': 4,  # only initial forms can be deleted
            'children-0-DELETE': True,
            'children-2-DELETE': True,
        })
        # make a copy object, since the instance will be updated by the form
        testobj = xmlmap.load_xmlobject_from_string(self.testobj.serialize(),
                                                    TestObject)
        update_form = TestForm(data, instance=self.testobj)
        # check that form is valid - if no errors, this populates cleaned_data
        self.assertTrue(update_form.is_valid())
        instance = update_form.update_instance()
        # children 0 and 2 should be removed from the updated instance
        self.assert_(testobj.children[0] not in instance.children)
        self.assert_(testobj.children[2] not in instance.children)
Пример #55
0
 def setUp(self):
     self.dc = load_xmlobject_from_string(self.FIXTURE, DublinCore)
Пример #56
0
 def setUp(self):
     # instance of form with no test object
     self.new_form = TestForm()
     # instance of form with test object instance
     self.testobj = xmlmap.load_xmlobject_from_string(FIXTURE_TEXT, TestObject)
     self.update_form = TestForm(instance=self.testobj)