예제 #1
0
    def _query(self, base_url, qargs, response_xmlclass):
        '''Utility method: Adds required query arguments, returns response
        as a caller-specified :class:`~eulxml.xmlmap.XmlObject`. Delays if
        necessary to enforce EUtils query speed policy.
        '''
        self._enforce_query_timing()
        qargs = qargs.copy()
        if 'tool' not in qargs:
            qargs['tool'] = self.EUTILS_TOOL
        if 'email' not in qargs:
            qargs['email'] = self.EUTILS_EMAIL
        # TODO: When we start making more than one query we need to sleep to
        # avoid making more than 3 requests per second per E-Utilities
        # policies.
        qurl = base_url + urlencode(qargs)
        logger.debug('EntrezClient querying: ' + qurl)

        # use a url validator to examine if the qurl is a file location or a url
        # open the remote file with urllib.urlopen if it is a url
        # or open it as a file
        url_validator = URLValidator()
        try:
            url_validator(qurl)
            target_file = urlopen(qurl)
            return xmlmap.load_xmlobject_from_file(target_file.read(), xmlclass=response_xmlclass)
        except ValidationError, e:
            return xmlmap.load_xmlobject_from_file(qurl, xmlclass=response_xmlclass)
예제 #2
0
파일: models.py 프로젝트: WSULib/readux
    def test_get_fulltext(self):
        with patch.object(self.vol, 'ocr') as mockocr:
            mockocr.exists = True
            # abbyy finereader v8
            ocr_xml = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,
                'abbyyocr_fr8v2.xml'))
            mockocr.content = ocr_xml

            text = self.vol.get_fulltext()
            # check for arbitrary text content
            self.assert_('In presenting this,  the initial volume of  the' in text,
                'ocr text content should be present in plain text')
            self.assert_('Now, kind reader, we ask that you do not crit' in text,
                'ocr text content should be present in plain text')
            self.assert_(re.search(r'Baldwin\s+Dellinger\s+Brice', text),
                'table row content should be displayed on a single line')

            # abbyy finereader v6
            ocr_xml = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,
                'abbyyocr_fr6v1.xml'))
            mockocr.content = ocr_xml

            text = self.vol.get_fulltext()
            # check for arbitrary text content
            self.assert_('was late in the autumn, the vines yet kept their leaves,' in text,
                'ocr text content should be present in plain text')
            self.assert_('walked up the steps. The lady had not moved, and made' in text,
                'ocr text content should be present in plain text')
            self.assert_(re.search(r'Modern\.\s+New Standard\.\s+Popular\.', text),
                'table row content should be displayed on a single line')
예제 #3
0
파일: tei.py 프로젝트: saracarl/readux
 def setUp(self):
     # tei generated from mets alto
     self.alto_tei = load_xmlobject_from_file(
         os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), tei.Facsimile)
     # tei generated from abbyy ocr
     self.abbyy_tei = load_xmlobject_from_file(
         os.path.join(FIXTURE_DIR, 'teifacsimile_abbyy.xml'), tei.Facsimile)
예제 #4
0
파일: tei.py 프로젝트: WSULib/readux
 def setUp(self):
     # tei generated from mets alto
     self.alto_tei = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'),
         tei.Facsimile)
     # tei generated from abbyy ocr
     self.abbyy_tei = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile_abbyy.xml'),
         tei.Facsimile)
예제 #5
0
파일: tests.py 프로젝트: mprefer/OpenEmory
    def setUp(self):
        search_fixture_path = self.fixture_path('esearch-response-withhist.xml')
        self.search_response = xmlmap.load_xmlobject_from_file(search_fixture_path,
                xmlclass=ESearchResponse)

        fetch_fixture_path = self.fixture_path('efetch-retrieval-from-hist.xml')
        self.fetch_response = xmlmap.load_xmlobject_from_file(fetch_fixture_path,
                xmlclass=EFetchResponse)

        self.mock_client = Mock(spec=EntrezClient)
예제 #6
0
    def setUp(self):
        search_fixture_path = self.fixture_path(
            'esearch-response-withhist.xml')
        self.search_response = xmlmap.load_xmlobject_from_file(
            search_fixture_path, xmlclass=ESearchResponse)

        fetch_fixture_path = self.fixture_path(
            'efetch-retrieval-from-hist.xml')
        self.fetch_response = xmlmap.load_xmlobject_from_file(
            fetch_fixture_path, xmlclass=EFetchResponse)

        self.mock_client = Mock(spec=EntrezClient)
예제 #7
0
 def setUp(self):
     super(HarvestRecordTest, self).setUp()
     article_fixture_path = fixture_path('efetch-retrieval-from-hist.xml')
     self.fetch_response = xmlmap.load_xmlobject_from_file(
         article_fixture_path, xmlclass=EFetchResponse)
     # one corresponding author with an emory email
     self.article = self.fetch_response.articles[0]
예제 #8
0
파일: annotate.py 프로젝트: saracarl/readux
    def test_consolidate_bibl(self):
        teidoc = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,
                                                       'teifacsimile.xml'),
                                          tei.AnnotatedFacsimile)
        teinote = annotation_to_tei(self.zotero_note, teidoc)
        teidoc.annotations.append(teinote)
        consolidate_bibliography(teidoc)

        self.assertEqual(2, len(teidoc.citations),
            'annotation citations should be present in main document bibl')
        teinote = teidoc.annotations[0]
        self.assertEqual(0, len(teinote.citations),
            'citations should not be present on individual annotation')
        self.assertEqual(None, teinote.works_cited)
        self.assertEqual(None, teinote.zotero_items)
        self.assertEqual(None, teinote.works_cited_milestone)
        teinote_xml = teinote.serialize()
        self.assertFalse('<item><anchor xml:id="zotero-' in teinote_xml)
        self.assertFalse('<listBibl/>' in teinote_xml)

        # repeated zotero ids should only appear once in document bibl
        # load the same note and add it again
        teinote = annotation_to_tei(self.zotero_note, teidoc)
        teidoc.annotations.append(teinote)
        consolidate_bibliography(teidoc)
        self.assertEqual(2, len(teidoc.citations),
            'citations repeated in annotations should only appear once')
예제 #9
0
파일: symp.py 프로젝트: alexBLR/OpenEmory
    def language(self):
        '''
        wrapper arond field that chooses that prefered source
         :returns: a tuple containng language code and name
        '''
        marc_languages_xml = 'http://www.loc.gov/standards/codelists/languages.xml'
        langs =  xmlmap.load_xmlobject_from_file(marc_languages_xml)

        ns = {'lang':'info:lc/xmlns/codelist-v1'}

        if self.wos and self.wos.language:
            lang = self.wos.language
        elif self.scopus and self.scopus.language:
            lang = self.scopus.language
        elif self.pubmed and self.pubmed.language:
            lang = self.pubmed.language
        elif self.crossref and self.crossref.language:
            lang = self.crossref.language
        elif self.arxiv and self.arxiv.language:
            lang = self.arxiv.language
        elif self.repec and self.repec.language:
            lang = self.repec.language
        elif self.dblp and self.dblp.language:
            lang = self.dblp.language
        else: lang = ''


        nodes = langs.node.xpath("//lang:language[lang:name='%s' or lang:code='%s']" % (lang, lang), namespaces=ns)
        if nodes:
            return (nodes[0].findtext('lang:code', namespaces=ns), nodes[0].findtext('lang:name', namespaces=ns))

        else:
            return ('', '')
예제 #10
0
def test_can_serialize_xsd300_ds():
    from eulxml.xmlmap import load_xmlobject_from_file

    with open(os.path.join(FIXTURE_ROOT, '1023796ar.xml')) as f:
        xmlobject = load_xmlobject_from_file(f)
        serialized_object = serializers._xsd300_serializer(xmlobject)
        assert type(serialized_object) == bytes
예제 #11
0
 def setUp(self):
     #load the three xml doc objects
     self.docs = dict()
     for file in self.FIXTURES:
         filebase = file.split('.')[0]
         self.docs[filebase] = xmlmap.load_xmlobject_from_file(
             path.join(exist_fixture_path, file), TestDocTitle)
def process(spreadsheet, xml_files_dir, sheet=1, control_row=None, force_dates=False,
        object_type='parent', input_encoding='utf8', copy_parent_to_children=False):
    '''Function to go through all the data and process it.'''
    #make sure we have a directory to put the mods files in
    os.makedirs(xml_files_dir, exist_ok=True)
    data_handler = DataHandler(spreadsheet, sheet=sheet, control_row=control_row, force_dates=force_dates,
            object_type=object_type, input_encoding=input_encoding)
    index = 1
    for record in data_handler.get_xml_records():
        filename = '%s.%s.xml' % (record.xml_id, record.record_type)
        full_path = os.path.join(xml_files_dir, filename)
        if os.path.exists(full_path):
            raise DataError('%s file already exists from previous record! Possible duplicate %s IDs?' % (filename, record.xml_id))
        if copy_parent_to_children:
            #load parent mods object if desired (& it exists)
            parent_filename = os.path.join(xml_files_dir, u'%s.%s' % (record.group_id, record.record_type))
            parent_xml = None
            if os.path.exists(parent_filename):
                parent_xml = load_xmlobject_from_file(parent_filename, mods.Mods)
                mapper = Mapper(record.record_type, record.field_data(), parent_mods=parent_xml)
        else:
            mapper = Mapper(record.record_type, record.field_data())
        xml_obj = mapper.get_xml()
        xml_bytes = xml_obj.serializeDocument(pretty=True) #serializes as UTF-8
        with open(full_path, 'wb') as f:
            f.write(xml_bytes)
        index = index + 1
예제 #13
0
    def test_can_retrieve_xml_of_existing_articles(self, mock_ds, mock_pdf):

        with open(os.path.join(FIXTURE_ROOT, '1023796ar.xml'), 'r') as f:
            from eulxml.xmlmap import load_xmlobject_from_file
            mock_pdf.content = load_xmlobject_from_file(f)
        mock_ds = ['ERUDITXSD300', ]  # noqa

        issue = IssueFactory.create(
            journal=self.journal, year=2010,
            date_published=dt.datetime.now() - dt.timedelta(days=1000))
        article = ArticleFactory.create(issue=issue)
        journal_id = self.journal.localidentifier
        issue_id = issue.localidentifier
        article_id = article.localidentifier
        url = reverse('public:journal:article_raw_xml', args=(
            journal_id, issue.volume_slug, issue_id, article_id
        ))
        request = self.factory.get(url)
        request.user = AnonymousUser()
        request.subscription = None

        # Run
        response = ArticleXmlView.as_view()(
            request, journal_code=journal_id, issue_slug=issue.volume_slug, issue_localid=issue_id,
            localid=article_id)

        # Check
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/xml')
예제 #14
0
파일: tests.py 프로젝트: mprefer/OpenEmory
 def setUp(self):
     super(HarvestRecordTest, self).setUp()
     article_fixture_path = fixture_path('efetch-retrieval-from-hist.xml')
     self.fetch_response = xmlmap.load_xmlobject_from_file(article_fixture_path,
                                                           xmlclass=EFetchResponse)
     # one corresponding author with an emory email
     self.article = self.fetch_response.articles[0]
예제 #15
0
    def test_rdf_type(self):
        # not enough information to determine type
        self.assertEqual(None, self.c1.rdf_type)
        # infer book, article, etc from title attributes
        self.assertEqual('bibo:Book', self.c3.rdf_type)
        self.assertEqual('bibo:Article', self.c4.rdf_type)

        # type inferred based on series; requires access to series, so load from fixtures
        # - bailey findingaid contains printed material, photographs, and audiovisual
        bailey = load_xmlobject_from_file(path.join(exist_fixture_path, 'bailey807.xml'),
            FindingAid)

        # patch in unittitles so it looks as though items have semantic data
        with patch('findingaids.fa.models.Series.unittitle_titles', new=[Title()]):

            # series 4 is printed material
            self.assertEqual('bibo:Document', bailey.dsc.c[3].c[0].rdf_type,
                'items in printed materials series should default to document type')

            # series 5 is photographs
            self.assertEqual('bibo:Image', bailey.dsc.c[4].c[0].rdf_type,
                'items in photograph series should default to image type')

            # series 9 is audiovisual
            self.assertEqual('bibo:AudioVisualDocument', bailey.dsc.c[8].c[0].rdf_type,
                'items in audiovisual series should default to audiovisualdocument type')

            # fallback type is manuscript
            self.assertEqual('bibo:Manuscript', bailey.dsc.c[0].c[0].rdf_type,
                'items in photograph series should default to image type')
예제 #16
0
    def add_xml_datastream(self, xml_path, ds_id, label, control_group,
                           mimetype, checksum_type):
        """Add XML object."""
        xml_object = xmlmap.load_xmlobject_from_file(xml_path)

        if checksum_type == "SHA-512":
            checksum = self.generate_checksum(xml_path)

        else:
            checksum = None
            logging.warning(
                "Unable to generate checksum for specified type: {0}".format(
                    checksum_type))

        logging.info("----adding datastream {0}: {1}".format(ds_id, label))

        new_datastream = DatastreamObject(self.obj,
                                          ds_id,
                                          label,
                                          mimetype=mimetype,
                                          control_group=control_group,
                                          checksum_type=checksum_type,
                                          checksum=checksum)

        new_datastream.content = xml_object
        new_datastream.label = label
        new_datastream.save()
    def setUp(self):

        # load the three xml issue objects
        self.issue = dict()
        for file in self.FIXTURES:
            filebase = file.split('.')[0]
            self.issue[filebase] = xmlmap.load_xmlobject_from_file(path.join(exist_fixture_path, file), TestIssue)
예제 #18
0
def process(dataHandler, copy_parent_to_children=False):
    '''Function to go through all the data and process it.'''
    #get dicts of columns that should be mapped & where they go in MODS
    index = 1
    for record in dataHandler.get_mods_records():
        filename = record.mods_filename
        if os.path.exists(os.path.join(MODS_DIR, filename)):
            raise Exception('%s already exists!' % filename)
        logger.info('Processing row %d to %s.' % (index, filename))
        if copy_parent_to_children:
            #load parent mods object if desired (& it exists)
            parent_filename = os.path.join(MODS_DIR, record.parent_mods_filename)
            parent_mods = None
            if os.path.exists(parent_filename):
                parent_mods = load_xmlobject_from_file(parent_filename, mods.Mods)
                mapper = Mapper(parent_mods=parent_mods)
        else:
            mapper = Mapper()
        for field in record.field_data():
            mapper.add_data(field['mods_path'], field['data'])
        mods_obj = mapper.get_mods()
        mods_data = unicode(mods_obj.serializeDocument(pretty=True), 'utf-8')
        with codecs.open(os.path.join(MODS_DIR, filename), 'w', 'utf-8') as f:
            f.write(mods_data)
        index = index + 1
 def setUp(self):
     # load the fixture file as a generic tei document
     self.tei = xmlmap.load_xmlobject_from_file(self.simmons_xml,
                                                teimap.Tei)
     # find the first groupsheet via xpath and load
     groups = self.tei.node.xpath('//t:text/t:group/t:group',
                                  namespaces={'t': teimap.TEI_NAMESPACE})
     self.groupsheet = TeiGroupSheet(groups[0])
예제 #20
0
def lsdibag():
    # create and return a LsdiBaggee object to use in tests
    digwf_item_response = os.path.join(FIXTURE_DIR, 'digwf_getitems_3031.xml')
    response = load_xmlobject_from_file(digwf_item_response, digwf.Items)
    # update path to use local fixture for marc xml
    item = response.items[0]
    item.marc_path = os.path.join(FIXTURE_DIR, 'ocm08951025_MRC.xml')
    return LsdiBaggee(response.items[0])
예제 #21
0
 def set_attr_xml_content(self, attr, path):
     """Add xml content to datastream."""
     xml_object = xmlmap.load_xmlobject_from_file(path)
     xml_object = open(path)
     if attr == "dc":
         self.set_attr(attr + ".content", xml_object, sub_attr="dc")
     else:
         self.set_attr(attr, xml_object, sub_attr="content")
예제 #22
0
def xml(request):
  "Display xml of a single issue."
  try:
    doc = xmlmap.load_xmlobject_from_file(filename=os.path.join(settings.BASE_DIR, 'static', 'xml', 'luther_text.xml'))
  except:
    raise Http404
  tei_xml = doc.serializeDocument(pretty=True)
  return HttpResponse(tei_xml, mimetype='application/xml')  
예제 #23
0
def test_no_content():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    # this file has text content
    assert not tei.no_content()

    # if we delete the lines and labels, it does not
    tei.lines = []
    tei.labels = []
    assert tei.no_content()
예제 #24
0
파일: export.py 프로젝트: saracarl/readux
    def setUp(self):
        self.vol = Volume(Mock())  # use a real volume, but Mock for api
        self.vol.pid = 'testvol:123'
        self.tei = load_xmlobject_from_file(
            os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), AnnotatedFacsimile)

        self.tmpdir = tempfile.mkdtemp(prefix='rdx-export-test')
        # for now, use defaults for page one, callback, images
        self.exporter = VolumeExport(self.vol, self.tei)
예제 #25
0
def test_fields():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    assert tei.pgpid == 968
    # should have text, lines, and labels
    assert tei.text
    assert tei.lines
    assert tei.labels
    assert len(tei.labels) == 4
    assert tei.source_authors == ["Gil"]
예제 #26
0
    def from_file(cls, file_path, validate=True):
        """ Creates a Python object from a XML file

        :param file_path: Path to the XML file
        :param validate: XML should be validated against the embedded XSD definition
        :type validate: Boolean
        :returns: the Python object
        """
        return xmlmap.load_xmlobject_from_file(file_path, xmlclass=cls, validate=validate)
예제 #27
0
    def setUp(self):
        self.vol = Volume(Mock())   # use a real volume, but Mock for api
        self.vol.pid = 'testvol:123'
        self.tei = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,
                                                         'teifacsimile.xml'),
                                            AnnotatedFacsimile)

        self.tmpdir = tempfile.mkdtemp(prefix='rdx-export-test')
        # for now, use defaults for page one, callback, images
        self.exporter = VolumeExport(self.vol, self.tei)
예제 #28
0
def update_999a(path, kdip_id, enumcron):
    """
    Method to updae the 999a MARC field if/when it is changed
    in the database.
    """
    marc_file = '%s/%s/marc.xml' %(path, kdip_id)
    marc = load_xmlobject_from_file(marc_file, models.Marc)
    marc.tag_999a = enumcron
    with open(marc_file, 'w') as marcxml:
        marcxml.write(marc.serialize(pretty=True))
예제 #29
0
파일: annotate.py 프로젝트: saracarl/readux
    def test_annotation_to_tei(self):
        teidoc = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'),
            tei.AnnotatedFacsimile)

        note = Annotation(text="Here's the thing", quote="really",
            extra_data=json.dumps({'sample data': 'foobar',
                'tags': ['test', 'one', 'two']}))

        teinote = annotation_to_tei(note, teidoc)
        self.assert_(isinstance(teinote, tei.Note))
        self.assertEqual('annotation-%s' % note.id, teinote.id)
        self.assert_(teinote.href.endswith(note.get_absolute_url()))
        self.assertEqual(note.text, teinote.paragraphs[0])

        # todo: add a schema validation once we get the output to be valid
        # teidoc.schema_valid()
        # access errors with teidoc.schema_validation_errors()

        # annotation user should be set as note response
        user = get_user_model()(username='******')
        user.save()
        note.user = user
        teinote = annotation_to_tei(note, teidoc)
        self.assertEqual(user.username, teinote.resp)

        # tags should be set as interp ids ana attribute
        for tag in note.info()['tags']:
            self.assert_('#%s' % tag in teinote.ana)

        # test that markdown formatting is coming through
        footnote = '''Footnotes[^1] have a label and content.

[^1]: This is some footnote content.'''
        note.text = footnote
        teinote = annotation_to_tei(note, teidoc)
        self.assert_('<ref target="#fn1" type="noteAnchor">1</ref>' in
            teinote.serialize())

        # markdown should be included in a code element
        self.assertEqual(note.text, teinote.markdown)

        # related page references
        rel_pages = [
            'http://testpid.co/ark:/1234/11',
            'http://testpid.co/ark:/1234/22',
            'http://testpid.co/ark:/1234/qq'
        ]
        note.extra_data = json.dumps({'related_pages': rel_pages})
        teinote = annotation_to_tei(note, teidoc)
        self.assertEqual(len(rel_pages), len(teinote.related_pages))
        # first ark has a corresponding id in the fixture, should be converted
        self.assertEqual('#%s' % teidoc.page_id_by_xlink(rel_pages[0]),
            teinote.related_pages[0].target)
        for idx in range(len(rel_pages)):
            self.assertEqual(rel_pages[idx], teinote.related_pages[idx].text)
예제 #30
0
파일: models.py 프로젝트: WSULib/readux
    def test_ocr_ids(self):
        # pach in fixture ocr content
        with patch.object(self.vol, 'ocr') as mockocr:
            mockocr.exists = True
            ocr_xml = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,
                'abbyyocr_fr8v2.xml'))
            mockocr.content = ocr_xml

            self.assertFalse(self.vol.ocr_has_ids)
            self.vol.add_ocr_ids()
            self.assertTrue(self.vol.ocr_has_ids)
예제 #31
0
    def test_ocr_ids(self):
        # pach in fixture ocr content
        with patch.object(self.vol, 'ocr') as mockocr:
            mockocr.exists = True
            ocr_xml = load_xmlobject_from_file(
                os.path.join(FIXTURE_DIR, 'abbyyocr_fr8v2.xml'))
            mockocr.content = ocr_xml

            self.assertFalse(self.vol.ocr_has_ids)
            self.vol.add_ocr_ids()
            self.assertTrue(self.vol.ocr_has_ids)
예제 #32
0
    def from_file(cls, file_path, validate=True):
        """ Creates a Python object from a XML file

        :param file_path: Path to the XML file
        :param validate: XML should be validated against the embedded XSD definition
        :type validate: Boolean
        :returns: the Python object
        """
        return xmlmap.load_xmlobject_from_file(file_path,
                                               xmlclass=cls,
                                               validate=validate)
예제 #33
0
def test_text_to_plaintext_longlines():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    # replace the text of the last line with an excessively long line
    # - because the xmlobject isn't configured with an eye to updates,
    #   update the lxml node text directly
    tei.lines[-1].node.text = "superlongline" * 100
    plaintext = tei.text_to_plaintext()
    plaintext_lines = plaintext.split("\n")
    # line is slightly more than 100 because of ltr/rtl marks & line number
    # but should NOT be padded to match the superlongline
    assert len(plaintext_lines[1]) < 110
예제 #34
0
def test_text_to_plaintext():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    plaintext = tei.text_to_plaintext()
    assert plaintext.count("\n") == 43
    # two section breaks
    assert plaintext.count("\n\n") == 4
    # includes labels
    assert "Right Margin" in plaintext
    assert "מא" in plaintext
    assert "الحسن بن ابرهيم" in plaintext
    # includes line numbers and ltr/rtl marks
    assert ("\u200f        כתאבי אטאל אללה בקא מולי אלשיך ואדאם \u200e   1\n"
            in plaintext)
예제 #35
0
    def test_items_xml(self):
        # basic inspection of sample result / xml mapping
        response = load_xmlobject_from_file(self.item_response, digwf.Items)
        assert response.count == 1
        assert len(response.items) == 1
        assert isinstance(response.items[0], digwf.Item)

        item = response.items[0]
        assert item.pid == '7svgb'
        assert item.item_id == '3031'
        assert item.control_key == 'ocm08951025'
        assert item.display_image_path == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output'
        assert item.display_image_count == 2218
        assert item.ocr_file_path == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output'
        assert item.ocr_file_count == 2218
        assert item.pdf == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output/Output.pdf'
        assert item.marc_path == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/ocm08951025_MRC.xml'
        assert item.ocr_file == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output/Output.xml'
        assert item.collection_id == 10
        assert item.collection_name == 'Atlanta City Directories'

        response = load_xmlobject_from_file(self.empty_response, digwf.Items)
        assert response.count == 0
예제 #36
0
파일: annotate.py 프로젝트: saracarl/readux
    def test_annotation_citation_to_tei(self):
        teidoc = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'),
            tei.AnnotatedFacsimile)

        teinote = annotation_to_tei(self.zotero_note, teidoc)
        # print teinote.serialize(pretty=True)
        # number of citations should match
        self.assertEqual(len(self.zotero_note.extra_data['citations']),
                         len(teinote.citations))
        # minimal inspection to check that values carried through as expected
        self.assertEqual('webpage', teinote.citations[0].type)
        self.assertEqual('journalArticle', teinote.citations[1].type)

        self.assertEqual('zotero-7CBCH6E8', teinote.citations[0].id)
        self.assertEqual('zotero-MUXAEE89', teinote.citations[1].id)
예제 #37
0
    def test_get_fulltext(self):
        with patch.object(self.vol, 'ocr') as mockocr:
            mockocr.exists = True
            # abbyy finereader v8
            ocr_xml = load_xmlobject_from_file(
                os.path.join(FIXTURE_DIR, 'abbyyocr_fr8v2.xml'))
            mockocr.content = ocr_xml

            text = self.vol.get_fulltext()
            # check for arbitrary text content
            self.assert_(
                'In presenting this,  the initial volume of  the' in text,
                'ocr text content should be present in plain text')
            self.assert_(
                'Now, kind reader, we ask that you do not crit' in text,
                'ocr text content should be present in plain text')
            self.assert_(
                re.search(r'Baldwin\s+Dellinger\s+Brice', text),
                'table row content should be displayed on a single line')

            # abbyy finereader v6
            ocr_xml = load_xmlobject_from_file(
                os.path.join(FIXTURE_DIR, 'abbyyocr_fr6v1.xml'))
            mockocr.content = ocr_xml

            text = self.vol.get_fulltext()
            # check for arbitrary text content
            self.assert_(
                'was late in the autumn, the vines yet kept their leaves,'
                in text, 'ocr text content should be present in plain text')
            self.assert_(
                'walked up the steps. The lady had not moved, and made'
                in text, 'ocr text content should be present in plain text')
            self.assert_(
                re.search(r'Modern\.\s+New Standard\.\s+Popular\.', text),
                'table row content should be displayed on a single line')
예제 #38
0
        def mock_load(url, xmlclass):
            '''mock-like method wrapping load_xmlobject_from_file without
            actually making a network query, but still calling the requested
            xmlclass constructor.
            '''
            # figure out what fixture to return
            fixture = (mock_load.return_fixtures[mock_load.call_count]
                       if mock_load.call_count < len(mock_load.return_fixtures)
                       else mock_load.return_fixtures[-1])

            mock_load.call_count += 1
            mock_load.urls.append(url)
            test_response_path = fixture_path(fixture)
            test_response_obj = xmlmap.load_xmlobject_from_file(
                test_response_path, xmlclass=xmlclass)
            return test_response_obj
예제 #39
0
파일: tests.py 프로젝트: mprefer/OpenEmory
        def mock_load(url, xmlclass):
            '''mock-like method wrapping load_xmlobject_from_file without
            actually making a network query, but still calling the requested
            xmlclass constructor.
            '''
            # figure out what fixture to return
            fixture = (mock_load.return_fixtures[mock_load.call_count]
                       if mock_load.call_count < len(mock_load.return_fixtures)
                       else mock_load.return_fixtures[-1])

            mock_load.call_count += 1
            mock_load.urls.append(url)
            test_response_path = fixture_path(fixture)
            test_response_obj = xmlmap.load_xmlobject_from_file(test_response_path,
                    xmlclass=xmlclass)
            return test_response_obj
예제 #40
0
    def init_xml_object(self):
        '''Initialize an xmlobject based on user-specified arguments
        for filename and type.  Returns an instance of the
        appropriate :class:`~eulxml.xmlmap.XmlObject`, or displays
        an error message if the document could not be parsed as XML.'''

        if self.args.input == 'ead':
            xmlobj_class = EAD
        elif self.args.input == 'tei':
            xmlobj_class = Tei

        try:
            return load_xmlobject_from_file(self.args.filename, xmlobj_class)
        except Exception as err:
            print 'Error loading %s as XML: %s' % (self.args.filename, err)
            exit(-1)
예제 #41
0
    def init_xml_object(self):
        '''Initialize an xmlobject based on user-specified arguments
        for filename and type.  Returns an instance of the
        appropriate :class:`~eulxml.xmlmap.XmlObject`, or displays
        an error message if the document could not be parsed as XML.'''

        if self.args.input == 'ead':
            xmlobj_class = EAD
        elif self.args.input == 'tei':
            xmlobj_class = Tei

        try:
            return load_xmlobject_from_file(self.args.filename, xmlobj_class)
        except Exception as err:
            print 'Error loading %s as XML: %s' % (self.args.filename, err)
            exit(-1)
예제 #42
0
def test_html():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    html = tei.text_to_html()
    # should result in 3 sections
    assert html.count("<section>") == 3
    assert "<h1>Right Margin</h1>" in html
    assert "<li value='1'>מא</li>" in html
    # three different lines that are # 1
    assert html.count("<li value='1'>") == 3

    # check that the last line / last block is included
    assert "<li value='6'>الحسن بن ابرهيم</li>" in html

    # assert that missing line number does not result in a line number of "None"
    assert "<li value='None'>" not in html
    assert "<li value=''>" not in html
예제 #43
0
    def test_page_index_data(self, mockzipfile):
        mockzip_obj = mockzipfile.return_value.__enter__.return_value
        page_files = ['0001.txt', '00002.txt']
        mockzip_obj.namelist.return_value = page_files
        # simulate reading zip file contents
        contents = ('page content for one', 'hello! pshaw! what?')
        mockzip_obj.open.return_value.__enter__.return_value \
            .read.return_value.decode.side_effect = contents

        work = DigitizedWork(source_id='chi.79279237')

        # page data comes from mets
        mets = load_xmlobject_from_file(self.metsfile, hathi.MinimalMETS)
        with patch.object(DigitizedWork, 'hathi') as mock_hathiobj:
            mock_hathiobj.zipfile_path.return_value = '/path/to/79279237.zip'
            mock_hathiobj.metsfile_path.return_value = self.metsfile
            mock_hathiobj.content_dir = 'data'

            page_data = work.page_index_data()
            assert isinstance(page_data, types.GeneratorType)

            for i, data in enumerate(page_data):
                mets_page = mets.structmap_pages[i]
                assert data['id'] == '.'.join([work.source_id, mets_page.text_file.sequence])
                assert data['source_id'] == work.source_id
                assert data['content'] == contents[i]
                assert data['order'] == mets_page.order
                assert data['item_type'] == 'page'
                assert data['label'] == mets_page.display_label
                assert 'tags' in data
                assert data['tags'] == mets_page.label.split(', ')

            # not suppressed by no data
            mock_hathiobj.metsfile_path.side_effect = \
                storage_exceptions.ObjectNotFoundException
            # should log an error, not currently tested
            assert not list(work.page_index_data())

        # if item is suppressed - no page data
        work.status = DigitizedWork.SUPPRESSED
        assert not list(work.page_index_data())

        # non hathi item - no page data
        nonhathi_work = DigitizedWork(source=DigitizedWork.OTHER)
        assert not list(nonhathi_work.page_index_data())
예제 #44
0
def process(spreadsheet,
            xml_files_dir,
            sheet=1,
            control_row=None,
            force_dates=False,
            object_type='parent',
            input_encoding='utf8',
            copy_parent_to_children=False):
    '''Function to go through all the data and process it.'''
    #make sure we have a directory to put the mods files in
    os.makedirs(xml_files_dir, exist_ok=True)
    data_handler = DataHandler(spreadsheet,
                               sheet=sheet,
                               control_row=control_row,
                               force_dates=force_dates,
                               object_type=object_type,
                               input_encoding=input_encoding)
    index = 1
    for record in data_handler.get_xml_records():
        filename = '%s.%s.xml' % (record.xml_id, record.record_type)
        full_path = os.path.join(xml_files_dir, filename)
        if os.path.exists(full_path):
            raise DataError(
                '%s file already exists from previous record! Possible duplicate %s IDs?'
                % (filename, record.xml_id))
        if copy_parent_to_children:
            #load parent mods object if desired (& it exists)
            parent_filename = os.path.join(
                xml_files_dir,
                u'%s.%s' % (record.group_id, record.record_type))
            parent_xml = None
            if os.path.exists(parent_filename):
                parent_xml = load_xmlobject_from_file(parent_filename,
                                                      mods.Mods)
                mapper = Mapper(record.record_type,
                                record.field_data(),
                                parent_mods=parent_xml)
        else:
            mapper = Mapper(record.record_type, record.field_data())
        xml_obj = mapper.get_xml()
        xml_bytes = xml_obj.serializeDocument(
            pretty=True)  #serializes as UTF-8
        with open(full_path, 'wb') as f:
            f.write(xml_bytes)
        index = index + 1
예제 #45
0
파일: web_export.py 프로젝트: WSULib/readux
    def handle(self, *args, **options):
        repo = Repository()
        for pid in options['pid']:
            vol = repo.get_object(pid, type=Volume)
            if options['tei']:
                tei = load_xmlobject_from_file(options['tei'], Facsimile)
            else:
                tei = annotate.annotated_tei(vol.generate_volume_tei(),
                    vol.annotations())
            try:
                zipfile = export.website(vol, tei)
            except export.ExportException as err:
                raise CommandError(err)

            zipfilename = '%s-annotated-site.zip' % vol.noid
            shutil.copyfile(zipfile.name, zipfilename)

            print 'Export for %s complete, zipfile is %s' % (vol.noid, zipfilename)
예제 #46
0
파일: entrez.py 프로젝트: mprefer/OpenEmory
 def _query(self, base_url, qargs, response_xmlclass):
     '''Utility method: Adds required query arguments, returns response
     as a caller-specified :class:`~eulxml.xmlmap.XmlObject`. Delays if
     necessary to enforce EUtils query speed policy.
     '''
     self._enforce_query_timing()
     qargs = qargs.copy()
     if 'tool' not in qargs:
         qargs['tool'] = self.EUTILS_TOOL
     if 'email' not in qargs:
         qargs['email'] = self.EUTILS_EMAIL
     # TODO: When we start making more than one query we need to sleep to
     # avoid making more than 3 requests per second per E-Utilities
     # policies.
     qurl = base_url + urlencode(qargs)
     logger.debug('EntrezClient querying: ' + qurl)
     return xmlmap.load_xmlobject_from_file(qurl,
             xmlclass=response_xmlclass)
예제 #47
0
 def _query(self, base_url, qargs, response_xmlclass):
     '''Utility method: Adds required query arguments, returns response
     as a caller-specified :class:`~eulxml.xmlmap.XmlObject`. Delays if
     necessary to enforce EUtils query speed policy.
     '''
     self._enforce_query_timing()
     qargs = qargs.copy()
     if 'tool' not in qargs:
         qargs['tool'] = self.EUTILS_TOOL
     if 'email' not in qargs:
         qargs['email'] = self.EUTILS_EMAIL
     # TODO: When we start making more than one query we need to sleep to
     # avoid making more than 3 requests per second per E-Utilities
     # policies.
     qurl = base_url + urlencode(qargs)
     logger.debug('EntrezClient querying: ' + qurl)
     return xmlmap.load_xmlobject_from_file(qurl,
                                            xmlclass=response_xmlclass)
예제 #48
0
    def handle(self, *args, **options):
        repo = Repository()
        for pid in options['pid']:
            vol = repo.get_object(pid, type=Volume)
            if options['tei']:
                tei = load_xmlobject_from_file(options['tei'], Facsimile)
            else:
                tei = annotate.annotated_tei(vol.generate_volume_tei(),
                                             vol.annotations())
            try:
                zipfile = export.website(vol, tei)
            except export.ExportException as err:
                raise CommandError(err)

            zipfilename = '%s-annotated-site.zip' % vol.noid
            shutil.copyfile(zipfile.name, zipfilename)

            print 'Export for %s complete, zipfile is %s' % (vol.noid,
                                                             zipfilename)
예제 #49
0
def create_ht_marc(kdip):

    if isinstance(kdip, basestring):
        barcode = kdip
    else:
        barcode = kdip.kdip_id

    record = load_bib_record(barcode)
    cleanup_035s(record)
    remove_most_999_fields(record, barcode)
    transform_035(record)

    marc_file = '%s/%s/marc.xml' % (settings.KDIP_DIR, barcode)

    # Write the marc.xml to disk.
    with open(marc_file, 'w') as marcxml:
        # When we insert the 035 field in position an empaty datafield is instered
        # at the bottom, so we get rid of that.
        marcxml.write(re.sub('\<datafield\/\>\\n', '', record.serialize(pretty=True)))

    return load_xmlobject_from_file(marc_file, models.Marc)
예제 #50
0
    def as_publication_article(self, repo=None):
        '''Initialize (but do not save) a new
        :class:`~openemory.publication.models.Article` instance and
        based on harvested record information and Article XML.

        :param repo: optional; pass in an existing
           :class:`eulfedora.server.Repository` object initialized
           with the desired credentials

        :returns: unsaved :class:`~openemory.publication.models.Article`
        '''
        if repo is None:
            repo = Repository()
        article = repo.get_object(type=Article)
        # using comma-delimited usernames to indicate object has multiple owners
        # should work with existing XACML owner policy;
        # for more detail, see https://jira.duraspace.org/browse/FCREPO-82
        article.owner = ', '.join(auth.username for auth in self.authors.all())
        # VERY preliminary, minimal metadata mapping 
        article.label = self.title
        article.dc.content.title = self.title
        article.dc.content.creator_list.extend([auth.get_full_name()
                                                for auth in self.authors.all()])
        article.dc.content.identifier_list.extend([self.access_url,
                                               'PMC%d' % self.pmcid])

        # set the XML article content as the contentMetadata datastream
        # - record content is a file field with a read method, which should be
        #   handled correctly by eulfedora for ingest
        if hasattr(self.content, 'read'):
            article.contentMetadata.content = load_xmlobject_from_file(self.content, NlmArticle)

        if article.contentMetadata.content:
            article.descMetadata.content = article.contentMetadata.content.as_article_mods()

            
        # FIXME: datastream checksum!
        # TODO: format uri for this datastream ? 

        return article
예제 #51
0
파일: models.py 프로젝트: mprefer/OpenEmory
    def as_publication_article(self, repo=None):
        '''Initialize (but do not save) a new
        :class:`~openemory.publication.models.Article` instance and
        based on harvested record information and Article XML.

        :param repo: optional; pass in an existing
           :class:`eulfedora.server.Repository` object initialized
           with the desired credentials

        :returns: unsaved :class:`~openemory.publication.models.Article`
        '''
        if repo is None:
            repo = Repository()
        article = repo.get_object(type=Article)
        # using comma-delimited usernames to indicate object has multiple owners
        # should work with existing XACML owner policy;
        # for more detail, see https://jira.duraspace.org/browse/FCREPO-82
        article.owner = ', '.join(auth.username for auth in self.authors.all())
        # VERY preliminary, minimal metadata mapping 
        article.label = self.title
        article.dc.content.title = self.title
        article.dc.content.creator_list.extend([auth.get_full_name()
                                                for auth in self.authors.all()])
        article.dc.content.identifier_list.extend([self.access_url,
                                               'PMC%d' % self.pmcid])

        # set the XML article content as the contentMetadata datastream
        # - record content is a file field with a read method, which should be
        #   handled correctly by eulfedora for ingest
        if hasattr(self.content, 'read'):
            article.contentMetadata.content = load_xmlobject_from_file(self.content, NlmArticle)

        if article.contentMetadata.content:
            article.descMetadata.content = article.contentMetadata.content.as_article_mods()

            
        # FIXME: datastream checksum!
        # TODO: format uri for this datastream ? 

        return article
예제 #52
0
    def page_index_data(self):
        '''Get page content for this work from Hathi pairtree and return
        data to be indexed in solr.'''

        # If an item has been suppressed or is from a source other than
        # hathi, bail out. No pages to index.
        if self.is_suppressed or self.source != self.HATHI:
            return

        # load mets record to pull metadata about the images
        try:
            mmets = load_xmlobject_from_file(self.hathi.metsfile_path(),
                                             MinimalMETS)
        except storage_exceptions.ObjectNotFoundException:
            logger.error('Pairtree data for %s not found but status is %s',
                         self.source_id, self.get_status_display())
            return

        # read zipfile contents in place, without unzipping
        with ZipFile(self.hathi.zipfile_path()) as ht_zip:

            # yield a generator of index data for each page; iterate
            # over pages in METS structmap
            for page in mmets.structmap_pages:
                # zipfile spec uses / for path regardless of OS
                pagefilename = '/'.join([self.hathi.content_dir, page.text_file_location])
                with ht_zip.open(pagefilename) as pagefile:
                    try:
                        yield {
                            'id': '%s.%s' % (self.source_id, page.text_file.sequence),
                            'source_id': self.source_id,   # for grouping with work record
                            'content': pagefile.read().decode('utf-8'),
                            'order': page.order,
                            'label': page.display_label,
                            'tags': page.label.split(', ') if page.label else [],
                            'item_type': 'page'
                        }
                    except StopIteration:
                        return
예제 #53
0
    def test_check_ht(self):
        test_xml = [
            'digitizedbooks/apps/publish/fixtures/bib1.xml',
            'digitizedbooks/apps/publish/fixtures/bib2.xml',
            'digitizedbooks/apps/publish/fixtures/bib3.xml'
        ]

        job = Job(pk=1)
        job.save()
        kdip0 = KDip.objects.create(kdip_id='10002350302', oclc="12345", note='0', pid='r8d9b', create_date = '2015-12-30 15:43:17', job_id=1)
        kdip1 = KDip.objects.create(kdip_id='10002350304', oclc="12345", note='1', pid='r8d9y', create_date = '2015-12-30 15:43:17', job_id=1)
        kdip2 = KDip.objects.create(kdip_id='10002350306', oclc="67890", note='2', pid='r8d9s', create_date = '2015-12-30 15:43:17', job_id=1)
        text590 = "The online edition of this book in the public domain, i.e., not protected by copyright, has been produced by the Emory University Digital library Publications Program."

        for xml in test_xml:
            index = test_xml.index(xml)
            kdip = KDip.objects.get(note=index)
            marc = load_xmlobject_from_file(xml, AlmaBibRecord)
            marc = check_ht.add_856(marc, kdip)
            marc = Utils.remove_all_999_fields(marc)
            marc = Utils.update_583(marc)


            text_856 = '<datafield tag="856" ind1="4" ind2="1"><subfield code="3">%s</subfield><subfield code="u">http://pid.emory.edu/ark:/25593/%s/HT</subfield><subfiled code="y">HathiTrust version</subfiled></datafield>' % (index, kdip.pid)
            field856s = []
            for tag856 in marc.field856:
                field856s.append(tag856.serialize())

            self.assertIn(text_856, field856s)

            self.assertEqual(len(marc.field999), 0)

            self.assertNotIn(marc.serialize().lower(), text590.lower())

            marc = check_ht.add_590(marc)

            self.assertEqual(marc.field590, text590)

            self.assertEqual(marc.tag583a, 'digitized')
예제 #54
0
    def test_can_retrieve_xml_of_existing_articles(self, mock_ds, mock_pdf):

        with open(os.path.join(FIXTURE_ROOT, '1023796ar.xml'), 'r') as f:
            from eulxml.xmlmap import load_xmlobject_from_file
            mock_pdf.content = load_xmlobject_from_file(f)
        mock_ds = [
            'ERUDITXSD300',
        ]  # noqa

        issue = IssueFactory.create(journal=self.journal,
                                    year=2010,
                                    date_published=dt.datetime.now() -
                                    dt.timedelta(days=1000))
        IssueFactory.create(journal=self.journal,
                            year=2010,
                            date_published=dt.datetime.now())
        article = ArticleFactory.create(issue=issue)
        journal_id = self.journal.localidentifier
        issue_id = issue.localidentifier
        article_id = article.localidentifier
        url = reverse('public:journal:article_raw_xml',
                      args=(journal_id, issue.volume_slug, issue_id,
                            article_id))
        request = self.factory.get(url)
        request.user = AnonymousUser()
        request.subscription = None

        # Run
        response = ArticleXmlView.as_view()(request,
                                            journal_code=journal_id,
                                            issue_slug=issue.volume_slug,
                                            issue_localid=issue_id,
                                            localid=article_id)

        # Check
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/xml')
예제 #55
0
    def handle(self, *paths, **options):

        if not len(paths):
            raise CommandError('Please specify path to content for import.')
        if len(paths) > 1:
            # this limitation is kind of arbitrary, but keep thing simple for now
            raise CommandError('Import currently only supports a single volume.')
        path = paths[0]

        dry_run = options.get('dry_run', False)
        verbosity = options.get('verbosity', self.v_normal)

        repo = ManagementRepository()

        # make collection required to avoid accidentally forgetting it
        coll = options.get('collection', None)
        if coll is None:
            raise CommandError('Please specify collection pid')

        collection = repo.get_object(coll, type=Collection)
        if not collection.exists:
            raise CommandError('Collection %s does not exist' % coll)
        if not collection.has_requisite_content_models:
            raise CommandError('%s is not a collection' % coll)

        try:
            start = time.time()
            bag = bagit.Bag(path)
            # NOTE: could consider using fast validation, but files probably are
            # not so large or so numerous that this will be an issue
            if verbosity > self.v_normal:
                self.stdout.write('Validating bag %s' % path)
            fast_validate = options.get('fast_validate')
            bag.validate(fast=fast_validate)
            if verbosity >= self.v_normal:
                self.stdout.write('Validated %s in %.02fs %s' % (path, time.time() - start,
                    '(fast validation enabled)' if fast_validate else ''))
        except bagit.BagError as err:
            # failed to load directory as a bag
            raise CommandError('Please supply a valid BagIt as input. %s' % err)
        except bagit.BagValidationError as err:
            # bag is not valid
            raise CommandError('Input is not a valid bag. %s' % err)

        files = {'pdf': None, 'marcxml': None, 'dc': None}
        checksums = {}

        # this is potentially a long list, but go ahead and store since we will
        # be consulting it multiple times
        payload_files = list(bag.payload_files())

        # identify required contents within the bag by extension and name
        for data_path in payload_files:
            # path is relative to bag root dir
            filename = os.path.join(path, data_path)

            # get extension and name
            basename = os.path.basename(filename)
            basefile, ext = os.path.splitext(basename)
            # NOTE: splitext leaves . on the ext portion

            if ext.lower() == '.pdf':
                files['pdf'] = filename
                checksums['pdf'] = bag.entries[data_path].get('md5', None)

            elif ext.lower() == '.xml':

                if basefile.lower() == 'marc':
                    files['marcxml'] = filename
                    checksums['marcxml'] = bag.entries[data_path].get('md5', None)

                elif basefile.lower() == 'dc':
                    files['dc'] = filename
                    checksums['dc'] = bag.entries[data_path].get('md5', None)

        # check that required components are present
        err = False
        for label, filepath in files.iteritems():
            if filepath is None:
                self.stderr.write('%s not found' % label.upper())
                err = True

            elif checksums[label] is None:
                self.stderr.write('No MD5 checksum found for %s' % label.upper())
                err = True

        if err:
            raise CommandError('Cannot import without all required files and checksums.')

        # all pieces are available, so proceed with ingest


        # construct book and ingest
        if verbosity > self.v_normal:
            self.stdout.write('Creating book object with marxml %s' % files['marcxml'])
        try:
            marcxml = load_xmlobject_from_file(files['marcxml'], MinMarcxml)
        except XMLSyntaxError as err:
            raise CommandError('Failed to load %s as xml: %s' % (files['marcxml'], err))
        try:
            dcxml = load_xmlobject_from_file(files['dc'], DublinCore)
        except XMLSyntaxError as err:
            raise CommandError('Failed to load %s as xml: %s' % (files['dc'], err))

        # look for book by ocm number first, in case a previous ingest failed
        book_pids = Book.pids_by_label(marcxml.ocm_number)
        # error if we find more than one
        if len(book_pids) > 1:
            raise CommandError('Multiple books exist with label %s. Please correct this first.' \
                                % marcxml.ocm_number)

        # if we find exactly one, use that instead of creating a new book
        elif len(book_pids) == 1:
            book = repo.get_object(book_pids[0], type=Book)
            if verbosity >= self.v_normal:
                self.stdout.write('Using existing book %s with ocm number %s' % \
                    (book.pid, marcxml.ocm_number))

        # otherwise, ingest new book
        else:
            book = repo.get_object(type=Book)
            # set book label to ocm number from the marc
            book.label = marcxml.ocm_number
            if verbosity > self.v_normal:
                self.stdout.write('Book label %s' % book.label)

            # associate with collection
            if collection is not None:
                book.collection = collection
                if verbosity > self.v_normal:
                    self.stdout.write('Associating with collection %s' % collection.short_label)
            book.marcxml.content = marcxml
            # NOTE: import checksum can't be used because xml may be serialized differently
            # book.marcxml.checksum = checksums['marcxml']
            book.dc.content = dcxml
            # NOTE: import checksum can't be used because DC is modified to add ARK
            # book.dc.checksum = checksums['dc']

            # save; bail if error
            if not dry_run:
                try:
                    saved = book.save('ingest')
                    if not saved:
                        raise CommandError('Failed to ingest book into repository')
                    if verbosity >= self.v_normal:
                        self.stdout.write('Successfully ingested book %s' \
                                    % book.pid)
                except RequestFailed as err:
                    raise CommandError('Error ingesting book: %s' % err)

        # in case of pre-existing book object, check for existing volume
        if book.volume_set:
            if len(book.volume_set) > 1:
                raise CommandError('Book %s has multiple volumes; import not supported' \
                    % book.pid)
            else:
                # use existing volume object
                vol = book.volume_set[0]
                if verbosity >= self.v_normal:
                    self.stdout.write('Using existing volume %s' % vol.pid)

        # otherwise, create new volume object
        else:
            # construct volume (v1.1), associate with book, and ingest
            if verbosity > self.v_normal:
                self.stdout.write('Creating volume with %s' % files['pdf'])
            with open(files['pdf']) as pdf_file:
                vol = repo.get_object(type=VolumeV1_1)
                # set volume label to ocm number from the marc + volume number
                # for consistency with lsdi content, use ocm_v# notation
                # V.0 indicates single-volume book
                vol.label = '%s_V.0' % marcxml.ocm_number
                # set pdf content
                vol.pdf.content = pdf_file
                vol.pdf.checksum = checksums['pdf']
                # set relation to parent book object
                vol.book = book
                # minimal DC metadata derived from book metadata
                vol.dc.content.title = book.dc.content.title
                for t in book.dc.content.type_list:
                    vol.dc.content.type_list.append(t)
                vol.dc.content.format = book.dc.content.format
                vol.dc.content.language = book.dc.content.language
                vol.dc.content.rights = book.dc.content.rights

                if not dry_run:
                    try:
                        saved = vol.save('ingest')
                        if not saved:
                            # NOTE: possibly, if this fails, we should deactivate the book object
                            # but will leave that to manual processing for now
                            raise CommandError('Failed to ingest volume into repository')
                        else:
                            if verbosity >= self.v_normal:
                                self.stdout.write('Successfully ingested volume %s' \
                                    % vol.pid)
                    except RequestFailed as err:
                        raise CommandError('Error ingesting volume: %s' % err)


        #### page import

        # if volume has existing pages, bail
        if len(vol.pages):
            raise CommandError('Volume %s already has %s page%s' % \
                (vol.pid, len(vol.pages), '' if len(vol.pages) == 1 else 's'))

        # should page import happen here?
        # - identify numeric jp2/jpf files in the bag and get total count
        # - identify numeric .xml files in the bag and get total count
        # - make sure counts match up
        # Question: can we assume no start/end blank pages for now?
        # - start looping through, create page-1.1 and associate with book,
        #   and ingest
        # - set first page as primary image on the volume
        # - report number of pages ingested

        image_files = []

        # identify page files (images and ocr xml)
        for data_path in payload_files:
            # get extension and name
            basename = os.path.basename(data_path)
            basefile, ext = os.path.splitext(basename)
            if ext in ['.jp2', '.jpf']:
                image_files.append(data_path)
                # check that MD5 is present and bail if not
                # - this is probably redundant since by this point validation
                # has passed and previous content has checksums, but
                # ingest will assume checksums are available so better to error
                # *before* starting to ingest page-level content
                if bag.entries[data_path].get('md5', None) is None:
                    raise CommandError('No MD5 checksum for %s' % data_path)

        # ensure pages are sorted into page-order
        image_files.sort()

        # NOTE: disabled for now; tunebook does not appear to include alto
        # for pages with no text content
        ## find matching page ocr files
        # for imgfile in image_files:
        #     basefile, ext = os.path.splitext(imgfile)
        #     ocrfile = '%s.xml' % basefile
        #     if ocrfile not in payload_files:
        #         raise CommandError('No OCR xml page present for %s (expected %s)' % \
        #             (imgfile, ocrfile))

        # pre-generate empty xml in case we need it to force eulfedora to not
        # create ocr datastream when no ocr is present
        emptyxml = load_xmlobject_from_string('<empty/>')

        # iterate through page images and put into fedora
        pageindex = 1
        for imgfile in image_files:
            if verbosity > self.v_normal:
                print 'Creating Page object for %s' % imgfile
            # path is relative to bag root dir
            img_filename = os.path.join(path, imgfile)

            page = repo.get_object(type=PageV1_1)
            # set page label
            page.label = '%s page %d' % (vol.label, pageindex)
            # set the relation to the volume object
            page.volume = vol
            logger.debug('Page %s volume %s' % (page.pid, page.volume.pid))
            # set a dc:title based on volume title
            page.dc.content.title = '%s page %d' % (vol.dc.content.title, pageindex)
            # set page order
            page.page_order = pageindex

            with open(img_filename) as img_content:
                # set image content
                page.image.content = img_content
                page.image.checksum = bag.entries[imgfile]['md5']
                # assume jpeg2000 for now (only looking for jp2/jpf)
                page.image.mimetype = 'image/jp2'

                # check for ocr xml within the bag, same base name as image
                basefile, ext = os.path.splitext(imgfile)
                ocrfile = '%s.xml' % basefile

                if ocrfile in payload_files:
                    page.ocr.content = load_xmlobject_from_file(os.path.join(path, ocrfile))
                    # NOTE: can't use MD5 from bag because XML may be
                    # serialized differently when sent to Fedora
                    # (unless we treat as file instead of xml...)
                    # page.ocr.checksum = bag.entries[ocrfile]['md5']

                    if verbosity > self.v_normal:
                        print 'Setting OCR for Page from %s' % ocrfile

                else:
                    # warn but do not error if ocr xml is not found
                    self.stdout.write('Warning: no OCR xml found for %s' % imgfile)
                    # explicitly set xml content to empty so eulfedora doesn't
                    # attempt to bootstrap & ingest (and error)
                    page.ocr.content = emptyxml

                if not dry_run:
                    try:
                        # for now, if any page ingest errors, bail out
                        # (unclear what would cause load to fail midway)

                        saved = page.save()

                        if not saved:
                            raise CommandError('Failed to ingest page %d into repository' \
                                % pageindex)

                    except RequestFailed as err:
                        raise CommandError('Error ingesting page %d: %s' % (pageindex, err))

            # set first page as primary image for the volume
            if not dry_run and pageindex == 1:
                vol.primary_image = page
                vol.save('adding primary image relation')

            # increase page index for next page
            pageindex += 1

        if verbosity >= self.v_normal:
            # total is pageindex - 1 since pageindex incremented at end of loop
            self.stdout.write('Created %d pages' % (pageindex - 1))
예제 #56
0
파일: test_cerp.py 프로젝트: bcail/eulxml
 def setUp(self):
     self.account = load_xmlobject_from_file(self.FIXTURE_FILE, cerp.Account)
     self.folder = self.account.folders[0]
     self.message = self.folder.messages[0]
예제 #57
0
 def setUp(self):
     self.fr6v1 = load_xmlobject_from_file(self.fr6v1_doc,
                                           abbyyocr.Document)
     self.fr8v2 = load_xmlobject_from_file(self.fr8v2_doc,
                                           abbyyocr.Document)