def index_data(request, id, repo=None): '''Return the fields and values to be indexed for a single object as JSON. Index content is generated via :meth:`eulfedora.models.DigitalObject.index_data`. :param id: id of the object to be indexed; in this case a Fedora pid ''' #Ensure permission to this resource is allowed. Currently based on IP only. if _permission_denied_check(request): return HttpResponseForbidden('Access to this web service was denied.', content_type='text/html') if repo is None: repo_opts = {} # if credentials are specified via Basic Auth, use them for Fedora access auth_info = request.META.get('HTTP_AUTHORIZATION', None) basic = 'Basic ' if auth_info and auth_info.startswith(basic): basic_info = auth_info[len(basic):] basic_info_decoded = codecs.decode(force_bytes(basic_info), 'base64') u, p = force_text(basic_info_decoded).split(':') repo_opts.update({'username': u, 'password': p}) repo = TypeInferringRepository(**repo_opts) try: obj = repo.get_object(id) return HttpResponse(json.dumps(obj.index_data()), content_type='application/json') except RequestFailed: # for now, treat any failure getting the object from Fedora as a 404 # (could also potentially be a permission error) raise Http404
def index_data(request, id, repo=None): """Return the fields and values to be indexed for a single object as JSON. Index content is generated via :meth:`eulfedora.models.DigitalObject.index_data`. :param id: id of the object to be indexed; in this case a Fedora pid """ # Ensure permission to this resource is allowed. Currently based on IP only. if _permission_denied_check(request): return HttpResponseForbidden("Access to this web service was denied.", content_type="text/html") if repo is None: repo_opts = {} # if credentials are specified via Basic Auth, use them for Fedora access auth_info = request.META.get("HTTP_AUTHORIZATION", None) basic = "Basic " if auth_info and auth_info.startswith(basic): basic_info = auth_info[len(basic) :] basic_info_decoded = codecs.decode(force_bytes(basic_info), "base64") u, p = force_text(basic_info_decoded).split(":") repo_opts.update({"username": u, "password": p}) repo = TypeInferringRepository(**repo_opts) try: obj = repo.get_object(id) return HttpResponse(json.dumps(obj.index_data()), content_type="application/json") except RequestFailed: # for now, treat any failure getting the object from Fedora as a 404 # (could also potentially be a permission error) raise Http404
def curl_upload_file(filename): print("curl upload") conn = pycurl.Curl() auth = base64.b64encode(force_bytes("%s:%s" % (testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD))) headers = {"Authorization": "Basic %s" % force_text(auth)} conn.setopt(conn.URL, "%supload" % testsettings.FEDORA_ROOT_NONSSL) conn.setopt(pycurl.VERBOSE, 1) conn.setopt(pycurl.HTTPHEADER, ["%s: %s" % t for t in headers.items()]) filesize = os.path.getsize(filename) widgets = [ "Upload: ", progressbar.widgets.Percentage(), " ", progressbar.widgets.Bar(), " ", progressbar.widgets.ETA(), " ", progressbar.widgets.FileTransferSpeed(), ] # set initial progressbar size based on file; will be slightly larger because # of multipart boundary content pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start() def progress(dl_total, dl, up_total, up): # update the progressbar to actual maxval (content + boundary) pbar.max_value = up_total # update current status pbar.update(up) conn.setopt( conn.HTTPPOST, [ ( "file", ( # upload the contents of this file conn.FORM_FILE, filename, # specify a different file name for the upload conn.FORM_FILENAME, "file", # specify a different content type # conn.FORM_CONTENTTYPE, 'application/x-python', ), ) ], ) # conn.setopt(conn.CURLOPT_READFUNCTION) conn.setopt(conn.XFERINFOFUNCTION, progress) conn.setopt(conn.NOPROGRESS, False) conn.perform() # HTTP response code, e.g. 200. print("Status: %d" % conn.getinfo(conn.RESPONSE_CODE)) # Elapsed time for the transfer. print("Time: %f" % conn.getinfo(conn.TOTAL_TIME)) conn.close()
def test_ingest_utf8(self): # ingest with unicode log message obj = self.loadFixtureData('basic-object.foxml') response = self.rest_api.ingest(obj, logMessage=self.unicode_test_str) pid = response.text self.assertTrue(pid) response = self.rest_api.getObjectXML(pid) response.encoding = 'utf-8' # ensure requests decodes as utf-8 self.assert_(u'<audit:justification>%s</audit:justification>' % self.unicode_test_str in response.text) self.rest_api.purgeObject(force_text(pid)) # ingest with unicode object label # convert to text to replace string, then convert back to bytes obj = force_bytes(force_text(obj).replace( u"A test object", self.unicode_test_str)) response = self.rest_api.ingest(obj) pid = response.text self.assertTrue(pid) # object label in profile should match the unicode sent response = self.rest_api.getObjectProfile(pid) response.encoding = 'utf-8' # ensure requests decodes as utf-8 self.assert_(u'<objLabel>%s</objLabel>' % self.unicode_test_str in response.text) self.rest_api.purgeObject(force_text(pid))
def index_data(request, id, repo=None): '''Return the fields and values to be indexed for a single object as JSON. Index content is generated via :meth:`eulfedora.models.DigitalObject.index_data`. :param id: id of the object to be indexed; in this case a Fedora pid ''' # Ensure permission to this resource is allowed. Currently based on IP only. if _permission_denied_check(request): return HttpResponseForbidden('Access to this web service was denied.', content_type='text/html') if repo is None: repo_opts = {} # if credentials are specified via Basic Auth, use them for Fedora access auth_info = request.META.get('HTTP_AUTHORIZATION', None) basic = 'Basic ' if auth_info and auth_info.startswith(basic): basic_info = auth_info[len(basic):] basic_info_decoded = base64.b64decode(force_bytes(basic_info)) # NOTE: codecs.decode works everywhere but python 3.3. which # complains about an unknown encoding # basic_info_decoded = codecs.decode(force_bytes(basic_info), 'base64') u, p = force_text(basic_info_decoded).split(':') repo_opts.update({'username': u, 'password': p}) repo = TypeInferringRepository(**repo_opts) try: obj = repo.get_object(id) return HttpResponse(json.dumps(obj.index_data()), content_type='application/json') except RequestFailed: # for now, treat any failure getting the object from Fedora as a 404 # (could also potentially be a permission error) raise Http404
def test_raw_audit_trail(self): rqst = Mock() rqst.method = 'GET' # created with no ingest message = no audit trail self.assertRaises(Http404, raw_audit_trail, rqst, self.obj.pid) # modify object so it will have an audit trail self.obj.dc.content.title = 'audit this!' changelog = 'I just changed the title' self.obj.save(changelog) response = raw_audit_trail(rqst, self.obj.pid) expected, got = 200, response.status_code self.assertEqual(expected, got, 'Expected %s but returned %s for raw_audit_trail' \ % (expected, got)) expected, got = 'text/xml', response['Content-Type'] self.assertEqual(expected, got, 'Expected %s but returned %s for mimetype on raw_audit_trail' \ % (expected, got)) self.assert_(b'<audit:auditTrail' in response.content) self.assert_( force_bytes('<audit:justification>%s</audit:justification>' % changelog) in response.content) self.assert_('Last-Modified' in response)
def upload(self, data, callback=None, content_type=None, size=None): ''' Upload a multi-part file for content to ingest. Returns a temporary upload id that can be used as a datstream location. :param data: content string, file-like object, or iterable with content to be uploaded :param callback: optional callback method to monitor the upload; see :mod:`requests-toolbelt` documentation for more details: https://toolbelt.readthedocs.org/en/latest/user.html#uploading-data :param content_type: optional content type of the data :param size: optional size of the data; required when using an iterable for the data :returns: upload id on success ''' url = 'upload' # fedora only expects content uploaded as multipart file; # make string content into a file-like object so requests.post # sends it the way Fedora expects. # NOTE: checking for both python 2.x next method and # python 3.x __next__ to test if data is iteraable if not hasattr(data, 'read') and \ not (hasattr(data, '__next__') or hasattr(data, 'next')): data = six.BytesIO(force_bytes(data)) # if data is an iterable, wrap in a readable iterator that # requests-toolbelt can read data from elif not hasattr(data, 'read') and \ (hasattr(data, '__next__') or hasattr(data, 'next')): if size is None: raise Exception('Cannot upload iterable with unknown size') data = ReadableIterator(data, size) # use requests-toolbelt multipart encoder to avoid reading # the full content of large files into memory menc = MultipartEncoder(fields={'file': ('file', data, content_type)}) if callback is not None: menc = MultipartEncoderMonitor(menc, callback) headers = {'Content-Type': menc.content_type} if size: headers['Content-Length'] = size try: response = self.post(url, data=menc, headers=headers) except OverflowError: # Python __len__ uses integer so it is limited to system maxint, # and requests and requests-toolbelt use len() throughout. # This results in an overflow error when trying to upload a file # larger than system maxint (2GB on 32-bit OSes). # See http://bugs.python.org/issue12159 msg = 'upload content larger than system maxint (32-bit OS limitation)' logger.error('OverflowError: %s', msg) raise OverflowError(msg) if response.status_code == requests.codes.accepted: return response.text.strip()
def curl_upload_file(filename): print('curl upload') conn = pycurl.Curl() auth = base64.b64encode( force_bytes("%s:%s" % (testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD))) headers = {'Authorization': 'Basic %s' % force_text(auth)} conn.setopt(conn.URL, '%supload' % testsettings.FEDORA_ROOT_NONSSL) conn.setopt(pycurl.VERBOSE, 1) conn.setopt(pycurl.HTTPHEADER, ["%s: %s" % t for t in headers.items()]) filesize = os.path.getsize(filename) widgets = [ 'Upload: ', progressbar.widgets.Percentage(), ' ', progressbar.widgets.Bar(), ' ', progressbar.widgets.ETA(), ' ', progressbar.widgets.FileTransferSpeed() ] # set initial progressbar size based on file; will be slightly larger because # of multipart boundary content pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start() def progress(dl_total, dl, up_total, up): # update the progressbar to actual maxval (content + boundary) pbar.max_value = up_total # update current status pbar.update(up) conn.setopt( conn.HTTPPOST, [ ( 'file', ( # upload the contents of this file conn.FORM_FILE, filename, # specify a different file name for the upload conn.FORM_FILENAME, 'file', # specify a different content type # conn.FORM_CONTENTTYPE, 'application/x-python', )), ]) # conn.setopt(conn.CURLOPT_READFUNCTION) conn.setopt(conn.XFERINFOFUNCTION, progress) conn.setopt(conn.NOPROGRESS, False) conn.perform() # HTTP response code, e.g. 200. print('Status: %d' % conn.getinfo(conn.RESPONSE_CODE)) # Elapsed time for the transfer. print('Time: %f' % conn.getinfo(conn.TOTAL_TIME)) conn.close()
def test_compareDatastreamChecksum(self): # create datastream with checksum (added, ds) = self._add_text_datastream() r = self.rest_api.compareDatastreamChecksum(self.pid, ds['id']) mdsum = hashlib.md5() mdsum.update(force_bytes(self.TEXT_CONTENT)) text_md5 = mdsum.hexdigest() self.assert_('<dsChecksum>%s</dsChecksum>' % text_md5 in r.text) # FIXME: how to test that checksum has actually been checked? # check for log message in audit trail r = self.rest_api.getObjectXML(self.pid) self.assert_(ds['logMessage'] in r.text)
def test_compareDatastreamChecksum(self): # create datastream with checksum (added, ds) = self._add_text_datastream() r = self.rest_api.compareDatastreamChecksum(self.pid, ds["id"]) mdsum = hashlib.md5() mdsum.update(force_bytes(self.TEXT_CONTENT)) text_md5 = mdsum.hexdigest() self.assert_("<dsChecksum>%s</dsChecksum>" % text_md5 in r.text) # FIXME: how to test that checksum has actually been checked? # check for log message in audit trail r = self.rest_api.getObjectXML(self.pid) self.assert_(ds["logMessage"] in r.text)
def test_ingest_without_pid(self): obj = self.loadFixtureData('basic-object.foxml') pid = self.repo.ingest(force_bytes(obj)) self.assertTrue(pid) self.repo.purge_object(force_text(pid)) # test ingesting with log message pid = self.repo.ingest(obj, "this is my test ingest message") # ingest message is stored in AUDIT datastream # - can currently only be accessed by retrieving entire object xml r = self.repo.api.getObjectXML(force_text(pid)) self.assertTrue("this is my test ingest message" in r.text) purged = self.repo.purge_object(force_text(pid), "removing test ingest object") self.assertTrue(purged)
def curl_download_file(pid, dsid): repo = Repository(testsettings.FEDORA_ROOT_NONSSL, testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD) obj = repo.get_object(pid) ds = obj.getDatastreamObject(dsid) tmpfile = tempfile.NamedTemporaryFile(prefix='%s-%s_' % (pid, dsid), delete=False) print('writing to ', tmpfile.name) widgets = [ 'Download: ', progressbar.widgets.Percentage(), ' ', progressbar.widgets.Bar(), ' ', progressbar.widgets.ETA(), ' ', progressbar.widgets.FileTransferSpeed() ] # set initial progressbar size based on file; will be slightly larger because # of multipart boundary content pbar = progressbar.ProgressBar(widgets=widgets, max_value=ds.size).start() def progress(dl_total, dl, up_total, up): # update current status pbar.update(dl) c = pycurl.Curl() auth = base64.b64encode( force_bytes("%s:%s" % (testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD))) headers = {'Authorization': 'Basic %s' % force_text(auth)} c.setopt(pycurl.VERBOSE, 1) c.setopt(pycurl.HTTPHEADER, ["%s: %s" % t for t in headers.items()]) # /objects/{pid}/datastreams/{dsID}/content ? [asOfDateTime] [download] c.setopt(c.URL, '%sobjects/%s/datastreams/%s/content' % \ (testsettings.FEDORA_ROOT_NONSSL, pid, dsid)) # c.setopt(c.WRITEDATA, buffer) c.setopt(c.WRITEFUNCTION, tmpfile.write) c.setopt(c.XFERINFOFUNCTION, progress) c.setopt(c.NOPROGRESS, False) c.perform() # HTTP response code, e.g. 200. print('Status: %d' % c.getinfo(c.RESPONSE_CODE)) # Elapsed time for the transfer. print('Time: %f' % c.getinfo(c.TOTAL_TIME)) c.close()
def curl_download_file(pid, dsid): repo = Repository(testsettings.FEDORA_ROOT_NONSSL, testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD) obj = repo.get_object(pid) ds = obj.getDatastreamObject(dsid) tmpfile = tempfile.NamedTemporaryFile( prefix='%s-%s_' % (pid, dsid), delete=False) print('writing to ', tmpfile.name) widgets = ['Download: ', progressbar.widgets.Percentage(), ' ', progressbar.widgets.Bar(), ' ', progressbar.widgets.ETA(), ' ', progressbar.widgets.FileTransferSpeed()] # set initial progressbar size based on file; will be slightly larger because # of multipart boundary content pbar = progressbar.ProgressBar(widgets=widgets, max_value=ds.size).start() def progress(dl_total, dl, up_total, up): # update current status pbar.update(dl) c = pycurl.Curl() auth = base64.b64encode(force_bytes("%s:%s" % (testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD))) headers = {'Authorization' : 'Basic %s' % force_text(auth)} c.setopt(pycurl.VERBOSE, 1) c.setopt(pycurl.HTTPHEADER, ["%s: %s" % t for t in headers.items()]) # /objects/{pid}/datastreams/{dsID}/content ? [asOfDateTime] [download] c.setopt(c.URL, '%sobjects/%s/datastreams/%s/content' % \ (testsettings.FEDORA_ROOT_NONSSL, pid, dsid)) # c.setopt(c.WRITEDATA, buffer) c.setopt(c.WRITEFUNCTION, tmpfile.write) c.setopt(c.XFERINFOFUNCTION, progress) c.setopt(c.NOPROGRESS, False) c.perform() # HTTP response code, e.g. 200. print('Status: %d' % c.getinfo(c.RESPONSE_CODE)) # Elapsed time for the transfer. print('Time: %f' % c.getinfo(c.TOTAL_TIME)) c.close()
def test_raw_audit_trail(self): rqst = Mock() rqst.method = 'GET' # created with no ingest message = no audit trail self.assertRaises(Http404, raw_audit_trail, rqst, self.obj.pid) # modify object so it will have an audit trail self.obj.dc.content.title = 'audit this!' changelog = 'I just changed the title' self.obj.save(changelog) response = raw_audit_trail(rqst, self.obj.pid) expected, got = 200, response.status_code self.assertEqual(expected, got, 'Expected %s but returned %s for raw_audit_trail' \ % (expected, got)) expected, got = 'text/xml', response['Content-Type'] self.assertEqual(expected, got, 'Expected %s but returned %s for mimetype on raw_audit_trail' \ % (expected, got)) self.assert_(b'<audit:auditTrail' in response.content) self.assert_(force_bytes('<audit:justification>%s</audit:justification>' % changelog) in response.content) self.assert_('Last-Modified' in response)
def load_fixture_data(fname): with open(fixture_path(fname)) as f: return force_bytes(f.read())
def object_data(self): '''Process the archival export and return a buffer with foxml content for ingest into the destination repository. :returns: :class:`io.BytesIO` for ingest, with references to uploaded datastream content or content location urls ''' self.foxml_buffer = io.BytesIO() if self.progress_bar: self.progress_bar.start() previous_section = None while True: try: section = self.get_next_section() except StopIteration: break if section == BINARY_CONTENT_START: self.within_file = True # get datastream info from the end of the section just before this one # (needed to provide size to upload request) dsinfo = self.get_datastream_info(previous_section) if dsinfo: 'Found encoded datastream %(id)s (%(mimetype)s, size %(size)s, %(type)s %(digest)s)' % \ dsinfo logger.info('Found encoded datastream %(id)s (%(mimetype)s, size %(size)s, %(type)s %(digest)s)', dsinfo) else: # error if datastream info is not found, because either # size or version date is required to handle content raise Exception('Failed to find datastream information for %s from \n%s' \ % (self.obj.pid, previous_section)) if self.xml_only and not dsinfo['mimetype'] == 'text/xml': # possibly others? try: dsid, dsversion = dsinfo['id'].split('.') except ValueError: # if dsid doesn't include a .# (for versioning), # use the id as is. dsid = dsinfo['id'] if self.url_credentials: # if url credentials are set, parse the base fedora api # url so they can be inserted at the right place parsed_url = urlparse(self.obj.api.base_url) # reassemble base url, adding in credentials base_url = ''.join([parsed_url.scheme, '://', self.url_credentials, parsed_url.netloc, parsed_url.path]) else: base_url = self.obj.api.base_url # versioned datastream dissemination url content_location = '%sobjects/%s/datastreams/%s/content?asOfDateTime=%s' % \ (base_url, self.obj.pid, dsid, dsinfo['created']) else: upload_args = {} if self.progress_bar: def upload_callback(monitor): self.progress_bar.upload = monitor.bytes_read upload_args = {'callback': upload_callback} # use upload id as content location content_location = self.dest_repo.api.upload(self.encoded_datastream(), size=int(dsinfo['size']), **upload_args) self.foxml_buffer.write(force_bytes('<foxml:contentLocation REF="%s" TYPE="URL"/>' \ % content_location)) elif section == BINARY_CONTENT_END: # should not occur here; this section will be processed by # encoded_datastream method self.within_file = False elif self.within_file: # should not occur here; this section will be pulled by # encoded_datastream method # binary content within a file - ignore here # (handled by encoded_datastream method) next else: # not start or end of binary content, and not # within a file, so yield as is (e.g., datastream tags # between small files) self.foxml_buffer.write(section) previous_section = section return self.foxml_buffer
def object_data(self): '''Process the archival export and return a buffer with foxml content for ingest into the destination repository. :returns: :class:`io.BytesIO` for ingest, with references to uploaded datastream content or content location urls ''' self.foxml_buffer = io.BytesIO() if self.progress_bar: self.progress_bar.start() previous_section = None while True: try: section = self.get_next_section() except StopIteration: break if section == BINARY_CONTENT_START: self.within_file = True # get datastream info from the end of the section just before this one # (needed to provide size to upload request) dsinfo = self.get_datastream_info(previous_section) if dsinfo: 'Found encoded datastream %(id)s (%(mimetype)s, size %(size)s, %(type)s %(digest)s)' % \ dsinfo logger.info( 'Found encoded datastream %(id)s (%(mimetype)s, size %(size)s, %(type)s %(digest)s)', dsinfo) else: # error if datastream info is not found, because either # size or version date is required to handle content raise Exception( 'Failed to find datastream information from \n%s' % previous_section) if self.xml_only and not dsinfo[ 'mimetype'] == 'text/xml': # possibly others? try: dsid, dsversion = dsinfo['id'].split('.') except ValueError: # if dsid doesn't include a .# (for versioning), # use the id as is. dsid = dsinfo['id'] if self.url_credentials: # if url credentials are set, parse the base fedora api # url so they can be inserted at the right place parsed_url = urlparse(self.obj.api.base_url) # reassemble base url, adding in credentials base_url = ''.join([ parsed_url.scheme, '://', self.url_credentials, parsed_url.netloc, parsed_url.path ]) else: base_url = self.obj.api.base_url # versioned datastream dissemination url content_location = '%sobjects/%s/datastreams/%s/content?asOfDateTime=%s' % \ (base_url, self.obj.pid, dsid, dsinfo['created']) else: upload_args = {} if self.progress_bar: def upload_callback(monitor): self.progress_bar.upload = monitor.bytes_read upload_args = {'callback': upload_callback} # use upload id as content location content_location = self.dest_repo.api.upload( self.encoded_datastream(), size=int(dsinfo['size']), **upload_args) self.foxml_buffer.write(force_bytes('<foxml:contentLocation REF="%s" TYPE="URL"/>' \ % content_location)) elif section == BINARY_CONTENT_END: # should not occur here; this section will be processed by # encoded_datastream method self.within_file = False elif self.within_file: # should not occur here; this section will be pulled by # encoded_datastream method # binary content within a file - ignore here # (handled by encoded_datastream method) next else: # not start or end of binary content, and not # within a file, so yield as is (e.g., datastream tags # between small files) self.foxml_buffer.write(section) previous_section = section return self.foxml_buffer