Пример #1
0
def index_data(request, id, repo=None):
    '''Return the fields and values to be indexed for a single object
    as JSON.  Index content is generated via
    :meth:`eulfedora.models.DigitalObject.index_data`.

    :param id: id of the object to be indexed; in this case a Fedora pid
    '''

    #Ensure permission to this resource is allowed. Currently based on IP only.
    if _permission_denied_check(request):
        return HttpResponseForbidden('Access to this web service was denied.',
                                     content_type='text/html')

    if repo is None:
        repo_opts = {}
        # if credentials are specified via Basic Auth, use them for Fedora access
        auth_info = request.META.get('HTTP_AUTHORIZATION', None)
        basic = 'Basic '
        if auth_info and auth_info.startswith(basic):
            basic_info = auth_info[len(basic):]
            basic_info_decoded = codecs.decode(force_bytes(basic_info),
                                               'base64')
            u, p = force_text(basic_info_decoded).split(':')
            repo_opts.update({'username': u, 'password': p})

        repo = TypeInferringRepository(**repo_opts)
    try:
        obj = repo.get_object(id)
        return HttpResponse(json.dumps(obj.index_data()),
                            content_type='application/json')
    except RequestFailed:
        # for now, treat any failure getting the object from Fedora as a 404
        # (could also potentially be a permission error)
        raise Http404
Пример #2
0
def index_data(request, id, repo=None):
    """Return the fields and values to be indexed for a single object
    as JSON.  Index content is generated via
    :meth:`eulfedora.models.DigitalObject.index_data`.

    :param id: id of the object to be indexed; in this case a Fedora pid
    """

    # Ensure permission to this resource is allowed. Currently based on IP only.
    if _permission_denied_check(request):
        return HttpResponseForbidden("Access to this web service was denied.", content_type="text/html")

    if repo is None:
        repo_opts = {}
        # if credentials are specified via Basic Auth, use them for Fedora access
        auth_info = request.META.get("HTTP_AUTHORIZATION", None)
        basic = "Basic "
        if auth_info and auth_info.startswith(basic):
            basic_info = auth_info[len(basic) :]
            basic_info_decoded = codecs.decode(force_bytes(basic_info), "base64")
            u, p = force_text(basic_info_decoded).split(":")
            repo_opts.update({"username": u, "password": p})

        repo = TypeInferringRepository(**repo_opts)
    try:
        obj = repo.get_object(id)
        return HttpResponse(json.dumps(obj.index_data()), content_type="application/json")
    except RequestFailed:
        # for now, treat any failure getting the object from Fedora as a 404
        # (could also potentially be a permission error)
        raise Http404
Пример #3
0
def curl_upload_file(filename):
    print("curl upload")
    conn = pycurl.Curl()
    auth = base64.b64encode(force_bytes("%s:%s" % (testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD)))
    headers = {"Authorization": "Basic %s" % force_text(auth)}
    conn.setopt(conn.URL, "%supload" % testsettings.FEDORA_ROOT_NONSSL)
    conn.setopt(pycurl.VERBOSE, 1)
    conn.setopt(pycurl.HTTPHEADER, ["%s: %s" % t for t in headers.items()])

    filesize = os.path.getsize(filename)
    widgets = [
        "Upload: ",
        progressbar.widgets.Percentage(),
        " ",
        progressbar.widgets.Bar(),
        " ",
        progressbar.widgets.ETA(),
        " ",
        progressbar.widgets.FileTransferSpeed(),
    ]
    # set initial progressbar size based on file; will be slightly larger because
    # of multipart boundary content
    pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()

    def progress(dl_total, dl, up_total, up):
        # update the progressbar to actual maxval (content + boundary)
        pbar.max_value = up_total
        # update current status
        pbar.update(up)

    conn.setopt(
        conn.HTTPPOST,
        [
            (
                "file",
                (
                    # upload the contents of this file
                    conn.FORM_FILE,
                    filename,
                    # specify a different file name for the upload
                    conn.FORM_FILENAME,
                    "file",
                    # specify a different content type
                    # conn.FORM_CONTENTTYPE, 'application/x-python',
                ),
            )
        ],
    )
    # conn.setopt(conn.CURLOPT_READFUNCTION)
    conn.setopt(conn.XFERINFOFUNCTION, progress)
    conn.setopt(conn.NOPROGRESS, False)

    conn.perform()

    # HTTP response code, e.g. 200.
    print("Status: %d" % conn.getinfo(conn.RESPONSE_CODE))
    # Elapsed time for the transfer.
    print("Time: %f" % conn.getinfo(conn.TOTAL_TIME))

    conn.close()
Пример #4
0
    def test_ingest_utf8(self):
        # ingest with unicode log message
        obj = self.loadFixtureData('basic-object.foxml')
        response = self.rest_api.ingest(obj, logMessage=self.unicode_test_str)
        pid = response.text
        self.assertTrue(pid)

        response = self.rest_api.getObjectXML(pid)
        response.encoding = 'utf-8'  # ensure requests decodes as utf-8
        self.assert_(u'<audit:justification>%s</audit:justification>' %
                     self.unicode_test_str in response.text)
        self.rest_api.purgeObject(force_text(pid))

        # ingest with unicode object label
        # convert to text to replace string, then convert back to bytes
        obj = force_bytes(force_text(obj).replace(
            u"A test object", self.unicode_test_str))
        response = self.rest_api.ingest(obj)
        pid = response.text
        self.assertTrue(pid)

        # object label in profile should match the unicode sent
        response = self.rest_api.getObjectProfile(pid)
        response.encoding = 'utf-8'  # ensure requests decodes as utf-8
        self.assert_(u'<objLabel>%s</objLabel>' % self.unicode_test_str
                     in response.text)
        self.rest_api.purgeObject(force_text(pid))
Пример #5
0
def index_data(request, id, repo=None):
    '''Return the fields and values to be indexed for a single object
    as JSON.  Index content is generated via
    :meth:`eulfedora.models.DigitalObject.index_data`.

    :param id: id of the object to be indexed; in this case a Fedora pid
    '''

    # Ensure permission to this resource is allowed. Currently based on IP only.
    if _permission_denied_check(request):
        return HttpResponseForbidden('Access to this web service was denied.', content_type='text/html')

    if repo is None:
        repo_opts = {}
        # if credentials are specified via Basic Auth, use them for Fedora access
        auth_info = request.META.get('HTTP_AUTHORIZATION', None)
        basic = 'Basic '
        if auth_info and auth_info.startswith(basic):
            basic_info = auth_info[len(basic):]
            basic_info_decoded = base64.b64decode(force_bytes(basic_info))
            # NOTE: codecs.decode works everywhere but python 3.3. which
            # complains about an unknown encoding
            # basic_info_decoded = codecs.decode(force_bytes(basic_info), 'base64')
            u, p = force_text(basic_info_decoded).split(':')
            repo_opts.update({'username': u, 'password': p})

        repo = TypeInferringRepository(**repo_opts)
    try:
        obj = repo.get_object(id)
        return HttpResponse(json.dumps(obj.index_data()),
                            content_type='application/json')
    except RequestFailed:
        # for now, treat any failure getting the object from Fedora as a 404
        # (could also potentially be a permission error)
        raise Http404
Пример #6
0
    def test_raw_audit_trail(self):
        rqst = Mock()
        rqst.method = 'GET'

        # created with no ingest message = no audit trail
        self.assertRaises(Http404, raw_audit_trail, rqst, self.obj.pid)

        # modify object so it will have an audit trail
        self.obj.dc.content.title = 'audit this!'
        changelog = 'I just changed the title'
        self.obj.save(changelog)
        response = raw_audit_trail(rqst, self.obj.pid)
        expected, got = 200, response.status_code
        self.assertEqual(expected, got,
            'Expected %s but returned %s for raw_audit_trail' \
                % (expected, got))
        expected, got = 'text/xml', response['Content-Type']
        self.assertEqual(expected, got,
            'Expected %s but returned %s for mimetype on raw_audit_trail' \
                % (expected, got))
        self.assert_(b'<audit:auditTrail' in response.content)
        self.assert_(
            force_bytes('<audit:justification>%s</audit:justification>' %
                        changelog) in response.content)
        self.assert_('Last-Modified' in response)
Пример #7
0
    def test_ingest_utf8(self):
        # ingest with unicode log message
        obj = self.loadFixtureData('basic-object.foxml')
        response = self.rest_api.ingest(obj, logMessage=self.unicode_test_str)
        pid = response.text
        self.assertTrue(pid)

        response = self.rest_api.getObjectXML(pid)
        response.encoding = 'utf-8'  # ensure requests decodes as utf-8
        self.assert_(u'<audit:justification>%s</audit:justification>' %
                     self.unicode_test_str in response.text)
        self.rest_api.purgeObject(force_text(pid))

        # ingest with unicode object label
        # convert to text to replace string, then convert back to bytes
        obj = force_bytes(force_text(obj).replace(
            u"A test object", self.unicode_test_str))
        response = self.rest_api.ingest(obj)
        pid = response.text
        self.assertTrue(pid)

        # object label in profile should match the unicode sent
        response = self.rest_api.getObjectProfile(pid)
        response.encoding = 'utf-8'  # ensure requests decodes as utf-8
        self.assert_(u'<objLabel>%s</objLabel>' % self.unicode_test_str
                     in response.text)
        self.rest_api.purgeObject(force_text(pid))
Пример #8
0
    def upload(self, data, callback=None, content_type=None, size=None):
        '''
        Upload a multi-part file for content to ingest.  Returns a
        temporary upload id that can be used as a datstream location.

        :param data: content string, file-like object, or iterable with
            content to be uploaded
        :param callback: optional callback method to monitor the upload;
            see :mod:`requests-toolbelt` documentation for more
            details: https://toolbelt.readthedocs.org/en/latest/user.html#uploading-data
        :param content_type: optional content type of the data
        :param size: optional size of the data; required when using an
            iterable for the data

        :returns: upload id on success
        '''
        url = 'upload'
        # fedora only expects content uploaded as multipart file;
        # make string content into a file-like object so requests.post
        # sends it the way Fedora expects.
        # NOTE: checking for both python 2.x next method and
        # python 3.x __next__ to test if data is iteraable
        if not hasattr(data, 'read') and \
            not (hasattr(data, '__next__') or hasattr(data, 'next')):
            data = six.BytesIO(force_bytes(data))

        # if data is an iterable, wrap in a readable iterator that
        # requests-toolbelt can read data from
        elif not hasattr(data, 'read') and \
            (hasattr(data, '__next__') or hasattr(data, 'next')):
            if size is None:
                raise Exception('Cannot upload iterable with unknown size')
            data = ReadableIterator(data, size)

        # use requests-toolbelt multipart encoder to avoid reading
        # the full content of large files into memory
        menc = MultipartEncoder(fields={'file': ('file', data, content_type)})

        if callback is not None:
            menc = MultipartEncoderMonitor(menc, callback)

        headers = {'Content-Type': menc.content_type}
        if size:
            headers['Content-Length'] = size

        try:
            response = self.post(url, data=menc, headers=headers)
        except OverflowError:
            # Python __len__ uses integer so it is limited to system maxint,
            # and requests and requests-toolbelt use len() throughout.
            # This results in an overflow error when trying to upload a file
            # larger than system maxint (2GB on 32-bit OSes).
            # See http://bugs.python.org/issue12159
            msg = 'upload content larger than system maxint (32-bit OS limitation)'
            logger.error('OverflowError: %s', msg)
            raise OverflowError(msg)

        if response.status_code == requests.codes.accepted:
            return response.text.strip()
Пример #9
0
def curl_upload_file(filename):
    print('curl upload')
    conn = pycurl.Curl()
    auth = base64.b64encode(
        force_bytes("%s:%s" %
                    (testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD)))
    headers = {'Authorization': 'Basic %s' % force_text(auth)}
    conn.setopt(conn.URL, '%supload' % testsettings.FEDORA_ROOT_NONSSL)
    conn.setopt(pycurl.VERBOSE, 1)
    conn.setopt(pycurl.HTTPHEADER, ["%s: %s" % t for t in headers.items()])

    filesize = os.path.getsize(filename)
    widgets = [
        'Upload: ',
        progressbar.widgets.Percentage(), ' ',
        progressbar.widgets.Bar(), ' ',
        progressbar.widgets.ETA(), ' ',
        progressbar.widgets.FileTransferSpeed()
    ]
    # set initial progressbar size based on file; will be slightly larger because
    # of multipart boundary content
    pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()

    def progress(dl_total, dl, up_total, up):
        # update the progressbar to actual maxval (content + boundary)
        pbar.max_value = up_total
        # update current status
        pbar.update(up)

    conn.setopt(
        conn.HTTPPOST,
        [
            (
                'file',
                (
                    # upload the contents of this file
                    conn.FORM_FILE,
                    filename,
                    # specify a different file name for the upload
                    conn.FORM_FILENAME,
                    'file',
                    # specify a different content type
                    # conn.FORM_CONTENTTYPE, 'application/x-python',
                )),
        ])
    # conn.setopt(conn.CURLOPT_READFUNCTION)
    conn.setopt(conn.XFERINFOFUNCTION, progress)
    conn.setopt(conn.NOPROGRESS, False)

    conn.perform()

    # HTTP response code, e.g. 200.
    print('Status: %d' % conn.getinfo(conn.RESPONSE_CODE))
    # Elapsed time for the transfer.
    print('Time: %f' % conn.getinfo(conn.TOTAL_TIME))

    conn.close()
Пример #10
0
    def test_compareDatastreamChecksum(self):
        # create datastream with checksum
        (added, ds) = self._add_text_datastream()
        r = self.rest_api.compareDatastreamChecksum(self.pid, ds['id'])

        mdsum = hashlib.md5()
        mdsum.update(force_bytes(self.TEXT_CONTENT))
        text_md5 = mdsum.hexdigest()
        self.assert_('<dsChecksum>%s</dsChecksum>' % text_md5 in r.text)
        # FIXME: how to test that checksum has actually been checked?

        # check for log message in audit trail
        r = self.rest_api.getObjectXML(self.pid)
        self.assert_(ds['logMessage'] in r.text)
Пример #11
0
    def test_compareDatastreamChecksum(self):
        # create datastream with checksum
        (added, ds) = self._add_text_datastream()
        r = self.rest_api.compareDatastreamChecksum(self.pid, ds["id"])

        mdsum = hashlib.md5()
        mdsum.update(force_bytes(self.TEXT_CONTENT))
        text_md5 = mdsum.hexdigest()
        self.assert_("<dsChecksum>%s</dsChecksum>" % text_md5 in r.text)
        # FIXME: how to test that checksum has actually been checked?

        # check for log message in audit trail
        r = self.rest_api.getObjectXML(self.pid)
        self.assert_(ds["logMessage"] in r.text)
Пример #12
0
    def test_ingest_without_pid(self):
        obj = self.loadFixtureData('basic-object.foxml')
        pid = self.repo.ingest(force_bytes(obj))
        self.assertTrue(pid)
        self.repo.purge_object(force_text(pid))

        # test ingesting with log message
        pid = self.repo.ingest(obj, "this is my test ingest message")
        # ingest message is stored in AUDIT datastream
        # - can currently only be accessed by retrieving entire object xml
        r = self.repo.api.getObjectXML(force_text(pid))
        self.assertTrue("this is my test ingest message" in r.text)
        purged = self.repo.purge_object(force_text(pid), "removing test ingest object")
        self.assertTrue(purged)
Пример #13
0
def curl_download_file(pid, dsid):
    repo = Repository(testsettings.FEDORA_ROOT_NONSSL,
                      testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD)
    obj = repo.get_object(pid)
    ds = obj.getDatastreamObject(dsid)

    tmpfile = tempfile.NamedTemporaryFile(prefix='%s-%s_' % (pid, dsid),
                                          delete=False)
    print('writing to ', tmpfile.name)

    widgets = [
        'Download: ',
        progressbar.widgets.Percentage(), ' ',
        progressbar.widgets.Bar(), ' ',
        progressbar.widgets.ETA(), ' ',
        progressbar.widgets.FileTransferSpeed()
    ]
    # set initial progressbar size based on file; will be slightly larger because
    # of multipart boundary content
    pbar = progressbar.ProgressBar(widgets=widgets, max_value=ds.size).start()

    def progress(dl_total, dl, up_total, up):
        # update current status
        pbar.update(dl)

    c = pycurl.Curl()
    auth = base64.b64encode(
        force_bytes("%s:%s" %
                    (testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD)))
    headers = {'Authorization': 'Basic %s' % force_text(auth)}
    c.setopt(pycurl.VERBOSE, 1)
    c.setopt(pycurl.HTTPHEADER, ["%s: %s" % t for t in headers.items()])

    # /objects/{pid}/datastreams/{dsID}/content ? [asOfDateTime] [download]
    c.setopt(c.URL, '%sobjects/%s/datastreams/%s/content' % \
        (testsettings.FEDORA_ROOT_NONSSL, pid, dsid))
    # c.setopt(c.WRITEDATA, buffer)
    c.setopt(c.WRITEFUNCTION, tmpfile.write)
    c.setopt(c.XFERINFOFUNCTION, progress)
    c.setopt(c.NOPROGRESS, False)
    c.perform()

    # HTTP response code, e.g. 200.
    print('Status: %d' % c.getinfo(c.RESPONSE_CODE))
    # Elapsed time for the transfer.
    print('Time: %f' % c.getinfo(c.TOTAL_TIME))

    c.close()
Пример #14
0
def curl_download_file(pid, dsid):
    repo = Repository(testsettings.FEDORA_ROOT_NONSSL, testsettings.FEDORA_USER,
                      testsettings.FEDORA_PASSWORD)
    obj = repo.get_object(pid)
    ds = obj.getDatastreamObject(dsid)

    tmpfile = tempfile.NamedTemporaryFile(
        prefix='%s-%s_' % (pid, dsid), delete=False)
    print('writing to ', tmpfile.name)

    widgets = ['Download: ', progressbar.widgets.Percentage(), ' ',
               progressbar.widgets.Bar(), ' ', progressbar.widgets.ETA(),
               ' ', progressbar.widgets.FileTransferSpeed()]
    # set initial progressbar size based on file; will be slightly larger because
    # of multipart boundary content
    pbar = progressbar.ProgressBar(widgets=widgets, max_value=ds.size).start()

    def progress(dl_total, dl, up_total, up):
        # update current status
        pbar.update(dl)

    c = pycurl.Curl()
    auth = base64.b64encode(force_bytes("%s:%s" % (testsettings.FEDORA_USER, testsettings.FEDORA_PASSWORD)))
    headers = {'Authorization' : 'Basic %s' % force_text(auth)}
    c.setopt(pycurl.VERBOSE, 1)
    c.setopt(pycurl.HTTPHEADER, ["%s: %s" % t for t in headers.items()])

            # /objects/{pid}/datastreams/{dsID}/content ? [asOfDateTime] [download]
    c.setopt(c.URL, '%sobjects/%s/datastreams/%s/content' % \
        (testsettings.FEDORA_ROOT_NONSSL, pid, dsid))
    # c.setopt(c.WRITEDATA, buffer)
    c.setopt(c.WRITEFUNCTION, tmpfile.write)
    c.setopt(c.XFERINFOFUNCTION, progress)
    c.setopt(c.NOPROGRESS, False)
    c.perform()

    # HTTP response code, e.g. 200.
    print('Status: %d' % c.getinfo(c.RESPONSE_CODE))
    # Elapsed time for the transfer.
    print('Time: %f' % c.getinfo(c.TOTAL_TIME))

    c.close()
Пример #15
0
    def test_raw_audit_trail(self):
        rqst = Mock()
        rqst.method = 'GET'

        # created with no ingest message = no audit trail
        self.assertRaises(Http404, raw_audit_trail, rqst, self.obj.pid)

        # modify object so it will have an audit trail
        self.obj.dc.content.title = 'audit this!'
        changelog = 'I just changed the title'
        self.obj.save(changelog)
        response = raw_audit_trail(rqst, self.obj.pid)
        expected, got = 200, response.status_code
        self.assertEqual(expected, got,
            'Expected %s but returned %s for raw_audit_trail' \
                % (expected, got))
        expected, got = 'text/xml', response['Content-Type']
        self.assertEqual(expected, got,
            'Expected %s but returned %s for mimetype on raw_audit_trail' \
                % (expected, got))
        self.assert_(b'<audit:auditTrail' in response.content)
        self.assert_(force_bytes('<audit:justification>%s</audit:justification>' % changelog)
                     in response.content)
        self.assert_('Last-Modified' in response)
Пример #16
0
    def upload(self, data, callback=None, content_type=None,
        size=None):
        '''
        Upload a multi-part file for content to ingest.  Returns a
        temporary upload id that can be used as a datstream location.

        :param data: content string, file-like object, or iterable with
            content to be uploaded
        :param callback: optional callback method to monitor the upload;
            see :mod:`requests-toolbelt` documentation for more
            details: https://toolbelt.readthedocs.org/en/latest/user.html#uploading-data
        :param content_type: optional content type of the data
        :param size: optional size of the data; required when using an
            iterable for the data

        :returns: upload id on success
        '''
        url = 'upload'
        # fedora only expects content uploaded as multipart file;
        # make string content into a file-like object so requests.post
        # sends it the way Fedora expects.
        # NOTE: checking for both python 2.x next method and
        # python 3.x __next__ to test if data is iteraable
        if not hasattr(data, 'read') and \
            not (hasattr(data, '__next__') or hasattr(data, 'next')):
            data = six.BytesIO(force_bytes(data))

        # if data is an iterable, wrap in a readable iterator that
        # requests-toolbelt can read data from
        elif not hasattr(data, 'read') and \
            (hasattr(data, '__next__') or hasattr(data, 'next')):
            if size is None:
                raise Exception('Cannot upload iterable with unknown size')
            data = ReadableIterator(data, size)

        # use requests-toolbelt multipart encoder to avoid reading
        # the full content of large files into memory
        menc = MultipartEncoder(fields={'file': ('file', data, content_type)})

        if callback is not None:
            menc = MultipartEncoderMonitor(menc, callback)

        headers = {'Content-Type': menc.content_type}
        if size:
            headers['Content-Length'] = size

        try:
            response = self.post(url, data=menc, headers=headers)
        except OverflowError:
            # Python __len__ uses integer so it is limited to system maxint,
            # and requests and requests-toolbelt use len() throughout.
            # This results in an overflow error when trying to upload a file
            # larger than system maxint (2GB on 32-bit OSes).
            # See http://bugs.python.org/issue12159
            msg = 'upload content larger than system maxint (32-bit OS limitation)'
            logger.error('OverflowError: %s', msg)
            raise OverflowError(msg)


        if response.status_code == requests.codes.accepted:
            return response.text.strip()
Пример #17
0
def load_fixture_data(fname):
    with open(fixture_path(fname)) as f:
        return force_bytes(f.read())
Пример #18
0
    def object_data(self):
        '''Process the archival export and return a buffer with foxml
        content for ingest into the destination repository.

        :returns: :class:`io.BytesIO` for ingest, with references
            to uploaded datastream content or content location urls
        '''
        self.foxml_buffer = io.BytesIO()

        if self.progress_bar:
            self.progress_bar.start()

        previous_section = None
        while True:
            try:
                section = self.get_next_section()
            except StopIteration:
                break

            if section == BINARY_CONTENT_START:
                self.within_file = True

                # get datastream info from the end of the section just before this one
                # (needed to provide size to upload request)
                dsinfo = self.get_datastream_info(previous_section)
                if dsinfo:
                    'Found encoded datastream %(id)s (%(mimetype)s, size %(size)s, %(type)s %(digest)s)' %  \
                        dsinfo

                    logger.info('Found encoded datastream %(id)s (%(mimetype)s, size %(size)s, %(type)s %(digest)s)',
                        dsinfo)
                else:
                    # error if datastream info is not found, because either
                    # size or version date is required to handle content
                    raise Exception('Failed to find datastream information for %s from \n%s' \
                        % (self.obj.pid, previous_section))

                if self.xml_only and not dsinfo['mimetype'] == 'text/xml':  # possibly others?
                    try:
                        dsid, dsversion = dsinfo['id'].split('.')
                    except ValueError:
                        # if dsid doesn't include a .# (for versioning),
                        # use the id as is.
                        dsid = dsinfo['id']

                    if self.url_credentials:
                        # if url credentials are set, parse the base fedora api
                        # url so they can be inserted at the right place
                        parsed_url = urlparse(self.obj.api.base_url)
                        # reassemble base url, adding in credentials
                        base_url = ''.join([parsed_url.scheme, '://',
                            self.url_credentials, parsed_url.netloc,
                            parsed_url.path])
                    else:
                        base_url = self.obj.api.base_url

                    # versioned datastream dissemination url
                    content_location = '%sobjects/%s/datastreams/%s/content?asOfDateTime=%s' % \
                        (base_url, self.obj.pid, dsid, dsinfo['created'])
                else:
                    upload_args = {}
                    if self.progress_bar:
                        def upload_callback(monitor):
                            self.progress_bar.upload = monitor.bytes_read
                        upload_args = {'callback': upload_callback}

                    # use upload id as content location
                    content_location = self.dest_repo.api.upload(self.encoded_datastream(),
                        size=int(dsinfo['size']), **upload_args)

                self.foxml_buffer.write(force_bytes('<foxml:contentLocation REF="%s" TYPE="URL"/>' \
                    % content_location))

            elif section == BINARY_CONTENT_END:
                # should not occur here; this section will be processed by
                # encoded_datastream method
                self.within_file = False

            elif self.within_file:
                # should not occur here; this section will be pulled by
                # encoded_datastream method

                # binary content within a file - ignore here
                # (handled by encoded_datastream method)
                next

            else:
                # not start or end of binary content, and not
                # within a file, so yield as is (e.g., datastream tags
                # between small files)
                self.foxml_buffer.write(section)

            previous_section = section

        return self.foxml_buffer
Пример #19
0
    def object_data(self):
        '''Process the archival export and return a buffer with foxml
        content for ingest into the destination repository.

        :returns: :class:`io.BytesIO` for ingest, with references
            to uploaded datastream content or content location urls
        '''
        self.foxml_buffer = io.BytesIO()

        if self.progress_bar:
            self.progress_bar.start()

        previous_section = None
        while True:
            try:
                section = self.get_next_section()
            except StopIteration:
                break

            if section == BINARY_CONTENT_START:
                self.within_file = True

                # get datastream info from the end of the section just before this one
                # (needed to provide size to upload request)
                dsinfo = self.get_datastream_info(previous_section)
                if dsinfo:
                    'Found encoded datastream %(id)s (%(mimetype)s, size %(size)s, %(type)s %(digest)s)' %  \
                        dsinfo

                    logger.info(
                        'Found encoded datastream %(id)s (%(mimetype)s, size %(size)s, %(type)s %(digest)s)',
                        dsinfo)
                else:
                    # error if datastream info is not found, because either
                    # size or version date is required to handle content
                    raise Exception(
                        'Failed to find datastream information from \n%s' %
                        previous_section)

                if self.xml_only and not dsinfo[
                        'mimetype'] == 'text/xml':  # possibly others?
                    try:
                        dsid, dsversion = dsinfo['id'].split('.')
                    except ValueError:
                        # if dsid doesn't include a .# (for versioning),
                        # use the id as is.
                        dsid = dsinfo['id']

                    if self.url_credentials:
                        # if url credentials are set, parse the base fedora api
                        # url so they can be inserted at the right place
                        parsed_url = urlparse(self.obj.api.base_url)
                        # reassemble base url, adding in credentials
                        base_url = ''.join([
                            parsed_url.scheme, '://', self.url_credentials,
                            parsed_url.netloc, parsed_url.path
                        ])
                    else:
                        base_url = self.obj.api.base_url

                    # versioned datastream dissemination url
                    content_location = '%sobjects/%s/datastreams/%s/content?asOfDateTime=%s' % \
                        (base_url, self.obj.pid, dsid, dsinfo['created'])
                else:
                    upload_args = {}
                    if self.progress_bar:

                        def upload_callback(monitor):
                            self.progress_bar.upload = monitor.bytes_read

                        upload_args = {'callback': upload_callback}

                    # use upload id as content location
                    content_location = self.dest_repo.api.upload(
                        self.encoded_datastream(),
                        size=int(dsinfo['size']),
                        **upload_args)

                self.foxml_buffer.write(force_bytes('<foxml:contentLocation REF="%s" TYPE="URL"/>' \
                    % content_location))

            elif section == BINARY_CONTENT_END:
                # should not occur here; this section will be processed by
                # encoded_datastream method
                self.within_file = False

            elif self.within_file:
                # should not occur here; this section will be pulled by
                # encoded_datastream method

                # binary content within a file - ignore here
                # (handled by encoded_datastream method)
                next

            else:
                # not start or end of binary content, and not
                # within a file, so yield as is (e.g., datastream tags
                # between small files)
                self.foxml_buffer.write(section)

            previous_section = section

        return self.foxml_buffer