Exemplo n.º 1
0
class Book(ArkDigitalObject):
    '''Mimimal digitized Book object.
    '''
    #: content model for books
    BOOK_CONTENT_MODEL = 'info:fedora/emory-control:ScannedBook-1.0'
    CONTENT_MODELS = [BOOK_CONTENT_MODEL]

    #: :class:`~readux.collection.models.Collection` this book belongs to,
    #: via fedora rels-ext isMemberOfcollection
    collection = Relation(relsext.isMemberOfCollection, type=Collection)
Exemplo n.º 2
0
class Volume(ArkDigitalObject):
    '''Minimal object for ScannedVolume-1.0`.

    ScannedVolume-1.0 objects include an Abbyy FineReader OCR XML datastream
    with the OCR content for the entire volume.
    '''

    #: volume content model
    VOLUME_CONTENT_MODEL = 'info:fedora/emory-control:ScannedVolume-1.0'
    CONTENT_MODELS = [VOLUME_CONTENT_MODEL]

    #: :class:`Book` this volume is associated with, via isConstituentOf
    book = Relation(relsext.isConstituentOf, type=Book)
Exemplo n.º 3
0
class LocalObject(DigitalObject):
    """Object class for features common to all repository objects."""
    batch_id = Relation(prov_ns.wasGeneratedBy,
                        ns_prefix={"prov": prov_ns},
                        rdf_type=XSD.int)
    batch_timestamp = Relation(prov_ns.wasGeneratedAtTime,
                               ns_prefix={"prov": prov_ns},
                               rdf_type=XSD.dateTime)
    has_model = Relation(fedora_model.hasModel,
                         ns_prefix={"fedora-model": fedora_model},
                         rdf_type=XSD.anyURI)
    is_member_of_collection = Relation(
        fedora_ns.isMemberOfCollection,
        ns_prefix={"fedora-rels-ext": fedora_ns})
    constituent = Relation(fedora_ns.isConstituentOf,
                           ns_prefix={"fedora": fedora_ns})
    is_member_of = Relation(fedora_ns.isMemberOf,
                            ns_prefix={"fedora-rels-ext": fedora_ns})
    is_sequence_number = Relation(islandora_ns.isSequenceNumber,
                                  ns_prefix={"islandora": islandora_ns})
    is_sequence_of = Relation(islandora_ns.isSequenceOf,
                              ns_prefix={"islandora": islandora_ns})
Exemplo n.º 4
0
class ArrangementObject(boda.Arrangement, ArkPidDigitalObject):
    '''Subclass of :class:`eulfedora.models.DigitalObject` for
    "arrangement" content.'''

    NEW_OBJECT_VIEW = 'arrangement:edit'

    mods = XmlDatastream('MODS',
                         'MODS Metadata',
                         LocalArrangementMods,
                         defaults={
                             'control_group': 'M',
                             'format': mods.MODS_NAMESPACE,
                             'versionable': True,
                         })
    '''MODS :class:`~eulfedora.models.XmlDatastream` with content as
    :class:`ArrangementMods`; datstream ID ``MODS``'''

    provenance = XmlDatastream('provenanceMetadata',
                               'Provenance metadata',
                               ArrangementPremis,
                               defaults={'versionable': False})

    component_key = {
        'FileMasterTech': 'file technical metadata',
        'MODS': 'descriptive metadata',
        'DC': 'descriptive metadata',
        'Rights': 'rights metadata',
        'RELS-EXT':
        'collection membership',  # TODO: revise when/if we add more relations
    }

    status_codes = {'processed': 'A', 'accessioned': 'I'}

    # map arrangement status to fedora object state

    def _get_arrangement_status(self):
        for status, code in self.status_codes.iteritems():
            if self.state == code:
                return status

    def _set_arrangement_status(self, status):
        if status not in self.status_codes:
            raise ValueError('%s is not a recognized arrangement status' %
                             status)
        self.state = self.status_codes[status]

    arrangement_status = property(_get_arrangement_status,
                                  _set_arrangement_status)
    'arrangement status, i.e., whether this item is processed or accessioned'

    _deprecated_collection = Relation(relsext.isMemberOf,
                                      type=CollectionObject)
    ''':class:`~keep.collection.models.collection.v1_1.Collection` that this
    object is a member of, via `isMemberOf` relation.

    **deprecated** because these objects should be using **isMemberOfCollection**
    '''

    collection = Relation(relsext.isMemberOfCollection, type=CollectionObject)
    ''':class:`~keep.collection.models.CollectionObject that this object is a member of,
    via `isMemberOfCollection` relation.
    '''

    process_batch = ReverseRelation(relsext.hasMember, type=SimpleCollection)

    # access to the processing batch aka simple collection this object
    # is associated with; reverse because rel is stored on the simplecollection

    def save(self, logMessage=None):
        '''Save the object.  If the content of the rights datastream
        has changed, update content models used to control access to
        match the current rights access code.

        :param logMessage: optional log message
        '''
        if self.rights.isModified:
            self._update_access_cmodel()

        return super(ArrangementObject, self).save(logMessage)

    def get_original_datastream(self):
        # retrieve original datastream object; used to generate
        # object information for premis checksums
        orig_ds = self.getDatastreamObject('ORIGINAL')
        if orig_ds.exists:
            return orig_ds

        # for email content, use MIME datastream as original content
        orig_ds = self.getDatastreamObject('MIME')
        if orig_ds.exists:
            return orig_ds

        # TODO: what to do for email folders ?

        raise Exception('No original datastream found')

    def set_premis_object(self):
        # NOTE: could add a check to see if premis exists before setting
        # these values (although we don't expect arrangment objects
        # to have premis by default)

        # NOTE: should be using the ark:/####/#### id form here
        # using ArkPidDigitalObject ark property
        self.provenance.content.create_object()
        self.provenance.content.object.id_type = 'ark'
        self.provenance.content.object.id = self.mods.content.ark

        # add basic object premis information
        # object type required to be schema valid, must be in premis namespace
        self.provenance.content.object.type = 'p:file'
        self.provenance.content.object.composition_level = 0

        original_ds = self.get_original_datastream()

        # add MD5 and SHA-1 checksums for original datastream content
        # Fedora should already have an MD5
        # (NOTE: could confirm that this is MD5 using checksum_type)
        self.provenance.content.object.checksums.append(
            PremisFixity(algorithm='MD5'))
        self.provenance.content.object.checksums[
            0].digest = original_ds.checksum

        # save original content to disk in order to calculate a SHA-1 checkusum
        # don't delete when file handle is closed
        origtmpfile = tempfile.NamedTemporaryFile(prefix='%s-orig-' %
                                                  self.noid,
                                                  delete=False)
        for data in original_ds.get_chunked_content():
            origtmpfile.write(data)

        # close to flush contents before calculating checksum
        origtmpfile.close()

        # calculate SHA-1 and add to premis
        self.provenance.content.object.checksums.append(
            PremisFixity(algorithm='SHA-1'))
        self.provenance.content.object.checksums[1].digest = sha1sum(
            origtmpfile.name)

        # clean up temporary copy of the original file
        os.remove(origtmpfile.name)

        # set object format - using original file mimetype
        self.provenance.content.object.create_format()
        self.provenance.content.object.format.name = original_ds.mimetype

    def identifier_change_event(self, oldpid):
        '''Add an identifier change event to the premis for this object.'''

        detail_msg = 'Persistent identifier reassigned from %s to %s' % \
            (oldpid, self.pid)
        idchange_event = PremisEvent()
        idchange_event.id_type = 'UUID'
        idchange_event.id = uuid.uuid1()
        idchange_event.type = 'identifier assignment'
        idchange_event.date = datetime.now().isoformat()
        idchange_event.detail = 'program="keep"; version="%s"' % __version__
        idchange_event.outcome = 'Pass'
        idchange_event.outcome_detail = detail_msg
        idchange_event.agent_type = 'fedora user'
        idchange_event.agent_id = self.api.username
        self.provenance.content.events.append(idchange_event)

    def _update_access_cmodel(self):
        # update access/restriction content models based on rights access code

        # FIXME: is there not a better way to add/remove cmodels ?
        _allowed_triple = (self.uriref, modelns.hasModel,
                           URIRef(ACCESS_ALLOWED_CMODEL))
        _restricted_triple = (self.uriref, modelns.hasModel,
                              URIRef(ACCESS_RESTRICTED_CMODEL))

        if self.rights.content.access_status and \
           self.rights.content.access_status.code == '2':
            # FIXME: sholudn't have to hard code this number;
            # can we use researcher_access check instead ?

            # allow access.
            # remove restricted if present, add allowed if not present
            if _restricted_triple in self.rels_ext.content:
                self.rels_ext.content.remove(_restricted_triple)
            if _allowed_triple not in self.rels_ext.content:
                self.rels_ext.content.add(_allowed_triple)

        else:
            # deny access.
            # remove allowed if present, add restricted if not present
            if _allowed_triple in self.rels_ext.content:
                self.rels_ext.content.remove(_allowed_triple)
            if _restricted_triple not in self.rels_ext.content:
                self.rels_ext.content.add(_restricted_triple)

    def update_ark_label(self, force_update=False):
        """Update an object's label. While arrangement objects do have a MODS datastream,
        the descriptive metadata about the object (including title) is currently stored
        in the DC (or Dublin Core). In Fedora the DC data stream is a special one that
        always exists, and we do not need to check if the dc exists. This method will compare
        the title in DC field in Fedora and that in the Pidman object.

        :param force_update: optional flag that will enforce update of the object title
            regardless of mods.isModified(), when supplied as True
        """

        # Check if the object itself exists, and proceed if so.
        # Python evaluates conditionals from left to right; therefore the
        # order here matters
        if self.exists:
            # perform update when either force_update flag is provided, or otherwise
            # only take actions when mods is modified.
            if force_update or self.dc.isModified():
                if pidman is not None:
                    try:
                        pidman_label = pidman.get_ark(self.noid)['name']
                        if self.dc.content.title != pidman_label:  # when the title is different
                            pidman.update_ark(noid=self.noid,
                                              name=self.dc.content.title)

                    # catch KeyError
                    except KeyError as e:
                        logger.error(
                            "Object %s doesn't have a name attribute in Pidman.",
                            self.noid)

                    # catch HTTPError (e.g. 401, 404)
                    except urllib2.HTTPError as e:
                        logger.error(
                            "Object %s errored out in Pidman HTTP requests. \
                            HTTP status code: %s \n", self.noid, str(e.code))

                    # catch other exceptions
                    except Exception as e:
                        logger.error(
                            "Object %s errored out in Pidman. \
                            Error message: %s \n", self.noid, str(e))
                else:
                    logging.warning("Pidman client does not exist.")

    @property
    def content_md5(self):
        if self.filetech.content.file and self.filetech.content.file[0].md5:
            return self.filetech.content.file[0].md5

    def index_data(self):
        '''Extend the default
        :meth:`eulfedora.models.DigitalObject.index_data` method to
        include additional fields specific to Keep Arrangement
        objects.  Includes collection and archive information, along
        with arrangement id and access status.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        repo = Repository()  # FIXME: use relation from current object instead

        # FIXME: is it worth splitting out descriptive index data here?
        data = super(ArrangementObject, self).index_data()

        if self.has_model(boda.EmailMessage.EMAIL_MESSAGE_CMODEL) or \
          self.has_model(boda.Mailbox.MAILBOX_CONTENT_MODEL):
            data['object_type'] = 'email'
        # elif self.has_model(boda.RushdieFile.RUSHDIE_FILE_CMODEL):
        # data['object_type'] = 'file'
        else:
            # generic fallback
            data['object_type'] = 'born-digital'

        # Collection Info
        if self._deprecated_collection:
            collection = self._deprecated_collection
        elif self.collection:
            collection = self.collection
        else:
            collection = None

        if collection and collection.exists:

            # collection_source_id
            if collection.mods.content.source_id is not None:  # allowed to be 0
                data[
                    'collection_source_id'] = collection.mods.content.source_id
            data['collection_id'] = collection.pid
            try:
                # pull parent & archive collection objects directly from fedora
                data['collection_label'] = collection.label
                # the parent collection of the collection this item belongs to is its archive

                # FIXME: this shouldn't be indexed here; are we actually
                # using it anywhere?
                # if collection.collection:
                #     data['archive_id'] = collection.collection.uri
                #     data['archive_label'] = collection.collection.label

            except RequestFailed as rf:
                logger.error(
                    'Error accessing collection or archive object in Fedora: %s'
                    % rf)

        # Arrangement unique id
        try:
            if self.filetech.content.file:
                if self.filetech.content.file[0].local_id:
                    data["arrangement_id"] = self.filetech.content.file[
                        0].local_id
                if self.filetech.content.file[0].md5:
                    data['content_md5'] = self.filetech.content.file[0].md5
        except Exception as e:
            logging.error(
                "Error getting arrangement id or content MD5 for %s: %s" %
                self.pid, e)

        # rights access status code
        if self.rights.content.access_status:
            data['access_code'] = self.rights.content.access_status.code
            # normally this should be picked up via dc:rights, but arrangement
            # objects don't seem to have DC fields populated
            # NOTE: migrated items don't seem to have rights text set
            if self.rights.content.access_status.text:
                data['rights'] = self.rights.content.access_status.text

        # get simple collections that have an association with this object
        try:
            simple_collections = repo.risearch.get_subjects(
                relsext.hasMember, self.uriref)
            simple_collections = list(simple_collections)

            sc_ids = []
            sc_labels = []

            for sc in simple_collections:
                obj = repo.get_object(pid=sc, type=repo.infer_object_subtype)
                if isinstance(obj, SimpleCollection):
                    sc_ids.append("info:fedora/%s" % obj.pid)
                    sc_labels.append(obj.label)
        except RequestFailed as rf:
            logger.error('Error accessing simpleCollection in Fedora: %s' % rf)

        if sc_ids:
            data["simpleCollection_id"] = sc_ids
        if sc_labels:
            data["simpleCollection_label"] = sc_labels

        return data

    @staticmethod
    def by_arrangement_id(id, repo=None):
        '''
        Static method to find an :class:`ArrangementObject` by its
        local or arrangement id.  Looks for the item in Solr and
        returns an :class:`ArrangementObject` instance initialized
        from the repository if a single match is found for the
        requested id.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param id: arrangement id or local id

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`ArrangementObject`


        '''
        solr = solr_interface()
        q = solr.query(arrangement_id=id,
                   content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \
                   .field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with arrangement id %s' % \
                                          (found, id))
        if not found:
            raise ObjectDoesNotExist('No record found with arrangement id %s' %
                                     id)

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=ArrangementObject)
Exemplo n.º 5
0
class EmailMessage(boda.EmailMessage, ArrangementObject):
    CONTENT_MODELS = [
        boda.EmailMessage.EMAIL_MESSAGE_CMODEL,
        boda.Arrangement.ARRANGEMENT_CONTENT_MODEL
    ]

    NEW_OBJECT_VIEW = 'arrangement:view'

    # messages are related to mailbox via is part of
    mailbox = Relation(relsext.isPartOf, type=Mailbox)

    @property
    def headers(self):
        '''
        Access CERP headers as a dictionary.
        '''
        return dict([(h.name, h.value) for h in self.cerp.content.headers])

    def get_original_datastream(self):
        # for email content, use MIME datastream as original content
        return self.getDatastreamObject('MIME')

    def email_label(self):
        '''
        Construct a label based on to, from, subject and date as
        stored in :attr:`EmailMessage.cerp.content`.

        Returns object label if set.
        '''
        if self.label:
            return self.label

        # if cerp is not present or has no data, return a generic label
        if not self.cerp or not self.cerp.content or \
            not self.cerp.content.from_list and not self.cerp.content.to_list:
            return 'Email message'

        # sender & to should generally be present
        if self.cerp.content.from_list:
            sender = self.cerp.content.from_list[0]
        else:
            sender = 'unknown sender'

        if self.cerp.content.to_list:
            to = self.cerp.content.to_list[0]
            if len(self.cerp.content.to_list) > 1:
                to = '%s et al.' % to
        else:
            to = 'unknown recipient'

        label = 'Email from %s to %s' % (sender, to)

        if self.cerp.content.subject_list:
            subject = self.cerp.content.subject_list[0]
            label += ' %s' % subject

        date = self.headers.get('Date', None)
        if date is not None:
            label += ' on %s' % date

        return label

    def index_data(self):
        '''Extend the :meth:`keep.arrangement.models.ArrangementObject.index_data` method to
        include additional data specific to EmailMessages objects.
        '''

        data = super(EmailMessage, self).index_data()
        data['label'] = self.email_label()
        # email does not have filetech or content; use mime data checksum
        # for content md5
        if self.mime_data.exists:
            data['content_md5'] = self.mime_data.checksum
            data['arrangement_id'] = self.mime_data.content.get('message-id')

        return data

    @property
    def content_md5(self):
        return self.mime_data.checksum

    @staticmethod
    def find_by_field(field, value, repo=None):
        '''
        Static method to find a single :class:`EmailMessage` by an indexed
        value.  Looks for the item in Solr and
        returns an :class:`EmailMessage` instance initialized
        from the repository if a single match is found for the
        requested field and value.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param field: solr field to search
        :param value: value to search on in the specified field

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`EmailMessage`


        '''
        solr = solr_interface()
        search_terms = {
            field: value,
            'content_model': ArrangementObject.ARRANGEMENT_CONTENT_MODEL
        }
        q = solr.query(**search_terms).field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with %s %s' % \
                                          (found, field, value))
        if not found:
            raise ObjectDoesNotExist('No record found with %s %s' %
                                     (field, value))

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=EmailMessage)

    @staticmethod
    def by_checksum(md5sum, repo=None):
        '''
        Static method to find an :class:`EmailMessage` by the content
        md5 checksum.  Wrapper around :meth:`EmailMessage.find_by_field`.

        :param md5sum: MD5 checksum to search for
        '''
        return EmailMessage.find_by_field('content_md5', md5sum, repo=repo)

    @staticmethod
    def by_message_id(id, repo=None):
        '''
        Static method to find an :class:`EmailMessage` by its
        message id. Wrapper around :meth:`EmailMessage.find_by_field`.

        :param id: message id to search for
        '''
        return EmailMessage.find_by_field('arrangement_id', id, repo=repo)
Exemplo n.º 6
0
class AudioObject(DigitalObject):
    '''Fedora Audio Object.  Extends :class:`~eulfedora.models.DigitalObject`.'''
    AUDIO_CONTENT_MODEL = 'info:fedora/emory-control:EuterpeAudio-1.0'
    CONTENT_MODELS = [AUDIO_CONTENT_MODEL]
    NEW_OBJECT_VIEW = 'audio:view'

    allowed_mimetypes = ['audio/x-wav', 'audio/wav']

    mods = XmlDatastream("MODS",
                         "MODS Metadata",
                         AudioMods,
                         defaults={
                             'control_group': 'M',
                             'format': mods.MODS_NAMESPACE,
                             'versionable': True,
                         })
    'MODS :class:`~eulfedora.models.XmlDatastream` with content as :class:`AudioMods`'

    audio = FileDatastream("AUDIO",
                           "Audio datastream",
                           defaults={
                               'mimetype': 'audio/x-wav',
                               'versionable': True,
                           })
    'master audio :class:`~eulfedora.models.FileDatastream`'

    compressed_audio = FileDatastream("CompressedAudio",
                                      "Compressed audio datastream",
                                      defaults={
                                          'mimetype': 'audio/mpeg',
                                          'versionable': True,
                                      })
    'access copy of audio :class:`~eulfedora.models.FileDatastream`'

    digitaltech = XmlDatastream("DigitalTech",
                                "Technical Metadata - Digital",
                                DigitalTech,
                                defaults={
                                    'control_group': 'M',
                                    'versionable': True,
                                })
    '''digital technical metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`DigitalTech`'''

    sourcetech = XmlDatastream("SourceTech",
                               "Technical Metadata - Source",
                               SourceTech,
                               defaults={
                                   'control_group': 'M',
                                   'versionable': True,
                               })
    '''source technical metadata :class:`~eulfedora.models.XmlDatastream` with content as
    :class:`SourceTech`'''

    rights = XmlDatastream("Rights",
                           "Usage rights and access control metadata",
                           Rights,
                           defaults={
                               'control_group': 'M',
                               'versionable': True,
                           })
    '''access control metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`Rights`'''

    jhove = FileDatastream(
        "JHOVE",
        "JHOVE datastream",
        defaults={
            'mimetype': 'application/xml',
            'control_group': 'M',
            'versionable': True,
            'format': 'http://hul.harvard.edu/ois/xml/xsd/jhove/jhove.xsd',
        })
    'JHOVE technical metadata for the master audio :class:`~eulfedora.models.FileDatastream`'
    # JHOVE is xml, but treat it as a file for now since we're just storing it,
    # not doing any processing, updating, etc.

    # map datastream IDs to human-readable names for inherited history_events method
    component_key = {
        'AUDIO': 'audio (master)',
        'CompressedAudio': 'audio (access version)',
        'SourceTech': 'source technical metadata',
        'DigitalTech': 'digital technical metadata',
        'JHOVE': 'technical metadata',
        'MODS': 'descriptive metadata',
        'DC': 'descriptive metadata',
        'Rights': 'rights metadata',
        'RELS-EXT':
        'collection membership',  # TODO: revise when/if we add more relations
    }

    collection = Relation(relsext.isMemberOfCollection, type=CollectionObject)
    ''':class:`~keep.collection.models.CollectionObject that this object is a member of,
    via `isMemberOfCollection` relation.
    '''
    @property
    def content_md5(self):
        return self.audio.checksum

    def save(self, logMessage=None):
        '''Save the object.  If the content of any :class:`~AudioObject.mods`,
        :class:`AudioObject.rels_ext`, or :class:`AudioObject.digitaltech`
        datastreams have been changed, the DC will be updated and saved as well.

        :param logMessage: optional log message
        '''
        if not self.exists or self.mods.isModified() or self.rels_ext.isModified() or \
            self.digitaltech.isModified() or self.rights.isModified():
            # DC is derivative metadata based on MODS/RELS-EXT/Digital Tech
            # If this is a new item (does not yet exist in Fedora)
            # OR if any of the relevant datastreams have changed, update DC
            self._update_dc()

        # for now, keep object label in sync with MODS title
        if self.mods.isModified() and self.mods.content.title:
            self.label = self.mods.content.title

        return super(AudioObject, self).save(logMessage)

    @models.permalink
    def get_absolute_url(self):
        'Absolute url to view this object within the site'
        return ('audio:view', [str(self.pid)])

    def get_access_url(self):
        "Absolute url to hear this object's access version"
        if self.compressed_audio.exists:
            return reverse('audio:download-compressed-audio',
                           args=[str(self.pid),
                                 self.access_file_extension()])
        # as of file migration (1.2), legacy DM access path is no longer needed

    def access_file_extension(self):
        '''Return the expected file extension for whatever type of
        compressed audio datastream the current object has (if it has
        one), based on the datastream mimetype.  Currently, compressed
        audio could be MP3 or M4A/MP4.'''
        if self.compressed_audio.exists:
            if self.compressed_audio.mimetype == 'audio/mpeg':
                return 'mp3'
            if self.compressed_audio.mimetype == 'audio/mp4':
                return 'm4a'

    @property
    def conversion_result(self):
        '''Return the :class:`~eulcommon.djangoextras.taskresult.models.TaskResult`
        for the most recently requested access copy conversion (if any).
        '''
        conversions = TaskResult.objects.filter(
            object_id=self.pid).order_by('-created')
        if conversions:
            return conversions[0]

    @property
    def researcher_access(self):
        return allow_researcher_access(self.rights.content)

    def _update_dc(self):
        '''Update Dublin Core (derivative metadata) based on master metadata
        from MODS, RELS-EXT, and digital tech metadata in order to keep data
        synchronized and make fields that need to be searchable accessible to
        Fedora findObjects API method.
         '''
        # identifiers
        del (self.dc.content.identifier_list)  # clear out any existing names

        # title
        if self.mods.content.title:
            self.label = self.mods.content.title
            self.dc.content.title = self.mods.content.title
        if self.mods.content.resource_type:
            self.dc.content.type = self.mods.content.resource_type

        # creator names
        del (self.dc.content.creator_list)  # clear out any existing names
        for name in self.mods.content.names:
            # for now, use unicode conversion as defined in mods.Name
            self.dc.content.creator_list.append(unicode(name))

        # clear out any dates previously in DC
        del (self.dc.content.date_list)
        if self.mods.content.origin_info and \
           len(self.mods.content.origin_info.created) and \
           self.mods.content.origin_info.created[0].date:
            self.dc.content.date_list.append(
                self.mods.content.origin_info.created[0].date)
        if self.mods.content.origin_info and \
           len(self.mods.content.origin_info.issued) and \
           self.mods.content.origin_info.issued[0].date:
            self.dc.content.date_list.append(
                self.mods.content.origin_info.issued[0].date)

        # clear out any descriptions previously in DC and set from MODS/digitaltech
        del (self.dc.content.description_list)
        if self.mods.content.general_note and \
           self.mods.content.general_note.text:
            self.dc.content.description_list.append(
                self.mods.content.general_note.text)

        # clear out any rights previously in DC and set contents from Rights accessStatus
        del (self.dc.content.rights_list)
        if self.rights.content.access_status:
            # access code no longer needs to be included, since we will not be searching
            self.dc.content.rights_list.append(
                self.rights.content.access_status.text)

    def index_data(self):
        '''Extend the default
        :meth:`eulfedora.models.DigitalObject.index_data`
        method to include additional fields specific to Keep
        Audio objects.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        # FIXME: is it worth splitting out descriptive index data here?
        data = super(AudioObject, self).index_data()
        data['object_type'] = 'audio'
        if self.collection and self.collection.exists:

            # collection_source_id  (0 is an allowable id, so check not None)
            if self.collection.mods.content.source_id is not None:
                data[
                    'collection_source_id'] = self.collection.mods.content.source_id

            # FIXME: previously indexing URI; is this needed for any reason or can we
            # use pid?  (needs to match collection index pid field for solr join)
            # data['collection_id'] = self.collection.uri
            data['collection_id'] = self.collection.pid
            try:
                # pull parent & archive collection objects directly from fedora
                parent = CollectionObject(self.api, self.collection.uri)
                data['collection_label'] = parent.label
                # NB: as of 2011-08-23, eulindexer doesn't support automatic
                # reindexing of audio objects when their collection changes.
                # as a result, archive_id and archive_label may be stale.
                # disable indexing them until eulindexer supports those
                # chained updates.
                #data['archive_id'] = parent.collection_id
                #archive = CollectionObject(self.api, parent.collection_id)
                #data['archive_label'] = archive.label
            except RequestFailed as rf:
                logger.error(
                    'Error accessing collection or archive object in Fedora: %s'
                    % rf)

        # include resolvable ARK if available
        if self.mods.content.ark_uri:
            data['ark_uri'] = self.mods.content.ark_uri

        # old identifiers from previous digital masters
        dm1_ids = []
        if self.mods.content.dm1_id:
            dm1_ids.append(self.mods.content.dm1_id)
        if self.mods.content.dm1_other_id:
            dm1_ids.append(self.mods.content.dm1_other_id)
        if dm1_ids:
            data['dm1_id'] = dm1_ids

        # digitization purpose, if not empty
        if self.digitaltech.content.digitization_purpose_list:
            # convert nodelist to a normal list that can be serialized as json
            data['digitization_purpose'] = [
                dp for dp in self.digitaltech.content.digitization_purpose_list
            ]

        # related files
        if self.sourcetech.content.related_files_list:
            data['related_files'] = [
                rel for rel in self.sourcetech.content.related_files_list
            ]

        # part note
        if self.mods.content.part_note and self.mods.content.part_note.text:
            data['part'] = self.mods.content.part_note.text

        # sublocation
        if self.sourcetech.content.sublocation:
            data['sublocation'] = self.sourcetech.content.sublocation

        # rights access status code
        if self.rights.content.access_status:
            data['access_code'] = self.rights.content.access_status.code
        # copyright date from rights metadata
        if self.rights.content.copyright_date:
            data['copyright_date'] = self.rights.content.copyright_date
        # ip note from rights metadata
        if self.rights.content.ip_note:
            data['ip_note'] = self.rights.content.ip_note

        # boolean values that should always be available
        data.update({
            # should this item be accessible to researchers?
            'researcher_access':
            bool(self.researcher_access),  # if None, we want False
            # flags to indicate which datastreams are available
            'has_access_copy': self.compressed_audio.exists,
            'has_original': self.audio.exists,
        })

        if self.compressed_audio.exists:
            data.update({
                'access_copy_size': self.compressed_audio.size,
                'access_copy_mimetype': self.compressed_audio.mimetype,
            })
        if self.digitaltech.content.duration:
            data['duration'] = self.digitaltech.content.duration

        if self.mods.content.origin_info and \
           self.mods.content.origin_info.issued \
                and not self.mods.content.origin_info.issued.is_empty():
            data['date_issued'] = [
                unicode(di) for di in self.mods.content.origin_info.issued
            ]
        if self.mods.content.origin_info and \
           self.mods.content.origin_info.created \
                and not self.mods.content.origin_info.created.is_empty():
            data['date_created'] = [
                unicode(di) for di in self.mods.content.origin_info.created
            ]

        if self.audio.exists:
            data['content_md5'] = self.audio.checksum

        return data

    @staticmethod
    def init_from_file(filename,
                       initial_label=None,
                       request=None,
                       checksum=None,
                       mimetype=None):
        '''Static method to create a new :class:`AudioObject` instance from
        a file.  Sets the object label and metadata title based on the initial
        label specified, or file basename.  Calculates and stores the duration
        based on the file. Also sets the following default metadata values:

            * mods:typeOfResource = "sound recording"
            * dt:codecQuality = "lossless"

        :param filename: full path to the audio file, as a string
        :param initial_label: optional initial label to use; if not specified,
            the base name of the specified file will be used
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param checksum: the checksum of the file being sent to fedora.
        :returns: :class:`AudioObject` initialized from the file
        '''
        if initial_label is None:
            initial_label = os.path.basename(filename)
        repo = Repository(request=request)
        obj = repo.get_object(type=AudioObject)
        # set initial object label from the base filename
        obj.label = initial_label
        obj.dc.content.title = obj.mods.content.title = obj.label
        obj.audio.content = open(
            filename)  # FIXME: at what point does/should this get closed?
        # Set the file checksum, if set.
        obj.audio.checksum = checksum
        # set content datastream mimetype if passed in
        if mimetype is not None:
            obj.audio.mimetype = mimetype
        #Get the label, minus the ".wav" (mimetype indicates that)
        obj.audio.label = initial_label[:-4]
        # set initial mods:typeOfResource - all AudioObjects default to sound recording
        obj.mods.content.resource_type = 'sound recording'
        # set codec quality to lossless in digital tech metadata
        # - default for AudioObjects, should only accept lossless audio for master file
        obj.digitaltech.content.codec_quality = 'lossless'
        # get wav duration and store in digital tech metadata
        obj.digitaltech.content.duration = '%d' % round(wav_duration(filename))

        return obj

    @staticmethod
    def all():
        'Find all Audio objects by content model within the configured pidspace.'
        search_opts = {
            'type': AudioObject,
            # restrict to objects in configured pidspace
            'pid__contains': '%s:*' % settings.FEDORA_PIDSPACE,
            # restrict by cmodel in dc:format
            'format__contains': AudioObject.AUDIO_CONTENT_MODEL,
        }
        repo = Repository()
        return repo.find_objects(**search_opts)
Exemplo n.º 7
0
class SimpleCollection(Collectionv1_0, ArkPidDigitalObject):
    '''This is a simple DC only collection  LIES
    '''

    NEW_OBJECT_VIEW = 'collection:simple_edit'

    # FIXME: why do we have mods on a dc-only collection ?
    mods = XmlDatastream('MODS', 'Descriptive Metadata (MODS)', MODS, defaults={
            'control_group': 'M',
            'format': mods.MODS_NAMESPACE,
            'versionable': True,
        })
    '''MODS :class:`~eulfedora.models.XmlDatastream` with content as
    :class:`eulxml.xmlmap.mods.MODS`; versionable, datastream ID
    ``MODS``'''

    type = Relation(RDF.type)


    # override this function and add additional functionality
    def __init__(self, *args, **kwargs):
        super(SimpleCollection, self).__init__(*args, **kwargs)

        # Only set type when creating a new object, as determined
        # by base class
        if self._create:
            self.type = REPO.SimpleCollection

    def index_data(self):
        '''Extend the default
        :meth:`eulfedora.models.DigitalObject.index_data`
        method to include additional fields specific to Keep
        SimpleCollection objects.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        # FIXME: is it worth splitting out descriptive index data here?
        data = super(SimpleCollection, self).index_data()
        data['object_type'] = 'collection'
        # FIXME: do we need to differentiate collection vs. simple collection ?

        if self.type:
            data['type'] = self.type

        return data

    @staticmethod
    def find_by_pid(pid):
        'Find a collection by pid and return a dictionary with collection information.'
        # NOTE: this method added as a replacement for
        # get_cached_collection_dict that was used elsewhere
        # throughout the site (audio app, etc.)  It should probably be
        # consolidated with other find methods...

        if pid.startswith('info:fedora/'):  # allow passing in uri
            pid = pid[len('info:fedora/'):]
        solr = solr_interface()
        solrquery = solr.query(content_model=SimpleCollection.COLLECTION_CONTENT_MODEL,
                               pid=pid)
        result = solrquery.execute()
        if len(result) == 1:
            return result[0]

    @staticmethod
    def simple_collections():
        """Find all simpleCollection objects in the configured Fedora
        pidspace that can contain items.

        :returns: list of dict
        :rtype: list
        """

        # search solr for simpleCollection objects
        solr = solr_interface()
        solrquery = solr.query(content_model=SimpleCollection.COLLECTION_CONTENT_MODEL, \
                    type=REPO.SimpleCollection)

        # by default, only returns 10; get everything
        # - solr response is a list of dictionary with collection info
        # use dictsort and regroup in templates for sorting where appropriate
        return solrquery.paginate(start=0, rows=1000).execute()


    @property
    def total_members(self):
           return len(self.member_pids)

    @property
    def member_pids(self):
        return list(self.rels_ext.content.objects(self.uriref,
                                                  relsext.hasMember))
    @property
    def members(self):
        return [DigitalObject(self.api, pid=p) for p in self.member_pids]

    @models.permalink
    def get_absolute_url(self):
        'Absolute url to view this object within the site'
        return (self.NEW_OBJECT_VIEW, [str(self.pid)])

    @property
    def content_md5(self):
        return None
Exemplo n.º 8
0
class DiskImage(DigitalObject):
    '''Fedora object for Disk Images.  Extends :class:`~keep.common.fedora.DigitalObject`.
    '''

    # NOTE about datastream naming conventions
    # Where a corresponding datastream id already exists within the Keep
    # (i.e. MODS for mods metadata), the same datastream id will be used
    # Where a Keep datastream id does not already exist (e.g., Premis), following
    # Hydra content model conventions, based on generic simple Hydra content model
    # For documentation on Hydra content models, see:
    #   https://wiki.duraspace.org/display/hydra/Hydra+objects%2C+content+models+%28cModels%29+and+disseminators
    #   https://wiki.duraspace.org/display/hydra/Hydra+objects%2C+content+models+%28cModels%29+and+disseminators#Hydraobjects%2Ccontentmodels%28cModels%29anddisseminators-genericContent

    DISKIMAGE_CONTENT_MODEL = 'info:fedora/emory-control:DiskImage-1.0'
    CONTENT_MODELS = [DISKIMAGE_CONTENT_MODEL]
    NEW_OBJECT_VIEW = 'file:view'

    diskimage_mimetypes = [
        'application/x-aff',  # AFF, advanced forensic format
        'application/x-ad1',  # AD1, proprietary disk image format
        'application/x-iso9660-image',  # ISO
        'application/x-ewf',  # E01 Expert Witness Format
        'application/x-tar',  # tar file
        'application/mbox'  # mbox (? may require extra magic file entries)
    ]

    # mapping of mimetype to format label to insert in Premis
    mimetype_format = {
        'application/x-aff': 'AFF',
        'application/x-ad1': 'AD1',
        'application/x-iso9660-image': 'ISO',
        'application/x-ewf': 'E01',
        'application/x-tar': 'TAR',
        'application/mbox': 'MBOX'
    }

    allowed_mimetypes = ['', 'application/octet-stream'] + diskimage_mimetypes
    # NOTE: empty type and application/octet-stream are required for javascript upload,
    # because browser does not detect any mimetype at all for AFF and AD1 files
    # and detects ISO as the generic application/octet-stream
    # NOTE: Mimetypes for AD1 and AFF are custom mimetypes and must be configured
    # in your local magic files.  See the deploy notes for more information.

    collection = Relation(relsext.isMemberOfCollection, type=CollectionObject)
    ''':class:`~keep.collection.models.CollectionObject that this object belongs to,
    via `isMemberOfCollection` relation.
    '''

    #: original DiskImage object that this DiskImage is related to, if
    #: this is a migrated object; related via fedora-rels-ext isDerivationOf
    original = Relation(relsext.isDerivationOf, type='self')
    #: migrated DiskImage object that supercedes this object, if a
    #: migration has occurred; related via fedora-rels-ext hasDerivation
    migrated = Relation(relsext.hasDerivation, type='self')

    mods = XmlDatastream("MODS",
                         "MODS Metadata",
                         DiskImageMods,
                         defaults={
                             'control_group': 'M',
                             'format': mods.MODS_NAMESPACE,
                             'versionable': True,
                         })
    '''descriptive metadata as MODS - :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`LocalMods`'''
    # note: using base local mods for now; may need to extend for disk images

    content = FileDatastream("content",
                             "Master disk image file",
                             defaults={
                                 'versionable': False,
                             })
    'master disk image binary content as :class:`~eulfedora.models.FileDatastream`'
    # NOTE: could be one of a few allowed mimetypes

    rights = XmlDatastream("Rights",
                           "Usage rights and access control metadata",
                           Rights,
                           defaults={
                               'control_group': 'M',
                               'versionable': True,
                           })
    '''access control metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`Rights`'''

    provenance = XmlDatastream('provenanceMetadata',
                               'Provenance metadata',
                               DiskImagePremis,
                               defaults={'versionable': False})
    '''``provenanceMetadata`` datastream for PREMIS object metadata; datastream
    XML content will be an instance of :class:`eulxml.xmlmap.premis.Premis`.'''

    # map datastream IDs to human-readable names for inherited history_events method
    component_key = {
        'MODS': 'descriptive metadata',
        'DC': 'descriptive metadata',
        'Rights': 'rights metadata',
        'RELS-EXT':
        'collection membership or last fixity check',  # TODO: revise as we add more relations
        'provenanceMetadata': 'provenance metadata',
    }

    def get_default_pid(self):
        # extend common default pid logic in to also set ARK identifier
        # in the premis object
        pid = super(DiskImage, self).get_default_pid()

        if self.mods.content.ark:
            self.provenance.content.object.id = self.mods.content.ark
            self.provenance.content.object.id_type = 'ark'

        return pid

    @property
    def has_supplemental_content(self):
        '''Boolean to indicate if this disk image object has any supplemental
        file datastreams.

        .. Note:: only works on saved objects
        '''
        return any(
            dsid.startswith('supplement') for dsid in self.ds_list.keys())

    @property
    def supplemental_content(self):
        '''Generator for supplemental content datastreams'''
        for dsid in self.ds_list.keys():
            if dsid.startswith('supplement'):
                yield self.getDatastreamObject(dsid)

    _content_checksum = None
    '''Used as a workaround for Fedora 3.4 issue with file URIs and checksums
    and to support duplicate detection based on checksums, store
    content checksum without sending it to Fedora.'''

    @property
    def content_md5(self):
        return self._content_checksum or self.content.checksum

    # NOTE: auto-calculated information such as checksums stored in premis
    # will need to be updated anytime the master disk image datastream is updated
    # (will probably need to extend the save method for this)

    def save(self, logMessage=None):
        '''Save the object.  If the content of any :class:`~AudioObject.mods`,
        :class:`AudioObject.rels_ext`, or :class:`AudioObject.digitaltech`
        datastreams have been changed, the DC will be updated and saved as well.

        :param logMessage: optional log message
        '''
        if not self.exists or self.mods.isModified() or \
            self.rels_ext.isModified() or self.rights.isModified():
            # DC is derivative metadata.
            # If this is a new item (does not yet exist in Fedora)
            # OR if any of the relevant datastreams have changed, update it.
            self._update_dc()

        return super(DiskImage, self).save(logMessage)

    def _update_dc(self):
        '''Update Dublin Core (derivative metadata) based on master metadata
        from MODS, RELS-EXT, and rights metadata in order to keep data
        synchronized and make fields that need to be searchable accessible to
        Fedora findObjects API method.
         '''
        # NOTE: borrowed almost completely from audio, with minor modifications
        # TODO: move to common code somewhere?

        # identifiers
        del (self.dc.content.identifier_list)  # clear out any existing names

        # title
        if self.mods.content.title:
            # not strictly DC, but also keep object label in sync with MODS title
            self.label = self.mods.content.title
            self.dc.content.title = self.mods.content.title
        if self.mods.content.resource_type:
            self.dc.content.type = self.mods.content.resource_type

        # clear out any dates previously in DC
        del (self.dc.content.coverage_list)
        if self.mods.content.coveringdate_start and \
           self.mods.content.coveringdate_end:
            # FIXME: not sure the best way to indicate date range here
            self.dc.content.coverage_list.append(
                '%s:%s' % (self.mods.content.coveringdate_start,
                           self.mods.content.coveringdate_end))

        # clear out any descriptions previously in DC and set from MODS abstract
        del (self.dc.content.description_list)
        if self.mods.content.abstract and \
           self.mods.content.abstract.text:
            self.dc.content.description_list.append(
                self.mods.content.abstract.text)

        # clear out any rights previously in DC and set contents from Rights accessStatus
        del (self.dc.content.rights_list)
        if self.rights.content.access_status:
            # set dc:rights to text of access status
            self.dc.content.rights_list.append(
                self.rights.content.access_status.text)

    @staticmethod
    def init_from_file(filename,
                       initial_label=None,
                       request=None,
                       checksum=None,
                       mimetype=None,
                       content_location=None,
                       sha1_checksum=None):
        '''Static method to create a new :class:`DiskImage` instance from
        a file.  Sets the object label and metadata title based on the initial
        label specified, or file basename.

        :param filename: full path to the disk image file, as a string
        :param initial_label: optional initial label to use; if not specified,
            the base name of the specified file will be used
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param checksum: the MD5 checksum of the file being sent to fedora.
        :param mimetype: the mimetype for the main disk image content.
        :param content_location: optional file URI for file-based Fedora ingest
        :param sha1_checksum: the SHA1 checksum of the file being sent to fedora,
            for storage in the PREMIS technical metadata. Note that SHA-1 will
            be calculated if not passed in (slow for large files).
        :returns: :class:`DiskImage` initialized from the file
        '''

        # if no checksum was passed in, calculate one
        if checksum is None:
            checksum = md5sum(filename)

        basename, ext = os.path.splitext(os.path.basename(filename))

        # ajax upload passes original filename as initial label
        if initial_label is not None:
            # if initial label looks like a file, strip off the extension
            # for the object name/title
            if initial_label.lower().endswith('.aff') or \
               initial_label.lower().endswith('.ad1') or \
               initial_label.lower().endswith('.iso'):
                basename, ext = os.path.splitext(initial_label)
                # NOTE: also using extension from original filename
                # here because in some cases (under apache?) uploaded file
                # names do not have the original extension
                initial_label = basename

        else:
            initial_label = basename

        repo = Repository(request=request)
        obj = repo.get_object(type=DiskImage)
        # set initial object label from the base filename
        obj.label = initial_label
        obj.mods.content.title = obj.label
        obj.dc.content.title = obj.label
        # set initial mods:typeOfResource - same for all Disk Images
        obj.mods.content.resource_type = 'software, multimedia'
        # set genre as born digital
        obj.mods.content.genres.append(
            mods.Genre(authority='aat', text='born digital'))

        # Set the file checksum
        obj.content.checksum = checksum
        # set mimetype
        if mimetype is None:
            # if no mimetype was passed in, determine from file
            m = magic.Magic(mime=True)
            mtype = m.from_file(filename)
            mimetype, separator, options = mtype.partition(';')
        obj.content.mimetype = mimetype

        # Set disk image datastream label to filename
        obj.content.label = initial_label

        # premis data
        obj.provenance.content.create_object()
        # NOTE: premis object id will be same as short-form ARK stored in MODS
        # It cannot be set until pid is minted, which will happen in get_default_pid,
        # but premis is order dependent so add a place-holder here
        obj.provenance.content.object.id_type = 'ark'
        obj.provenance.content.object.id = ''

        # object type required to be schema valid, must be in premis namespace
        obj.provenance.content.object.type = 'p:file'

        # composition level required for object characteristics; probably should be 0 (?)
        obj.provenance.content.object.composition_level = 0
        # store checksums in premis: MD5 (already calculated) and SHA-1
        # picky about order here too: force algorithm to be added first
        obj.provenance.content.object.checksums.append(
            PremisFixity(algorithm='MD5'))
        obj.provenance.content.object.checksums[0].digest = checksum
        # add sha-1 to checksums in premis; calculate if not passed in
        if sha1_checksum is None:
            sha1_checksum = sha1sum(filename)
        obj.provenance.content.object.checksums.append(
            PremisFixity(algorithm='SHA-1'))
        obj.provenance.content.object.checksums[1].digest = sha1_checksum

        obj.provenance.content.object.create_format()
        # set format based on mimetype
        if mimetype in DiskImage.mimetype_format:
            obj_format = DiskImage.mimetype_format[mimetype]
        else:
            # as a fallback, use the file extension for format
            obj_format = ext.upper().strip('.')
        obj.provenance.content.object.format.name = obj_format

        # if a content URI is specified (e.g. for large files), use that
        if content_location is not None:
            obj.content.ds_location = content_location
        # otherwise set the file as content to be posted
        else:
            obj.content.content = open(filename)
            # FIXME: at what point does/should this file get closed?

        # descriptive/technical metadata todo

        return obj

    @staticmethod
    def init_from_bagit(path, request=None, file_uri=True):
        '''Static method to create a new :class:`DiskImage` instance from
        a BagIt.  Sets the object label and metadata title based on the
        name of the bag, and looks for a supported disk image file type
        (e.g. AFF or AD1) to use as the content datastream for the object.
        Content checksum is pulled from the BagIt metadata, and repository
        ingest will be done via file URIs based on configured
        **LARGE_FILE_STAGING_DIR** and **LARGE_FILE_STAGING_FEDORA_DIR**
        to better support ingesting large files (unless file_uri
        is False).

        Raises an exception if BagIt is not valid or if it does not
        contain a supported disk image data file.  (Note: using fast validation
        without checksum calculation, to minimize the time required to ingest
        large files.)

        :param path: full path to the BagIt directory that contains
            a disk image file
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param file_uri: ingest BagIt data via file uris based on
            configured staging directories (default behavior)
            instead of uploading the content to Fedora

        :returns: :class:`DiskImage` initialized from the BagIt contents
        '''

        # TODO: add optional file uri ingest flag, default to false
        # (mostly to allow testing)
        # - for all data files other than disk image, add
        # supplementN datastream with mimetype/filename as label/checksum
        # see if eulfedora getDatastreamObject can be used to init
        # a new/unmapped ds?

        bag = bagit.Bag(path)
        # NOTE: using fast validation here to avoid recalculating checksums
        # for very large files; only checksum compare will be done by fedora
        bag.validate(fast=True)  # raises bagit.BagValidationError if not valid

        # use the base name of the BagIt as initial object label
        initial_label = os.path.basename(path)

        # identify disk image content file within the bag
        content_file = None
        m = magic.Magic(mime=True)
        supplemental_files = []
        supplement_mimetypes = {}
        diskimage_mimetype = None
        # loop through bag content until we find a supported disk image file
        for data_path in bag.payload_files():
            # path is relative to bag root dir
            filename = os.path.join(path, data_path)
            mtype = m.from_file(filename)
            mimetype, separator, options = mtype.partition(';')
            if mimetype in DiskImage.diskimage_mimetypes:
                checksum_err_msg = '%%s checksum not found for disk image %s' \
                    % os.path.basename(data_path)
                # require both MD5 and SHA-1 for disk image to ingest
                try:
                    md5_checksum = bag.entries[data_path]['md5']
                except KeyError:
                    raise Exception(checksum_err_msg % 'MD5')
                try:
                    sha1_checksum = bag.entries[data_path]['sha1']
                except KeyError:
                    raise Exception(checksum_err_msg % 'SHA-1')

                # this is the disk image content file
                # store file and mimetype for further initialization
                content_file = filename
                diskimage_mimetype = mimetype

            # any data file that is not a disk image should be assumed
            # to be a supplemental file
            else:
                supplemental_files.append(filename)
                # store the mimetype so we don't have to recalculate
                supplement_mimetypes[filename] = mimetype

        # no disk image data found
        if content_file is None:
            raise Exception('No disk image content found in %s' %
                            os.path.basename(path))

        optional_args = {}
        if file_uri:
            ingest_location = 'file://%s' % urllib.quote(content_file)
            # if Fedora base path is different from locally mounted staging directory,
            # convert from local path to fedora server path
            if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR',
                       None) is not None:
                ingest_location = ingest_location.replace(
                    settings.LARGE_FILE_STAGING_DIR,
                    settings.LARGE_FILE_STAGING_FEDORA_DIR)

            optional_args['content_location'] = ingest_location

        img = DiskImage.init_from_file(content_file,
                                       initial_label=initial_label,
                                       checksum=md5_checksum,
                                       mimetype=diskimage_mimetype,
                                       request=request,
                                       sha1_checksum=sha1_checksum,
                                       **optional_args)

        i = 0
        for i in range(len(supplemental_files)):
            sfile = supplemental_files[i]
            dsid = 'supplement%d' % i
            dsobj = img.getDatastreamObject(dsid,
                                            dsobj_type=FileDatastreamObject)
            dsobj.label = os.path.basename(sfile)
            dsobj.mimetype = supplement_mimetypes[sfile]
            # convert to relative path *within* the bag for BagIt metadata lookup
            data_path = sfile.replace(path, '').lstrip('/')
            dsobj.checksum = bag.entries[data_path]['md5']
            logger.debug('Adding supplemental dastream %s label=%s mimetype=%s checksum=%s' % \
                (dsid, dsobj.label, dsobj.mimetype, dsobj.checksum))

            if file_uri:
                ingest_location = 'file://%s' % urllib.quote(sfile)
                # if Fedora base path is different from locally mounted staging directory,
                # convert from local path to fedora server path
                if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR',
                           None) is not None:
                    ingest_location = ingest_location.replace(
                        settings.LARGE_FILE_STAGING_DIR,
                        settings.LARGE_FILE_STAGING_FEDORA_DIR)

                dsobj.ds_location = ingest_location
            else:
                # will probably only work for small/test content
                dsobj.content = open(sfile).read()

        return img

    @models.permalink
    def get_absolute_url(self):
        'Absolute url to view this object within the site'
        return (DiskImage.NEW_OBJECT_VIEW, [str(self.pid)])

    def index_data(self):
        '''Extend the default
        :meth:`eulfedora.models.DigitalObject.index_data`
        method to include additional fields specific to Keep and for
        disk images.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        data = super(DiskImage, self).index_data()
        # FIXME: is born-digital type still needed for anything? perms?
        # data['object_type'] = 'born-digital'
        data['object_type'] = 'disk image'
        # set as born digital for now; eventually, we'll need to distinguish
        # between kinds of born digital content

        if self.collection and self.collection.exists:

            # collection_source_id  (0 is an allowable id, so check not None)
            if self.collection.mods.content.source_id is not None:
                data[
                    'collection_source_id'] = self.collection.mods.content.source_id

            data['collection_id'] = self.collection.pid
            data['collection_label'] = self.collection.label

        # include resolvable ARK if available
        if self.mods.content.ark_uri:
            data['ark_uri'] = self.mods.content.ark_uri

        if self.content.checksum:
            data['content_md5'] = self.content.checksum

        # copied from audio; enable once we have rights editing
        # # rights access status code
        # if self.rights.content.access_status:
        #     data['access_code'] = self.rights.content.access_status.code
        # # copyright date from rights metadata
        # if self.rights.content.copyright_date:
        #     data['copyright_date'] = self.rights.content.copyright_date
        # # ip note from rights metadata
        # if self.rights.content.ip_note:
        #     data['ip_note'] = self.rights.content.ip_note

        if self.provenance.content.fixity_checks:
            last_fixity_check = self.provenance.content.fixity_checks[-1]
            data['last_fixity_check'] = last_fixity_check.date
            data['last_fixity_result'] = last_fixity_check.outcome

        # store disk image format and size
        # - some disk images (i.e., objects migrated from AD1/AFF)
        # will have two sets of object characteristics; we want the
        # format from the last one listed
        if self.provenance.content.object and \
          self.provenance.content.object.latest_format:
            data[
                'content_format'] = self.provenance.content.object.latest_format.name

        data['content_size'] = self.content.size

        if self.original:
            data['original_pid'] = self.original.pid

        return data
Exemplo n.º 9
0
class Video(DigitalObject):
    '''Fedora Video Object.  Extends :class:`~eulfedora.models.DigitalObject`.'''
    VIDEO_CONTENT_MODEL = 'info:fedora/emory-control:Video-1.0'
    CONTENT_MODELS = [VIDEO_CONTENT_MODEL]
    NEW_OBJECT_VIEW = 'video:view'

    # There are several mimetypes for MPEG files
    allowed_master_mimetypes = {
        'video/quicktime': 'mov',
        'video/x-dv': 'dv',
        'video/mpeg': 'mpg',
        'video/x-m4v': 'm4v',
        'video/x-msvideo': 'avi'
    }
    allowed_access_mimetypes = {'video/mp4': 'mp4'}

    mods = XmlDatastream("MODS",
                         "MODS Metadata",
                         VideoMods,
                         defaults={
                             'control_group': 'M',
                             'format': mods.MODS_NAMESPACE,
                             'versionable': True,
                         })

    digitaltech = XmlDatastream("DigitalTech",
                                "Technical Metadata - Digital",
                                VideoDigitalTech,
                                defaults={
                                    'control_group': 'M',
                                    'versionable': True,
                                })
    '''digital technical metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`DigitalTech`'''

    'MODS :class:`~eulfedora.models.XmlDatastream` with content as :class:`VideoMods`'

    content = FileDatastream("VIDEO",
                             "Video datastream",
                             defaults={
                                 'versionable': True,
                             })
    'master video :class:`~eulfedora.models.FileDatastream`'

    provenance = XmlDatastream('provenanceMetadata',
                               'Provenance metadata',
                               VideoPremis,
                               defaults={'versionable': False})
    '''``provenanceMetadata`` datastream for PREMIS object metadata; datastream
    XML content will be an instance of :class:`eulxml.xmlmap.premis.Premis`.'''

    access_copy = FileDatastream("CompressedVideo",
                                 "Compressed video datastream",
                                 defaults={
                                     'mimetype': 'video/mp4',
                                     'versionable': True,
                                 })
    'access copy of video :class:`~eulfedora.models.FileDatastream`'

    sourcetech = XmlDatastream("SourceTech",
                               "Technical Metadata - Source",
                               VideoSourceTech,
                               defaults={
                                   'control_group': 'M',
                                   'versionable': True,
                               })
    '''source technical metadata :class:`~eulfedora.models.XmlDatastream` with content as
    :class:`SourceTech`'''

    rights = XmlDatastream("Rights",
                           "Usage rights and access control metadata",
                           Rights,
                           defaults={
                               'control_group': 'M',
                               'versionable': True,
                           })
    '''access control metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`Rights`'''

    # # map datastream IDs to human-readable names for inherited history_events method
    component_key = {
        'Video': 'video (master)',
        'CompressedVideo': 'video (access version)',
        'SourceTech': 'source technical metadata',
        'DigitalTech': 'digital technical metadata',
        'MODS': 'descriptive metadata',
        'DC': 'descriptive metadata',
        'Rights': 'rights metadata',
        'RELS-EXT': 'collection membership',
    }
    #
    collection = Relation(relsext.isMemberOfCollection, type=CollectionObject)
    ''':class:`~keep.collection.models.CollectionObject that this object is a member of,
    via `isMemberOfCollection` relation.
    '''
    _content_checksum = None
    '''Used as a workaround for Fedora 3.4 issue with file URIs and checksums
    and to support duplicate detection based on checksums, store
    content checksum without sending it to Fedora.'''
    @property
    def content_md5(self):
        return self._content_checksum or self.content.checksum

    def get_default_pid(self):
        # extend common default pid logic in to also set ARK identifier
        # in the premis object
        pid = super(Video, self).get_default_pid()

        if self.mods.content.ark:
            self.provenance.content.create_object()
            self.provenance.content.object.id = self.mods.content.ark
            self.provenance.content.object.id_type = 'ark'

        return pid

    def save(self, logMessage=None):
        '''Save the object.  If the content of any :class:`~Video.mods`,
        :class:`Video.rels_ext`, or :class:`Video.digitaltech`
        datastreams have been changed, the DC will be updated and saved as well.

        :param logMessage: optional log message
        '''
        if not self.exists or self.mods.isModified() or self.rels_ext.isModified() or \
            self.digitaltech.isModified() or self.rights.isModified():
            # DC is derivative metadata based on MODS/RELS-EXT/Digital Tech
            # If this is a new item (does not yet exist in Fedora)
            # OR if any of the relevant datastreams have changed, update DC
            self._update_dc()

        # for now, keep object label in sync with MODS title
        if self.mods.isModified() and self.mods.content.title:
            self.label = self.mods.content.title

        return super(Video, self).save(logMessage)

    #
    @models.permalink
    def get_absolute_url(self):
        'Absolute url to view this object within the site'
        return ('video:view', [str(self.pid)])

    def get_access_url(self):
        "Absolute url to hear this object's access version"
        if self.access_copy.exists:
            return reverse('video:download-compressed-video',
                           args=[str(self.pid)])

    def access_file_extension(self):
        '''Return the expected file extension for whatever type of
        compressed video datastream the current object has (if it has
        one), based on the datastream mimetype.  Currently, compressed
        video is MP4.'''
        if self.access_copy.exists:
            return self.allowed_access_mimetypes.get(self.access_copy.mimetype,
                                                     'mp4')

    @property
    def researcher_access(self):
        return allow_researcher_access(self.rights.content)

    def _update_dc(self):
        '''Update Dublin Core (derivative metadata) based on master metadata
        from MODS, RELS-EXT, and digital tech metadata in order to keep data
        synchronized and make fields that need to be searchable accessible to
        Fedora findObjects API method.
         '''
        # identifiers
        del self.dc.content.identifier_list  # clear out any existing names

        # title
        if self.mods.content.title:
            self.label = self.mods.content.title
            self.dc.content.title = self.mods.content.title
        if self.mods.content.resource_type:
            self.dc.content.type = self.mods.content.resource_type

        # creator names
        del self.dc.content.creator_list  # clear out any existing names
        for name in self.mods.content.names:
            # for now, use unicode conversion as defined in mods.Name
            self.dc.content.creator_list.append(unicode(name))

        # clear out any dates previously in DC
        del self.dc.content.date_list
        if self.mods.content.origin_info and \
           len(self.mods.content.origin_info.created) and \
           self.mods.content.origin_info.created[0].date:
            self.dc.content.date_list.append(
                self.mods.content.origin_info.created[0].date)
        if self.mods.content.origin_info and \
           len(self.mods.content.origin_info.issued) and \
           self.mods.content.origin_info.issued[0].date:
            self.dc.content.date_list.append(
                self.mods.content.origin_info.issued[0].date)

        # clear out any descriptions previously in DC and set from MODS/digitaltech
        del self.dc.content.description_list
        if self.mods.content.general_note and \
           self.mods.content.general_note.text:
            self.dc.content.description_list.append(
                self.mods.content.general_note.text)

        # clear out any rights previously in DC and set contents from Rights accessStatus
        del self.dc.content.rights_list
        if self.rights.content.access_status:
            # access code no longer needs to be included, since we will not be searching
            self.dc.content.rights_list.append(
                self.rights.content.access_status.text)

    def index_data(self):
        '''Extend the default
        :meth:`eulfedora.models.DigitalObject.index_data`
        method to include additional fields specific to Keep
        Video objects.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        data = super(Video, self).index_data()
        data['object_type'] = 'video'
        if self.collection and self.collection.exists:

            # collection_source_id  (0 is an allowable id, so check not None)
            if self.collection.mods.content.source_id is not None:
                data[
                    'collection_source_id'] = self.collection.mods.content.source_id
            data['collection_id'] = self.collection.pid
            try:
                # pull parent & archive collection objects directly from fedora
                parent = CollectionObject(self.api, self.collection.uri)
                data['collection_label'] = parent.label
            except RequestFailed as rf:
                logger.error(
                    'Error accessing collection or archive object in Fedora: %s'
                    % rf)

        # include resolvable ARK if available
        if self.mods.content.ark_uri:
            data['ark_uri'] = self.mods.content.ark_uri

        #TODO May have to add these sections if more metada is added
        # # old identifiers from previous digital masters
        dm1_ids = []
        if self.mods.content.dm1_id:
            dm1_ids.append(self.mods.content.dm1_id)
        if self.mods.content.dm1_other_id:
            dm1_ids.append(self.mods.content.dm1_other_id)
        if dm1_ids:
            data['dm1_id'] = dm1_ids

        # digitization purpose, if not empty
        if self.digitaltech.content.digitization_purpose_list:
            # convert nodelist to a normal list that can be serialized as json
            data['digitization_purpose'] = [
                dp for dp in self.digitaltech.content.digitization_purpose_list
            ]

        # sublocation
        if self.sourcetech.content.sublocation:
            data['sublocation'] = self.sourcetech.content.sublocation

        # rights access status code
        if self.rights.content.access_status:
            data['access_code'] = self.rights.content.access_status.code
        # copyright date from rights metadata
        if self.rights.content.copyright_date:
            data['copyright_date'] = self.rights.content.copyright_date
        # ip note from rights metadata
        if self.rights.content.ip_note:
            data['ip_note'] = self.rights.content.ip_note
        #
        # # boolean values that should always be available
        data.update({
            # should this item be accessible to researchers?
            'researcher_access': bool(self.researcher_access),
            # flags to indicate which datastreams are available
            'has_access_copy': self.access_copy.exists,
            'has_original': self.content.exists,
        })

        if self.access_copy.exists:
            data.update({
                'access_copy_size': self.access_copy.info.size,
                'access_copy_mimetype': self.access_copy.mimetype,
            })

        if self.digitaltech.content.duration:
            data['duration'] = self.digitaltech.content.duration

        if self.mods.content.origin_info and \
           self.mods.content.origin_info.issued \
                and not self.mods.content.origin_info.issued.is_empty():
            data['date_issued'] = [
                unicode(di) for di in self.mods.content.origin_info.issued
            ]
        if self.mods.content.origin_info and \
           self.mods.content.origin_info.created \
                and not self.mods.content.origin_info.created.is_empty():
            data['date_created'] = [
                unicode(di) for di in self.mods.content.origin_info.created
            ]

        # store master video format and size
        if self.provenance.content.object and self.provenance.content.object.format:
            data['content_format'] = self.provenance.content.object.format.name
        data['content_size'] = self.content.size

        return data

    @staticmethod
    def init_from_file(master_filename,
                       initial_label=None,
                       request=None,
                       master_md5_checksum=None,
                       master_sha1_checksum=None,
                       master_location=None,
                       master_mimetype=None,
                       access_filename=None,
                       access_location=None,
                       access_md5_checksum=None,
                       access_mimetype=None):
        '''Static method to create a new :class:`Video` instance from
        a file.  Sets the object label and metadata title based on the initial
        label specified, or file basename.  Calculates and stores the duration
        based on the file. Also sets the following default metadata values:

            * mods:typeOfResource = "sound recording"

        :param master_filename: full path to the master file, as a string
        :param initial_label: optional initial label to use; if not specified,
            the base name of the specified file will be used
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param master_md5_checksum: the MD5 checksum of the master file being sent to fedora.
        :param master_sha1_checksum: the sha-1 checksum of the master file being sent to fedora.
        :param master_location: optional file URI for file-based Fedora ingest of master file
        :param master_mimetype: the master_mimetype of the master file being sent to fedora
        :param access_filename: full path to the access file, as a string
        :param access_md5_checksum: the MD5 checksum of the access file being sent to fedora.
        :param access_mimetype: the mimetype of the access file being sent to fedora
        :returns: :class:`Video` initialized from the file
        '''

        if initial_label is None:
            initial_label = os.path.basename(master_filename)
        repo = Repository(request=request)
        obj = repo.get_object(type=Video)
        # set initial object label from the base master_filename
        obj.label = initial_label
        obj.dc.content.title = obj.mods.content.title = obj.label
        # Set the file checksum, if set.
        obj.content.checksum = master_md5_checksum
        # set content datastream master_mimetype if passed in
        if master_mimetype is not None:
            obj.content.mimetype = master_mimetype
        #Get the label, minus the extention (master_mimetype indicates that)
        obj.content.label = initial_label.rsplit('.', 1)[0]
        # set initial mods:typeOfResource - all Vodeo default to video recording
        obj.mods.content.resource_type = 'moving image'
        # get duration and store in digital tech metadata
        try:
            info = MediaInfo.parse(master_filename)
            duration = info.tracks[0].duration / 1000
        except:
            raise Exception('Error getting video duration')

        obj.digitaltech.content.duration = '%d' % round(duration)

        # premis data
        obj.provenance.content.create_object()
        obj.provenance.content.object.id_type = 'ark'
        obj.provenance.content.object.id = ''

        obj.provenance.content.object.type = 'p:file'
        obj.provenance.content.object.checksums.append(
            PremisFixity(algorithm='MD5'))
        obj.provenance.content.object.checksums[0].digest = master_md5_checksum

        if master_sha1_checksum is None:
            master_sha1_checksum = sha1sum(master_filename)
        obj.provenance.content.object.checksums.append(
            PremisFixity(algorithm='SHA-1'))
        obj.provenance.content.object.checksums[
            1].digest = master_sha1_checksum

        obj.provenance.content.object.create_format()
        #format name will be upper-cased version of file extension
        obj.provenance.content.object.format.name = master_filename.rsplit(
            '.', 1)[1].upper()

        # if a content URI is specified (e.g. for large files), use that
        if master_location is not None:
            obj.content.ds_location = master_location

        # otherwise set the file as content to be posted
        else:
            obj.content.content = open(master_filename)

        # Access copy data

        # if a access URI is specified (e.g. for large files), use that
        if access_location is not None:
            obj.access_copy.ds_location = access_location

        # otherwise set the access file as content to be posted
        else:
            obj.access_copy.content = open(access_filename)

        obj.access_copy.mimetype = access_mimetype
        obj.access_copy.checksum = access_md5_checksum
        obj.access_copy.label = initial_label

        return obj

    @staticmethod
    def init_from_bagit(path, request=None, file_uri=True):
        '''Static method to create a new :class:`Video` instance from
        a BagIt.  Sets the object label and metadata title based on the
        name of the bag, and looks for a supported video file type
        to use as the content datastream for the object.
        Content checksum is pulled from the BagIt metadata, and repository
        ingest will be done via file URIs based on configured
        **LARGE_FILE_STAGING_DIR** and **LARGE_FILE_STAGING_FEDORA_DIR**
        to better support ingesting large files (unless file_uri
        is False).

        Raises an exception if BagIt is not valid or if it does not
        contain a supported video data file.  (Note: using fast validation
        without checksum calculation, to minimize the time required to ingest
        large files.)

        :param path: full path to the BagIt directory that contains
            a video file
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param file_uri: ingest BagIt data via file uris based on
            configured staging directories (default behavior)
            instead of uploading the content to Fedora

        :returns: :class:`Video` initialized from the BagIt contents
        '''

        bag = bagit.Bag(path)
        # NOTE: using fast validation here to avoid recalculating checksums
        # for very large files; only checksum compare will be done by fedora
        bag.validate(fast=True)  # raises bagit.BagValidationError if not valid

        # use the base name of the BagIt as initial object label
        initial_label = os.path.basename(path)

        # identify video content file within the bag
        m = magic.Magic(mime=True)
        # loop through bag content until we find a supported video file

        opts = {'request': request, 'initial_label': initial_label}

        for data_path in bag.payload_files():
            # path is relative to bag root dir
            filename = os.path.join(path, data_path)
            mtype = m.from_file(filename)
            mimetype, separator, options = mtype.partition(';')

            # require both MD5 and SHA-1 for video to ingest
            try:
                md5_checksum = bag.entries[data_path]['md5']
            except KeyError:
                raise Exception('MD5 checksum mismatch on file %s' % data_path)
            try:
                sha1_checksum = bag.entries[data_path]['sha1']
            except KeyError:
                raise Exception('SHA-1 checksum mismatch on file %s' %
                                data_path)

            if mimetype in Video.allowed_master_mimetypes.keys():
                opts['master_filename'] = filename
                opts['master_md5_checksum'] = md5_checksum
                opts['master_sha1_checksum'] = sha1_checksum
                opts['master_mimetype'] = mimetype
                if file_uri:
                    # if Fedora base path is different from locally mounted staging directory,
                    # convert from local path to fedora server path
                    master_location = 'file://%s' % urllib.quote(
                        opts['master_filename'])
                    if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR',
                               None) is not None:
                        master_location = master_location.replace(
                            settings.LARGE_FILE_STAGING_DIR,
                            settings.LARGE_FILE_STAGING_FEDORA_DIR)
                    opts['master_location'] = master_location

            elif mimetype in Video.allowed_access_mimetypes.keys():
                opts['access_filename'] = filename
                opts['access_md5_checksum'] = md5_checksum
                opts['access_mimetype'] = mimetype
                if file_uri:
                    # if Fedora base path is different from locally mounted staging directory,
                    # convert from local path to fedora server path
                    access_location = 'file://%s' % urllib.quote(
                        opts['access_filename'])
                    if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR',
                               None) is not None:
                        access_location = access_location.replace(
                            settings.LARGE_FILE_STAGING_DIR,
                            settings.LARGE_FILE_STAGING_FEDORA_DIR)
                    opts['access_location'] = access_location
        # no Video found
        if 'master_filename' not in opts:
            raise Exception('No Video content found in %s' %
                            os.path.basename(path))

        vid = Video.init_from_file(**opts)

        return vid

    def old_dm_media_path(self):
        old_id = self.mods.content.dm1_other_id or self.mods.content.dm1_id
        if old_id:
            coll_obj = self._collection_object()
            if not coll_obj:
                return
            coll_path = coll_obj.old_dm_media_path()
            if not coll_path:
                return
            return '%svideo/%s.m4a' % (coll_path, old_id)

    def _collection_object(self):
        return self.collection