def view(request, pid): '''View a single :class:`~keep.video.models.Video`. User must either have general view video permissions, or if they have view researcher view, the object must be researcher accessible (based on rights codes). ''' repo = Repository(request=request) obj = repo.get_object(pid=pid, type=Video) # # user either needs view video permissions OR # # if they can view researcher audio and object must be researcher-accessible viewable = request.user.has_perm('video.view_video') or \ (request.user.has_perm('video.view_researcher_video') and bool(obj.researcher_access)) if not viewable: return prompt_login_or_403(request) try: if not obj.has_requisite_content_models: raise Http404 except: raise Http404 return render(request, 'video/view.html', {"resource": obj})
def find_by_collection_number(num, parent=None): '''Find a CollectionObject in Fedora by collection number (or source id), optionally limited by parent collection (owning archive). :param num: collection number to search for (aka source id) :param parent: optional; archive that the collection must belong to :return: generator of any matching items, as instances of :class:`CollectionObject` ''' solr = solr_interface() solrquery = solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL, pid='%s:*' % settings.FEDORA_PIDSPACE, source_id=int(num)) # if parent is specified, restrict by archive id (parent should be a pid) if parent is not None: # remove prefix on parent prefix ='info:fedora/' if parent.startswith(prefix): parent = parent[12:] solrquery = solrquery.query(archive_id=parent) # by default, only returns 10; get everything # - solr response is a list of dictionary with collection info # use dictsort in template for sorting where appropriate collections = solrquery.paginate(start=0, rows=1000).execute() # return a generator of matching items, as instances of CollectionObject repo = Repository() for coll in collections: yield repo.get_object(coll['pid'], type=CollectionObject)
def archives(format=None): """Find Archives objects, to which CollectionObjects belong. :returns: list of :class:`CollectionObject` :rtype: list """ # NOTE: formerly called top-level collections or Repository / # Owning Repository; should now be called archive and labeled # as such anywhere user-facing # TODO: search logic very similar to item_collections and # subcollections methods; consider refactoring search logic # into a common search method. if CollectionObject._archives is None: # find all objects with cmodel collection-1.1 and no parents # search solr for collection objects with NO parent collection id solr = solr_interface() # NOTE: not filtering on pidspace, since top-level objects are loaded as fixtures # and may not match the configured pidspace in a dev environment solrquery = solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL) collections = solrquery.exclude(archive_id__any=True).sort_by('title_exact').execute() # store the solr response format CollectionObject._archives = collections if format == dict: return CollectionObject._archives # otherwise, initialize as instances of CollectionObject repo = Repository() return [repo.get_object(arch['pid'], type=CollectionObject) for arch in CollectionObject._archives]
def handle(self, *args, **options): self.options = options self.repaired_count = 0 self.unrepaired_count = 0 repo = Repository() self.pidman = DjangoPidmanRestClient() # populate list of objects to be processed objects = [] for pid in args: try: obj = repo.get_object(pid=pid, type=CollectionObject) if obj.has_requisite_content_models: objects.append(obj) else: obj = repo.get_object(pid=pid, type=AudioObject) if obj.has_requisite_content_models: objects.append(obj) except Exception: self.log(message="Could not find Collection or Audio object for: %s" % pid) # get list of all collections from the repository # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object if not args: objects = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) if not objects: self.log(message="No Collections were found.") for obj in objects: self.repair_ark(obj) self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True)
def setUp(self): self.repo = Repository() self.pids = [] # test EmailMessage self.email = self.repo.get_object(type=EmailMessage) self.email.cerp.content.from_list = ['*****@*****.**'] self.email.cerp.content.to_list = ['*****@*****.**'] self.email.cerp.content.subject_list = ['Interesting Subject']
def archives(format=None): if format == dict: return [{'title': nick, 'pid': pid} for nick,pid in settings.PID_ALIASES.iteritems()] if not hasattr(FedoraFixtures, '_archives'): repo = Repository() FedoraFixtures._archives = [repo.get_object(pid, type=CollectionObject) for pid in settings.PID_ALIASES.itervalues()] return FedoraFixtures._archives
def download(request, pid): 'Download disk image datastream contents' repo = Repository(request=request) obj = repo.get_object(pid, type=DiskImage) extra_headers = { 'Content-Disposition': "attachment; filename=%s.%s" % \ (obj.noid, obj.provenance.content.object.latest_format.name) } return raw_datastream(request, pid, DiskImage.content.id, repo=repo, headers=extra_headers)
def englishdocs_collection(): repo = Repository() obj = repo.get_object(type=CollectionObject) obj.label = 'English documents collection' obj.mods.content.title = 'English documents collection' obj.mods.content.source_id = '309' obj.collection = repo.get_object(FedoraFixtures.archives()[1].uri) obj.mods.content.create_origin_info() obj.mods.content.origin_info.created.append(mods.DateCreated(date=1509, point='start')) obj.mods.content.origin_info.created.append(mods.DateCreated(date=1805, point='end')) return obj
def simple_collection(label=None, status=None, pid=None): repo = Repository() obj = repo.get_object(type=SimpleCollection) if label is not None: obj.label = label obj.mods.content.create_restrictions_on_access() if status is not None: obj.mods.content.restrictions_on_access.text = status if pid is not None: obj.pid = pid return obj
def all(): 'Find all Audio objects by content model within the configured pidspace.' search_opts = { 'type': AudioObject, # restrict to objects in configured pidspace 'pid__contains': '%s:*' % settings.FEDORA_PIDSPACE, # restrict by cmodel in dc:format 'format__contains': AudioObject.AUDIO_CONTENT_MODEL, } repo = Repository() return repo.find_objects(**search_opts)
def englishdocs_collection(): repo = Repository() obj = repo.get_object(type=CollectionObject) obj.label = 'English documents collection' obj.mods.content.title = 'English documents collection' obj.mods.content.source_id = '309' obj.collection = repo.get_object(FedoraFixtures.archives()[1].uri) obj.mods.content.create_origin_info() obj.mods.content.origin_info.created.append( mods.DateCreated(date=1509, point='start')) obj.mods.content.origin_info.created.append( mods.DateCreated(date=1805, point='end')) return obj
def esterbrook_collection(): repo = Repository() obj = repo.get_object(type=CollectionObject) obj.label = 'Thomas Esterbrook letter books' obj.mods.content.title = 'Thomas Esterbrook letter books' obj.mods.content.source_id = '123' obj.collection = repo.get_object(FedoraFixtures.archives()[2].uri) obj.mods.content.create_origin_info() obj.mods.content.origin_info.created.append(mods.DateCreated(date=1855, point='start')) obj.mods.content.origin_info.created.append(mods.DateCreated(date=1861, point='end')) obj.mods.content.create_name() obj.mods.content.name.name_parts.append(mods.NamePart(text='Thomas Esterbrook')) return obj
def rushdie_collection(): repo = Repository() obj = repo.get_object(type=CollectionObject) obj.label = 'Salman Rushdie Collection' obj.mods.content.title = 'Salman Rushdie Collection' obj.mods.content.source_id = '1000' obj.collection = repo.get_object(FedoraFixtures.archives()[1].uri) obj.mods.content.create_origin_info() obj.mods.content.origin_info.created.append(mods.DateCreated(date=1947, point='start')) obj.mods.content.origin_info.created.append(mods.DateCreated(date=2008, point='end')) obj.mods.content.create_name() obj.mods.content.name.name_parts.append(mods.NamePart(text='Salman Rushdie')) return obj
def _objects_by_type(type_uri, type=None): """ Returns a list of objects with the specified type_uri as objects of the specified type :param type_uri: The uri of the type being searched :param type: The type of object that should be returned """ repo = Repository() pids = repo.risearch.get_subjects(RDF.type, type_uri) pids_list = list(pids) for pid in pids_list: yield repo.get_object(pid=pid, type=type)
def archives(format=None): if format == dict: return [{ 'title': nick, 'pid': pid } for nick, pid in settings.PID_ALIASES.iteritems()] if not hasattr(FedoraFixtures, '_archives'): repo = Repository() FedoraFixtures._archives = [ repo.get_object(pid, type=CollectionObject) for pid in settings.PID_ALIASES.itervalues() ] return FedoraFixtures._archives
def tasks(request, pid): '''Manage tasks associated with an :class:`~keep.audio.models.AudioObject`. Currently, the only supported functionality is to queue access copy conversion; this should be done by POSTing the type of task to be queued, i.e. **generate access copy**. Supported tasks: * **generate access copy** - queue access copy conversion for an audio item by pid. Returns a status message as the body of a plain/text response :param pid: the pid of the object for which tasks should be queued ''' if request.method == 'POST': status = "queued" task_type = request.POST.get('task', None) # TODO May want to prevent queuing of more than one at a time or within a time period. # TODO For now javascript disables the link until the page is refreshed. # currently the only supported task is if task_type == 'generate access copy': try: repo = Repository(request=request) obj = repo.get_object(pid, type=AudioObject) # if object doesn't exist or isn't an audio item, 404 if not obj.exists or not obj.has_requisite_content_models: raise Http404 queue_access_copy(obj) status = 'Successfully queued access copy conversion' except Exception as err: # re-raise any 404 error if isinstance(err, Http404): raise logger.error('Error queueing access copy conversion for %s : %s' % \ (pid, err)) status = 'Error queueing access copy conversion (%s)' % err return HttpResponse(status, content_type='text/plain') # unsupported task else: return HttpResponse('Task "%s" is not supported' % task_type, content_type='text/plain', status=500)
def create_from_findingaid(request): form = FindCollection(request.POST) if not form.is_valid(): messages.error(request, 'Form is not valid; please try again.') else: data = form.cleaned_data q = CollectionObject.item_collection_query() # submitted value is pid alias; lookup pid for solr query archive_id = settings.PID_ALIASES[data['archive']] q = q.query(archive_id=archive_id, source_id=data['collection']) # if collection is found, redirect to collection view with message if q.count(): messages.info(request, 'Found %d collection%s for %s %s.' % (q.count(), 's' if q.count() != 1 else '', data['archive'].upper(), data['collection'])) return HttpResponseSeeOtherRedirect(reverse('collection:view', kwargs={'pid': q[0]['pid']})) else: # otherwise, create the new record and redirect to new # collection edit page repo = Repository(request=request) coll_id = data['collection'] coll = None try: archive = repo.get_object(archive_id, type=CollectionObject) fa = FindingAid.find_by_unitid(unicode(coll_id), archive.mods.content.title) coll = fa.generate_collection() coll.collection = archive coll.save() messages.info(request, 'Added %s for collection %s: %s' % (coll, coll_id, coll.mods.content.title)) return HttpResponseSeeOtherRedirect( reverse('collection:edit', kwargs={'pid': coll.pid})) except DoesNotExist: messages.error(request, 'No EAD found for %s in %s' % (coll_id, data['archive'].upper())) except ReturnedMultiple: messages.error(request, 'Multiple EADs found for %s in %s' % (coll_id, data['archive'].upper())) except RequestFailed as err: print err messages.error(request, 'Failed to save new collection') return HttpResponseSeeOtherRedirect(reverse('repo-admin:dashboard'))
def setUp(self): self.repo = Repository() self.pids = [] # create test collection coll = self.repo.get_object(type=CollectionObject) coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE coll.mods.content.source_id = '12345' coll.save() self.pids.append(coll.pid) #create test arrangement object self.arr = self.repo.get_object(type=ArrangementObject) self.arr.pid = 'foo:1' self.arr.collection = coll
class Command(BaseCommand): '''Generate access copies for PIDs specified on the command line.''' help = __doc__ def handle(self, *args, **options): self.verbosity = options['verbosity'] self.repo = Repository() for pid in args: self.process_pid(pid) def process_pid(self, pid): '''Process a single PID by looking it up in the repository, figuring out what kind of processing it needs based on its object type, and doing that. ''' obj = self.repo.get_object(pid=pid, type=self.repo.infer_object_subtype) if not obj.exists: if self.verbosity >= 1: print "No such PID; skipped:", pid return if isinstance(obj, AudioObject): if self.verbosity >= 2: print "Generating audio access copy:", pid queue_access_copy(obj) else: if self.verbosity >= 1: print "Unhandled object type; skipped:", pid
def rushdie_collection(): repo = Repository() obj = repo.get_object(type=CollectionObject) obj.label = 'Salman Rushdie Collection' obj.mods.content.title = 'Salman Rushdie Collection' obj.mods.content.source_id = '1000' obj.collection = repo.get_object(FedoraFixtures.archives()[1].uri) obj.mods.content.create_origin_info() obj.mods.content.origin_info.created.append( mods.DateCreated(date=1947, point='start')) obj.mods.content.origin_info.created.append( mods.DateCreated(date=2008, point='end')) obj.mods.content.create_name() obj.mods.content.name.name_parts.append( mods.NamePart(text='Salman Rushdie')) return obj
def esterbrook_collection(): repo = Repository() obj = repo.get_object(type=CollectionObject) obj.label = 'Thomas Esterbrook letter books' obj.mods.content.title = 'Thomas Esterbrook letter books' obj.mods.content.source_id = '123' obj.collection = repo.get_object(FedoraFixtures.archives()[2].uri) obj.mods.content.create_origin_info() obj.mods.content.origin_info.created.append( mods.DateCreated(date=1855, point='start')) obj.mods.content.origin_info.created.append( mods.DateCreated(date=1861, point='end')) obj.mods.content.create_name() obj.mods.content.name.name_parts.append( mods.NamePart(text='Thomas Esterbrook')) return obj
def init_from_file(filename, initial_label=None, request=None, checksum=None, mimetype=None): '''Static method to create a new :class:`AudioObject` instance from a file. Sets the object label and metadata title based on the initial label specified, or file basename. Calculates and stores the duration based on the file. Also sets the following default metadata values: * mods:typeOfResource = "sound recording" * dt:codecQuality = "lossless" :param filename: full path to the audio file, as a string :param initial_label: optional initial label to use; if not specified, the base name of the specified file will be used :param request: :class:`django.http.HttpRequest` passed into a view method; must be passed in order to connect to Fedora as the currently-logged in user :param checksum: the checksum of the file being sent to fedora. :returns: :class:`AudioObject` initialized from the file ''' if initial_label is None: initial_label = os.path.basename(filename) repo = Repository(request=request) obj = repo.get_object(type=AudioObject) # set initial object label from the base filename obj.label = initial_label obj.dc.content.title = obj.mods.content.title = obj.label obj.audio.content = open( filename) # FIXME: at what point does/should this get closed? # Set the file checksum, if set. obj.audio.checksum = checksum # set content datastream mimetype if passed in if mimetype is not None: obj.audio.mimetype = mimetype #Get the label, minus the ".wav" (mimetype indicates that) obj.audio.label = initial_label[:-4] # set initial mods:typeOfResource - all AudioObjects default to sound recording obj.mods.content.resource_type = 'sound recording' # set codec quality to lossless in digital tech metadata # - default for AudioObjects, should only accept lossless audio for master file obj.digitaltech.content.codec_quality = 'lossless' # get wav duration and store in digital tech metadata obj.digitaltech.content.duration = '%d' % round(wav_duration(filename)) return obj
def disk_images(self): self.stderr.write('Disk images') ### disk images # representative sample of aff and ad1 # DO NOT include anything in these collections: # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw), # Clifton (94kf4), and Grennan (9k0st) solr = solr_interface() repo = Repository() q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \ .exclude(collection_id=self.collections['trethewey']) \ .exclude(collection_id=self.collections['rushdie']) \ .exclude(collection_id=self.collections['mackey']) \ .exclude(collection_id=self.collections['clifton']) \ .exclude(collection_id=self.collections['grennan']) \ .field_limit('pid') if self.verbosity >= self.v_normal: self.stderr.write( 'Found %d disk images not in restricted collections' % q.count()) # currently there is no way to filter on format or size in either # solr or fedora risearch # so, go through individually and group them by type, # then sort by size and pick the smallest ones diskimgs_by_type = defaultdict(list) for result in q: diskimg = repo.get_object(result['pid'], type=DiskImage) if not diskimg.exists: if self.verbosity >= self.v_normal: self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \ % result['pid']) continue fmt = diskimg.provenance.content.object.format.name diskimgs_by_type[fmt].append(diskimg) for fmt, diskimages in diskimgs_by_type.iteritems(): if self.verbosity >= self.v_normal: self.stderr.write('Selecting %s disk images' % fmt) # sort on binary file size so we sync the smallest ones diskimages = sorted(diskimages, key=lambda diskimg: diskimg.content.size) # use the first 10 of each type for d in diskimages[:10]: self.stdout.write(d.pid)
def view_audit_trail(request, pid): 'Access XML audit trail for an audio object' # initialize local repo with logged-in user credentials & call eulfedora view # FIXME: redundant across collection/arrangement/audio apps; consolidate? return raw_audit_trail(request, pid, type=AudioObject, repo=Repository(request=request))
def setUp(self): self.repo = Repository() self.pids = [] #Create a simple Collection self.sc = self.repo.get_object(type=SimpleCollection) self.sc.label = "SimpleCollection For Test" self.sc.save() self.pids.append(self.sc.pid) #Create a Master Collection self.mc = self.repo.get_object(type=CollectionObject) self.mc.label = "MasterCollection For Test" self.mc.save() self.pids.append(self.mc.pid) #Create a a DigitalObject self.digObj = self.repo.get_object(type=RushdieArrangementFile) self.digObj.label = "Object For Test" self.digObj.save() self.pids.append(self.digObj.pid) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-MACTECH", "MARBL-MACTECH", mimeType="application/xml", content=self.MM_FIXTURE) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-ANALYSIS", "MARBL-ANALYSIS", mimeType="application/xml", content=self.MA_FIXTURE) #Remove Arrangement model so it can be added later relation = (self.digObj.uriref, modelns.hasModel, "info:fedora/emory-control:Arrangement-1.0") self.digObj.rels_ext.content.remove(relation) self.digObj.save() #Setup Command self.cmd = migrate_rushdie.Command() self.cmd.verbosity = 1 self.cmd.v_normal = 1 self.cmd.v_none = 0 self.cmd.simple_collection = self.sc self.cmd.stdout = sys.stdout self.cmd.CONTENT_MODELS = CONTENT_MODELS self.cmd.repo = self.repo
def find_by_field(field, value, repo=None): ''' Static method to find a single :class:`EmailMessage` by an indexed value. Looks for the item in Solr and returns an :class:`EmailMessage` instance initialized from the repository if a single match is found for the requested field and value. Raises :class:`django.core.exceptions.MultipleObjectsReturned` if more than one match is found; raises :class:`django.core.exceptions.ObjectDoesNotExist` if no matches are found in the Solr index. :param field: solr field to search :param value: value to search on in the specified field :param repo: optional :class:`eulfedora.server.Repository` to use an existing connection with specific credentials :returns: :class:`EmailMessage` ''' solr = solr_interface() search_terms = { field: value, 'content_model': ArrangementObject.ARRANGEMENT_CONTENT_MODEL } q = solr.query(**search_terms).field_limit('pid') # check that we found one and only one found = len(q) # borrowing custom django exceptions for not found / too many # matches if found > 1: raise MultipleObjectsReturned('Found %d records with %s %s' % \ (found, field, value)) if not found: raise ObjectDoesNotExist('No record found with %s %s' % (field, value)) if repo is None: repo = Repository() return repo.get_object(q[0]['pid'], type=EmailMessage)
def view(request, pid): '''View a single :class:`~keep.collection.models.CollectionObject`, with a paginated list of all items in that collection. ''' repo = Repository(request=request) obj = repo.get_object(pid, type=CollectionObject) # if pid doesn't exist or isn't a collection, 404 if not obj.exists or not obj.has_requisite_content_models: raise Http404 # search for all items that belong to this collection q = obj.solr_items_query() q = q.sort_by('date_created') \ .sort_by('date_issued') \ .sort_by('title_exact') # filter by logged-in user permissions # (includes researcher-accessible content filter when appropriate) q = filter_by_perms(q, request.user) # if current user can only view researcher-accesible collections and # no items were found, they don't have permission to view this collection if not request.user.has_perm('collection.view_collection') and \ request.user.has_perm('collection.view_researcher_collection') and \ q.count() == 0: return prompt_login_or_403(request) # paginate the solr result set paginator = Paginator(q, 30) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: results = paginator.page(page) except (EmptyPage, InvalidPage): results = paginator.page(paginator.num_pages) # url parameters for pagination links url_params = request.GET.copy() if 'page' in url_params: del url_params['page'] return TemplateResponse(request, 'collection/view.html', {'collection': obj, 'items': results, 'url_params': urlencode(url_params)})
def playlist(request, pid): # FIXME: this needs last-modified so browser can cache!!! # NOTE: preliminary logic duplicated from view above repo = Repository(request=request) obj = repo.get_object(pid, type=CollectionObject) # if pid doesn't exist or isn't a collection, 404 if not obj.exists or not obj.has_requisite_content_models: raise Http404 # search for all items that belong to this collection q = obj.solr_items_query() q = q.sort_by('date_created') \ .sort_by('date_issued') \ .sort_by('title_exact') # filter by logged-in user permissions # (includes researcher-accessible content filter when appropriate) q = filter_by_perms(q, request.user) # if current user can only view researcher-accesible collections and # no items were found, they don't have permission to view this collection if not request.user.has_perm('collection.view_collection') and \ request.user.has_perm('collection.view_researcher_collection') and \ q.count() == 0: return prompt_login_or_403(request) playlist = [] for result in q: # skip non-audio or audio without access copies if result['object_type'] != 'audio' or not result['has_access_copy']: continue data = { 'title': result['title'], 'free': False # explicitly mark as not downloadable } if result['access_copy_mimetype'] == 'audio/mp4': audio_type = 'm4a' else: audio_type = 'mp3' data[audio_type] = reverse('audio:download-compressed-audio', kwargs={'pid': result['pid'], 'extension': audio_type}) playlist.append(data) return HttpResponse(json.dumps(playlist), content_type='application/json')
def disk_images(self): self.stderr.write('Disk images') ### disk images # representative sample of aff and ad1 # DO NOT include anything in these collections: # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw), # Clifton (94kf4), and Grennan (9k0st) solr = solr_interface() repo = Repository() q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \ .exclude(collection_id=self.collections['trethewey']) \ .exclude(collection_id=self.collections['rushdie']) \ .exclude(collection_id=self.collections['mackey']) \ .exclude(collection_id=self.collections['clifton']) \ .exclude(collection_id=self.collections['grennan']) \ .field_limit('pid') if self.verbosity >= self.v_normal: self.stderr.write('Found %d disk images not in restricted collections' % q.count()) # currently there is no way to filter on format or size in either # solr or fedora risearch # so, go through individually and group them by type, # then sort by size and pick the smallest ones diskimgs_by_type = defaultdict(list) for result in q: diskimg = repo.get_object(result['pid'], type=DiskImage) if not diskimg.exists: if self.verbosity >= self.v_normal: self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \ % result['pid']) continue fmt = diskimg.provenance.content.object.format.name diskimgs_by_type[fmt].append(diskimg) for fmt, diskimages in diskimgs_by_type.iteritems(): if self.verbosity >= self.v_normal: self.stderr.write('Selecting %s disk images' % fmt) # sort on binary file size so we sync the smallest ones diskimages = sorted(diskimages, key=lambda diskimg: diskimg.content.size) # use the first 10 of each type for d in diskimages[:10]: self.stdout.write(d.pid)
def handle(self, batch_id=None, folder_path=None, verbosity=1, noact=False, max_ingest=None, skip_purge=False, purge_only=False, *args, **options): # check batch object if batch_id is None: raise CommandError('Processing batch id is required') self.verbosity = int(verbosity) # ensure we compare int to int if max_ingest is not None: self.max_ingest = int(max_ingest) # check folder path if folder_path is None: raise CommandError('Eudora folder base path is required') if not os.path.isdir(folder_path): raise CommandError('Eudora folder path "%s" is not a directory' % folder_path) self.noact = noact # check for any specified fedora credentials fedora_opts = {} if 'username' in options: fedora_opts['username'] = options['username'] if 'password' in options: fedora_opts['password'] = options['password'] self.repo = Repository(**fedora_opts) batch = self.repo.get_object(batch_id, type=ProcessingBatch) if not batch.exists: raise CommandError('Processing batch %s not found' % batch_id) print 'Looking for email messages in processing batch "%s"' \ % batch.label try: pidman = DjangoPidmanRestClient() except: raise CommandError('Error initializing PID manager client; ' + 'please check settings.') self.stats = defaultdict(int) # purge old metadata email 'arrangement' objects that belong to this batch if not skip_purge: self.remove_arrangement_emails(batch) # ingest new objects for email mailboxes & messages if not purge_only: self.ingest_email(folder_path)
def simple_edit(request, pid=None): ''' Edit an existing Fedora :class:`~keep.collection.models.SimpleCollection`. If a pid is specified, attempts to retrieve an existing object. ''' repo = Repository(request=request) try: obj = repo.get_object(pid=pid, type=SimpleCollection) if request.method == 'POST': form = SimpleCollectionEditForm(request.POST) if form.is_valid(): status = form.cleaned_data['status'] if status == obj.mods.content.restrictions_on_access.text: # don't queue job if there is no change messages.info(request, 'Status is unchanged') else: # queue celery task to update items in this batch queue_batch_status_update(obj, status) messages.info( request, 'Batch status update has been queued; ' + 'please check later via <a href="%s">recent tasks</a> page' % reverse('tasks:recent') ) else: #Just Display the form form = SimpleCollectionEditForm(initial={'status': obj.mods.content.restrictions_on_access.text}) except RequestFailed, e: # if there was a 404 accessing objects, raise http404 # NOTE: this probably doesn't distinguish between object exists with # no MODS and object does not exist at all if e.code == 404: raise Http404 # otherwise, re-raise and handle as a common fedora connection error else: raise
def by_arrangement_id(id, repo=None): ''' Static method to find an :class:`ArrangementObject` by its local or arrangement id. Looks for the item in Solr and returns an :class:`ArrangementObject` instance initialized from the repository if a single match is found for the requested id. Raises :class:`django.core.exceptions.MultipleObjectsReturned` if more than one match is found; raises :class:`django.core.exceptions.ObjectDoesNotExist` if no matches are found in the Solr index. :param id: arrangement id or local id :param repo: optional :class:`eulfedora.server.Repository` to use an existing connection with specific credentials :returns: :class:`ArrangementObject` ''' solr = solr_interface() q = solr.query(arrangement_id=id, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \ .field_limit('pid') # check that we found one and only one found = len(q) # borrowing custom django exceptions for not found / too many # matches if found > 1: raise MultipleObjectsReturned('Found %d records with arrangement id %s' % \ (found, id)) if not found: raise ObjectDoesNotExist('No record found with arrangement id %s' % id) if repo is None: repo = Repository() return repo.get_object(q[0]['pid'], type=ArrangementObject)
def handle(self, *args, **options): self.options = options self.repaired_count = 0 self.unrepaired_count = 0 repo = Repository() self.pidman = DjangoPidmanRestClient() # populate list of objects to be processed objects = [] for pid in args: try: obj = repo.get_object(pid=pid, type=CollectionObject) if obj.has_requisite_content_models: objects.append(obj) else: obj = repo.get_object(pid=pid, type=AudioObject) if obj.has_requisite_content_models: objects.append(obj) except Exception: self.log( message="Could not find Collection or Audio object for: %s" % pid) # get list of all collections from the repository # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object if not args: objects = repo.get_objects_with_cmodel( CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) if not objects: self.log(message="No Collections were found.") for obj in objects: self.repair_ark(obj) self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True)
def view(request, pid): '''View a single :class:`~keep.audio.models.AudioObject`. User must either have general view audio permissions, or if they have view researcher audio, the object must be researcher accessible (based on rights codes). ''' repo = Repository(request=request) obj = repo.get_object(pid, type=AudioObject) # user either needs view audio permissions OR # if they can view researcher audio and object must be researcher-accessible if not request.user.has_perm('audio.view_audio') and \ not (request.user.has_perm('audio.view_researcher_audio') and \ bool(obj.researcher_access)): return prompt_login_or_403(request) try: if not obj.has_requisite_content_models: raise Http404 except: raise Http404 return TemplateResponse(request, 'audio/view.html', {'resource': obj})
def setUp(self): self.repo = Repository() self.pids = [] #Create a simple Collection self.sc = self.repo.get_object(type=SimpleCollection) self.sc.label = "SimpleCollection For Test" self.sc.save() self.pids.append(self.sc.pid) #Create a Master Collection self.mc = self.repo.get_object(type=CollectionObject) self.mc.label = "MasterCollection For Test" self.mc.save() self.pids.append(self.mc.pid) #Create a a DigitalObject self.digObj = self.repo.get_object(type=RushdieArrangementFile) self.digObj.label = "Object For Test" self.digObj.save() self.pids.append(self.digObj.pid) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-MACTECH", "MARBL-MACTECH", mimeType="application/xml", content= self.MM_FIXTURE) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-ANALYSIS", "MARBL-ANALYSIS", mimeType="application/xml", content= self.MA_FIXTURE) #Remove Arrangement model so it can be added later relation = (self.digObj.uriref, modelns.hasModel, "info:fedora/emory-control:Arrangement-1.0") self.digObj.rels_ext.content.remove(relation) self.digObj.save() #Setup Command self.cmd = migrate_rushdie.Command() self.cmd.verbosity = 1 self.cmd.v_normal = 1 self.cmd.v_none = 0 self.cmd.simple_collection = self.sc self.cmd.stdout = sys.stdout self.cmd.CONTENT_MODELS = CONTENT_MODELS self.cmd.repo = self.repo
def manage_supplements(request, pid): '''Manage supplemental file datastreams associated with a :class:`~keep.file.models.DiskImage`.''' repo = Repository(request=request) obj = repo.get_object(pid, type=DiskImage) if not obj.exists or not obj.has_requisite_content_models: raise Http404 # generate initial data from any existing supplemental datastreams initial_data = [] for s in obj.supplemental_content: initial_data.append({'dsid': s.id, 'label': s.label, 'file': DatastreamFile(obj.pid, s.id, s.label)}) # on get, just display the form if request.method == 'GET': formset = SupplementalFileFormSet(initial=initial_data) # on post, process the form and any updates/additions if request.method == 'POST': formset = SupplementalFileFormSet(request.POST, request.FILES, initial=initial_data) if formset.is_valid(): m = magic.Magic(mime=True) # NOTE: because we currently don't support re-ordering # or deletion, simply counting to keep track of datastream ids s_id = 0 modified = 0 added = 0 for file_info in formset.cleaned_data: # skip empty formset if not file_info: continue if file_info.get('dsid', None): ds = obj.getDatastreamObject(file_info['dsid'], dsobj_type=FileDatastreamObject) # ds = getattr(obj, file_info['dsid']) else: added += 1 ds = obj.getDatastreamObject('supplement%d' % s_id, dsobj_type=FileDatastreamObject) # only set if changed so datastream isModified is accurate if file_info['label'] != ds.label: ds.label = file_info['label'] # if this is an uploaded file, replace content and calculate mimetype, checksum if isinstance(file_info['file'], UploadedFile): filename = file_info['file'].temporary_file_path() mimetype = m.from_file(filename) mimetype, separator, options = mimetype.partition(';') ds.mimetype = mimetype ds.checksum = md5sum(filename) ds.content = file_info['file'] if ds.exists and ds.isModified(): modified += 1 s_id += 1 try: obj.save('updating supplemental files') # summarize number of changes, if any if added or modified: msg_add = 'added %d' % added if added else '' msg_update = 'updated %d' % modified if modified else '' msg = 'Successfully %s%s%s supplemental file%s' % \ (msg_add, ' and ' if added and modified else '', msg_update, 's' if (added + modified) != 1 else '') messages.success(request, msg) else: # possible for the form to be valid but not make any changes messages.info(request, 'No changes made to supplemental content') return HttpResponseSeeOtherRedirect(reverse('file:edit', args=[pid])) except Exception as e: logger.error('Error on supplemental file update: %s' % e) logger.debug("Error details:\n" + traceback.format_exc()) messages.error(request, unicode(e)) # for now, just redisplay the form with error message return TemplateResponse(request, 'file/supplemental_content.html', {'obj': obj, 'formset': formset})
def edit(request, pid): '''Edit the metadata for a single :class:`~keep.file.models.DiskImage`.''' # FIXME: should be generic file (?) or possibly one of several supported files repo = Repository(request=request) obj = repo.get_object(pid, type=DiskImage) try: # if this is not actually a disk image, then 404 (object is not available at this url) if not obj.has_requisite_content_models: raise Http404 if request.method == 'POST': # if data has been submitted, initialize form with request data and object mods form = DiskImageEditForm(request.POST, instance=obj) if form.is_valid(): # includes schema validation # update foxml object with data from the form form.update_instance() if 'comment' in form.cleaned_data \ and form.cleaned_data['comment']: comment = form.cleaned_data['comment'] else: comment = "update metadata" obj.save(comment) messages.success(request, 'Successfully updated <a href="%s">%s</a>' % \ (reverse('file:edit', args=[pid]), pid)) # save & continue functionality - same as collection edit if '_save_continue' not in request.POST: return HttpResponseSeeOtherRedirect(reverse('repo-admin:dashboard')) # otherwise - fall through to display edit form again # form was posted but not valid else: # if we attempted to save and failed, add a message since the error # may not be obvious or visible in the first screenful of the form messages.error(request, '''Your changes were not saved due to a validation error. Please correct any required or invalid fields indicated below and save again.''') else: # GET - display the form for editing, pre-populated with content from the object form = DiskImageEditForm(instance=obj) class AdminOpts(object): app_label = 'file' model_name = 'application' # options for generating admin link to edit/add file application db info admin_fileapp = AdminOpts() return TemplateResponse(request, 'file/edit.html', {'obj': obj, 'form': form, 'admin_fileapp': admin_fileapp}) except PermissionDenied: # Fedora may return a PermissionDenied error when accessing a datastream # where the datastream does not exist, object does not exist, or user # does not have permission to access the datastream # check that the object exists - if not, 404 if not obj.exists: raise Http404 # for now, assuming that if object exists and has correct content models, # it will have all the datastreams required for this view return HttpResponseForbidden('Permission Denied to access %s' % pid, content_type='text/plain') except RequestFailed as rf: # if fedora actually returned a 404, propagate it if rf.code == 404: raise Http404 msg = 'There was an error contacting the digital repository. ' + \ 'This prevented us from accessing audio data. If this ' + \ 'problem persists, please alert the repository ' + \ 'administrator.' return HttpResponse(msg, content_type='text/plain', status=500)
class TestMigrateRushdie(TestCase): MM_FIXTURE ='''<macfs:document xmlns:macfs="info:fedora/emory-control:Rushdie-MacFsData-1.0"> <macfs:md5>ffcf48e5df673fc7de985e1b859eeeec</macfs:md5> <macfs:file> <macfs:computer>Performa 5400</macfs:computer> <macfs:path>/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles</macfs:path> <macfs:rawpath>L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=</macfs:rawpath> <macfs:attributes>avbstclInmedz</macfs:attributes> <macfs:created>1997-01-19T19:29:32</macfs:created> <macfs:modified>1997-01-19T19:29:32</macfs:modified> <macfs:type>TEXT</macfs:type> <macfs:creator>ttxt</macfs:creator> </macfs:file> </macfs:document>''' MA_FIXTURE ='''<marbl:analysis xmlns:marbl="info:fedora/emory-control:Rushdie-MarblAnalysis-1.0"> <marbl:series>Writings by Rushdie</marbl:series> <marbl:subseries>Fiction</marbl:subseries> <marbl:verdict>As is</marbl:verdict> </marbl:analysis>''' SERIES_FIXTURE = {'Writings by Rushdie': { 'series_info': {'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_series2', 'short_id': 'series2', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2'}, 'subseries_info': { 'Fiction': { 'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_subseries2.1', 'short_id': 'subseries2.1', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2/subseries2.1'}}}} def setUp(self): self.repo = Repository() self.pids = [] #Create a simple Collection self.sc = self.repo.get_object(type=SimpleCollection) self.sc.label = "SimpleCollection For Test" self.sc.save() self.pids.append(self.sc.pid) #Create a Master Collection self.mc = self.repo.get_object(type=CollectionObject) self.mc.label = "MasterCollection For Test" self.mc.save() self.pids.append(self.mc.pid) #Create a a DigitalObject self.digObj = self.repo.get_object(type=RushdieArrangementFile) self.digObj.label = "Object For Test" self.digObj.save() self.pids.append(self.digObj.pid) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-MACTECH", "MARBL-MACTECH", mimeType="application/xml", content= self.MM_FIXTURE) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-ANALYSIS", "MARBL-ANALYSIS", mimeType="application/xml", content= self.MA_FIXTURE) #Remove Arrangement model so it can be added later relation = (self.digObj.uriref, modelns.hasModel, "info:fedora/emory-control:Arrangement-1.0") self.digObj.rels_ext.content.remove(relation) self.digObj.save() #Setup Command self.cmd = migrate_rushdie.Command() self.cmd.verbosity = 1 self.cmd.v_normal = 1 self.cmd.v_none = 0 self.cmd.simple_collection = self.sc self.cmd.stdout = sys.stdout self.cmd.CONTENT_MODELS = CONTENT_MODELS self.cmd.repo = self.repo def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test__add_to_simple_collection(self): self.cmd._add_to_simple_collection(self.digObj) self.assertTrue((self.sc.uriref, relsextns.hasMember, self.digObj.uriref) in self.sc.rels_ext.content, "%s shold be a member of the Simplecollection" % self.digObj.pid ) def test__get_unique_objects(self): #duplicate pids are processed only once objs = self.cmd._get_unique_objects([self.digObj.pid, self.digObj.pid]) self.assertEqual(len(objs), 1, "No dup pids should be processed") def test__convert_ds(self): obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) #Check all fields are moved over correctly #filetech self.assertEqual(obj.filetech.content.file[0].md5, "ffcf48e5df673fc7de985e1b859eeeec") self.assertEqual(obj.filetech.content.file[0].computer, "Performa 5400") self.assertEqual(obj.filetech.content.file[0].path, "/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles") self.assertEqual(obj.filetech.content.file[0].rawpath, "L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=") self.assertEqual(obj.filetech.content.file[0].attributes, "avbstclInmedz") self.assertEqual(obj.filetech.content.file[0].created, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].modified, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].type, "TEXT") self.assertEqual(obj.filetech.content.file[0].creator, "ttxt") #MODS self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual(obj.mods.content.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["uri"]) self.assertEqual(obj.mods.content.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["base_ark"]) self.assertEqual(obj.mods.content.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["id"]) self.assertEqual(obj.mods.content.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["short_id"]) self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie") self.assertEqual(obj.mods.content.series.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["uri"]) self.assertEqual(obj.mods.content.series.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["base_ark"]) self.assertEqual(obj.mods.content.series.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["id"]) self.assertEqual(obj.mods.content.series.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["short_id"]) #Rights self.assertEqual(obj.rights.content.access_status.code, "2") #RELS-EXT self.assertTrue((obj.uriref, relsextns.isMemberOf, self.mc.uriref) in obj.rels_ext.content, "Object should have isMember relation to master collection") self.assertTrue((obj.uriref, modelns.hasModel, URIRef("info:fedora/emory-control:ArrangementAccessAllowed-1.0")) in obj.rels_ext.content, "Object should have Allowed Content Model") #Label and DS self.assertEqual(obj.label, "x - the roles", "Label should be set to last part of path") self.assertEqual(obj.owner, "thekeep-project", "owner should be set to 'thekeep-project'") self.assertEqual(obj.dc.content.title, "x - the roles", "DC title should be set to last part of path") #DataStreams #have to reload obj from repository to get DS update obj = self.repo.get_object(pid=obj.pid, type=ArrangementObject) self.assertFalse("MARBL-MACTECH" in obj.ds_list, "MARBL-MACTECH should have been removed") self.assertFalse("MARBL-ANALYSIS" in obj.ds_list, "MARBL-ANALYSIS should have been removed") def test_missing_series_info(self): #Remove subseries info from lookup series = self.SERIES_FIXTURE.copy() del series["Writings by Rushdie"]["subseries_info"] obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie")
class EmailMessageTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # test EmailMessage self.email = self.repo.get_object(type=EmailMessage) self.email.cerp.content.from_list = ['*****@*****.**'] self.email.cerp.content.to_list = ['*****@*****.**'] self.email.cerp.content.subject_list = ['Interesting Subject'] def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test_headers(self): h1 = cerp.Header() h1.name = "HEADER 1" h1.value = "value for header 1" h2 = cerp.Header() h2.name = "HEADER 2" h2.value = "value for header 2" self.email.cerp.content.headers.append(h1) self.email.cerp.content.headers.append(h2) self.assertEqual(self.email.headers['HEADER 1'], 'value for header 1') self.assertEqual(self.email.headers['HEADER 2'], 'value for header 2') def test_email_label(self): # no object label and one person in to field label = self.email.email_label() self.assertEqual('Email from [email protected] to [email protected] Interesting Subject', label, 'Should construct label when it does not exist') # more then one person in to list self.email.cerp.content.to_list.append('*****@*****.**') label = self.email.email_label() self.assertEqual('Email from [email protected] to [email protected] et al. Interesting Subject', label, 'only show first to email address when there are more than one') # no subject self.email.cerp.content.subject_list = [] self.assertEqual('Email from [email protected] to [email protected] et al.', self.email.email_label(), 'Display message without subject when no subject is present') # has a date date_header = cerp.Header() date_header.name = 'Date' date_header.value = 'Friday 13 200 13:00' self.email.cerp.content.headers.append(date_header) label = self.email.email_label() self.assertEqual('Email from [email protected] to [email protected] et al. on Friday 13 200 13:00', label, 'only show first to email address when there are more than one') # object label already exists self.email.label = "label we want to keep" label = self.email.email_label() self.assertEqual(self.email.label, label, 'label should be preserved when it exists') def test_index_data(self): # NOTE: logic for creating the label is in the label test # test to make sure label exists in index data data = self.email.index_data() self.assertIn('label', data.keys()) # mime_data does not exist, so no c self.assert_('content_md5' not in data, 'content_md5 should not be set when mime data does not exist') # patch mime data to test exists /cchecksum with patch.object(self.email, 'mime_data', Mock()) as mock_mime: mock_mime.exists = True mock_mime.checksum = 'test checksum value' data = self.email.index_data() self.assertEqual(self.email.mime_data.checksum, data['content_md5']) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_checksum(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_checksum, 42) solr = mocksolr.return_value solr.query.assert_called_with(content_md5=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}, {'pid': 'pid:2'}] self.assertRaises(MultipleObjectsReturned, EmailMessage.by_checksum, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] em = EmailMessage.by_checksum(42) self.assert_(isinstance(em, EmailMessage)) # custom repo object mockrepo = Mock() em = EmailMessage.by_checksum(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=EmailMessage) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_message_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_message_id, '<*****@*****.**>') solr = mocksolr.return_value solr.query.assert_called_with(arrangement_id='<*****@*****.**>', content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid')
def upload(request): '''Upload file(s) and create new fedora :class:`~keep.audio.models.AudioObject` (s). Only accepts audio/x-wav currently. There are two distinct ways to upload file. The first case is kicked off when "fileManualUpload" exists in the posted form. If it does, then this was not a HTML5 browser, and the file upload occurs as is usual for a single file upload. In the other approach, the file was uploaded via a HTML5 ajax upload already. In this case, we are reading in various hidden generated form fields that indicate what was uploaded from the javascript code. ''' repo = Repository(request=request) ctx_dict = { # list of allowed file types, in a format suited for passing to javascript 'js_allowed_types': mark_safe(json.dumps(allowed_upload_types(request.user))) } if request.method == 'POST': content_type = request.META.get('CONTENT_TYPE', 'application/octet-stream') media_type, sep, options = content_type.partition(';') # content type is technically case-insensitive; lower-case before comparing media_type = media_type.strip().lower() # if form has been posted, process & ingest files if media_type == 'multipart/form-data': # check for a single file upload form = UploadForm(request.POST, request.FILES) # If form is not valid (i.e., no collection specified, no # or mismatched files uploaded), bail out and redisplay # form with any error messages. if not form.is_valid(): ctx_dict['form'] = form return TemplateResponse(request, 'file/upload.html', ctx_dict) # Form is valid. Get collection & check for optional comment collection = repo.get_object(pid=form.cleaned_data['collection'], type=CollectionObject) # get user comment if any; default to a generic ingest comment comment = form.cleaned_data['comment'] or 'initial repository ingest' # get dictionary of file path -> filename, based on form data files_to_ingest = form.files_to_ingest() # process all files submitted for ingest (single or batch mode) if files_to_ingest: results = ingest_files(files_to_ingest, collection, comment, request) # add per-file ingest result status to template context ctx_dict['ingest_results'] = results # after processing files, fall through to display upload template else: # POST but not form data - handle ajax file upload return ajax_upload(request) # on GET or non-ajax POST, display the upload form ctx_dict['form'] = UploadForm() # convert list of allowed types for passing to javascript return TemplateResponse(request, 'file/upload.html', ctx_dict)
class ArrangementObjectTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # create test collection coll = self.repo.get_object(type=CollectionObject) coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE coll.mods.content.source_id = '12345' coll.save() self.pids.append(coll.pid) #create test arrangement object self.arr = self.repo.get_object(type=ArrangementObject) self.arr.pid = 'foo:1' self.arr.collection = coll def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_arrangement_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, ArrangementObject.by_arrangement_id, 42) solr = mocksolr.return_value solr.query.assert_called_with(arrangement_id=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}, {'pid': 'pid:2'}] self.assertRaises(MultipleObjectsReturned, ArrangementObject.by_arrangement_id, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] ao = ArrangementObject.by_arrangement_id(42) self.assert_(isinstance(ao, ArrangementObject)) # custom repo object mockrepo = Mock() ao = ArrangementObject.by_arrangement_id(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=ArrangementObject) def test_arrangement_status(self): obj = ArrangementObject(Mock()) obj.arrangement_status = 'processed' self.assertEqual('A', obj.state) self.assertEqual('processed', obj.arrangement_status) obj.arrangement_status = 'accessioned' self.assertEqual('I', obj.state) self.assertEqual('accessioned', obj.arrangement_status) value_error = None try: obj.arrangement_status = 'bogus' except ValueError: value_error = True self.assertTrue(value_error, 'attempting to assign an unknown status should raise a ValueError') def test_update_access_cmodel(self): obj = ArrangementObject(Mock()) # no status set - should be set to restricted obj._update_access_cmodel() self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) in obj.rels_ext.content) self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) not in obj.rels_ext.content) # set to status code 2 = access allowed obj.rights.content.create_access_status() obj.rights.content.access_status.code = '2' obj._update_access_cmodel() self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) not in obj.rels_ext.content) self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) in obj.rels_ext.content) def test_index_data(self): idx_data = self.arr.index_data() self.assertEqual('born-digital', idx_data['object_type']) self.assertEqual(self.arr.pid, idx_data['pid']) self.assertIn(self.arr.owner, idx_data['owner']) self.assertEquals(self.arr.collection.pid, idx_data['collection_id']) self.assertEquals(self.arr.collection.mods.content.source_id, idx_data['collection_source_id']) # Test the update_ark_label method in the keep.common.fedora # Note that this test is a simplified version of keep.common.fedora:ArkPidDigitalObject.test_update_ark_label # The udpate_ark_label here is an overriden method that is more specifc, and is used on Arrangement objects @patch('keep.arrangement.models.pidman') # mock the pidman client (the API service) def test_update_ark_label(self, mockpidman): # Create a ArrangementObject arrangement_object = ArrangementObject(Mock()) # Set a pid on the object so that it could internally generate a noid etc. arrangement_object.pid = "test:1234" # Simulate when the object doesn't exist (or hasn't been saved) # By default it appears as if it doesn't exist arrangement_object.update_ark_label() # What we should expect is that the update_ark_label is not called on pidman # Also there shouldn't be any errors # Use the mock assertFalse to check if a method is called or not self.assertFalse(mockpidman.get_ark.called) # Mock when the object exists (returns True) # Note: Need to set the Mock on the class and not the object because # this (exists) is a property method with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): arrangement_object.update_ark_label() self.assertFalse(mockpidman.get_ark.called) # Set the label before the object exists so we don't trigger API calls arrangement_object.dc.content.title = "testpid" with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): mockpidman.get_ark.return_value = {"name": arrangement_object.dc.content.title} arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too self.assertFalse(mockpidman.update_ark.called) # When the label is different from that in Pidman mockpidman.get_ark.return_value = {"name": "another pid"} arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too mockpidman.update_ark.assert_called_with(noid=arrangement_object.noid, name=arrangement_object.dc.content.title) def test_set_premis_object(self): mockapi = Mock() arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = "test:1234" arrangement_object.mods.content.ark = 'ark:/1234/987' # return empty iterator for original data to checksum mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() self.assert_(arrangement_object.provenance.content.object) premis = arrangement_object.provenance.content # FIXME: placeholder tests for placeholder functionality, # should be updated to use ARK uri once that is implemented self.assertEqual('ark', premis.object.id_type) self.assertEqual(arrangement_object.mods.content.ark, premis.object.id) self.assertEqual('p:file', premis.object.type) self.assertEqual(0, premis.object.composition_level) self.assertEqual('MD5', premis.object.checksums[0].algorithm) self.assertEqual('123456789', premis.object.checksums[0].digest) # sha1 for an empty file empty_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' self.assertEqual('SHA-1', premis.object.checksums[1].algorithm) self.assertEqual(empty_sha1, premis.object.checksums[1].digest) # object format should be original mietype self.assertEqual('text/plain', premis.object.format.name) # generated premis should be valid self.assertTrue(premis.is_valid()) def test_identifier_change_event(self): mockapi = Mock() mockapi.username = '******' arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = 'test:1234' arrangement_object.mods.content.ark = 'ark:/1234/987' # set object premis so we can validate mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() arrangement_object.identifier_change_event('old-pid:1') premis = arrangement_object.provenance.content self.assertEqual(1, len(premis.events)) event = premis.events[0] self.assertEqual('UUID', event.id_type) # id should be set, we don't care what it is exactly self.assert_(event.id) self.assertEqual('identifier assignment', event.type) self.assertEqual('program="keep"; version="%s"' % __version__, event.detail) self.assertEqual('Pass', event.outcome) msg = 'Persistent identifier reassigned from %s to %s' % \ ('old-pid:1', arrangement_object.pid) self.assertEqual(msg, event.outcome_detail) self.assertEqual('fedora user', event.agent_type) self.assertEqual('fedoraAdmin', event.agent_id) # generated premis should be valid self.assertTrue(premis.is_valid())
def batch_set_status(pid, status): repo = Repository() batch = repo.get_object(pid, type=SimpleCollection) # keep track of totals for success and failure success = 0 error = 0 # translate form status codes to fedora state code # TODO: shift this logic to arrangement object for re-use ? codes = {'Processed': 'A', 'Accessioned': 'I'} # target state for every object in the collection if status not in codes: err_msg = 'Status %s unknown' % status logger.error(err_msg) raise Exception(err_msg) else: state = codes[status] # finp all pids associated with this object pids = list( batch.rels_ext.content.objects(batch.uriref, relsextns.hasMember)) for pid in pids: try: # pass in api from batch object to retain user credentials obj = ArrangementObject(batch.api, pid) obj.state = state obj.save('Marking as %s via SimpleCollection %s' % (status, batch.pid)) success += 1 except Exception as e: logger.error('Failed to update %s : %s' % (pid, e)) error += 1 info = { 'success': success, 'error': error, 'success_plural': '' if success == 1 else 's', 'error_plural': '' if error == 1 else 's', 'status': status } summary_msg = "Successfully updated %(success)s item%(success_plural)s; error updating %(error)s" % info # if not all objects were updated correctly, exit with error if error > 0: raise Exception(summary_msg) # FIXME: this is based on the current form logic, but could leave # some member items stranded in a different status than the parent object batch.mods.content.create_restrictions_on_access() batch.mods.content.restrictions_on_access.text = status # Change collection status try: batch.save( 'Marking as %(status)s; updated %(success)s member item%(success_plural)s' % info) except Exception as e: save_err = "Error updating SimpleCollection %s - %s" % (obj.pid, e) logger.error(save_err) raise Exception('%s; %s' % (save_err, summary_msg)) # success return 'Successfully updated %(success)s item%(success_plural)s' % info
def migrate_aff_diskimage(self, pid): creating_application = 'AccessData FTK Imager' application_version = 'v3.1.1 CLI' migration_event_detail = 'program="%s"; version="%s"' % \ (creating_application, application_version) migration_event_outcome = 'AFF reformatted as E01 using command line ' + \ 'FTK program with settings: --e01 --compress 0 --frag 100T --quiet' # use the configured ingesting staging area as the base tmp dir # create # for all temporary files staging_dir = getattr(settings, 'LARGE_FILE_STAGING_DIR', None) # create a tempdir within the large file staging area tmpdir = tempfile.mkdtemp(suffix='-aff-migration', dir=staging_dir) logger.debug('Using tmpdir %s', tmpdir) # Retrieve the object to be migrated repo = Repository() original = repo.get_object(pid, type=DiskImage) # check object before migrating # - exists in fedora if not original.exists: # raise Exception raise Exception('%s not found in Fedora' % original.pid) # - is a disk image if not original.has_requisite_content_models: raise Exception('%s is not a DiskImage object' % original.pid) # - is an AFF disk image if original.provenance.content.object.format.name != 'AFF': raise Exception('%s DiskImage format is not AFF' % original.pid) # - has not already been migrated if original.migrated is not None: raise Exception('%s has already been migrated' % original.pid) # download the aff disk image to a tempfile aff_file = tempfile.NamedTemporaryFile(suffix='.aff', prefix='keep-%s_' % original.noid, dir=tmpdir, delete=False) logger.debug('Saving AFF as %s for conversion (datastream size: %s)' \ % (aff_file.name, filesizeformat(original.content.size))) try: for chunk in original.content.get_chunked_content(): aff_file.write(chunk) except Exception as err: raise Exception('Error downloading %s AFF for conversion' % original.pid) # close the file handle in case of weird interactions with ftkimager aff_file.close() aff_size = os.path.getsize(aff_file.name) logger.debug('Downloaded %s' % filesizeformat(aff_size)) # run ftkimager to generate the E01 version logger.debug('Running ftkimager to generate E01') e01_file = tempfile.NamedTemporaryFile(suffix='.E01', prefix='keep-%s_' % original.noid, dir=tmpdir, delete=False) # close the file handle in case of weird interactions with ftkimager e01_file.close() # file handle to capture console output from ftkimager ftk_output = tempfile.NamedTemporaryFile(suffix='.txt', prefix='keep-%s-ftkimager_' % original.noid, dir=tmpdir) logger.debug('E01 temp file is %s' % e01_file.name) logger.debug('ftkimager output temp file is %s' % ftk_output.name) # ftkimager adds .E01 to the specified filename, so pass in filename without e01_file_basename, ext = os.path.splitext(e01_file.name) convert_command = [ 'ftkimager', aff_file.name, e01_file_basename, '--e01', '--compress', '0', '--frag', '100T', '--quiet' ] # quiet simply suppresses progress output, which is not meaningful # in a captured text file logger.debug('conversion command is %s' % ' '.join(convert_command)) return_val = subprocess.call(convert_command, stdout=ftk_output, stderr=subprocess.STDOUT) logger.debug('ftkimager return value is %s' % return_val) ftk_detail_output = '%s.txt' % e01_file.name e01_size = os.path.getsize(e01_file.name) if e01_size == 0: raise Exception('Generated E01 file is 0 size') logger.info('Generated E01 (%s) from %s AFF (%s)' % \ (filesizeformat(e01_size), original.pid, filesizeformat(aff_size))) # use ftkimager to verify aff and e01 and compare checksums aff_checksums = ftkimager_verify(aff_file.name) if not aff_checksums: raise Exception('Error running ftkimager verify on AFF for %s' % original.pid) e01_checksums = ftkimager_verify(e01_file.name) if not e01_checksums: raise Exception('Error running ftkimager verify on E01 for %s' % original.pid) logger.debug('AFF verify checksums: %s' % \ ', '.join('%s: %s' % (k, v) for k, v in aff_checksums.iteritems())) logger.debug('E01 verify checksums: %s' % \ ', '.join('%s: %s' % (k, v) for k, v in e01_checksums.iteritems())) if aff_checksums != e01_checksums: raise Exception('AFF and E01 ftkimager verify checksums do not match') # create a new diskimage object from the file # - calculate file uri for content location e01_file_uri = fedora_file_uri(e01_file.name) logger.debug('E01 fedora file URI is %s', e01_file_uri) # change permissions on tmpdir + files to ensure fedora can access them os.chmod(tmpdir, 0775) os.chmod(e01_file.name, 0666) os.chmod(ftk_output.name, 0666) os.chmod(ftk_detail_output, 0666) migrated = DiskImage.init_from_file(e01_file.name, initial_label=original.label, content_location=e01_file_uri) # add ftkimager text output & details as supplemental files # - console output captured from subprocess call dsobj = migrated.getDatastreamObject('supplement0', dsobj_type=FileDatastreamObject) dsobj.label = 'ftkimager_output.txt' dsobj.mimetype = 'text/plain' dsobj.checksum = md5sum(ftk_output.name) logger.debug('Adding ftkimager console output as supplemental dastream %s label=%s mimetype=%s checksum=%s' % \ (dsobj.id, dsobj.label, dsobj.mimetype, dsobj.checksum)) dsobj.content = open(ftk_output.name).read() # - text file generated by ftkimager alongside the E01 dsobj2 = migrated.getDatastreamObject('supplement1', dsobj_type=FileDatastreamObject) dsobj2.label = 'ftkimager_summary.txt' dsobj2.mimetype = 'text/plain' dsobj2.checksum = md5sum(ftk_detail_output) logger.debug('Adding ftkimager summary as supplemental dastream %s label=%s mimetype=%s checksum=%s' % \ (dsobj2.id, dsobj2.label, dsobj2.mimetype, dsobj2.checksum)) dsobj2.content = open(ftk_detail_output).read() # set metadata based on original disk image # - associate with original migrated.original = original # copy over descriptive & rights metadata # - collection membership migrated.collection = original.collection # - mods title, covering dates, abstract migrated.mods.content.title = original.mods.content.title migrated.mods.content.abstract = original.mods.content.abstract migrated.mods.content.coveringdate_start = original.mods.content.coveringdate_start migrated.mods.content.coveringdate_end = original.mods.content.coveringdate_end # - entire rights datastream migrated.rights.content = original.rights.content ### Update generated premis to describe migration. premis_ds = migrated.provenance.content premis_ds.object.composition_level = 0 # these values are the same for all migrated AFFs premis_ds.object.create_creating_application() premis_ds.object.creating_application.name = creating_application premis_ds.object.creating_application.version = application_version premis_ds.object.creating_application.date = date.today() # add relationship to the original object rel = PremisRelationship(type='derivation') rel.subtype = 'has source' rel.related_object_type = 'ark' rel.related_object_id = original.mods.content.ark # relationship must also reference the migration event on the # original, which doesn't exist yet. Generate a migration event # id now to use for both migration_event_id = uuid.uuid1() rel.related_event_type = 'UUID' rel.related_event_id = migration_event_id premis_ds.object.relationships.append(rel) ## NOTE: Due to a Fedora bug with checksums and file uri ingest, ## content datastream checksum must be cleared out before ingest ## and manually checked after. # store datastream checksum that would be sent to fedora e01_checksum = migrated.content.checksum # clear it out so Fedora can ingest without erroring migrated.content.checksum = None # ingest try: migrated.save('Ingest migrated version of %s' % original.pid) logger.debug('Migrated object ingested as %s' % migrated.pid) except DuplicateContent as err: raise Exception('Duplicate content detected for %s: %s %s', original.pid, err, ', '.join(err.pids)) # would probably be good to catch other fedora errors # remove temporary files for tmpfilename in [ aff_file.name, e01_file.name, ftk_output.name, ftk_detail_output ]: os.remove(tmpfilename) # reinitialize migrated object, just to avoid any issues # with accessing ark uri for use in original object premis migrated = repo.get_object(migrated.pid, type=DiskImage) # verify checksum if migrated.content.checksum != e01_checksum: raise Exception('Checksum mismatch detected on E01 for %s', migrated.pid) # once migrated object has been ingested, # update original object with migration information # - add rels-ext reference to migrated object original.migrated = migrated # - update premis with migration event and relationship migration_event = PremisEvent() migration_event.id_type = 'UUID' migration_event.id = migration_event_id migration_event.type = 'migration' migration_event.date = datetime.now().isoformat() migration_event.detail = migration_event_detail migration_event.outcome = 'Pass' migration_event.outcome_detail = migration_event_outcome migration_event.agent_type = 'fedora user' migration_event.agent_id = repo.username # premis wants both source and outcome objects linked in the event link_source = PremisLinkingObject(id_type='ark') link_source.id = original.mods.content.ark link_source.role = 'source' link_outcome = PremisLinkingObject(id_type='ark') link_outcome.id = migrated.mods.content.ark link_outcome.role = 'outcome' migration_event.linked_objects.extend([link_source, link_outcome]) original.provenance.content.events.append(migration_event) # add relation to migrated object in to premis object rel = PremisRelationship(type='derivation') rel.subtype = 'is source of' rel.related_object_type = 'ark' rel.related_object_id = migrated.mods.content.ark rel.related_event_type = 'UUID' rel.related_event_id = migration_event.id original.provenance.content.object.relationships.append(rel) original.save() logger.debug('Original disk image updated with migration data') # remove aff migration temp dir and any remaining contents try: shutil.rmtree(tmpdir) except OSError: # tempdir removal could fail due to nfs files # wait a few seconds and try again time.sleep(3) try: shutil.rmtree(tmpdir) except OSError as os_err: logger.warning('Failed to remove tmpdir %s : %s', tmpdir, os_err) logger.info('Migrated %s AFF to %s E01' % (original.pid, migrated.pid)) return 'Migrated %s to %s' % (original.pid, migrated.pid)
def batch_set_status(pid, status): repo = Repository() batch = repo.get_object(pid, type=SimpleCollection) # keep track of totals for success and failure success = 0 error = 0 # translate form status codes to fedora state code # TODO: shift this logic to arrangement object for re-use ? codes = {'Processed': 'A', 'Accessioned': 'I'} # target state for every object in the collection if status not in codes: err_msg = 'Status %s unknown' % status logger.error(err_msg) raise Exception(err_msg) else: state = codes[status] # finp all pids associated with this object pids = list(batch.rels_ext.content.objects(batch.uriref, relsextns.hasMember)) for pid in pids: try: # pass in api from batch object to retain user credentials obj = ArrangementObject(batch.api, pid) obj.state = state obj.save('Marking as %s via SimpleCollection %s' % (status, batch.pid)) success += 1 except Exception as e: logger.error('Failed to update %s : %s' % (pid, e)) error += 1 info = { 'success': success, 'error': error, 'success_plural': '' if success == 1 else 's', 'error_plural': '' if error == 1 else 's', 'status': status } summary_msg = "Successfully updated %(success)s item%(success_plural)s; error updating %(error)s" % info # if not all objects were updated correctly, exit with error if error > 0: raise Exception(summary_msg) # FIXME: this is based on the current form logic, but could leave # some member items stranded in a different status than the parent object batch.mods.content.create_restrictions_on_access() batch.mods.content.restrictions_on_access.text = status # Change collection status try: batch.save('Marking as %(status)s; updated %(success)s member item%(success_plural)s' % info) except Exception as e: save_err = "Error updating SimpleCollection %s - %s" % (obj.pid, e) logger.error(save_err) raise Exception('%s; %s' % (save_err, summary_msg)) # success return 'Successfully updated %(success)s item%(success_plural)s' % info