class TestMigrateRushdie(TestCase): MM_FIXTURE ='''<macfs:document xmlns:macfs="info:fedora/emory-control:Rushdie-MacFsData-1.0"> <macfs:md5>ffcf48e5df673fc7de985e1b859eeeec</macfs:md5> <macfs:file> <macfs:computer>Performa 5400</macfs:computer> <macfs:path>/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles</macfs:path> <macfs:rawpath>L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=</macfs:rawpath> <macfs:attributes>avbstclInmedz</macfs:attributes> <macfs:created>1997-01-19T19:29:32</macfs:created> <macfs:modified>1997-01-19T19:29:32</macfs:modified> <macfs:type>TEXT</macfs:type> <macfs:creator>ttxt</macfs:creator> </macfs:file> </macfs:document>''' MA_FIXTURE ='''<marbl:analysis xmlns:marbl="info:fedora/emory-control:Rushdie-MarblAnalysis-1.0"> <marbl:series>Writings by Rushdie</marbl:series> <marbl:subseries>Fiction</marbl:subseries> <marbl:verdict>As is</marbl:verdict> </marbl:analysis>''' SERIES_FIXTURE = {'Writings by Rushdie': { 'series_info': {'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_series2', 'short_id': 'series2', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2'}, 'subseries_info': { 'Fiction': { 'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_subseries2.1', 'short_id': 'subseries2.1', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2/subseries2.1'}}}} def setUp(self): self.repo = Repository() self.pids = [] #Create a simple Collection self.sc = self.repo.get_object(type=SimpleCollection) self.sc.label = "SimpleCollection For Test" self.sc.save() self.pids.append(self.sc.pid) #Create a Master Collection self.mc = self.repo.get_object(type=CollectionObject) self.mc.label = "MasterCollection For Test" self.mc.save() self.pids.append(self.mc.pid) #Create a a DigitalObject self.digObj = self.repo.get_object(type=RushdieArrangementFile) self.digObj.label = "Object For Test" self.digObj.save() self.pids.append(self.digObj.pid) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-MACTECH", "MARBL-MACTECH", mimeType="application/xml", content= self.MM_FIXTURE) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-ANALYSIS", "MARBL-ANALYSIS", mimeType="application/xml", content= self.MA_FIXTURE) #Remove Arrangement model so it can be added later relation = (self.digObj.uriref, modelns.hasModel, "info:fedora/emory-control:Arrangement-1.0") self.digObj.rels_ext.content.remove(relation) self.digObj.save() #Setup Command self.cmd = migrate_rushdie.Command() self.cmd.verbosity = 1 self.cmd.v_normal = 1 self.cmd.v_none = 0 self.cmd.simple_collection = self.sc self.cmd.stdout = sys.stdout self.cmd.CONTENT_MODELS = CONTENT_MODELS self.cmd.repo = self.repo def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test__add_to_simple_collection(self): self.cmd._add_to_simple_collection(self.digObj) self.assertTrue((self.sc.uriref, relsextns.hasMember, self.digObj.uriref) in self.sc.rels_ext.content, "%s shold be a member of the Simplecollection" % self.digObj.pid ) def test__get_unique_objects(self): #duplicate pids are processed only once objs = self.cmd._get_unique_objects([self.digObj.pid, self.digObj.pid]) self.assertEqual(len(objs), 1, "No dup pids should be processed") def test__convert_ds(self): obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) #Check all fields are moved over correctly #filetech self.assertEqual(obj.filetech.content.file[0].md5, "ffcf48e5df673fc7de985e1b859eeeec") self.assertEqual(obj.filetech.content.file[0].computer, "Performa 5400") self.assertEqual(obj.filetech.content.file[0].path, "/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles") self.assertEqual(obj.filetech.content.file[0].rawpath, "L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=") self.assertEqual(obj.filetech.content.file[0].attributes, "avbstclInmedz") self.assertEqual(obj.filetech.content.file[0].created, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].modified, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].type, "TEXT") self.assertEqual(obj.filetech.content.file[0].creator, "ttxt") #MODS self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual(obj.mods.content.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["uri"]) self.assertEqual(obj.mods.content.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["base_ark"]) self.assertEqual(obj.mods.content.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["id"]) self.assertEqual(obj.mods.content.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["short_id"]) self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie") self.assertEqual(obj.mods.content.series.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["uri"]) self.assertEqual(obj.mods.content.series.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["base_ark"]) self.assertEqual(obj.mods.content.series.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["id"]) self.assertEqual(obj.mods.content.series.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["short_id"]) #Rights self.assertEqual(obj.rights.content.access_status.code, "2") #RELS-EXT self.assertTrue((obj.uriref, relsextns.isMemberOf, self.mc.uriref) in obj.rels_ext.content, "Object should have isMember relation to master collection") self.assertTrue((obj.uriref, modelns.hasModel, URIRef("info:fedora/emory-control:ArrangementAccessAllowed-1.0")) in obj.rels_ext.content, "Object should have Allowed Content Model") #Label and DS self.assertEqual(obj.label, "x - the roles", "Label should be set to last part of path") self.assertEqual(obj.owner, "thekeep-project", "owner should be set to 'thekeep-project'") self.assertEqual(obj.dc.content.title, "x - the roles", "DC title should be set to last part of path") #DataStreams #have to reload obj from repository to get DS update obj = self.repo.get_object(pid=obj.pid, type=ArrangementObject) self.assertFalse("MARBL-MACTECH" in obj.ds_list, "MARBL-MACTECH should have been removed") self.assertFalse("MARBL-ANALYSIS" in obj.ds_list, "MARBL-ANALYSIS should have been removed") def test_missing_series_info(self): #Remove subseries info from lookup series = self.SERIES_FIXTURE.copy() del series["Writings by Rushdie"]["subseries_info"] obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie")
class EmailMessageTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # test EmailMessage self.email = self.repo.get_object(type=EmailMessage) self.email.cerp.content.from_list = ['*****@*****.**'] self.email.cerp.content.to_list = ['*****@*****.**'] self.email.cerp.content.subject_list = ['Interesting Subject'] def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test_headers(self): h1 = cerp.Header() h1.name = "HEADER 1" h1.value = "value for header 1" h2 = cerp.Header() h2.name = "HEADER 2" h2.value = "value for header 2" self.email.cerp.content.headers.append(h1) self.email.cerp.content.headers.append(h2) self.assertEqual(self.email.headers['HEADER 1'], 'value for header 1') self.assertEqual(self.email.headers['HEADER 2'], 'value for header 2') def test_email_label(self): # no object label and one person in to field label = self.email.email_label() self.assertEqual('Email from [email protected] to [email protected] Interesting Subject', label, 'Should construct label when it does not exist') # more then one person in to list self.email.cerp.content.to_list.append('*****@*****.**') label = self.email.email_label() self.assertEqual('Email from [email protected] to [email protected] et al. Interesting Subject', label, 'only show first to email address when there are more than one') # no subject self.email.cerp.content.subject_list = [] self.assertEqual('Email from [email protected] to [email protected] et al.', self.email.email_label(), 'Display message without subject when no subject is present') # has a date date_header = cerp.Header() date_header.name = 'Date' date_header.value = 'Friday 13 200 13:00' self.email.cerp.content.headers.append(date_header) label = self.email.email_label() self.assertEqual('Email from [email protected] to [email protected] et al. on Friday 13 200 13:00', label, 'only show first to email address when there are more than one') # object label already exists self.email.label = "label we want to keep" label = self.email.email_label() self.assertEqual(self.email.label, label, 'label should be preserved when it exists') def test_index_data(self): # NOTE: logic for creating the label is in the label test # test to make sure label exists in index data data = self.email.index_data() self.assertIn('label', data.keys()) # mime_data does not exist, so no c self.assert_('content_md5' not in data, 'content_md5 should not be set when mime data does not exist') # patch mime data to test exists /cchecksum with patch.object(self.email, 'mime_data', Mock()) as mock_mime: mock_mime.exists = True mock_mime.checksum = 'test checksum value' data = self.email.index_data() self.assertEqual(self.email.mime_data.checksum, data['content_md5']) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_checksum(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_checksum, 42) solr = mocksolr.return_value solr.query.assert_called_with(content_md5=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}, {'pid': 'pid:2'}] self.assertRaises(MultipleObjectsReturned, EmailMessage.by_checksum, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] em = EmailMessage.by_checksum(42) self.assert_(isinstance(em, EmailMessage)) # custom repo object mockrepo = Mock() em = EmailMessage.by_checksum(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=EmailMessage) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_message_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_message_id, '<*****@*****.**>') solr = mocksolr.return_value solr.query.assert_called_with(arrangement_id='<*****@*****.**>', content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid')
class Command(BaseCommand): '''Read CSV file and creates (or adds to) a Simple Collection and associated ArrangementObjects with the SimpleCollection and the Master collection''' def get_password_option(option, opt, value, parser): setattr(parser.values, option.dest, getpass()) #Set up additional options option_list = BaseCommand.option_list + ( make_option( '--noact', '-n', action='store_true', dest='no-act', default=False, help= 'Does not create PIDs or ingest anything into Fedora. Only parses file and outputs results' ), make_option( '--add', '-a', action='store', dest='add', help= 'adds to the SimpleCollection specified by pid, does not create a new SimpleCollection' ), make_option('--username', '-u', dest='username', action='store', help='''Username to connect to fedora'''), make_option( '--password', dest='password', action='callback', callback=get_password_option, help='''Prompt for password required when username used'''), ) args = '<CSV file> <master collection pid> <new simple collection name>' help = __doc__ def _create_series_lookup(self): #series / subseries info self.series = {} #exist query params return_fields = ['eadid'] search_fields = {'eadid': 'rushdie1000'} queryset = Series.objects.also(*return_fields).filter(**search_fields) for s in queryset: #series info self.series[s.title] = {} self.series[s.title]['series_info'] = {} self.series[s.title]['series_info']['id'] = s.id self.series[s.title]['series_info']['short_id'] = s.short_id self.series[s.title]['series_info']['base_ark'] = s.eadid.url self.series[s.title]['series_info']['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s" % \ (s.eadid.value, s.short_id) #subseries info if s.subseries: self.series[s.title]['subseries_info'] = {} for sub in s.subseries: self.series[s.title]['subseries_info'][sub.title] = {} self.series[s.title]['subseries_info'][ sub.title]['id'] = sub.id self.series[s.title]['subseries_info'][ sub.title]['short_id'] = sub.short_id self.series[s.title]['subseries_info'][ sub.title]['base_ark'] = s.eadid.url self.series[s.title]['subseries_info'][sub.title]['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s/%s" % \ (s.eadid.value, s.short_id, sub.short_id) def _create_arrangement(self, row): #Account for unicode characters #Preserve unicode characters for raw path, #but remove unicode character for other mappings rawpath = base64.encodestring(row["filename"]) path = row["filename"] path = unicode(path, 'utf8') creator = row["creator"] creator = unicode(creator, 'utf8') # set values in filetech DS obj = self.repo.get_object(type=ArrangementObject) obj.label = path.rpartition('/')[2] obj.filetech.content.file.append(FileMasterTech_Base()) obj.filetech.content.file[0].local_id = row['id'] obj.filetech.content.file[0].md5 = row['checksum'] obj.filetech.content.file[0].computer = row['computer'] obj.filetech.content.file[0].path = path obj.filetech.content.file[0].rawpath = rawpath obj.filetech.content.file[0].attributes = row['attrib'] obj.filetech.content.file[0].created = row['created'] obj.filetech.content.file[0].modified = row['modified'] obj.filetech.content.file[0].creator = creator #map DC title obj.dc.content.title = path.rpartition('/')[2] #map default verdict of 10 "Undetermined" in rights DS obj.rights.content.create_access_status() obj.rights.content.access_status.code = "10" #map series in MODS #RecordType used to lookup series info rec_type = row["rec_type"] rec_type = rec_type.strip() if rec_type not in self.series: rec_type = None if rec_type is not None: obj.mods.content.create_series() obj.mods.content.series.title = rec_type obj.mods.content.series.uri = self.series[rec_type]["series_info"][ "uri"] obj.mods.content.series.base_ark = self.series[rec_type][ "series_info"]["base_ark"] obj.mods.content.series.full_id = self.series[rec_type][ "series_info"]["id"] obj.mods.content.series.short_id = self.series[rec_type][ "series_info"]["short_id"] else: if self.verbosity > self.v_none: self.stdout.write("Series %s not found\n" % row["rec_type"]) # set association to master collection relation = (obj.uriref, relsextns.isMemberOf, self.master_obj.uriref) obj.rels_ext.content.add(relation) if self.verbosity > self.v_normal: self.stdout.write( "Adding %s isMemberOf %s relation on ArrangementObject\n" % (obj.label, self.master_obj.pid)) #set state to inactive by default obj.state = "I" return obj def handle(self, *args, **options): #collect arrangement pids here to delete later if SimpleCollection fails to save self.arrangement_pids = [] self._create_series_lookup() #0 = none, 1 = normal, 2 = all self.v_none = 0 self.v_normal = 1 if 'verbosity' in options: self.verbosity = int(options['verbosity']) else: self.verbosity = self.v_normal #Create the repo repo_args = {} if options.get('username') is not None: repo_args['username'] = options.get('username') if options.get('password') is not None: repo_args['password'] = options.get('password') self.repo = Repository(**repo_args) #Check to make sure all args and options are present try: file = args[0] except IndexError: raise CommandError("No CSV file specified") try: self.master_pid = args[1] except IndexError: raise CommandError("No master collection pid specified") #if -a or --add is used the new SimpleCollection name is ignored try: if not options["add"]: self.simple_collection_name = args[2] else: self.simple_collection_pid = options["add"] except IndexError: raise CommandError( "An existing SimpleCollection pid must be specified with the -a option or \ a new SimpleCollection name must be specified as an argument") #If Master collection does not exist then raise an exception self.master_obj = self.repo.get_object(type=CollectionObject, pid=self.master_pid) if not self.master_obj.exists: raise CommandError("Master Collection %s does not exist" % (self.master_pid)) else: if self.verbosity > self.v_none: self.stdout.write("Using Master Collection: %s(%s)\n" % (self.master_obj.label, self.master_obj.pid)) #Get or create SimpleColletion object #TODO Not sure why I have to do a try block to prevent a 404 here when I don't in other places try: if options["add"]: simple_collection = self.repo.get_object( type=SimpleCollection, pid=self.simple_collection_pid) else: simple_collection = self.repo.get_object(type=SimpleCollection) simple_collection.label = self.simple_collection_name simple_collection.dc.content.title = self.simple_collection_name simple_collection.mods.content.create_restrictions_on_access() simple_collection.mods.content.restrictions_on_access.text = "Accessioned" except: raise CommandError("Pid %s does not exist" % self.simple_collection_pid) #try to read file into a dict and assign the field names try: reader = csv.DictReader(open(file, 'rb'), fieldnames=[ "id", "checksum", "filename", "rec_type", "file_type", "creator", "attrib", "created", "modified", "computer", "size" ]) if self.verbosity > self.v_none: self.stdout.write("Reading CSV: %s\n" % (file)) except IOError: raise CommandError("Could not read file %s" % file) # skip the header row in CSV file reader.next() #read each field csv_read = 0 arrangement_saved = 0 errors = 0 for row in reader: try: csv_read += 1 arrangement_object = self._create_arrangement(row) if not options['no-act']: try: arrangement_object.save() arrangement_saved += 1 self.arrangement_pids.append(arrangement_object.pid) if self.verbosity > self.v_none: self.stdout.write( "Saved ArrangementObject %s(%s)\n" % (arrangement_object.label, arrangement_object.pid)) except Exception as e: if self.verbosity > self.v_none: self.stdout.write( "Error saving ArrangementObject %s: %s\n" % (arrangement_object.label, e.message)) errors += 1 else: if self.verbosity > self.v_none: self.stdout.write("TEST ArrangementObject %s\n" % (arrangement_object.label)) if self.verbosity > self.v_normal: self.stdout.write("===RELS-EXT===\n") for entry in arrangement_object.rels_ext.content: self.stdout.write("%s\n" % list(entry)) self.stdout.write("===MODS===\n") self.stdout.write( "%s\n" % arrangement_object.mods.content.serialize()) #Add each ArrangementObject to the SimpleCollection relation = (simple_collection.uriref, relsextns.hasMember, arrangement_object.uriref) simple_collection.rels_ext.content.add(relation) if self.verbosity > self.v_normal: self.stdout.write( "Adding hasMember %s relation on SimpleCollection\n" % (arrangement_object.pid)) except Exception as e: self.stdout.write("Error in record id %s: %s\n" % (row["id"], e)) errors += 1 if not options['no-act']: try: simple_collection.save() self.stdout.write( "Saved SimpleCollection %s(%s)\n" % (simple_collection.label, simple_collection.pid)) except Exception as e: if self.verbosity > self.v_none: self.stdout.write( "Error saving SimpleCollection %s: %s\n" % (simple_collection.label, e.message)) self.stdout.write( "Deleting Arrangement pids so they will not be Orphans\n" ) errors += 1 for pid in self.arrangement_pids: self.repo.purge_object(pid) if self.verbosity > self.v_none: self.stdout.write("Deleting: %s\n" % (pid)) arrangement_saved -= 1 else: if self.verbosity > self.v_none: self.stdout.write("TEST SimpleCollection %s\n" % (simple_collection.label)) if self.verbosity > self.v_normal: self.stdout.write("===RELS-EXT===\n") for entry in simple_collection.rels_ext.content: self.stdout.write("%s\n" % list(entry)) self.stdout.write("===DC===\n") self.stdout.write("%s\n" % simple_collection.dc.content.serialize()) self.stdout.write("===MODS===\n") self.stdout.write("%s\n" % simple_collection.mods.content.serialize()) #print Summary self.stdout.write("\n\nSUMMARY\n=======\n") self.stdout.write("SimpleCollection: %s(%s)\n" % (simple_collection.label, simple_collection.pid)) self.stdout.write("Master Collection Object: %s(%s)\n" % (self.master_obj.label, self.master_obj.pid)) self.stdout.write("%s Records read from CSV file\n" % (csv_read)) self.stdout.write("%s Records created\n" % (arrangement_saved)) self.stdout.write("%s Errors\n" % (errors))
class ArrangementObjectTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # create test collection coll = self.repo.get_object(type=CollectionObject) coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE coll.mods.content.source_id = '12345' coll.save() self.pids.append(coll.pid) #create test arrangement object self.arr = self.repo.get_object(type=ArrangementObject) self.arr.pid = 'foo:1' self.arr.collection = coll def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_arrangement_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, ArrangementObject.by_arrangement_id, 42) solr = mocksolr.return_value solr.query.assert_called_with(arrangement_id=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}, {'pid': 'pid:2'}] self.assertRaises(MultipleObjectsReturned, ArrangementObject.by_arrangement_id, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] ao = ArrangementObject.by_arrangement_id(42) self.assert_(isinstance(ao, ArrangementObject)) # custom repo object mockrepo = Mock() ao = ArrangementObject.by_arrangement_id(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=ArrangementObject) def test_arrangement_status(self): obj = ArrangementObject(Mock()) obj.arrangement_status = 'processed' self.assertEqual('A', obj.state) self.assertEqual('processed', obj.arrangement_status) obj.arrangement_status = 'accessioned' self.assertEqual('I', obj.state) self.assertEqual('accessioned', obj.arrangement_status) value_error = None try: obj.arrangement_status = 'bogus' except ValueError: value_error = True self.assertTrue(value_error, 'attempting to assign an unknown status should raise a ValueError') def test_update_access_cmodel(self): obj = ArrangementObject(Mock()) # no status set - should be set to restricted obj._update_access_cmodel() self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) in obj.rels_ext.content) self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) not in obj.rels_ext.content) # set to status code 2 = access allowed obj.rights.content.create_access_status() obj.rights.content.access_status.code = '2' obj._update_access_cmodel() self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) not in obj.rels_ext.content) self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) in obj.rels_ext.content) def test_index_data(self): idx_data = self.arr.index_data() self.assertEqual('born-digital', idx_data['object_type']) self.assertEqual(self.arr.pid, idx_data['pid']) self.assertIn(self.arr.owner, idx_data['owner']) self.assertEquals(self.arr.collection.pid, idx_data['collection_id']) self.assertEquals(self.arr.collection.mods.content.source_id, idx_data['collection_source_id']) # Test the update_ark_label method in the keep.common.fedora # Note that this test is a simplified version of keep.common.fedora:ArkPidDigitalObject.test_update_ark_label # The udpate_ark_label here is an overriden method that is more specifc, and is used on Arrangement objects @patch('keep.arrangement.models.pidman') # mock the pidman client (the API service) def test_update_ark_label(self, mockpidman): # Create a ArrangementObject arrangement_object = ArrangementObject(Mock()) # Set a pid on the object so that it could internally generate a noid etc. arrangement_object.pid = "test:1234" # Simulate when the object doesn't exist (or hasn't been saved) # By default it appears as if it doesn't exist arrangement_object.update_ark_label() # What we should expect is that the update_ark_label is not called on pidman # Also there shouldn't be any errors # Use the mock assertFalse to check if a method is called or not self.assertFalse(mockpidman.get_ark.called) # Mock when the object exists (returns True) # Note: Need to set the Mock on the class and not the object because # this (exists) is a property method with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): arrangement_object.update_ark_label() self.assertFalse(mockpidman.get_ark.called) # Set the label before the object exists so we don't trigger API calls arrangement_object.dc.content.title = "testpid" with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): mockpidman.get_ark.return_value = {"name": arrangement_object.dc.content.title} arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too self.assertFalse(mockpidman.update_ark.called) # When the label is different from that in Pidman mockpidman.get_ark.return_value = {"name": "another pid"} arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too mockpidman.update_ark.assert_called_with(noid=arrangement_object.noid, name=arrangement_object.dc.content.title) def test_set_premis_object(self): mockapi = Mock() arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = "test:1234" arrangement_object.mods.content.ark = 'ark:/1234/987' # return empty iterator for original data to checksum mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() self.assert_(arrangement_object.provenance.content.object) premis = arrangement_object.provenance.content # FIXME: placeholder tests for placeholder functionality, # should be updated to use ARK uri once that is implemented self.assertEqual('ark', premis.object.id_type) self.assertEqual(arrangement_object.mods.content.ark, premis.object.id) self.assertEqual('p:file', premis.object.type) self.assertEqual(0, premis.object.composition_level) self.assertEqual('MD5', premis.object.checksums[0].algorithm) self.assertEqual('123456789', premis.object.checksums[0].digest) # sha1 for an empty file empty_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' self.assertEqual('SHA-1', premis.object.checksums[1].algorithm) self.assertEqual(empty_sha1, premis.object.checksums[1].digest) # object format should be original mietype self.assertEqual('text/plain', premis.object.format.name) # generated premis should be valid self.assertTrue(premis.is_valid()) def test_identifier_change_event(self): mockapi = Mock() mockapi.username = '******' arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = 'test:1234' arrangement_object.mods.content.ark = 'ark:/1234/987' # set object premis so we can validate mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() arrangement_object.identifier_change_event('old-pid:1') premis = arrangement_object.provenance.content self.assertEqual(1, len(premis.events)) event = premis.events[0] self.assertEqual('UUID', event.id_type) # id should be set, we don't care what it is exactly self.assert_(event.id) self.assertEqual('identifier assignment', event.type) self.assertEqual('program="keep"; version="%s"' % __version__, event.detail) self.assertEqual('Pass', event.outcome) msg = 'Persistent identifier reassigned from %s to %s' % \ ('old-pid:1', arrangement_object.pid) self.assertEqual(msg, event.outcome_detail) self.assertEqual('fedora user', event.agent_type) self.assertEqual('fedoraAdmin', event.agent_id) # generated premis should be valid self.assertTrue(premis.is_valid())
class EmailMessageTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # test EmailMessage self.email = self.repo.get_object(type=EmailMessage) self.email.cerp.content.from_list = ['*****@*****.**'] self.email.cerp.content.to_list = ['*****@*****.**'] self.email.cerp.content.subject_list = ['Interesting Subject'] def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test_headers(self): h1 = cerp.Header() h1.name = "HEADER 1" h1.value = "value for header 1" h2 = cerp.Header() h2.name = "HEADER 2" h2.value = "value for header 2" self.email.cerp.content.headers.append(h1) self.email.cerp.content.headers.append(h2) self.assertEqual(self.email.headers['HEADER 1'], 'value for header 1') self.assertEqual(self.email.headers['HEADER 2'], 'value for header 2') def test_email_label(self): # no object label and one person in to field label = self.email.email_label() self.assertEqual( 'Email from [email protected] to [email protected] Interesting Subject', label, 'Should construct label when it does not exist') # more then one person in to list self.email.cerp.content.to_list.append('*****@*****.**') label = self.email.email_label() self.assertEqual( 'Email from [email protected] to [email protected] et al. Interesting Subject', label, 'only show first to email address when there are more than one') # no subject self.email.cerp.content.subject_list = [] self.assertEqual( 'Email from [email protected] to [email protected] et al.', self.email.email_label(), 'Display message without subject when no subject is present') # has a date date_header = cerp.Header() date_header.name = 'Date' date_header.value = 'Friday 13 200 13:00' self.email.cerp.content.headers.append(date_header) label = self.email.email_label() self.assertEqual( 'Email from [email protected] to [email protected] et al. on Friday 13 200 13:00', label, 'only show first to email address when there are more than one') # object label already exists self.email.label = "label we want to keep" label = self.email.email_label() self.assertEqual(self.email.label, label, 'label should be preserved when it exists') def test_index_data(self): # NOTE: logic for creating the label is in the label test # test to make sure label exists in index data data = self.email.index_data() self.assertIn('label', data.keys()) # mime_data does not exist, so no c self.assert_( 'content_md5' not in data, 'content_md5 should not be set when mime data does not exist') # patch mime data to test exists /cchecksum with patch.object(self.email, 'mime_data', Mock()) as mock_mime: mock_mime.exists = True mock_mime.checksum = 'test checksum value' data = self.email.index_data() self.assertEqual(self.email.mime_data.checksum, data['content_md5']) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_checksum(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_checksum, 42) solr = mocksolr.return_value solr.query.assert_called_with( content_md5=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{ 'pid': 'pid:1' }, { 'pid': 'pid:2' }] self.assertRaises(MultipleObjectsReturned, EmailMessage.by_checksum, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] em = EmailMessage.by_checksum(42) self.assert_(isinstance(em, EmailMessage)) # custom repo object mockrepo = Mock() em = EmailMessage.by_checksum(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=EmailMessage) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_message_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_message_id, '<*****@*****.**>') solr = mocksolr.return_value solr.query.assert_called_with( arrangement_id='<*****@*****.**>', content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid')
class ArrangementObjectTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # create test collection coll = self.repo.get_object(type=CollectionObject) coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE coll.mods.content.source_id = '12345' coll.save() self.pids.append(coll.pid) #create test arrangement object self.arr = self.repo.get_object(type=ArrangementObject) self.arr.pid = 'foo:1' self.arr.collection = coll def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_arrangement_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, ArrangementObject.by_arrangement_id, 42) solr = mocksolr.return_value solr.query.assert_called_with( arrangement_id=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{ 'pid': 'pid:1' }, { 'pid': 'pid:2' }] self.assertRaises(MultipleObjectsReturned, ArrangementObject.by_arrangement_id, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] ao = ArrangementObject.by_arrangement_id(42) self.assert_(isinstance(ao, ArrangementObject)) # custom repo object mockrepo = Mock() ao = ArrangementObject.by_arrangement_id(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=ArrangementObject) def test_arrangement_status(self): obj = ArrangementObject(Mock()) obj.arrangement_status = 'processed' self.assertEqual('A', obj.state) self.assertEqual('processed', obj.arrangement_status) obj.arrangement_status = 'accessioned' self.assertEqual('I', obj.state) self.assertEqual('accessioned', obj.arrangement_status) value_error = None try: obj.arrangement_status = 'bogus' except ValueError: value_error = True self.assertTrue( value_error, 'attempting to assign an unknown status should raise a ValueError') def test_update_access_cmodel(self): obj = ArrangementObject(Mock()) # no status set - should be set to restricted obj._update_access_cmodel() self.assert_( (obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) in obj.rels_ext.content) self.assert_( (obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) not in obj.rels_ext.content) # set to status code 2 = access allowed obj.rights.content.create_access_status() obj.rights.content.access_status.code = '2' obj._update_access_cmodel() self.assert_( (obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) not in obj.rels_ext.content) self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) in obj.rels_ext.content) def test_index_data(self): idx_data = self.arr.index_data() self.assertEqual('born-digital', idx_data['object_type']) self.assertEqual(self.arr.pid, idx_data['pid']) self.assertIn(self.arr.owner, idx_data['owner']) self.assertEquals(self.arr.collection.pid, idx_data['collection_id']) self.assertEquals(self.arr.collection.mods.content.source_id, idx_data['collection_source_id']) # Test the update_ark_label method in the keep.common.fedora # Note that this test is a simplified version of keep.common.fedora:ArkPidDigitalObject.test_update_ark_label # The udpate_ark_label here is an overriden method that is more specifc, and is used on Arrangement objects @patch('keep.arrangement.models.pidman' ) # mock the pidman client (the API service) def test_update_ark_label(self, mockpidman): # Create a ArrangementObject arrangement_object = ArrangementObject(Mock()) # Set a pid on the object so that it could internally generate a noid etc. arrangement_object.pid = "test:1234" # Simulate when the object doesn't exist (or hasn't been saved) # By default it appears as if it doesn't exist arrangement_object.update_ark_label() # What we should expect is that the update_ark_label is not called on pidman # Also there shouldn't be any errors # Use the mock assertFalse to check if a method is called or not self.assertFalse(mockpidman.get_ark.called) # Mock when the object exists (returns True) # Note: Need to set the Mock on the class and not the object because # this (exists) is a property method with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): arrangement_object.update_ark_label() self.assertFalse(mockpidman.get_ark.called) # Set the label before the object exists so we don't trigger API calls arrangement_object.dc.content.title = "testpid" with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): mockpidman.get_ark.return_value = { "name": arrangement_object.dc.content.title } arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with( arrangement_object.noid ) # assert that it is called with a noid too self.assertFalse(mockpidman.update_ark.called) # When the label is different from that in Pidman mockpidman.get_ark.return_value = {"name": "another pid"} arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with( arrangement_object.noid ) # assert that it is called with a noid too mockpidman.update_ark.assert_called_with( noid=arrangement_object.noid, name=arrangement_object.dc.content.title) def test_set_premis_object(self): mockapi = Mock() arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = "test:1234" arrangement_object.mods.content.ark = 'ark:/1234/987' # return empty iterator for original data to checksum mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() self.assert_(arrangement_object.provenance.content.object) premis = arrangement_object.provenance.content # FIXME: placeholder tests for placeholder functionality, # should be updated to use ARK uri once that is implemented self.assertEqual('ark', premis.object.id_type) self.assertEqual(arrangement_object.mods.content.ark, premis.object.id) self.assertEqual('p:file', premis.object.type) self.assertEqual(0, premis.object.composition_level) self.assertEqual('MD5', premis.object.checksums[0].algorithm) self.assertEqual('123456789', premis.object.checksums[0].digest) # sha1 for an empty file empty_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' self.assertEqual('SHA-1', premis.object.checksums[1].algorithm) self.assertEqual(empty_sha1, premis.object.checksums[1].digest) # object format should be original mietype self.assertEqual('text/plain', premis.object.format.name) # generated premis should be valid self.assertTrue(premis.is_valid()) def test_identifier_change_event(self): mockapi = Mock() mockapi.username = '******' arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = 'test:1234' arrangement_object.mods.content.ark = 'ark:/1234/987' # set object premis so we can validate mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() arrangement_object.identifier_change_event('old-pid:1') premis = arrangement_object.provenance.content self.assertEqual(1, len(premis.events)) event = premis.events[0] self.assertEqual('UUID', event.id_type) # id should be set, we don't care what it is exactly self.assert_(event.id) self.assertEqual('identifier assignment', event.type) self.assertEqual('program="keep"; version="%s"' % __version__, event.detail) self.assertEqual('Pass', event.outcome) msg = 'Persistent identifier reassigned from %s to %s' % \ ('old-pid:1', arrangement_object.pid) self.assertEqual(msg, event.outcome_detail) self.assertEqual('fedora user', event.agent_type) self.assertEqual('fedoraAdmin', event.agent_id) # generated premis should be valid self.assertTrue(premis.is_valid())
class TestMigrateRushdie(TestCase): MM_FIXTURE = '''<macfs:document xmlns:macfs="info:fedora/emory-control:Rushdie-MacFsData-1.0"> <macfs:md5>ffcf48e5df673fc7de985e1b859eeeec</macfs:md5> <macfs:file> <macfs:computer>Performa 5400</macfs:computer> <macfs:path>/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles</macfs:path> <macfs:rawpath>L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=</macfs:rawpath> <macfs:attributes>avbstclInmedz</macfs:attributes> <macfs:created>1997-01-19T19:29:32</macfs:created> <macfs:modified>1997-01-19T19:29:32</macfs:modified> <macfs:type>TEXT</macfs:type> <macfs:creator>ttxt</macfs:creator> </macfs:file> </macfs:document>''' MA_FIXTURE = '''<marbl:analysis xmlns:marbl="info:fedora/emory-control:Rushdie-MarblAnalysis-1.0"> <marbl:series>Writings by Rushdie</marbl:series> <marbl:subseries>Fiction</marbl:subseries> <marbl:verdict>As is</marbl:verdict> </marbl:analysis>''' SERIES_FIXTURE = { 'Writings by Rushdie': { 'series_info': { 'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_series2', 'short_id': 'series2', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2' }, 'subseries_info': { 'Fiction': { 'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_subseries2.1', 'short_id': 'subseries2.1', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2/subseries2.1' } } } } def setUp(self): self.repo = Repository() self.pids = [] #Create a simple Collection self.sc = self.repo.get_object(type=SimpleCollection) self.sc.label = "SimpleCollection For Test" self.sc.save() self.pids.append(self.sc.pid) #Create a Master Collection self.mc = self.repo.get_object(type=CollectionObject) self.mc.label = "MasterCollection For Test" self.mc.save() self.pids.append(self.mc.pid) #Create a a DigitalObject self.digObj = self.repo.get_object(type=RushdieArrangementFile) self.digObj.label = "Object For Test" self.digObj.save() self.pids.append(self.digObj.pid) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-MACTECH", "MARBL-MACTECH", mimeType="application/xml", content=self.MM_FIXTURE) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-ANALYSIS", "MARBL-ANALYSIS", mimeType="application/xml", content=self.MA_FIXTURE) #Remove Arrangement model so it can be added later relation = (self.digObj.uriref, modelns.hasModel, "info:fedora/emory-control:Arrangement-1.0") self.digObj.rels_ext.content.remove(relation) self.digObj.save() #Setup Command self.cmd = migrate_rushdie.Command() self.cmd.verbosity = 1 self.cmd.v_normal = 1 self.cmd.v_none = 0 self.cmd.simple_collection = self.sc self.cmd.stdout = sys.stdout self.cmd.CONTENT_MODELS = CONTENT_MODELS self.cmd.repo = self.repo def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test__add_to_simple_collection(self): self.cmd._add_to_simple_collection(self.digObj) self.assertTrue( (self.sc.uriref, relsextns.hasMember, self.digObj.uriref) in self.sc.rels_ext.content, "%s shold be a member of the Simplecollection" % self.digObj.pid) def test__get_unique_objects(self): #duplicate pids are processed only once objs = self.cmd._get_unique_objects([self.digObj.pid, self.digObj.pid]) self.assertEqual(len(objs), 1, "No dup pids should be processed") def test__convert_ds(self): obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) #Check all fields are moved over correctly #filetech self.assertEqual(obj.filetech.content.file[0].md5, "ffcf48e5df673fc7de985e1b859eeeec") self.assertEqual(obj.filetech.content.file[0].computer, "Performa 5400") self.assertEqual( obj.filetech.content.file[0].path, "/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles") self.assertEqual( obj.filetech.content.file[0].rawpath, "L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=" ) self.assertEqual(obj.filetech.content.file[0].attributes, "avbstclInmedz") self.assertEqual(obj.filetech.content.file[0].created, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].modified, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].type, "TEXT") self.assertEqual(obj.filetech.content.file[0].creator, "ttxt") #MODS self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual( obj.mods.content.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"] ["Fiction"]["uri"]) self.assertEqual( obj.mods.content.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"] ["Fiction"]["base_ark"]) self.assertEqual( obj.mods.content.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"] ["Fiction"]["id"]) self.assertEqual( obj.mods.content.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"] ["Fiction"]["short_id"]) self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie") self.assertEqual( obj.mods.content.series.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["uri"]) self.assertEqual( obj.mods.content.series.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"] ["base_ark"]) self.assertEqual( obj.mods.content.series.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["id"]) self.assertEqual( obj.mods.content.series.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"] ["short_id"]) #Rights self.assertEqual(obj.rights.content.access_status.code, "2") #RELS-EXT self.assertTrue( (obj.uriref, relsextns.isMemberOf, self.mc.uriref) in obj.rels_ext.content, "Object should have isMember relation to master collection") self.assertTrue( (obj.uriref, modelns.hasModel, URIRef("info:fedora/emory-control:ArrangementAccessAllowed-1.0")) in obj.rels_ext.content, "Object should have Allowed Content Model") #Label and DS self.assertEqual(obj.label, "x - the roles", "Label should be set to last part of path") self.assertEqual(obj.owner, "thekeep-project", "owner should be set to 'thekeep-project'") self.assertEqual(obj.dc.content.title, "x - the roles", "DC title should be set to last part of path") #DataStreams #have to reload obj from repository to get DS update obj = self.repo.get_object(pid=obj.pid, type=ArrangementObject) self.assertFalse("MARBL-MACTECH" in obj.ds_list, "MARBL-MACTECH should have been removed") self.assertFalse("MARBL-ANALYSIS" in obj.ds_list, "MARBL-ANALYSIS should have been removed") def test_missing_series_info(self): #Remove subseries info from lookup series = self.SERIES_FIXTURE.copy() del series["Writings by Rushdie"]["subseries_info"] obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie")
class Command(BaseCommand): help = '''Remove outdated email message metadata objects from the repository and replace them with email folder and message objects based on 5300c Eudora files. (One-time import for 5300c content) <batch id> pid for the 5300c processing batch object (used to find email records to be removed/replaced) <eudora base path> base path for Eudora folder data and index files ''' args = '<batch id> <eudora base path>' option_list = BaseCommand.option_list + ( make_option('-n', '--noact', action='store_true', default=False, help='''Test run: report what would be done, but do not modify anything in the repository'''), make_option('-m', '--max', metavar='MAX_NUM', dest='max_ingest', type='int', help='''Stop after ingesting MAX_NUM items'''), make_option('--skip-purge', action='store_true', default=False, help='''Skip purging old metadata email records and only ingest email messages (e.g., if purge has already been completed)'''), make_option('--purge-only', action='store_true', default=False, help='''Only purge old metadata email records; do not ingest email messages'''), # optional fedora credentials make_option('--user', metavar='FEDORA_USER', dest='user', help='''Connect to Fedora as the specified user'''), make_option('--password', metavar='FEDORA_PASSWORD', dest='password', help='''Connect to Fedora with the specified password (leave blank for prompt)''', action="callback", callback=get_password_opt, type='string'), ) # default django verbosity levels: 0 = none, 1 = normal, 2 = all v_normal = 1 # email folder names for 5300c # key is fake 'path' in arrangement objects; value is original filename email_folders = { 'In': 'In', 'Out': 'Out', 'Old-In': 'OLD "IN"', 'Old-Out': 'OLD "OUT"', } 'known email folders on the 5300c, for identifying current records' email_path_regex = '^(%s)/' % '|'.join(email_folders.keys()) max_ingest = None def handle(self, batch_id=None, folder_path=None, verbosity=1, noact=False, max_ingest=None, skip_purge=False, purge_only=False, *args, **options): # check batch object if batch_id is None: raise CommandError('Processing batch id is required') self.verbosity = int(verbosity) # ensure we compare int to int if max_ingest is not None: self.max_ingest = int(max_ingest) # check folder path if folder_path is None: raise CommandError('Eudora folder base path is required') if not os.path.isdir(folder_path): raise CommandError('Eudora folder path "%s" is not a directory' % folder_path) self.noact = noact # check for any specified fedora credentials fedora_opts = {} if 'username' in options: fedora_opts['username'] = options['username'] if 'password' in options: fedora_opts['password'] = options['password'] self.repo = Repository(**fedora_opts) batch = self.repo.get_object(batch_id, type=ProcessingBatch) if not batch.exists: raise CommandError('Processing batch %s not found' % batch_id) print 'Looking for email messages in processing batch "%s"' \ % batch.label try: pidman = DjangoPidmanRestClient() except: raise CommandError('Error initializing PID manager client; ' + 'please check settings.') self.stats = defaultdict(int) # purge old metadata email 'arrangement' objects that belong to this batch if not skip_purge: self.remove_arrangement_emails(batch) # ingest new objects for email mailboxes & messages if not purge_only: self.ingest_email(folder_path) def remove_arrangement_emails(self, batch): '''Find and iterate over all items that are part of the specified batch. Purge email message objects and update the correspending ARK records for re-use on ingest. ''' items = list(batch.rels_ext.content.objects(batch.uriref, relsext.hasMember)) for i in items: # for now, init as arrangement objects obj = self.repo.get_object(str(i), type=ArrangementObject) # NOTE: in dev/test, collection currently references all items # but only a handful actually exist in dev/test repo; just skip if not obj.exists: continue # number of objects self.stats['count'] += 1 if not obj.filetech.exists or not obj.filetech.content.file: print 'Error: no file tech for %s; skipping' % obj.pid continue # 5300c email messages should only have one file path. # Identify email messages by file path starting with # email folder name and no checksum file_info = obj.filetech.content.file[0] if not re.match(self.email_path_regex, file_info.path) or \ file_info.md5: # not an email message - skip to next item continue self.stats['email'] += 1 # if in no-act mode, nothing else to do if self.noact: continue # not in no-act mode : update pid, purge object try: # reinit client as a workaround for pidman errors (?) pidman = DjangoPidmanRestClient() # update ark name/domain pidman.update_ark(obj.noid, name=UNUSED_PID_NAME, domain=settings.PIDMAN_DOMAIN) # mark default target as inactive pidman.update_ark_target(obj.noid, active=False, target_uri=UNUSED_PID_URL) self.stats['pids'] +=1 if self.verbosity > self.v_normal: print 'Updated ARK for %s' % obj.noid except Exception as e: print 'Error updating ARK for %s: %s' % \ (obj.noid, e) # purge record try: self.repo.purge_object(obj.pid, 'removing metadata arrangement 5300c email record') self.stats['purged'] += 1 if self.verbosity > self.v_normal: print 'Purged %s' % obj.pid except RequestFailed as e: self.stats['purge_error'] += 1 print 'Error purging %s: %s' % (obj.pid, e) # summary if self.verbosity >= self.v_normal: print '''\nChecked %(count)d records, found %(email)d emails''' % self.stats if not self.noact: print 'Updated %(pids)d ARK(s); purged %(purged)d objects, error purging %(purge_error)d objects' \ % self.stats def ingest_email(self, folder_base): for folder_name, folder_file in self.email_folders.iteritems(): self.stats['folder'] += 1 folder_path = os.path.join(folder_base, folder_file) folder_toc = os.path.join(folder_base, folder_file + '.toc') # if either data or index file is not present, bail out if not os.path.isfile(folder_path) or \ not os.path.isfile(folder_toc): print 'Error: folder files %s not found at base path "%s"' % \ (folder_file, folder_base) continue # find the index/data file objects for this folder in fedora # by checksums from the originals; # check if they are associated with an existing mailbox object mailbox = None mbox_obj = self.find_file_object(folder_path) if mbox_obj is None: # these records should be found in production print 'Warning: record not found for folder data file "%s"' % folder_file elif mbox_obj.mailbox: mailbox = mbox_obj.mailbox toc_obj = self.find_file_object(folder_toc) if toc_obj is None: print 'Warning: record not found for folder index file "%s.toc"' % folder_file elif toc_obj.mailbox: mailbox = toc_obj.mailbox # mailbox not found via folder file objects, so create it if mailbox is None: if self.verbosity > self.v_normal: print 'Mailbox object for %s not found; creating one' % folder_name mailbox = self.repo.get_object(type=MailboxPidReuse) desc = 'Rushdie\'s email from his PowerBook 5300c: "%s" folder' % \ folder_name mailbox.label = desc mailbox.dc.content.title = desc # mailbox should belong to same collection mailbox files do if mbox_obj.collection: mailbox.collection = mbox_obj.collection elif mbox_obj._deprecated_collection: mailbox.collection = mbox_obj._deprecated_collection # save to get a pid, add mailbox rel to file objects if not self.noact: # TODO: fedora error handling try: mailbox.save('email folder object for %s' % folder_name) self.stats['ingested'] += 1 if self.verbosity >= self.v_normal: print 'Created new mailbox object for %s as %s' % \ (folder_name, mailbox.pid) except RequestFailed as rf: self.stats['ingest_error'] += 1 print 'Failed to create folder object for %s in Fedora: %s' % \ (folder_name, rf) if mbox_obj: mbox_obj.mailbox = mailbox mbox_obj.save('associating with mailbox object') self.stats['updated'] += 1 if toc_obj: toc_obj.mailbox = mailbox toc_obj.save('associating with mailbox object') self.stats['updated'] += 1 # NOTE: should be able to get rushdie collection # object from toc/mbox objects, but they seem to have # isMemberOf rel instead of isMemberOfCollection (?) else: # FIXME: boda rel is giving us boda mailbox instead of local # arrangement mailbox; re-init as local mailbox # for access to parent collection mailbox = self.repo.get_object(mailbox.pid, type=MailboxPidReuse) with open(folder_toc) as tocdata: with open(folder_path) as mbox: toc = eudora.Toc(tocdata) # load as eudora toc binfile # eudora Toc returns messages in folder order; # pass order in to store in CERP for sorting/display folder_order = 0 for msg in toc.messages: self.stats['message'] += 1 # get data from mbox file based on msg offset/size mbox.seek(msg.offset) # read message content from mailbox data file msg_data = mbox.read(msg.size) self.ingest_message(msg_data, mailbox, folder_order) folder_order += 1 # max to ingest for testing if self.max_ingest and self.stats['ingested'] >= self.max_ingest: break # summary if self.verbosity >= self.v_normal: print '''\nProcessed %(folder)d mail folders and %(message)d messages; %(previously_ingested)d messages previously ingested''' % self.stats if not self.noact: print '''\nCreated %(ingested)d records, updated %(updated)d''' % self.stats if self.stats['ingest_error']: print '''Error ingesting %(ingest_error)d records''' % self.stats def find_file_object(self, file_path): '''Find a file object by checksum in fedora based on a file path. Returns a file object if one matches the checksum for the file specified, or else None if no match is found. :returns: :class:`keep.arrangement.models.RushdieArrangementFile` or None ''' file_md5 = md5sum(file_path) solr = solr_interface() q = solr.query(content_md5=file_md5).field_limit('pid') if len(q): return self.repo.get_object(q[0]['pid'], type=RushdieArrangementFile) def ingest_message(self, msg_data, mailbox, folder_order): # read content and redact IP addresses / email addresses msg_data = redact_email(msg_data) # generate email object from data email_msg = email.message_from_string(msg_data, _class=MacEncodedMessage) # check and warn if email has attachments attachments = self.email_attachments(email_msg) if attachments: print 'Warning! Email has attachments (not yet handled): %s' % \ ','.join(attachments) # get current content type to preserve the original value, # and also to determine how to decode content_type = email_msg.get('Content-Type', '') orig_content_type = email_msg.get_content_type() orig_content_charset = email_msg.get_content_charset() # at least one email in this set has a charset of 'unknown-8bit', # but the \xa0 in the content indicates it is probably latin 1 if 'charset=unknown-8bit' in content_type: latin1_charset = email.charset.Charset('latin_1') email_msg.set_charset(latin1_charset) # otherwise, if charset is not set, assume mac roman elif not email_msg.get_charset(): # tell email that charset should be mac roman, # so it can decode special characters mac_charset = email.charset.Charset('mac_roman') email_msg.set_charset(mac_charset) # decode headers from mac roman charset # (some messages contain improperly formatted # accented characters in a from/to header) email_msg.decode_headers() # create a new object to populate with data msg_obj = self.repo.get_object(type=EmailMessagePidReuse) # generate cerp from mime message # - store folder order as message local id msg_obj.cerp.content = cerp.Message.from_email_message(email_msg, local_id=folder_order) # The generated CERP may have modified mac roman charset headers # which were needed to convert instead of the original; # update thex ml to store the original value, NOT the encoding # that was used to decode the content. if content_type: if msg_obj.cerp.content.single_body: msg_obj.cerp.content.single_body.content_type_list[0] = orig_content_type msg_obj.cerp.content.single_body.charset_list[0] = orig_content_charset else: if msg_obj.cerp.content.single_body: del msg_obj.cerp.content.single_body.content_type_list[0] del msg_obj.cerp.content.single_body.charset_list[0] # loop through headers to set/remove content type for h in msg_obj.cerp.content.headers: if h.name == 'Content-Type': if content_type: h.value = content_type else: h.value = None h.name = None break # construct an object label based on from/to/date/subject msg_from = email_msg['From'] # NOTE: it would be nice to suppress redundant redaction email text here; # at least simplify label for rushdie, since that is what we'll see most if 'REDACTED: Salman Rushdie\'s email' in msg_from: msg_from = 'Salman Rushdie' label = u'Email from %s' % msg_from if email_msg.get('To', None): # FIXME: could have multiple recipients # we *should* be able to get split-out version from email.Message ... to = email_msg['To'] label += u' to %s' % email_msg['To'] # date/subject not always present, but add if they are if email_msg.get('Date', None): label += u' on %s' % email_msg['Date'] if email_msg.get('Subject', None): label += u' %s' % email_msg['Subject'] # set as object label and dc:title msg_obj.label = label msg_obj.dc.content.title = label # in verbose noact mode, print label so user can see what is being done if self.verbosity > self.v_normal and self.noact: print label # generate a pristine email Message for saving fedora # (don't save modified charset, content type, etc.) msg_obj.mime_data.content = email.message_from_string(msg_data, _class=MacEncodedMessage) # calculate an MD5 of the email content *as it will be serialized* md5 = hashlib.md5() md5.update(str(msg_obj.mime_data.content)) email_md5 = md5.hexdigest() msg_obj.mime_data.checksum = email_md5 # check if this email has already been ingested via checksum; # don't re-ingest if it is already in the repository solr = solr_interface() q = solr.query(content_md5=msg_obj.mime_data.checksum).field_limit('pid') if len(q): if self.verbosity >= self.v_normal: print 'Email message has already been ingested as %s; skipping' \ % q[0]['pid'] self.stats['previously_ingested'] += 1 return # associate with current mailbox object msg_obj.mailbox = mailbox # belongs to same collection as its mailbox if mailbox.collection: msg_obj.collection = mailbox.collection # ingest items as accessioned/unprocessed msg_obj.arrangement_status = 'accessioned' # ingest with a default rights code of 10 "Undetermined" in rights DS msg_obj.rights.content.create_access_status() msg_obj.rights.content.access_status.code = "10" msg_obj.rights.content.access_status.text = rights_access_terms_dict["10"].text if not self.noact: try: msg_obj.save('ingesting email message from rushdie 5300c') if self.verbosity >= self.v_normal: print 'Ingested message %s : %s' % \ (msg_obj.pid, msg_obj.label) self.stats['ingested'] += 1 except RequestFailed as rf: self.stats['ingest_error'] += 1 print 'Error ingesting email message %s: %s' % \ (msg_obj.label, rf) def email_attachments(self, msg): attachments = [] if msg.is_multipart(): payload = msg.get_payload() # NOTE: sub parts could themselves be multipart... for p in payload: if 'attachment' in p.get('Content-Disposition', '') \ or p.get_filename(): attachments.append(p.get_filename()) return attachments
class Command(BaseCommand): '''Read CSV file and creates (or adds to) a Simple Collection and associated ArrangementObjects with the SimpleCollection and the Master collection''' def get_password_option(option, opt, value, parser): setattr(parser.values, option.dest, getpass()) #Set up additional options option_list = BaseCommand.option_list + ( make_option('--noact', '-n', action='store_true', dest='no-act', default=False, help='Does not create PIDs or ingest anything into Fedora. Only parses file and outputs results'), make_option('--add', '-a', action='store', dest='add', help='adds to the SimpleCollection specified by pid, does not create a new SimpleCollection'), make_option('--username', '-u', dest='username', action='store', help='''Username to connect to fedora'''), make_option('--password', dest='password', action='callback', callback=get_password_option, help='''Prompt for password required when username used'''), ) args = '<CSV file> <master collection pid> <new simple collection name>' help = __doc__ def _create_series_lookup(self): #series / subseries info self.series = {} #exist query params return_fields = ['eadid'] search_fields = {'eadid' : 'rushdie1000'} queryset = Series.objects.also(*return_fields).filter(**search_fields) for s in queryset: #series info self.series[s.title]= {} self.series[s.title]['series_info'] = {} self.series[s.title]['series_info']['id'] = s.id self.series[s.title]['series_info']['short_id'] = s.short_id self.series[s.title]['series_info']['base_ark'] = s.eadid.url self.series[s.title]['series_info']['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s" % \ (s.eadid.value, s.short_id) #subseries info if s.subseries: self.series[s.title]['subseries_info'] = {} for sub in s.subseries: self.series[s.title]['subseries_info'][sub.title] = {} self.series[s.title]['subseries_info'][sub.title]['id'] = sub.id self.series[s.title]['subseries_info'][sub.title]['short_id'] = sub.short_id self.series[s.title]['subseries_info'][sub.title]['base_ark'] = s.eadid.url self.series[s.title]['subseries_info'][sub.title]['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s/%s" % \ (s.eadid.value, s.short_id, sub.short_id) def _create_arrangement(self, row): #Account for unicode characters #Preserve unicode characters for raw path, #but remove unicode character for other mappings rawpath = base64.encodestring(row["filename"]) path = row["filename"] path = unicode(path, 'utf8') creator = row["creator"] creator = unicode(creator, 'utf8') # set values in filetech DS obj = self.repo.get_object(type=ArrangementObject) obj.label = path.rpartition('/')[2] obj.filetech.content.file.append(FileMasterTech_Base()) obj.filetech.content.file[0].local_id = row['id'] obj.filetech.content.file[0].md5 = row['checksum'] obj.filetech.content.file[0].computer = row['computer'] obj.filetech.content.file[0].path = path obj.filetech.content.file[0].rawpath = rawpath obj.filetech.content.file[0].attributes = row['attrib'] obj.filetech.content.file[0].created = row['created'] obj.filetech.content.file[0].modified = row['modified'] obj.filetech.content.file[0].creator = creator #map DC title obj.dc.content.title = path.rpartition('/')[2] #map default verdict of 10 "Undetermined" in rights DS obj.rights.content.create_access_status() obj.rights.content.access_status.code = "10" #map series in MODS #RecordType used to lookup series info rec_type= row["rec_type"] rec_type = rec_type.strip() if rec_type not in self.series: rec_type = None if rec_type is not None: obj.mods.content.create_series() obj.mods.content.series.title = rec_type obj.mods.content.series.uri = self.series[rec_type]["series_info"]["uri"] obj.mods.content.series.base_ark = self.series[rec_type]["series_info"]["base_ark"] obj.mods.content.series.full_id = self.series[rec_type]["series_info"]["id"] obj.mods.content.series.short_id = self.series[rec_type]["series_info"]["short_id"] else: if self.verbosity > self.v_none: self.stdout.write("Series %s not found\n" % row["rec_type"]) # set association to master collection relation = (obj.uriref, relsextns.isMemberOf, self.master_obj.uriref) obj.rels_ext.content.add(relation) if self.verbosity > self.v_normal: self.stdout.write("Adding %s isMemberOf %s relation on ArrangementObject\n" % (obj.label, self.master_obj.pid)) #set state to inactive by default obj.state = "I" return obj def handle(self, *args, **options): #collect arrangement pids here to delete later if SimpleCollection fails to save self.arrangement_pids = [] self._create_series_lookup() #0 = none, 1 = normal, 2 = all self.v_none = 0 self.v_normal = 1 if 'verbosity' in options: self.verbosity = int(options['verbosity']) else: self.verbosity = self.v_normal #Create the repo repo_args = {} if options.get('username') is not None: repo_args['username'] = options.get('username') if options.get('password') is not None: repo_args['password'] = options.get('password') self.repo = Repository(**repo_args) #Check to make sure all args and options are present try: file = args[0] except IndexError: raise CommandError("No CSV file specified") try: self.master_pid = args[1] except IndexError: raise CommandError("No master collection pid specified") #if -a or --add is used the new SimpleCollection name is ignored try: if not options["add"]: self.simple_collection_name = args[2] else: self.simple_collection_pid = options["add"] except IndexError: raise CommandError("An existing SimpleCollection pid must be specified with the -a option or \ a new SimpleCollection name must be specified as an argument") #If Master collection does not exist then raise an exception self.master_obj = self.repo.get_object(type = CollectionObject, pid=self.master_pid) if not self.master_obj.exists: raise CommandError("Master Collection %s does not exist" % (self.master_pid)) else: if self.verbosity > self.v_none: self.stdout.write("Using Master Collection: %s(%s)\n" % (self.master_obj.label, self.master_obj.pid)) #Get or create SimpleColletion object #TODO Not sure why I have to do a try block to prevent a 404 here when I don't in other places try: if options["add"]: simple_collection = self.repo.get_object(type=SimpleCollection, pid=self.simple_collection_pid) else: simple_collection = self.repo.get_object(type=SimpleCollection) simple_collection.label = self.simple_collection_name simple_collection.dc.content.title = self.simple_collection_name simple_collection.mods.content.create_restrictions_on_access() simple_collection.mods.content.restrictions_on_access.text = "Accessioned" except: raise CommandError("Pid %s does not exist" % self.simple_collection_pid) #try to read file into a dict and assign the field names try: reader = csv.DictReader(open(file, 'rb'), fieldnames=["id","checksum","filename","rec_type","file_type", "creator","attrib","created","modified","computer","size"]) if self.verbosity > self.v_none: self.stdout.write("Reading CSV: %s\n" % (file)) except IOError: raise CommandError("Could not read file %s" % file) # skip the header row in CSV file reader.next() #read each field csv_read = 0 arrangement_saved =0 errors = 0 for row in reader: try: csv_read += 1 arrangement_object = self._create_arrangement(row) if not options['no-act']: try: arrangement_object.save() arrangement_saved += 1 self.arrangement_pids.append(arrangement_object.pid) if self.verbosity > self.v_none: self.stdout.write("Saved ArrangementObject %s(%s)\n" % (arrangement_object.label, arrangement_object.pid)) except Exception as e: if self.verbosity > self.v_none: self.stdout.write("Error saving ArrangementObject %s: %s\n" % (arrangement_object.label, e.message)) errors += 1 else: if self.verbosity > self.v_none: self.stdout.write("TEST ArrangementObject %s\n" % (arrangement_object.label)) if self.verbosity > self.v_normal: self.stdout.write("===RELS-EXT===\n") for entry in arrangement_object.rels_ext.content: self.stdout.write("%s\n" % list(entry)) self.stdout.write("===MODS===\n") self.stdout.write("%s\n" % arrangement_object.mods.content.serialize()) #Add each ArrangementObject to the SimpleCollection relation = (simple_collection.uriref, relsextns.hasMember, arrangement_object.uriref) simple_collection.rels_ext.content.add(relation) if self.verbosity > self.v_normal: self.stdout.write("Adding hasMember %s relation on SimpleCollection\n" % (arrangement_object.pid)) except Exception as e: self.stdout.write("Error in record id %s: %s\n" % (row["id"], e)) errors += 1 if not options['no-act']: try: simple_collection.save() self.stdout.write("Saved SimpleCollection %s(%s)\n" % (simple_collection.label, simple_collection.pid)) except Exception as e: if self.verbosity > self.v_none: self.stdout.write("Error saving SimpleCollection %s: %s\n" % (simple_collection.label, e.message)) self.stdout.write("Deleting Arrangement pids so they will not be Orphans\n") errors += 1 for pid in self.arrangement_pids: self.repo.purge_object(pid) if self.verbosity > self.v_none: self.stdout.write("Deleting: %s\n" % (pid)) arrangement_saved -= 1 else: if self.verbosity > self.v_none: self.stdout.write("TEST SimpleCollection %s\n" % (simple_collection.label)) if self.verbosity > self.v_normal: self.stdout.write("===RELS-EXT===\n") for entry in simple_collection.rels_ext.content: self.stdout.write("%s\n" % list(entry)) self.stdout.write("===DC===\n") self.stdout.write("%s\n" % simple_collection.dc.content.serialize()) self.stdout.write("===MODS===\n") self.stdout.write("%s\n" % simple_collection.mods.content.serialize()) #print Summary self.stdout.write("\n\nSUMMARY\n=======\n") self.stdout.write("SimpleCollection: %s(%s)\n" % (simple_collection.label, simple_collection.pid)) self.stdout.write("Master Collection Object: %s(%s)\n" % (self.master_obj.label, self.master_obj.pid)) self.stdout.write("%s Records read from CSV file\n" % (csv_read)) self.stdout.write("%s Records created\n" % (arrangement_saved)) self.stdout.write("%s Errors\n" % (errors))