def handle(self, *pids, **options): # bind a handler for interrupt signal signal.signal(signal.SIGINT, self.interrupt_handler) verbosity = int(options.get('verbosity', self.v_normal)) repo = Repository() try: pidman = DjangoPidmanRestClient() except Exception as err: # error if pid manager config options not in localsettings raise CommandError(err) old_page_target = '%s/books/pages/' % Site.objects.get_current().domain search_args = {'type':'ark', 'target': old_page_target, 'count': 10} # get a small result set to retrieve the total results = pidman.search_pids(**search_args) total = results['results_count'] # then set a larger page size for actual processing search_args['count'] = 100 if verbosity >= self.v_normal: print 'Found %d total page ARKs with targets to be updated' % total pbar = ProgressBar(widgets=[Percentage(), ' (', Counter(), ')', Bar(), ETA()], maxval=total).start() self.stats = defaultdict(int) self.processed = set() for ark in self.get_search_results(pidman, search_args): self.processed.add(ark['pid']) # get fedora pid from target uri target_uri = ark['targets'][0]['target_uri'] baseurl, pid = target_uri.rstrip('/').rsplit('/', 1) try: page = repo.get_object(pid, type=Page) # this should probably only happen in dev/qa if not page.exists: if verbosity > self.v_normal: self.stderr.write('Page %s does not exist' % pid) self.stats['notfound'] += 1 else: # check if volume exists? pidman.update_ark_target(ark['pid'], target_uri=page.absolute_url) self.stats['updated'] += 1 except RequestFailed as rf: print 'Error accessing %s: %s' % (pid, rf) self.stats['error'] += 1 pbar.update(len(self.processed)) if self.interrupted: break if not self.interrupted: pbar.finish() # summarize self.stderr.write('Updated %(updated)d, %(error)d error(s), %(notfound)d not found' \ % self.stats)
def update_pid(kdip_pid, ht_url): client = DjangoPidmanRestClient() # Update the PID in pidman the the HathiTrust URL. client.update_target( \ type="ark", noid=kdip_pid, target_uri=ht_url) # Add a new qualifier for HathiTrust. client.update_target( \ type="ark", noid=kdip_pid, qualifier="HT", \ target_uri=ht_url)
def generate_ark(ead): '''Generate an ARK for the specified EAD document. ARK will be created with a default target of the url for the main page of the specified EAD document in this site. :param ead: :class:`findingaids.fa.models.FindingAid` instance :returns: resolvable URL for generated ARK on success ''' # catch init error and report simplified error to user try: pidclient = DjangoPidmanRestClient() except RuntimeError: raise Exception("Error initializing PID Manager client; please check site configuration.") # check that domain is set if not hasattr(settings, 'PIDMAN_DOMAIN'): raise Exception("Unable to generate ARK: PID manager domain is not configured.") # generate absolute url for ARK target ead_url = settings.SITE_BASE_URL.rstrip('/') + reverse('fa:findingaid', kwargs={'id': ead.eadid.value }) try: # search for an existing ARK first, in case one was already created for this ead # limit search by the configured domain; look for an ARK with the expected target url found = pidclient.search_pids(type='ark', target=ead_url, domain_uri=settings.PIDMAN_DOMAIN) # at least one match if found and found['results_count']: if found['results_count'] > 1: # uh-oh - this shouldn't happen; warn the user logger.warning("Found %d ARKs when searching for an existing ARK for %s", found['results_count'], ead.eadid.value) # use existing pid pid = found['results'][0] # find the unqualified target and get the access uri - primary resolvable ark url for t in pid['targets']: if 'qualifier' not in t or not t['qualifier']: ark_url = t['access_uri'] logger.info("Using existing ARK %s for %s", ark_url, ead.eadid.value) # what if no default target is not found? (unlikely but possible...) return ark_url # if no matches found, create a new ark and output a flash message to a user about it. ark_url = pidclient.create_ark(settings.PIDMAN_DOMAIN, ead_url, name=unicode(ead.unittitle)) logger.info("Created a new ARK %s for %s", ark_url, ead.eadid.value) return ark_url # any error in the pidclient is raised as an HTTPError except HTTPError as err: raise Exception('Error generating ARK: %s' % err)
def get_default_pid(self): # try to configure a pidman client to get pids. try: pidman = DjangoPidmanRestClient() except: raise CommandError("PIDMAN Not Configured. Plese check localsetting.py") target = get_pid_target('postcards:card') ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, self.label) arkbase, slash, noid = ark.rpartition('/') pid = '%s:%s' % (self.default_pidspace, noid) self.dc.content.identifier_list.append(ark) # Store local identifiers in DC return pid
def handle(self, *args, **options): self.options = options self.repaired_count = 0 self.unrepaired_count = 0 repo = Repository() self.pidman = DjangoPidmanRestClient() # populate list of objects to be processed objects = [] for pid in args: try: obj = repo.get_object(pid=pid, type=CollectionObject) if obj.has_requisite_content_models: objects.append(obj) else: obj = repo.get_object(pid=pid, type=AudioObject) if obj.has_requisite_content_models: objects.append(obj) except Exception: self.log(message="Could not find Collection or Audio object for: %s" % pid) # get list of all collections from the repository # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object if not args: objects = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) if not objects: self.log(message="No Collections were found.") for obj in objects: self.repair_ark(obj) self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True)
def get_default_pid(self): '''Default pid logic for DigitalObjects in :mod:`readux`. Mint a new ARK via the PID manager, store the ARK in the MODS metadata (if available) or Dublin Core, and use the noid portion of the ARK for a Fedora pid in the site-configured Fedora pidspace.''' global pidman if pidman is not None: # pidman wants a target for the new pid # generate a pidman-ready target for a named view # Use the object absolute url method # NOTE: this requires that all values used in a url be set # (i.e., page objects must have volume pid configured) self.pid = '%s:%s' % (self.default_pidspace, self.PID_TOKEN) target = self.get_absolute_url() # reverse() encodes the PID_TOKEN and the :, so just unquote the url # (shouldn't contain anything else that needs escaping) target = urllib.unquote(target) # reverse() returns a full path - absolutize so we get scheme & server also target = absolutize_url(target) # pid name is not required, but helpful for managing pids pid_name = self.label # ask pidman for a new ark in the configured pidman domain try: ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, name=pid_name) except httplib.BadStatusLine: logger.warn('Error creating ARK; re-initializing pidman client and trying again') pidman = DjangoPidmanRestClient() ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, name=pid_name) # pidman returns the full, resolvable ark # parse into dictionary with nma, naan, and noid parsed_ark = parse_ark(ark) noid = parsed_ark['noid'] # nice opaque identifier # Add full uri ARK to dc:identifier self.dc.content.identifier_list.append(ark) # use the noid to construct a pid in the configured pidspace return '%s:%s' % (self.default_pidspace, noid) else: # if pidmanager is not available, fall back to default pid behavior return super(DigitalObject, self).get_default_pid()
def handle(self, *args, **options): self.options = options self.repaired_count = 0 self.unrepaired_count = 0 repo = Repository() self.pidman = DjangoPidmanRestClient() # populate list of objects to be processed objects = [] for pid in args: try: obj = repo.get_object(pid=pid, type=CollectionObject) if obj.has_requisite_content_models: objects.append(obj) else: obj = repo.get_object(pid=pid, type=AudioObject) if obj.has_requisite_content_models: objects.append(obj) except Exception: self.log( message="Could not find Collection or Audio object for: %s" % pid) # get list of all collections from the repository # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object if not args: objects = repo.get_objects_with_cmodel( CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) if not objects: self.log(message="No Collections were found.") for obj in objects: self.repair_ark(obj) self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True)
def test_constructor(self): 'Test init from Django settings.' client = DjangoPidmanRestClient() self.assertEqual( client.baseurl['host'], 'testpidman.library.emory.edu', 'Client Base URL %s not expected value.' % client.baseurl) # credentials are stored for passing to request username, password = client._auth self.assertEqual( 'testuser', username, 'Client username %s not the expected value' % username) self.assertEqual('testpass', password, 'Client password %s is not expected value' % password)
def get_default_pid(self): # try to configure a pidman client to get pids. try: pidman = DjangoPidmanRestClient() except: if getattr(settings, 'DEV_ENV', False): logger.warn('Failed to configure PID manager client; default pid logic will be used') pidman = None else: raise CommandError("PID manager is not configured. Please check localsetting.py") if pidman: target = get_pid_target('postcards:card') ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, self.label) arkbase, slash, noid = ark.rpartition('/') pid = '%s:%s' % (self.default_pidspace, noid) # Store local identifiers in DC self.dc.content.identifier_list.append(ark) return pid else: # if pidmanager is not available, fall back to default pid behavior return super(ImageObject, self).get_default_pid()
def get_pidman(self): """Initialize a new Pidman client using the DjangoPidmanRestClient wrapper. The credentials are pulled from the application settings. :return: a Pidman client to interact with the Pidman APIs :rtype: DjangoPidmanRestClient """ # try to configure a pidman client to get pids. try: return DjangoPidmanRestClient() except CommandError as e: error_msg = """ Cannot initialize DjangoPidmanRestClient. Please check your configuration for more details. """ sys.stderr.write(error_msg) raise CommandError(e)
def handle(self, batch_id=None, folder_path=None, verbosity=1, noact=False, max_ingest=None, skip_purge=False, purge_only=False, *args, **options): # check batch object if batch_id is None: raise CommandError('Processing batch id is required') self.verbosity = int(verbosity) # ensure we compare int to int if max_ingest is not None: self.max_ingest = int(max_ingest) # check folder path if folder_path is None: raise CommandError('Eudora folder base path is required') if not os.path.isdir(folder_path): raise CommandError('Eudora folder path "%s" is not a directory' % folder_path) self.noact = noact # check for any specified fedora credentials fedora_opts = {} if 'username' in options: fedora_opts['username'] = options['username'] if 'password' in options: fedora_opts['password'] = options['password'] self.repo = Repository(**fedora_opts) batch = self.repo.get_object(batch_id, type=ProcessingBatch) if not batch.exists: raise CommandError('Processing batch %s not found' % batch_id) print 'Looking for email messages in processing batch "%s"' \ % batch.label try: pidman = DjangoPidmanRestClient() except: raise CommandError('Error initializing PID manager client; ' + 'please check settings.') self.stats = defaultdict(int) # purge old metadata email 'arrangement' objects that belong to this batch if not skip_purge: self.remove_arrangement_emails(batch) # ingest new objects for email mailboxes & messages if not purge_only: self.ingest_email(folder_path)
def get_default_pid(self): if not self._unused_pid_result: pidman = DjangoPidmanRestClient() result = pidman.search_pids(target=UNUSED_PID_URL) # if any were found, use results if result and result['results_count']: self._unused_pid_result = result['results'] # if we have any unused pids, pop one off and use it if self._unused_pid_result: pid_info = self._unused_pid_result.pop() ark = pid_info['targets'][0]['access_uri'] parsed_ark = parse_ark(ark) naan = parsed_ark['naan'] # name authority number noid = parsed_ark['noid'] # nice opaque identifier # use noid as basis for new pid pid = '%s:%s' % (self.default_pidspace, noid) # calculate target to new object target = reverse(self.NEW_OBJECT_VIEW, kwargs={'pid': pid}) # reverse() returns a full path - absolutize so we get scheme & server also target = absolutize_url(target) # update pid ark label from object pidman.update_ark(noid, name=self.label) # update default ark target for new object url pidman.update_ark_target(noid, target_uri=target, active=True) # if we have a mods datastream, store the ARK as mods:identifier if hasattr(self, 'mods'): # store full uri and short-form ark self.mods.content.identifiers.extend([ mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)), mods.Identifier(type='uri', text=ark) ]) # always add full uri ARK to dc:identifier self.dc.content.identifier_list.append(ark) # use the noid to construct a pid in the configured pidspace return '%s:%s' % (self.default_pidspace, noid) else: # if we run out of pids re-use, fall back to default behavior return super(PidReuseDigitalObject, self).get_default_pid()
def upload_for_ht(job, count=1): """ Task to upload files to Box in the backgroud. """ logger = logging.getLogger(__name__) kdip_dir = settings.KDIP_DIR for kdip in models.KDip.objects.filter(job__id=job.id).exclude(status='uploaded').exclude(status='upload_fail'): # Only create a PID if it doesn't already have one if job.upload_attempts == 0: if not kdip.pid: try: pidman_client = DjangoPidmanRestClient() pidman_domain = settings.PIDMAN_DOMAIN pidman_policy = settings.PIDMAN_POLICY ark = pidman_client.create_ark(domain='{}'.format(pidman_domain), target_uri='http://myuri.org', policy='{}'.format(pidman_policy), name='{}'.format(kdip.kdip_id)) noid = parse_ark(ark)['noid'] kdip.pid = noid kdip.save() logger.info("Ark {} was created for {}".format(ark, kdip.kdip_id)) except Exception as e: trace = traceback.format_exc() logger.error("Failed creating an ARK for %s: %s" % (kdip.kdip_id, e)) reason = "Box uplaod failed while making an ARK line 161 " + ' ' + trace print 'ERROR: {}'.format(reason) kdip_fail(job, kdip, reason) else: logger.info("{} already has pid {}".format(kdip.kdip_id, kdip.pid)) if not os.path.exists(kdip.process_dir): os.makedirs(kdip.process_dir) # Gather everything and write the file's checksum to a file via the # `checksum` method. The copy the file to the temp directory. # HT does not want sub directories in the package. tiffs = glob.glob('{}/{}/TIFF/*.tif'.format(kdip.path, kdip.kdip_id)) for tiff in tiffs: checksumfile(tiff, kdip.process_dir) shutil.copy(tiff, kdip.process_dir) altos = glob.glob('{}/{}/ALTO/*.xml'.format(kdip.path, kdip.kdip_id)) for alto in altos: checksumfile(alto, kdip.process_dir) shutil.copy(alto, kdip.process_dir) if 'alto' in alto: filename = alto.split('/') page, crap, ext = filename[-1].split('.') shutil.move(alto, '{}/{}.{}'.format(kdip.process_dir, page, ext)) ocrs = glob.glob('{}/{}/OCR/*.txt'.format(kdip.path, kdip.kdip_id)) for ocr in ocrs: checksumfile(ocr, kdip.process_dir) shutil.copy(ocr, kdip.process_dir) checksumfile(kdip.meta_yml, kdip.process_dir) checksumfile(kdip.marc_xml, kdip.process_dir) checksumfile(kdip.mets_xml, kdip.process_dir) shutil.copy(kdip.meta_yml, kdip.process_dir) shutil.copy(kdip.marc_xml, kdip.process_dir) shutil.copy(kdip.mets_xml, kdip.process_dir) # After copying all the files to the tmp directory. We verify that # the checksum matches the one we made before the move. This is done # using the `verify()` method. with open('{}/checksum.md5'.format(kdip.process_dir)) as f: content = f.readlines() for line in content: parts = line.split() verify = checksumverify(parts[0], kdip.process_dir, parts[1]) if verify is not True: logger.error('Checksum check failes for %s.' % kdip.process_dir) # Make the zip files zipf = zipfile.ZipFile('{}.zip'.format(kdip.process_dir), 'w', zipfile.ZIP_DEFLATED, allowZip64=True) os.chdir(kdip.process_dir) zipdir('.', zipf) zipf.close() # Delete the process directory to save space # but we keep the zip file shutil.rmtree(kdip.process_dir) attempts = 0 while attempts < 5: try: # Don't upload if no pid upload_file(job, kdip) if kdip.pid else kdip_fail(job, kdip, '{} has no pid.'.format(kdip.kdip_id)) break except ConnectionError: trace = traceback.format_exc() attempts += 1 sleep(5) reason = 'Connection Error, failed to upload {}.'.format(kdip.kdip_id) print 'ERROR: {}'.format(reason) kdip.status = 'retry' kdip.save() kdip_fail(job, kdip, reason) if attempts == 5 else logger.error( '{} failed to upload on attempt {} : '.format(kdip.kdip_id, attempts, trace)) except SysCallError: trace = traceback.format_exc() attempts = 5 reason = "SSL Error while uploading {}: {}".format(kdip.kdip_id, trace) logger.error(reason) kdip_fail(job, kdip, reason) except TypeError: trace = traceback.format_exc() attempts = 5 reason = "TypeError in upload package for {}: {}".format(kdip.kdip_id, trace) logger.error(reason) kdip_fail(job, kdip, reason) except MemoryError: trace = traceback.format_exc() attempts = 5 reason = "MemoryError for " + kdip.kdip_id logger.error(reason) kdip_fail(job, kdip, reason) except Exception as e: trace = traceback.format_exc() attempts = 5 reason = "Unexpected error for {}: {}, {}".format(kdip.kdip_id, str(e), trace) logger.error(reason) kdip_fail(job, kdip, reason) # Check to see if all the KDips uploaded. job.upload_attempts = job.upload_attempts + 1 statuses = job.kdip_set.values_list('status', flat=True) if ('retry' in statuses) and (job.upload_attempts < 5): # job.upload_attempts = job.upload_attempts + 1 return upload_for_ht(job, count - 1) elif ('upload_fail' in statuses) and (job.upload_attempts == 5): job.status = 'failed' job.save() elif job.upload_attempts == 5: job.status = 'being processed' job.save() recipients = settings.HATHITRUST_CONTACTS + settings.EMORY_MANAGERS kdip_list = '\n'.join(job.kdip_set.filter( status='uploaded').values_list('kdip_id', flat=True)) logger.info(kdip_list) send_to = settings.HATHITRUST_CONTACTS + settings.EMORY_MANAGERS send_from = settings.EMORY_CONTACT send_mail('New Volumes from Emory have been uploaded', 'The following volumes have been uploaded and are ready:\n\n{}'.format(kdip_list), send_from, send_to, fail_silently=False) else: return upload_for_ht(job, count - 1)
class Command(BaseCommand): '''Repair missing ARKs for :class:`~keep.collection.models.CollectionObject` objects based on the correct ARK from PIDMAN. ''' args = '[PID [PID...]]' help = '''Repair ARKs on Keep Collections or Audio objects. Optionally accepts a list of PIDs to be repaired. If no pids are specified, will find all collection objects and attempt to repair them.''' option_list = BaseCommand.option_list + (make_option( '--dry-run', dest='dry_run', action='store_true', default=False, help='Report which ARKs would be repaired'), ) def handle(self, *args, **options): self.options = options self.repaired_count = 0 self.unrepaired_count = 0 repo = Repository() self.pidman = DjangoPidmanRestClient() # populate list of objects to be processed objects = [] for pid in args: try: obj = repo.get_object(pid=pid, type=CollectionObject) if obj.has_requisite_content_models: objects.append(obj) else: obj = repo.get_object(pid=pid, type=AudioObject) if obj.has_requisite_content_models: objects.append(obj) except Exception: self.log( message="Could not find Collection or Audio object for: %s" % pid) # get list of all collections from the repository # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object if not args: objects = repo.get_objects_with_cmodel( CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) if not objects: self.log(message="No Collections were found.") for obj in objects: self.repair_ark(obj) self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True) def repair_ark(self, obj): ark_target = None try: ark_target = self.pidman.get_ark_target(noid=obj.noid, qualifier='') except: self.unrepaired_count += 1 self.log(level=WARNING, message="Failed to find ARK target for %s" % (obj.pid)) return parsed_ark = parse_ark(ark_target['access_uri']) naan = parsed_ark['naan'] noid = parsed_ark['noid'] if hasattr(obj, 'mods'): obj.mods.content.identifiers.extend([ mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)), mods.Identifier(type='uri', text=ark_target['access_uri']) ]) else: obj.dc.content.identifier_list(ark_target['access_uri']) if self.options['dry_run']: self.unrepaired_count += 1 self.log(message='ARK target found for %s' % obj.pid) return # save the collection object w/ updated ark try: self.log(level=INFO, message="Attempting to save %s" % obj.pid) obj.save(logMessage='Fixing missing ARK') self.repaired_count += 1 except DigitalObjectSaveFailure: self.log(message="An error occurred while saving %s" % (obj.pid)) def log(self, level=INFO, message='', no_label=False): ''' Convenience log function. WARNING level is only logged if the --verbosity flag is set to 2. INFO level is default and always logged. no_label can be set to True if a WARNING or INFO label is not desired. ''' if level == WARNING and not int(self.options['verbosity']) == WARNING: return output_str = '' if not no_label: output_str = '%s: ' % LOG_LEVEL[level] print "%s%s" % (output_str, message)
def handle(self, *pids, **options): dry_run = options.get('dry_run', False) verbosity = int(options.get('verbosity', self.v_normal)) repo = Repository() try: pidman = DjangoPidmanRestClient() except Exception as err: # error if pid manager config options not in localsettings raise CommandError(err) # if pids are specified on command line, only process those objects if pids: objs = [repo.get_object(pid, type=Volume) for pid in pids] # otherwise, look for all volume objects in fedora else: objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL, type=Volume) stats = defaultdict(int) for obj in objs: if not obj.exists: if verbosity >= self.v_normal: self.stdout.write( '%s does not exist or is not accessible' % obj.pid) stats['skipped'] += 1 continue stats['objs'] += 1 if is_ark(obj.dc.content.identifier): parsed_ark = parse_ark(obj.dc.content.identifier) noid = parsed_ark['noid'] try: ark_info = pidman.get_ark(noid) except Exception as err: # requested ARK is not in the configured pid manager # (this should ONLY happen in dev/QA) if verbosity >= self.v_normal: if '404: NOT FOUND' in str(err): msg = 'not found' self.stdout.write( 'Error retriving ARK information for %s: Not Found' % obj.pid) else: self.stdout.write( 'Error retriving ARK information for %s' % obj.pid) stats['skipped'] += 1 continue # update unqualified ark to resolve to readux volume landing page if not dry_run: pidman.update_ark_target(noid, target_uri=self.volume_url(obj), active=True) # we expected a qualified ARK target for the PDF; update whether # it currently exists or not qual = 'PDF' stats[ 'updated'] += 1 # count as updated in dry run mode (would be updated) if not dry_run: pidman.update_ark_target(noid, qual, target_uri=self.pdf_url(obj), active=True) # FIXME: catch possible exceptions here? # output summary if verbosity >= self.v_normal: msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '') self.stdout.write(msg)
class Command(BaseCommand): '''Repair missing ARKs for :class:`~keep.collection.models.CollectionObject` objects based on the correct ARK from PIDMAN. ''' args = '[PID [PID...]]' help = '''Repair ARKs on Keep Collections or Audio objects. Optionally accepts a list of PIDs to be repaired. If no pids are specified, will find all collection objects and attempt to repair them.''' option_list = BaseCommand.option_list + ( make_option('--dry-run', dest='dry_run', action='store_true', default=False, help='Report which ARKs would be repaired'), ) def handle(self, *args, **options): self.options = options self.repaired_count = 0 self.unrepaired_count = 0 repo = Repository() self.pidman = DjangoPidmanRestClient() # populate list of objects to be processed objects = [] for pid in args: try: obj = repo.get_object(pid=pid, type=CollectionObject) if obj.has_requisite_content_models: objects.append(obj) else: obj = repo.get_object(pid=pid, type=AudioObject) if obj.has_requisite_content_models: objects.append(obj) except Exception: self.log(message="Could not find Collection or Audio object for: %s" % pid) # get list of all collections from the repository # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object if not args: objects = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) if not objects: self.log(message="No Collections were found.") for obj in objects: self.repair_ark(obj) self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True) def repair_ark(self, obj): ark_target = None try: ark_target = self.pidman.get_ark_target(noid=obj.noid, qualifier='') except: self.unrepaired_count += 1 self.log(level=WARNING, message="Failed to find ARK target for %s" % (obj.pid)) return parsed_ark = parse_ark(ark_target['access_uri']) naan = parsed_ark['naan'] noid = parsed_ark['noid'] if hasattr(obj, 'mods'): obj.mods.content.identifiers.extend([ mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)), mods.Identifier(type='uri', text=ark_target['access_uri']) ]) else: obj.dc.content.identifier_list(ark_target['access_uri']) if self.options['dry_run']: self.unrepaired_count += 1 self.log(message='ARK target found for %s' % obj.pid) return # save the collection object w/ updated ark try: self.log(level=INFO, message="Attempting to save %s" % obj.pid) obj.save(logMessage='Fixing missing ARK') self.repaired_count += 1 except DigitalObjectSaveFailure: self.log(message="An error occurred while saving %s" % (obj.pid)) def log(self, level=INFO, message='', no_label=False): ''' Convenience log function. WARNING level is only logged if the --verbosity flag is set to 2. INFO level is default and always logged. no_label can be set to True if a WARNING or INFO label is not desired. ''' if level == WARNING and not int(self.options['verbosity']) == WARNING: return output_str = '' if not no_label: output_str = '%s: ' % LOG_LEVEL[level] print "%s%s" % (output_str, message)
def handle(self, *pids, **options): # bind a handler for interrupt signal signal.signal(signal.SIGINT, self.interrupt_handler) verbosity = int(options.get('verbosity', self.v_normal)) repo = Repository() try: pidman = DjangoPidmanRestClient() except Exception as err: # error if pid manager config options not in localsettings raise CommandError(err) old_page_target = '%s/books/pages/' % Site.objects.get_current().domain search_args = {'type': 'ark', 'target': old_page_target, 'count': 10} # get a small result set to retrieve the total results = pidman.search_pids(**search_args) total = results['results_count'] # then set a larger page size for actual processing search_args['count'] = 100 if verbosity >= self.v_normal: print 'Found %d total page ARKs with targets to be updated' % total pbar = ProgressBar( widgets=[Percentage(), ' (', Counter(), ')', Bar(), ETA()], maxval=total).start() self.stats = defaultdict(int) self.processed = set() for ark in self.get_search_results(pidman, search_args): self.processed.add(ark['pid']) # get fedora pid from target uri target_uri = ark['targets'][0]['target_uri'] baseurl, pid = target_uri.rstrip('/').rsplit('/', 1) try: page = repo.get_object(pid, type=Page) # this should probably only happen in dev/qa if not page.exists: if verbosity > self.v_normal: self.stderr.write('Page %s does not exist' % pid) self.stats['notfound'] += 1 else: # check if volume exists? pidman.update_ark_target(ark['pid'], target_uri=page.absolute_url) self.stats['updated'] += 1 except RequestFailed as rf: print 'Error accessing %s: %s' % (pid, rf) self.stats['error'] += 1 pbar.update(len(self.processed)) if self.interrupted: break if not self.interrupted: pbar.finish() # summarize self.stderr.write('Updated %(updated)d, %(error)d error(s), %(notfound)d not found' \ % self.stats)
def remove_arrangement_emails(self, batch): '''Find and iterate over all items that are part of the specified batch. Purge email message objects and update the correspending ARK records for re-use on ingest. ''' items = list(batch.rels_ext.content.objects(batch.uriref, relsext.hasMember)) for i in items: # for now, init as arrangement objects obj = self.repo.get_object(str(i), type=ArrangementObject) # NOTE: in dev/test, collection currently references all items # but only a handful actually exist in dev/test repo; just skip if not obj.exists: continue # number of objects self.stats['count'] += 1 if not obj.filetech.exists or not obj.filetech.content.file: print 'Error: no file tech for %s; skipping' % obj.pid continue # 5300c email messages should only have one file path. # Identify email messages by file path starting with # email folder name and no checksum file_info = obj.filetech.content.file[0] if not re.match(self.email_path_regex, file_info.path) or \ file_info.md5: # not an email message - skip to next item continue self.stats['email'] += 1 # if in no-act mode, nothing else to do if self.noact: continue # not in no-act mode : update pid, purge object try: # reinit client as a workaround for pidman errors (?) pidman = DjangoPidmanRestClient() # update ark name/domain pidman.update_ark(obj.noid, name=UNUSED_PID_NAME, domain=settings.PIDMAN_DOMAIN) # mark default target as inactive pidman.update_ark_target(obj.noid, active=False, target_uri=UNUSED_PID_URL) self.stats['pids'] +=1 if self.verbosity > self.v_normal: print 'Updated ARK for %s' % obj.noid except Exception as e: print 'Error updating ARK for %s: %s' % \ (obj.noid, e) # purge record try: self.repo.purge_object(obj.pid, 'removing metadata arrangement 5300c email record') self.stats['purged'] += 1 if self.verbosity > self.v_normal: print 'Purged %s' % obj.pid except RequestFailed as e: self.stats['purge_error'] += 1 print 'Error purging %s: %s' % (obj.pid, e) # summary if self.verbosity >= self.v_normal: print '''\nChecked %(count)d records, found %(email)d emails''' % self.stats if not self.noact: print 'Updated %(pids)d ARK(s); purged %(purged)d objects, error purging %(purge_error)d objects' \ % self.stats
from keep.collection.models import SimpleCollection from keep.common.models import PremisFixity, PremisObject, PremisEvent from keep.common.fedora import ArkPidDigitalObject, Repository from keep.common.utils import solr_interface from keep.collection.models import CollectionObject from keep.file.utils import sha1sum logger = logging.getLogger(__name__) # content models currently used for xacml access / restriction ACCESS_ALLOWED_CMODEL = "info:fedora/emory-control:ArrangementAccessAllowed-1.0" ACCESS_RESTRICTED_CMODEL = "info:fedora/emory-control:ArrangementAccessRestricted-1.0" # try to configure a pidman client to get pids. try: pidman = DjangoPidmanRestClient() except: # if we're in dev mode then we can fall back on the fedora default # pid allocator. in non-dev, though, we really need pidman if getattr(settings, 'DEV_ENV', False): pidman = None else: raise # FIXME: what about this one ? emory-control:RushdieResearcherAllowed-1.0 class Arrangement(models.Model): 'Place-holder DB model to define permissions for "arrangement" objects' class Meta:
def handle(self, *pids, **options): dry_run = options.get('dry_run', False) verbosity = int(options.get('verbosity', self.v_normal)) repo = Repository() try: pidman = DjangoPidmanRestClient() except Exception as err: # error if pid manager config options not in localsettings raise CommandError(err) # if pids are specified on command line, only process those objects if pids: objs = [repo.get_object(pid, type=Volume) for pid in pids] # otherwise, look for all volume objects in fedora else: objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL, type=Volume) stats = defaultdict(int) for obj in objs: if not obj.exists: if verbosity >= self.v_normal: self.stdout.write('%s does not exist or is not accessible' % obj.pid) stats['skipped'] += 1 continue stats['objs'] += 1 if is_ark(obj.dc.content.identifier): parsed_ark = parse_ark(obj.dc.content.identifier) noid = parsed_ark['noid'] try: ark_info = pidman.get_ark(noid) except Exception as err: # requested ARK is not in the configured pid manager # (this should ONLY happen in dev/QA) if verbosity >= self.v_normal: if '404: NOT FOUND' in str(err): msg = 'not found' self.stdout.write('Error retriving ARK information for %s: Not Found' % obj.pid) else: self.stdout.write('Error retriving ARK information for %s' % obj.pid) stats['skipped'] += 1 continue # update unqualified ark to resolve to readux volume landing page if not dry_run: pidman.update_ark_target(noid, target_uri=self.volume_url(obj), active=True) # we expected a qualified ARK target for the PDF; update whether # it currently exists or not qual = 'PDF' stats['updated'] += 1 # count as updated in dry run mode (would be updated) if not dry_run: pidman.update_ark_target(noid, qual, target_uri=self.pdf_url(obj), active=True) # FIXME: catch possible exceptions here? # output summary if verbosity >= self.v_normal: msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '') self.stdout.write(msg)
def handle(self, *pids, **options): # testPid # settings.PIDMAN_HOST = 'https://testpid.library.emory.edu/' # the web root where we'll ask for pids # settings.PIDMAN_USER = '' # settings.PIDMAN_PASSWORD = '' # settings.PIDMAN_DOMAIN = 'https://testpid.library.emory.edu/domains/18/' # default domain (e.g. when minting pids) # prodPid # PIDMAN_HOST = 'https://pidqas.library.emory.edu/' # get a pidman client client = DjangoPidmanRestClient() # testFedora repo = Repository(settings.FEDORA_ROOT, username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_MANAGEMENT_PASSWORD) # prodFedora #repo = Repository('https://fedora.library.emory.edu:8443/fedora/', username='******', password='******') # constants REPOMGMT = Namespace(rdflib.URIRef('info:fedora/fedora-system:def/relations-external#')) vol_list = repo.get_objects_with_cmodel('info:fedora/emory-control:ScannedVolume-1.0') print "Found " + str(len(vol_list)) + " books." # Get a file logger filename = "ecds/" + str(datetime.datetime.now().strftime("%I-%M-%S %B-%d-%Y")) + ".csv" f = open(filename, 'w+') # report all books f.write("Found " + str(len(vol_list)) + " books.") f.write("\n") # report titles f.write("TYPE,") f.write("PID,") f.write("NOID,") f.write("O_URI,") f.write("N_URI,") f.write("PAGE,") f.write("POST_URI,") # f.write("POST_PDF_URI,") f.write("\n") # go over all books for vol in vol_list: volDobj = repo.get_object(vol.pid.rstrip(), type=ScannedVolume) # get attributes pid = volDobj.pid noid = pid.split(":")[1] try: pidmanObj = client.get_pid("ark", noid) except Exception as e: f.write(str(pid)) f.write("\n") f.write(str(e)) continue # continue to the next item oriTargetUri = pidmanObj["targets"][0]["target_uri"] newTargetUri = oriTargetUri # if it has emory%3A if newTargetUri.find("emory%3A") != -1: newTargetUri = newTargetUri.replace("emory%3A", "emory:") # if it has readux%3A if newTargetUri.find("readux%3A") != -1: newTargetUri = newTargetUri.replace("readux%3A", "emory:") # if it has readux: if newTargetUri.find("readux:") != -1: newTargetUri = newTargetUri.replace("readux:", "emory:") # if it has webprd001.library.emory.edu/readux if newTargetUri.find("webprd001.library.emory.edu/readux") != -1: newTargetUri = newTargetUri.replace("webprd001.library.emory.edu/readux", "testreadux.ecds.emory.edu") # if it has webprd001.library.emory.edu if newTargetUri.find("webprd001.library.emory.edu/") != -1: newTargetUri = newTargetUri.replace("webprd001.library.emory.edu/", "testreadux.ecds.emory.edu/") # if it has /readux/ if newTargetUri.find("/readux/") != -1: newTargetUri = newTargetUri.replace("/readux/", "/") newTargetUri = unicode(newTargetUri) # log attributes f.write("BOOK" + ", ") f.write(str(pid) + ", ") f.write(str(noid) + ", ") f.write(str(oriTargetUri) + ", ") f.write(str(newTargetUri) + ", ") f.write(str(len(volDobj.pageDObjs)) + ", ") f.write("\n") # report attributes print("BOOK - " + str(pid) + " - " + str(len(volDobj.pageDObjs)) + " pages") #TODO update target # if newTargetUri != oriTargetUri: # response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri) # updated_target_uri = response["target_uri"] # response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri, qualifier="PDF") # updated_pdf_target_uri = response["target_uri"] # f.write(str(updated_target_uri) + ", ") # f.write(str(updated_pdf_target_uri) + ", ") # update pages page_count = 0 for p in volDobj.get_pages(): page_count = page_count + 1 # Get all relevant attributes pid = p noid = pid.split(":")[1] try: pidmanObj = client.get_pid("ark", noid) except Exception as e: f.write(str(pid)) f.write("\n") f.write(str(e)) continue # continue to the next item oriTargetUri = pidmanObj["targets"][0]["target_uri"] newTargetUri = unicode(oriTargetUri) # if it has readux%3A if newTargetUri.find("readux%3A%7B%25PID%25%7D") != -1: newTargetUri = newTargetUri.replace("readux%3A%7B%25PID%25%7D", pid) # if it has readux:abc1234 if newTargetUri.find("readux:") != -1: newTargetUri = newTargetUri.replace("readux:", "emory:") # if it has readux%3A if newTargetUri.find("readux%3A") != -1: newTargetUri = newTargetUri.replace("readux%3A", "emory:") # if it has /readux/ if newTargetUri.find("/readux/") != -1: newTargetUri = newTargetUri.replace("/readux/", "/") # if it has webprd001.library. if newTargetUri.find("webprd001.library.emory.") != -1: newTargetUri = newTargetUri.replace("webprd001.library.emory.", "testreadux.ecds.emory.") newTargetUri = unicode(newTargetUri) # Log attributes f.write("page"+ ", ") f.write(str(pid) + ", ") f.write(str(noid) + ", ") f.write(str(oriTargetUri) + ", ") f.write(str(newTargetUri) + ", ") f.write(str(page_count) + ", ") try: print(str(page_count) + "/" + str(len(volDobj.pageDObjs)) + " - " + str(noid) + " - page update") #TODO update target # if newTargetUri != oriTargetUri: # response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri) # updated_target_uri = response["target_uri"] # response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri, qualifier="PDF") # updated_pdf_target_uri = response["target_uri"] # f.write(str(noid) + " - page success" + ", ") # f.write(str(noid) + " - page pdf success" + ", ") except: print(str(page_count) + "/" + str(len(volDobj.pageDObjs)) + " - " + str(noid) + " - page fail") f.write(str(noid) + " - page fail" + ", ") f.write("\n") f.write("\n") f.close()