def handle(self, *args, **options): verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all v_normal = 1 if verbosity > v_normal: print "Preparing documents from all defined Archives" updated = 0 unchanged = 0 errored = 0 if len(args): files = args else: files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) for file in files: try: ead = load_xmlobject_from_file(file, FindingAid) orig_xml = ead.serializeDocument(pretty=True) unitid = unicode(ead.archdesc.unitid) match = self.unitid_regex.search(unitid) if not match: raise Exception('Could not determine collection number for %s - %s' % \ (file, unitid)) collection_num = match.group('number') if verbosity > v_normal: print "Identifier for %s is %s (%s)" % (file, collection_num, unitid) ead.archdesc.unitid.identifier = collection_num if orig_xml == ead.serializeDocument(pretty=True): if verbosity > v_normal: print "No changes made to %s" % file unchanged += 1 else: with open(file, 'w') as f: ead.serializeDocument(f, pretty=True) if verbosity > v_normal: print "Updated %s" % file updated += 1 except XMLSyntaxError, e: # xml is not well-formed print "Error: failed to load %s (document not well-formed XML?)" \ % file errored += 1 except Exception, e: # catch any other exceptions print "Error: failed to set identifier for %s : %s" % (file, e) errored += 1
def archive_svn_checkout(archive, update=False): client = svn_client() # if this is an update, clear out existing svn checkout if update and os.path.isdir(archive.svn_local_path): shutil.rmtree(archive.svn_local_path) logger.info('removing outdated svn directory %s' % archive.svn_local_path) client.checkout(archive.svn, archive.svn_local_path, 'HEAD')
def archive_save_hook(sender, instance, created, raw, using, update_fields, **kwargs): # check if an svn update or checkout is needed before queuing the task updated = False if not created: # if directory doesn't exist, check it out if not os.path.isdir(instance.svn_local_path): updated = True # if path already exists, check if the svn url has changed else: client = svn_client() svninfo = client.info(instance.svn_local_path, depth=0) current_svn_url = svninfo[svninfo.keys()[0]].url if current_svn_url != instance.svn: updated = True if created or updated: result = archive_svn_checkout.delay(instance, update=updated) task = TaskResult(label='SVN checkout', object_id=instance.label, # will be displayed in task result url=reverse('admin:fa_archive_change', args=[instance.pk]), # link in task result task_id=result.task_id) task.save()
def files_to_publish(archive): # determine local/remote revision to see if an update is needed start = time.time() svnwc = wc.WorkingCopy(None, archive.svn_local_path) # NOTE: second arg is path; first arg not documented (?!) local_rev = svnwc.entry(archive.svn_local_path).revision logger.debug('svn local revision for %s is %d (%f sec)' % (archive.slug, local_rev, time.time() - start)) remote = svn_remote(archive.svn) start = time.time() latest_rev = remote.get_latest_revnum() logger.debug('svn remote revision for %s is %d (%f sec)' % (archive.slug, latest_rev, time.time() - start)) # ONLY do an svn update if the revisions don't match if local_rev != latest_rev: svn = svn_client() start = time.time() svn.update(str(archive.svn_local_path)) # apparently can't handle unicode logger.debug('svn update %s in %f sec' % (archive.slug, time.time() - start)) # return list of recent xml files from the working copy return svn_xml_files(archive)
# any exception on prep is most likely ark generation return HttpResponseServerError('Failed to prep the document: ' + str(e)) # on GET, display the xml and make available for download if request.method == 'GET': response = HttpResponse(prepped_xml, content_type='application/xml') response['Content-Disposition'] = "attachment; filename=%s" % filename return response # on POST, save to file and commit to subversion if request.method == 'POST': file_path = os.path.join(arch.svn_local_path, filename) with open(file_path, 'w') as xmlfile: xmlfile.write(prepped_xml) svn = svn_client() # seems to be the only way to set a commit log message via client def get_log_message(arg): # argument looks something like this: # [('foo', 'https://svn.library.emory.edu/svn/dev_ead-eua/trunk/eua0081affirmationvietnam.xml', 6, None, 4)] # ignoring since we will only use this function for a single commit return 'prepared EAD via FindingAids website admin, saved on behalf of %s' % request.user svn.log_msg_func = get_log_message saved = svn.commit(str(file_path)) # has to be string and not unicode # commit returns something like this on success: # (8, '2013-11-13T18:19:00.191382Z', 'keep') # revision number, date, user # returns nothing if there were no changes to commit if saved:
def handle(self, *args, **options): verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all v_normal = 1 v_all = 2 if options['pdf_only'] and options['skip_pdf_reload']: raise CommandError("Options -s and -p are not compatible") # check for required settings if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION: raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing") return if len(args): files = args else: # Note: copied from prep_ead manage command; move somewhere common? files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) if verbosity == v_all: print 'Documents will be loaded to configured eXist collection: %s' \ % settings.EXISTDB_ROOT_COLLECTION if options['skip_pdf_reload']: print "** Skipping PDFs cache reload" db = ExistDB() loaded = 0 errored = 0 pdf_tasks = {} start_time = datetime.now() if not options['pdf_only']: # unless PDF reload only has been specified, load files for file in files: try: # full path location where file will be loaded in exist db collection dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file) errors = check_ead(file, dbpath) if errors: # report errors, don't load errored += 1 print "Error: %s does not pass publication checks; not loading to eXist." % file if verbosity >= v_normal: print " Errors found:" for err in errors: print " %s" % err else: with open(file, 'r') as eadfile: success = db.load(eadfile, dbpath, overwrite=True) if success: loaded += 1 if verbosity >= v_normal: print "Loaded %s" % file # load the file as a FindingAid object to get the eadid for PDF reload ead = load_xmlobject_from_file(file, FindingAid) # trigger PDF regeneration in the cache and store task result # - unless user has requested PDF reload be skipped if not options['skip_pdf_reload']: pdf_tasks[ead.eadid.value] = reload_cached_pdf.delay(ead.eadid.value) # NOTE: unlike the web admin publish, this does not # generate TaskResult db records; task outcomes will be # checked & reported before the script finishes else: errored += 1 print "Error: failed to load %s to eXist" % file except ExistDBException, e: print "Error: failed to load %s to eXist" % file print e.message() errored += 1 # output a summary of what was done print "%d document%s loaded" % (loaded, 's' if loaded != 1 else '') print "%d document%s with errors" % (errored, 's' if errored != 1 else '')
def handle(self, *args, **options): verbosity = int(options.get('verbosity', self.v_normal)) svn_commit = options.get('commit', False) dry_run = options.get('dryrun', False) # check for required settings if not hasattr(settings, 'KEEP_SOLR_SERVER_URL') or not settings.KEEP_SOLR_SERVER_URL: raise CommandError("KEEP_SOLR_SERVER_URL setting is required for this script") return solr = solr_interface() if verbosity > self.v_normal: print "Preparing documents from all defined Archives" if dry_run: print "Running in dry-run mode; no changes will be made" updated = 0 unchanged = 0 errored = 0 if len(args): files = args else: # Note: copied from prep_ead manage command; move somewhere common? files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) for file in files: file_items = 0 daos = 0 try: if verbosity >= self.v_normal and len(files) > 1: self.stdout.write('\nProcessing %s' % os.path.basename(file)) ead = load_xmlobject_from_file(file, FindingAid) orig_xml = ead.serializeDocument() # keep to check if changed for c in self.ead_file_items(ead): # if item already contains any dao tags, skip it (no furher processing needed) if c.did.dao_list: continue match = self.has_digitized_content(unicode(c.did.unittitle)) if match: file_items += 1 try: id_list = self.id_list(match.groupdict()['ids']) except Exception as e: self.stdout.write('Error parsing ids from "%s" : %s' % \ (unicode(c.did.unittitle), e)) continue # if no ids were found even though title seemed to have digitized content, # error and skip to next if not id_list: self.stdout.write('Appears to have digitized content, but no ids found in "%s"' % \ (unicode(c.did.unittitle))) continue # dictionary for any Keep info corresponding to these ids id_info = {} # look up each id in the Keep for i in id_list: q = solr.query(solr.Q(dm1_id="%s" % i) | solr.Q(pid="emory:%s" % i)) \ .field_limit(['ark_uri', 'pid']) if q.count() == 1: id_info[i] = q[0] # remove the plain-text digitized ids from unittitle content # (handle as unicode to preserve any special characters) # NOTE: because unittitle could contain nested tags (dates, # titles, names, etc), iterate through the text nodes and # remove the digitized note wherever it occurs # - use lxml smart strings to update based on parent nodes text_nodes = c.did.unittitle.node.xpath('text()') for txt in text_nodes: updated_txt = re.sub(self.digitized_ids, u'', txt) if txt.is_text: txt.getparent().text = updated_txt else: txt.getparent().tail = updated_txt # ensure document has xlink namespace declared at the top # or else it will be repeated for each dao for i in id_list: info = id_info.get(i, None) # append a new dao for each id; audience will always be internal dao_opts = {'audience': 'internal'} href = None if info: # in some cases in production, a record is found but no # ark_uri is indexed in solr (indicates ark_uri not in MODS) try: href = info['ark_uri'] except KeyError: self.stdout.write('Warning: Keep record was found for %s but no ARK URI is indexed' \ % i) # if no record was found, *should* be a digital masters id if href is None: # if id already starts with dm, don't duplicate the prefix if i.startswith('dm'): dao_opts['id'] = i # if it's a digit, add dm prefix elif i.isdigit(): dao_opts['id'] = 'dm%s' % i # otherwise, warn and add the id in pid notation else: # only warn if we didn't already warn about info without ark uri if not info: self.stdout.write('Warning: non-digital masters id %s not found in the Keep' \ % i) # generate an ark anyway, since pids don't make valid ids href = 'http://pid.emory.edu/ark:/25593/%s' % i c.did.dao_list.append(eadmap.DigitalArchivalObject(**dao_opts)) if href is not None: c.did.dao_list[-1].href = href # clean up any extra namespaces (exist-db ns) cleanup_namespaces(c.did.dao_list[-1].node) daos += 1 # NOTE: could use pretty=True, but not used elsewhere in fa_admin, # so leaving off for consistency if orig_xml == ead.serializeDocument(): if verbosity > self.v_normal: self.stdout.write("No changes made to %s" % file) unchanged += 1 else: # in dry run, don't actually change the file if not dry_run: with open(file, 'w') as f: ead.serializeDocument(f) if verbosity >= self.v_normal: self.stdout.write("Updated %s; found %d item%s with digitized content, added %d <dao>%s" \ % (file, file_items, 's' if file_items != 1 else '', daos, 's' if daos != 1 else '')) updated += 1 except XMLSyntaxError: # xml is not well-formed self.stdout.write("Error: failed to load %s (document not well-formed XML?)" \ % file) errored += 1 # except Exception, e: # # catch any other exceptions # print "Error: failed to update %s : %s" % (file, e) # errored += 1 # TODO: might be nice to also report total number of daos added # summary of what was done self.stdout.write("\n%d document%s updated" % (updated, 's' if updated != 1 else '')) self.stdout.write("%d document%s unchanged" % (unchanged, 's' if unchanged != 1 else '')) self.stdout.write("%d document%s with errors" % (errored, 's' if errored != 1 else '')) if svn_commit: svn = svn_client() # seems to be the only way to set a commit log message via client def get_log_message(arg): # argument looks something like this: # [('foo', 'https://svn.library.emory.edu/svn/dev_ead-eua/trunk/eua0081affirmationvietnam.xml', 6, None, 4)] # ignoring since we will only use this function for a single commit return 'converted digitized item ids to <dao> tags' svn.log_msg_func = get_log_message for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.commit(str(archive.svn_local_path))
def handle(self, *args, **options): verbosity = int(options['verbosity']) self._setup_logging(verbosity) # check for required settings if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION: raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing") return if verbosity == self.v_all: print "Preparing documents from all defined Archives" updated = 0 unchanged = 0 errored = 0 if len(args): files = args else: files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) for file in files: try: ead = load_xmlobject_from_file(file, FindingAid) orig_xml = ead.serializeDocument(pretty=True) ead = utils.prep_ead(ead, file) # sanity check before saving dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file) errors = utils.check_ead(file, dbpath, xml=ead.serializeDocument()) if errors: errored += 1 print "Prepared EAD for %s does not pass sanity checks, not saving." % file if verbosity >= self.v_normal: print "Errors found:" for err in errors: # some errors include a list of error instances - display nicely if isinstance(err, list): for suberr in err: print " %s" % suberr else: print " %s" % err elif orig_xml == ead.serializeDocument(pretty=True): if verbosity >= self.v_normal: print "No changes made to %s" % file unchanged += 1 else: with open(file, 'w') as f: ead.serializeDocument(f, pretty=True) if verbosity >= self.v_normal: print "Updated %s" % file updated += 1 except XMLSyntaxError, e: # xml is not well-formed print "Error: failed to load %s (document not well-formed XML?)" \ % file errored += 1 except Exception, e: # catch any other exceptions print "Error: failed to prep %s : %s" % (file, e) errored += 1