def handle(self, *args, **options):
        verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        v_normal = 1

        if verbosity > v_normal:
            print "Preparing documents from all defined Archives"

        updated = 0
        unchanged = 0
        errored = 0

        if len(args):
            files = args
        else:
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))

        for file in files:
            try:
                ead = load_xmlobject_from_file(file, FindingAid)
                orig_xml = ead.serializeDocument(pretty=True)
                unitid = unicode(ead.archdesc.unitid)

                match = self.unitid_regex.search(unitid)
                if not match:
                    raise Exception('Could not determine collection number for %s - %s' % \
                            (file, unitid))

                collection_num = match.group('number')
                if verbosity > v_normal:
                    print "Identifier for %s is %s (%s)" % (file, collection_num, unitid)
                ead.archdesc.unitid.identifier = collection_num

                if orig_xml == ead.serializeDocument(pretty=True):
                    if verbosity > v_normal:
                        print "No changes made to %s" % file
                    unchanged += 1
                else:
                    with open(file, 'w') as f:
                        ead.serializeDocument(f, pretty=True)
                    if verbosity > v_normal:
                        print "Updated %s" % file
                    updated += 1
            except XMLSyntaxError, e:
                # xml is not well-formed
                print "Error: failed to load %s (document not well-formed XML?)" \
                            % file
                errored += 1
            except Exception, e:
                # catch any other exceptions
                print "Error: failed to set identifier for %s : %s" % (file, e)
                errored += 1
Пример #2
0
def archive_svn_checkout(archive, update=False):
    client = svn_client()

    # if this is an update, clear out existing svn checkout
    if update and os.path.isdir(archive.svn_local_path):
        shutil.rmtree(archive.svn_local_path)
        logger.info('removing outdated svn directory %s' %
                    archive.svn_local_path)

    client.checkout(archive.svn, archive.svn_local_path, 'HEAD')
Пример #3
0
def archive_save_hook(sender, instance, created, raw, using,
                      update_fields, **kwargs):
    # check if an svn update or checkout is needed before queuing the task
    updated = False
    if not created:
        # if directory doesn't exist, check it out
        if not os.path.isdir(instance.svn_local_path):
            updated = True
        # if path already exists, check if the svn url has changed
        else:
            client = svn_client()
            svninfo = client.info(instance.svn_local_path, depth=0)
            current_svn_url = svninfo[svninfo.keys()[0]].url
            if current_svn_url != instance.svn:
                updated = True

    if created or updated:
        result = archive_svn_checkout.delay(instance, update=updated)
        task = TaskResult(label='SVN checkout',
            object_id=instance.label,  # will be displayed in task result
            url=reverse('admin:fa_archive_change', args=[instance.pk]), # link in task result
            task_id=result.task_id)
        task.save()
Пример #4
0
def files_to_publish(archive):
    # determine local/remote revision to see if an update is needed
    start = time.time()
    svnwc = wc.WorkingCopy(None, archive.svn_local_path)
    # NOTE: second arg is path; first arg not documented (?!)
    local_rev = svnwc.entry(archive.svn_local_path).revision
    logger.debug('svn local revision for %s is %d (%f sec)' %
                (archive.slug, local_rev, time.time() - start))

    remote = svn_remote(archive.svn)
    start = time.time()
    latest_rev = remote.get_latest_revnum()
    logger.debug('svn remote revision for %s is %d (%f sec)' %
                (archive.slug, latest_rev, time.time() - start))

    # ONLY do an svn update if the revisions don't match
    if local_rev != latest_rev:
        svn = svn_client()
        start = time.time()
        svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
        logger.debug('svn update %s in %f sec' % (archive.slug, time.time() - start))

    # return list of recent xml files from the working copy
    return svn_xml_files(archive)
Пример #5
0
                # any exception on prep is most likely ark generation
                return HttpResponseServerError('Failed to prep the document: ' + str(e))

    # on GET, display the xml and make available for download
    if request.method == 'GET':
        response = HttpResponse(prepped_xml, content_type='application/xml')
        response['Content-Disposition'] = "attachment; filename=%s" % filename
        return response

    # on POST, save to file and commit to subversion
    if request.method == 'POST':
        file_path = os.path.join(arch.svn_local_path, filename)
        with open(file_path, 'w') as xmlfile:
            xmlfile.write(prepped_xml)

        svn = svn_client()
        # seems to be the only way to set a commit log message via client
        def get_log_message(arg):
            # argument looks something like this:
            # [('foo', 'https://svn.library.emory.edu/svn/dev_ead-eua/trunk/eua0081affirmationvietnam.xml', 6, None, 4)]
            # ignoring since we will only use this function for a single commit
            return 'prepared EAD via FindingAids website admin, saved on behalf of %s' % request.user

        svn.log_msg_func = get_log_message
        saved = svn.commit(str(file_path))  # has to be string and not unicode
        # commit returns something like this on success:
        # (8, '2013-11-13T18:19:00.191382Z', 'keep')
        # revision number, date, user
        # returns nothing if there were no changes to commit

        if saved:
Пример #6
0
    def handle(self, *args, **options):
        verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        v_normal = 1
        v_all = 2

        if options['pdf_only'] and options['skip_pdf_reload']:
            raise CommandError("Options -s and -p are not compatible")

        # check for required settings
        if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION:
            raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing")
            return

        if len(args):
            files = args
        else:
            # Note: copied from prep_ead manage command; move somewhere common?
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))

        if verbosity == v_all:
            print 'Documents will be loaded to configured eXist collection: %s' \
                    % settings.EXISTDB_ROOT_COLLECTION
            if options['skip_pdf_reload']:
                print "** Skipping PDFs cache reload"

        db = ExistDB()

        loaded = 0
        errored = 0
        pdf_tasks = {}

        start_time = datetime.now()

        if not options['pdf_only']:
        # unless PDF reload only has been specified, load files

            for file in files:
                try:
                    # full path location where file will be loaded in exist db collection
                    dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file)
                    errors = check_ead(file, dbpath)
                    if errors:
                        # report errors, don't load
                        errored += 1
                        print "Error: %s does not pass publication checks; not loading to eXist." % file
                        if verbosity >= v_normal:
                            print "  Errors found:"
                            for err in errors:
                                print "    %s" % err
                    else:
                        with open(file, 'r') as eadfile:
                            success = db.load(eadfile, dbpath, overwrite=True)

                        if success:
                            loaded += 1
                            if verbosity >= v_normal:
                                print "Loaded %s" % file
                            # load the file as a FindingAid object to get the eadid for PDF reload
                            ead = load_xmlobject_from_file(file, FindingAid)

                            # trigger PDF regeneration in the cache and store task result
                            # - unless user has requested PDF reload be skipped
                            if not options['skip_pdf_reload']:
                                pdf_tasks[ead.eadid.value] = reload_cached_pdf.delay(ead.eadid.value)
                                # NOTE: unlike the web admin publish, this does not
                                # generate TaskResult db records; task outcomes will be
                                # checked & reported before the script finishes
                        else:
                            errored += 1
                            print "Error: failed to load %s to eXist" % file
                except ExistDBException, e:
                    print "Error: failed to load %s to eXist" % file
                    print e.message()
                    errored += 1

            # output a summary of what was done
            print "%d document%s loaded" % (loaded, 's' if loaded != 1 else '')
            print "%d document%s with errors" % (errored, 's' if errored != 1 else '')
Пример #7
0
    def handle(self, *args, **options):
        verbosity = int(options.get('verbosity', self.v_normal))
        svn_commit = options.get('commit', False)
        dry_run = options.get('dryrun', False)

        # check for required settings
        if not hasattr(settings, 'KEEP_SOLR_SERVER_URL') or not settings.KEEP_SOLR_SERVER_URL:
            raise CommandError("KEEP_SOLR_SERVER_URL setting is required for this script")
            return

        solr = solr_interface()

        if verbosity > self.v_normal:
            print "Preparing documents from all defined Archives"
            if dry_run:
                print "Running in dry-run mode; no changes will be made"

        updated = 0
        unchanged = 0
        errored = 0

        if len(args):
            files = args
        else:
            # Note: copied from prep_ead manage command; move somewhere common?
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))


        for file in files:
            file_items = 0
            daos = 0
            try:
                if verbosity >= self.v_normal and len(files) > 1:
                    self.stdout.write('\nProcessing %s' % os.path.basename(file))

                ead = load_xmlobject_from_file(file, FindingAid)
                orig_xml = ead.serializeDocument()  # keep to check if changed

                for c in self.ead_file_items(ead):
                    # if item already contains any dao tags, skip it (no furher processing needed)
                    if c.did.dao_list:
                        continue

                    match = self.has_digitized_content(unicode(c.did.unittitle))
                    if match:
                        file_items += 1
                        try:
                            id_list = self.id_list(match.groupdict()['ids'])
                        except Exception as e:
                            self.stdout.write('Error parsing ids from "%s" : %s' % \
                                              (unicode(c.did.unittitle), e))
                            continue

                        # if no ids were found even though title seemed to have digitized content,
                        # error and skip to next
                        if not id_list:
                            self.stdout.write('Appears to have digitized content, but no ids found in "%s"' % \
                                              (unicode(c.did.unittitle)))
                            continue

                        # dictionary for any Keep info corresponding to these ids
                        id_info = {}

                        # look up each id in the Keep
                        for i in id_list:
                            q = solr.query(solr.Q(dm1_id="%s" % i) | solr.Q(pid="emory:%s" % i)) \
                                    .field_limit(['ark_uri', 'pid'])
                            if q.count() == 1:
                                id_info[i] = q[0]

                        # remove the plain-text digitized ids from unittitle content
                        # (handle as unicode to preserve any special characters)
                        # NOTE: because unittitle could contain nested tags (dates,
                        # titles, names, etc), iterate through the text nodes and
                        # remove the digitized note wherever it occurs
                        # - use lxml smart strings to update based on parent nodes
                        text_nodes = c.did.unittitle.node.xpath('text()')
                        for txt in text_nodes:
                            updated_txt = re.sub(self.digitized_ids, u'', txt)
                            if txt.is_text:
                                txt.getparent().text = updated_txt
                            else:
                                txt.getparent().tail = updated_txt

                        # ensure document has xlink namespace declared at the top
                        # or else it will be repeated for each dao

                        for i in id_list:
                            info = id_info.get(i, None)
                            # append a new dao for each id; audience will always be internal
                            dao_opts = {'audience': 'internal'}
                            href = None

                            if info:
                                # in some cases in production, a record is found but no
                                # ark_uri is indexed in solr (indicates ark_uri not in MODS)
                                try:
                                    href = info['ark_uri']
                                except KeyError:
                                    self.stdout.write('Warning: Keep record was found for %s but no ARK URI is indexed' \
                                        % i)

                            # if no record was found, *should* be a digital masters id
                            if href is None:
                                # if id already starts with dm, don't duplicate the prefix
                                if i.startswith('dm'):
                                    dao_opts['id'] = i
                                # if it's a digit, add dm prefix
                                elif i.isdigit():
                                    dao_opts['id'] = 'dm%s' % i
                                # otherwise, warn and add the id in pid notation
                                else:
                                    # only warn if we didn't already warn about info without ark uri
                                    if not info:
                                        self.stdout.write('Warning: non-digital masters id %s not found in the Keep' \
                                                           % i)
                                    # generate an ark anyway, since pids don't make valid ids
                                    href = 'http://pid.emory.edu/ark:/25593/%s' % i

                            c.did.dao_list.append(eadmap.DigitalArchivalObject(**dao_opts))
                            if href is not None:
                                c.did.dao_list[-1].href = href
                            # clean up any extra namespaces (exist-db ns)
                            cleanup_namespaces(c.did.dao_list[-1].node)

                            daos += 1

                # NOTE: could use pretty=True, but not used elsewhere in fa_admin,
                # so leaving off for consistency
                if orig_xml == ead.serializeDocument():
                    if verbosity > self.v_normal:
                        self.stdout.write("No changes made to %s" % file)
                    unchanged += 1
                else:
                    # in dry run, don't actually change the file
                    if not dry_run:
                        with open(file, 'w') as f:
                            ead.serializeDocument(f)
                    if verbosity >= self.v_normal:
                        self.stdout.write("Updated %s; found %d item%s with digitized content, added %d <dao>%s" \
                            % (file, file_items, 's' if file_items != 1 else '',
                               daos, 's' if daos != 1 else ''))
                    updated += 1

            except XMLSyntaxError:
                # xml is not well-formed
                self.stdout.write("Error: failed to load %s (document not well-formed XML?)" \
                                  % file)
                errored += 1
            # except Exception, e:
            #     # catch any other exceptions
            #     print "Error: failed to update %s : %s" % (file, e)
            #     errored += 1

        # TODO: might be nice to also report total number of daos added

        # summary of what was done
        self.stdout.write("\n%d document%s updated" % (updated, 's' if updated != 1 else ''))
        self.stdout.write("%d document%s unchanged" % (unchanged, 's' if unchanged != 1 else ''))
        self.stdout.write("%d document%s with errors" % (errored, 's' if errored != 1 else ''))

        if svn_commit:
            svn = svn_client()
            # seems to be the only way to set a commit log message via client
            def get_log_message(arg):
                # argument looks something like this:
                # [('foo', 'https://svn.library.emory.edu/svn/dev_ead-eua/trunk/eua0081affirmationvietnam.xml', 6, None, 4)]
                # ignoring since we will only use this function for a single commit
                return 'converted digitized item ids to <dao> tags'

            svn.log_msg_func = get_log_message

            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.commit(str(archive.svn_local_path))
Пример #8
0
    def handle(self, *args, **options):
        verbosity = int(options['verbosity'])

        self._setup_logging(verbosity)

        # check for required settings
        if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION:
            raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing")
            return


        if verbosity == self.v_all:
            print "Preparing documents from all defined Archives"

        updated = 0
        unchanged = 0
        errored = 0

        if len(args):
            files = args
        else:
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))

        for file in files:
            try:
                ead = load_xmlobject_from_file(file, FindingAid)
                orig_xml = ead.serializeDocument(pretty=True)
                ead = utils.prep_ead(ead, file)
                # sanity check before saving
                dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file)
                errors = utils.check_ead(file, dbpath, xml=ead.serializeDocument())
                if errors:
                    errored += 1
                    print "Prepared EAD for %s does not pass sanity checks, not saving." % file
                    if verbosity >= self.v_normal:
                        print "Errors found:"
                        for err in errors:
                            # some errors include a list of error instances - display nicely
                            if isinstance(err, list):
                                for suberr in err:
                                    print "    %s" % suberr
                            else:
                                print "  %s" % err
                elif orig_xml == ead.serializeDocument(pretty=True):
                    if verbosity >= self.v_normal:
                        print "No changes made to %s" % file
                    unchanged += 1
                else:
                    with open(file, 'w') as f:
                        ead.serializeDocument(f, pretty=True)
                    if verbosity >= self.v_normal:
                        print "Updated %s" % file
                    updated += 1
            except XMLSyntaxError, e:
                # xml is not well-formed
                print "Error: failed to load %s (document not well-formed XML?)" \
                            % file
                errored += 1
            except Exception, e:
                # catch any other exceptions
                print "Error: failed to prep %s : %s" % (file, e)
                errored += 1