Exemplo n.º 1
0
 def test_src_to_dst(self):
     m=Mapper('http://e.org/p','/tmp/q')
     self.assertEqual( m.src_to_dst('http://e.org/p'), '/tmp/q')
     self.assertEqual( m.src_to_dst('http://e.org/p/aa'), '/tmp/q/aa')
     self.assertEqual( m.src_to_dst('http://e.org/p/aa/bb'), '/tmp/q/aa/bb')
     self.assertEqual( m.src_to_dst('http://e.org/p/aa/bb/'), '/tmp/q/aa/bb/')
     self.assertEqual( m.src_to_dst('http://e.org/pa'), '/tmp/qa') #should throw error
Exemplo n.º 2
0
 def test01_mapper_src_to_dst(self):
     m=Mapper( ['http://e.org/p/','/tmp/q/'] )
     self.assertEqual( m.src_to_dst('http://e.org/p/'), '/tmp/q/')
     self.assertEqual( m.src_to_dst('http://e.org/p/aa'), '/tmp/q/aa')
     self.assertEqual( m.src_to_dst('http://e.org/p/aa/bb'), '/tmp/q/aa/bb')
     self.assertEqual( m.src_to_dst('http://e.org/p/aa/bb/'), '/tmp/q/aa/bb/')
     self.assertRaises( MapperError, m.src_to_dst, 'http://e.org/p' )
     self.assertRaises( MapperError, m.src_to_dst, 'http://e.org/pa' )
     self.assertRaises( MapperError, m.src_to_dst, 'nomatch' )
Exemplo n.º 3
0
 def test01_mapper_src_to_dst(self):
     m = Mapper(['http://e.org/p/', '/tmp/q/'])
     self.assertEqual(m.src_to_dst('http://e.org/p/'), '/tmp/q/')
     self.assertEqual(m.src_to_dst('http://e.org/p/aa'), '/tmp/q/aa')
     self.assertEqual(m.src_to_dst('http://e.org/p/aa/bb'), '/tmp/q/aa/bb')
     self.assertEqual(m.src_to_dst('http://e.org/p/aa/bb/'),
                      '/tmp/q/aa/bb/')
     self.assertRaises(MapperError, m.src_to_dst, 'http://e.org/p')
     self.assertRaises(MapperError, m.src_to_dst, 'http://e.org/pa')
     self.assertRaises(MapperError, m.src_to_dst, 'nomatch')
Exemplo n.º 4
0
    def sync_or_audit(self, src_uri, dst_path, allow_deletion=False, 
                      audit_only=False):
        ### 1. Get inventorys from both src and dst 
        # 1.a source inventory
        ib = InventoryBuilder()
        try:
            src_inventory = ib.get(src_uri)
        except IOError as e:
            raise ClientFatalError("Can't read source inventory (%s)" % str(e))
        if (self.verbose):
            print "Read src inventory from %s, %d resources listed" % (src_uri,len(src_inventory))
        if (len(src_inventory)==0):
            raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_inventory.has_md5()):
            self.checksum=False
            print "Not calculating checksums on destination as not present in source inventory"
        # 1.b destination inventory mapped back to source URIs
        segments = src_uri.split('/')
        segments.pop()
        url_prefix='/'.join(segments)
        ib.do_md5=self.checksum
        dst_inventory = ib.from_disk(dst_path,url_prefix)
        ### 2. Compare these inventorys respecting any comparison options
        (num_same,changed,deleted,added)=dst_inventory.compare(src_inventory)   
        ### 3. Report status and planned actions
        status = "  IN SYNC  "
        if (len(changed)>0 or len(deleted)>0 or len(added)>0):
            status = "NOT IN SYNC"
        print "Status: %s (same=%d, changed=%d, deleted=%d, added=%d)" %\
              (status,num_same,len(changed),len(deleted),len(added))

        if (audit_only):
            return
        ### 4. Grab files to do sync
        mapper = Mapper(url_prefix,dst_path)
        for uri in changed:
            file = mapper.src_to_dst(uri)
            if (self.verbose):
                print "changed: %s -> %s" % (uri,file)
            self.update_resource(uri,file,src_inventory.resources[uri].timestamp)
        for uri in added:
            file = mapper.src_to_dst(uri)
            if (self.verbose):
                print "added: %s -> %s" % (uri,file)
            self.update_resource(uri,file,src_inventory.resources[uri].timestamp)
        for uri in deleted:
            if (allow_deletion):
                file = mapper.src_to_dst(uri)
                if (self.verbose):
                    print "deleted: %s -> %s" % (uri,file)
                os.unlink(file)
            else:
                if (self.verbose):
                    print "would delete %s (--delete to enable)" % uri
Exemplo n.º 5
0
class Client():
    """Implementation of a ResourceSync client"""

    def __init__(self, checksum=False, verbose=False, dryrun=False):
        self.checksum = checksum
        self.verbose = verbose
        self.dryrun = dryrun
        self.mapper = None
        self.sitemap_name = 'sitemap.xml'
        self.dump_format = None
        self.allow_multifile = False
        self.max_sitemap_entries = None

    @property
    def mappings(self):
        """Provide access to mappings list within Mapper object"""
        if (self.mapper is None):
            raise ClientFatalError("No mappings specified")
        return(self.mapper.mappings)

#    @mappings.setter
    def set_mappings(self,mappings):
        """Build and set Mapper object based on input mappings"""
        self.mapper = Mapper(mappings)

    @property
    def sitemap(self):
        """Return the sitemap URI base on maps or explicit settings"""
        if (re.match(r"\w+:",self.sitemap_name)):
            # looks like URI
            return(self.sitemap_name)
        elif (re.match(r"/",self.sitemap_name)):
            # looks like full path
            return(self.sitemap_name)
        else:
            # build from mapping with name appended
            return(self.mappings[0].src_uri + '/' + self.sitemap_name)

    @property
    def inventory(self):
        """Return inventory on disk based on current mappings

        Return inventory. Uses existing self.mapper settings.
        """
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Build from disk
        ib = InventoryBuilder(do_md5=self.checksum,verbose=self.verbose,mapper=self.mapper)
        return( ib.from_disk() )

    def sync_or_audit(self, allow_deletion=False, audit_only=False):
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Get inventories from both src and dst 
        # 1.a source inventory
        ib = InventoryBuilder(verbose=self.verbose,mapper=self.mapper)
        try:
            if (self.verbose):
                print "Reading sitemap %s ..." % (self.sitemap)
            src_inventory = ib.get(self.sitemap)
        except IOError as e:
            raise ClientFatalError("Can't read source inventory from %s (%s)" % (self.sitemap,str(e)))
        if (self.verbose):
            print "Read source inventory, %d resources listed" % (len(src_inventory))
        if (len(src_inventory)==0):
            raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_inventory.has_md5()):
            self.checksum=False
            print "Not calculating checksums on destination as not present in source inventory"
        # 1.b destination inventory mapped back to source URIs
        ib.do_md5=self.checksum
        dst_inventory = ib.from_disk()
        ### 2. Compare these inventorys respecting any comparison options
        (num_same,updated,deleted,created)=dst_inventory.compare(src_inventory)   
        ### 3. Report status and planned actions
        status = "  IN SYNC  "
        if (len(updated)>0 or len(deleted)>0 or len(created)>0):
            status = "NOT IN SYNC"
        print "Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\
              (status,num_same,len(updated),len(deleted),len(created))

        if (audit_only):
            return
        ### 4. Grab files to do sync
        for uri in updated:
            file = self.mapper.src_to_dst(uri)
            if (self.verbose):
                print "updated: %s -> %s" % (uri,file)
            self.update_resource(uri,file,src_inventory.resources[uri].timestamp)
        for uri in created:
            file = self.mapper.src_to_dst(uri)
            self.update_resource(uri,file,src_inventory.resources[uri].timestamp)
        for uri in deleted:
            if (allow_deletion):
                file = self.mapper.src_to_dst(uri)
                if (self.dryrun):
                    print "dryrun: would delete %s -> %s" % (uri,file)
                else:
                    os.unlink(file)
                    if (self.verbose):
                        print "deleted: %s -> %s" % (uri,file)
            else:
                if (self.verbose):
                    print "nodelete: would delete %s (--delete to enable)" % uri

    def update_resource(self, uri, file, timestamp=None):
        """Update resource from uri to file on local system

        Update means two things:
        1. GET resources
        2. set mtime to be equal to timestamp (should probably use LastModified 
        from the GET response instead but maybe warn if different (or just 
        earlier than) the lastmod we expected from the inventory
        """
        path = os.path.dirname(file)
        distutils.dir_util.mkpath(path)
        if (self.dryrun):
            print "dryrun: would GET %s --> %s" % (uri,file)
        else:
            urllib.urlretrieve(uri,file)
            if (self.verbose):
                print "created: %s -> %s" % (uri,file)
            if (timestamp is not None):
                unixtime=int(timestamp) #get rid of any fractional seconds
                os.utime(file,(unixtime,unixtime))

    def parse_sitemap(self):
        s=Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile)
        if (self.verbose):
            print "Reading sitemap(s) from %s ..." % (sitemap)
        i = s.read(sitemap)
        num_entries = len(i)
        print "Read sitemap with %d entries in %d sitemaps" % (num_entries,s.sitemaps_created)
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries>to_show):
                print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
            n=0
            for r in i.resource_uris():
                print i.resources[r]
                n+=1
                if ( n >= to_show ):
                    break

    def write_sitemap(self,outfile=None,capabilities=None,dump=None):
        # Set up base_path->base_uri mappings, get inventory from disk
        i = self.inventory
        i.capabilities = capabilities
        s=Sitemap(verbose=self.verbose, pretty_xml=True, allow_multifile=self.allow_multifile,
	          mapper=self.mapper)
        if (self.max_sitemap_entries is not None):
            s.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print s.inventory_as_xml(i)
        else:
            s.write(i,basename=outfile)
        self.write_dump_if_requested(i,dump)

    def changeset_sitemap(self,outfile=None,ref_sitemap=None,capabilities=None,
                          dump=None):
        # 1. Get and parse reference sitemap
        rs = Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile, 
                     mapper=self.mapper)
        if (self.verbose):
            print "Reading sitemap(s) from %s ..." % (ref_sitemap)
        ri = rs.read(ref_sitemap)
        num_entries = len(ri)
        print "Read reference sitemap with %d entries in %d sitemaps" % (num_entries,rs.sitemaps_created)
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries>to_show):
                print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
            n=0
            for r in ri.resource_uris():
                print ri.resources[r]
                n+=1
                if ( n >= to_show ):
                    break
        # 2. Set up base_path->base_uri mappings, get inventory from disk
        disk_inventory = self.inventory
        # 3. Calculate changeset
        (num_same,updated,deleted,created)=ri.compare(disk_inventory)   
        changeset = Inventory()
        changeset.capabilities = capabilities
        changeset.add( disk_inventory.changeset( updated, changetype='updated' ) )
        changeset.add( ri.changeset( deleted, changetype='deleted' ) )
        changeset.add( disk_inventory.changeset( created, changetype='created' ) )
        # 4. Write out changeset
        s = Sitemap(verbose=self.verbose, pretty_xml=True, allow_multifile=self.allow_multifile,
	            mapper=self.mapper)
        if (self.max_sitemap_entries is not None):
            s.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print s.inventory_as_xml(changeset)
        else:
            s.write(changeset,basename=outfile)
        self.write_dump_if_requested(changeset,dump)

    def write_dump_if_requested(self,inventory,dump):
        if (dump is None):
            return
        if (self.verbose):
            print "Writing dump to %s..." % (dump)
        d = Dump(format=self.dump_format)
        d.write(inventory=inventory,dumpfile=dump)
Exemplo n.º 6
0
    def sync_or_audit(self,
                      src_uri,
                      dst_path,
                      allow_deletion=False,
                      audit_only=False):
        ### 1. Get inventorys from both src and dst
        # 1.a source inventory
        ib = InventoryBuilder()
        try:
            src_inventory = ib.get(src_uri)
        except IOError as e:
            raise ClientFatalError("Can't read source inventory (%s)" % str(e))
        if (self.verbose):
            print "Read src inventory from %s, %d resources listed" % (
                src_uri, len(src_inventory))
        if (len(src_inventory) == 0):
            raise ClientFatalError(
                "Aborting as there are no resources to sync")
        if (self.checksum and not src_inventory.has_md5()):
            self.checksum = False
            print "Not calculating checksums on destination as not present in source inventory"
        # 1.b destination inventory mapped back to source URIs
        segments = src_uri.split('/')
        segments.pop()
        url_prefix = '/'.join(segments)
        ib.do_md5 = self.checksum
        dst_inventory = ib.from_disk(dst_path, url_prefix)
        ### 2. Compare these inventorys respecting any comparison options
        (num_same, changed, deleted,
         added) = dst_inventory.compare(src_inventory)
        ### 3. Report status and planned actions
        status = "  IN SYNC  "
        if (len(changed) > 0 or len(deleted) > 0 or len(added) > 0):
            status = "NOT IN SYNC"
        print "Status: %s (same=%d, changed=%d, deleted=%d, added=%d)" %\
              (status,num_same,len(changed),len(deleted),len(added))

        if (audit_only):
            return
        ### 4. Grab files to do sync
        mapper = Mapper(url_prefix, dst_path)
        for uri in changed:
            file = mapper.src_to_dst(uri)
            if (self.verbose):
                print "changed: %s -> %s" % (uri, file)
            self.update_resource(uri, file,
                                 src_inventory.resources[uri].timestamp)
        for uri in added:
            file = mapper.src_to_dst(uri)
            if (self.verbose):
                print "added: %s -> %s" % (uri, file)
            self.update_resource(uri, file,
                                 src_inventory.resources[uri].timestamp)
        for uri in deleted:
            if (allow_deletion):
                file = mapper.src_to_dst(uri)
                if (self.verbose):
                    print "deleted: %s -> %s" % (uri, file)
                os.unlink(file)
            else:
                if (self.verbose):
                    print "would delete %s (--delete to enable)" % uri
Exemplo n.º 7
0
class Client(object):
    """Implementation of a ResourceSync client

    Logging is used for both console output and for detailed logs for
    automated analysis. Levels used:
      warning - usually shown to user
      info    - verbose output
      debug   - very verbose for automated analysis
    """
    def __init__(self, checksum=False, verbose=False, dryrun=False):
        super(Client, self).__init__()
        self.checksum = checksum
        self.verbose = verbose
        self.dryrun = dryrun
        self.logger = logging.getLogger('resync.client')
        self.mapper = Mapper()
        self.resource_list_name = 'resourcelist.xml'
        self.change_list_name = 'changelist.xml'
        self.dump_format = None
        self.exclude_patterns = []
        self.sitemap_name = None
        self.allow_multifile = True
        self.noauth = False
        self.strictauth = False
        self.max_sitemap_entries = None
        self.ignore_failures = False
        self.pretty_xml = True
        # Default file names
        self.status_file = '.resync-client-status.cfg'
        self.default_resource_dump = 'resourcedump.zip'
        self.default_change_dump = 'changedump.zip'

    def set_mappings(self, mappings):
        """Build and set Mapper object based on input mappings"""
        self.mapper = Mapper(mappings, use_default_path=True)

    def sitemap_uri(self, basename):
        """Get full URI (filepath) for sitemap based on basename"""
        if (re.match(r"\w+:", basename)):
            # looks like URI
            return (basename)
        elif (re.match(r"/", basename)):
            # looks like full path
            return (basename)
        else:
            # build from mapping with name appended
            return (self.mapper.default_src_uri() + '/' + basename)

    @property
    def sitemap(self):
        """Return the sitemap URI based on maps or explicit settings"""
        if (self.sitemap_name is not None):
            return (self.sitemap_name)
        return (self.sitemap_uri(self.resource_list_name))

    def build_resource_list(self, paths=None, set_path=False):
        """Return a resource list for files on local disk

        The set of files is taken by disk scan from the paths specified or
        else defaults to the paths specified in the current mappings

        paths - override paths from mappings if specified

        set_path - set true to set the path information for each resource 
            included. This is used to build a resource list as the basis
            for creating a dump.

        Return ResourceList. Uses existing self.mapper settings.
        """
        # 0. Sanity checks, parse paths is specified
        if (len(self.mapper) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        if (paths is not None):
            # Expect comma separated list of paths
            paths = paths.split(',')
        # 1. Build from disk
        rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper)
        rlb.set_path = set_path
        rlb.add_exclude_files(self.exclude_patterns)
        rl = rlb.from_disk(paths=paths)
        # 2. Set defaults and overrides
        rl.allow_multifile = self.allow_multifile
        rl.pretty_xml = self.pretty_xml
        rl.mapper = self.mapper
        if (self.max_sitemap_entries is not None):
            rl.max_sitemap_entries = self.max_sitemap_entries
        return (rl)

    def log_event(self, change):
        """Log a Resource object as an event for automated analysis"""
        self.logger.debug("Event: " + repr(change))

    def baseline_or_audit(self, allow_deletion=False, audit_only=False):
        """Baseline synchonization or audit

	Both functions implemented in this routine because audit is a prerequisite
	for a baseline sync. In the case of baseline sync the last timestamp seen
        is recorded as client state.
	"""
        action = ('audit' if (audit_only) else 'baseline sync')
        self.logger.debug("Starting " + action)
        ### 0. Sanity checks
        if (len(self.mapper) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        if (not audit_only and self.mapper.unsafe()):
            raise ClientFatalError(
                "Source to destination mappings unsafe: %s" % str(self.mapper))
        ### 1. Get inventories from both src and dst
        # 1.a source resource list
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_resource_list = ResourceList(
                allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_resource_list.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError(
                "Can't read source resource list from %s (%s)" %
                (self.sitemap, str(e)))
        self.logger.info("Read source resource list, %d resources listed" %
                         (len(src_resource_list)))
        if (len(src_resource_list) == 0):
            raise ClientFatalError(
                "Aborting as there are no resources to sync")
        if (self.checksum and not src_resource_list.has_md5()):
            self.checksum = False
            self.logger.info(
                "Not calculating checksums on destination as not present in source resource list"
            )
        # 1.b destination resource list mapped back to source URIs
        rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper)
        dst_resource_list = rlb.from_disk()
        ### 2. Compare these resource lists respecting any comparison options
        (same, updated, deleted,
         created) = dst_resource_list.compare(src_resource_list)
        ### 3. Report status and planned actions
        self.log_status(in_sync=(len(updated) + len(deleted) +
                                 len(created) == 0),
                        audit=True,
                        same=len(same),
                        created=len(created),
                        updated=len(updated),
                        deleted=len(deleted))
        if (audit_only or len(created) + len(updated) + len(deleted) == 0):
            self.logger.debug("Completed " + action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        if (not self.noauth):
            uauth = UrlAuthority(self.sitemap, strict=self.strictauth)
            for resource in src_resource_list:
                if (not uauth.has_authority_over(resource.uri)):
                    raise ClientFatalError(
                        "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                        % (self.sitemap, resource.uri))
        ### 5. Grab files to do sync
        delete_msg = (", and delete %d resources" %
                      len(deleted)) if (allow_deletion) else ''
        self.logger.warning("Will GET %d resources%s" %
                            (len(created) + len(updated), delete_msg))
        self.last_timestamp = 0
        num_created = 0
        num_updated = 0
        num_deleted = 0
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri, file))
            num_created += self.update_resource(resource, file, 'created')
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri, file))
            num_updated += self.update_resource(resource, file, 'updated')
        for resource in deleted:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            num_deleted += self.delete_resource(resource, file, allow_deletion)
        ### 6. Store last timestamp to allow incremental sync
        if (not audit_only and self.last_timestamp > 0):
            ClientState().set_state(self.sitemap, self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" %
                             (datetime_to_str(self.last_timestamp)))
        ### 7. Done
        self.log_status(in_sync=(len(updated) + len(deleted) +
                                 len(created) == 0),
                        same=len(same),
                        created=num_created,
                        updated=num_updated,
                        deleted=num_deleted,
                        to_delete=len(deleted))
        self.logger.debug("Completed %s" % (action))

    def incremental(self,
                    allow_deletion=False,
                    change_list_uri=None,
                    from_datetime=None):
        """Incremental synchronization

        Use Change List to do incremental sync
        """
        self.logger.debug("Starting incremental sync")
        ### 0. Sanity checks
        if (len(self.mapper) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        if (self.mapper.unsafe()):
            raise ClientFatalError(
                "Source to destination mappings unsafe: %s" % str(self.mapper))
        from_timestamp = None
        if (from_datetime is not None):
            try:
                from_timestamp = str_to_datetime(from_datetime)
            except ValueError:
                raise ClientFatalError("Bad datetime in --from (%s)" %
                                       from_datetime)
    ### 1. Work out where to start from
        if (from_timestamp is None):
            from_timestamp = ClientState().get_state(self.sitemap)
            if (from_timestamp is None):
                raise ClientFatalError(
                    "Cannot do incremental sync. No stored timestamp for this site, and no explicit --from."
                )
    ### 2. Get URI of change list, from sitemap or explicit
        if (change_list_uri):
            # Translate as necessary using maps
            change_list = self.sitemap_uri(change_list_uri)
        else:
            # Try default name
            change_list = self.sitemap_uri(self.change_list_name)
    ### 3. Read change list from source
        try:
            self.logger.info("Reading change list %s" % (change_list))
            src_change_list = ChangeList()
            src_change_list.read(uri=change_list)
            self.logger.debug("Finished reading change list")
        except Exception as e:
            raise ClientFatalError(
                "Can't read source change list from %s (%s)" %
                (change_list, str(e)))
        self.logger.info("Read source change list, %d changes listed" %
                         (len(src_change_list)))
        #if (len(src_change_list)==0):
        #    raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_change_list.has_md5()):
            self.checksum = False
            self.logger.info(
                "Not calculating checksums on destination as not present in source change list"
            )
    # Check all changes have timestamp and record last
        self.last_timestamp = 0
        for resource in src_change_list:
            if (resource.timestamp is None):
                raise ClientFatalError(
                    "Aborting - missing timestamp for change in %s" % (uri))
            if (resource.timestamp > self.last_timestamp):
                self.last_timestamp = resource.timestamp
    ### 4. Check that the change list has authority over URIs listed
    # FIXME - What does authority mean for change list? Here use both the
    # change list URI and, if we used it, the sitemap URI
        if (not self.noauth):
            uauth_cs = UrlAuthority(change_list, self.strictauth)
            if (not change_list_uri):
                uauth_sm = UrlAuthority(self.sitemap)
                for resource in src_change_list:
                    if (not uauth_cs.has_authority_over(resource.uri) and
                        (change_list_uri
                         or not uauth_sm.has_authority_over(resource.uri))):
                        raise ClientFatalError(
                            "Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                            % (change_list, resource.uri))
    ### 5. Prune entries before starting timestamp and dupe changes for a resource
        num_skipped = src_change_list.prune_before(from_timestamp)
        if (num_skipped > 0):
            self.logger.info("Skipped %d changes before %s" %
                             (num_skipped, datetime_to_str(from_timestamp)))
        num_dupes = src_change_list.prune_dupes()
        if (num_dupes > 0):
            self.logger.info("Removed %d prior changes" % (num_dupes))
    # Review and log status before
    # FIXME - should at this stage prune the change list to pick out
    # only the last change for each resource
        to_update = 0
        to_create = 0
        to_delete = 0
        for resource in src_change_list:
            if (resource.change == 'updated'):
                to_update += 1
            elif (resource.change == 'created'):
                to_create += 1
            elif (resource.change == 'deleted'):
                to_delete += 1
            else:
                raise ClientError("Unknown change type %s" % (resource.change))
    # Log status based on what we know from the Change List. Exit if
    # either there are no changes or if there are only deletions and
    # we don't allow deletion
        in_sync = ((to_update + to_delete + to_create) == 0)
        self.log_status(in_sync=in_sync,
                        incremental=True,
                        created=to_create,
                        updated=to_update,
                        deleted=to_delete)
        if (in_sync or ((to_update + to_create) == 0 and not allow_deletion)):
            self.logger.debug("Completed incremental")
            return
    ### 6. Apply changes at same time or after from_timestamp
        delete_msg = (", and delete %d resources" %
                      to_delete) if (allow_deletion) else ''
        self.logger.warning("Will apply %d changes%s" %
                            (len(src_change_list), delete_msg))
        num_updated = 0
        num_deleted = 0
        num_created = 0
        for resource in src_change_list:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            if (resource.change == 'updated'):
                self.logger.info("updated: %s -> %s" % (uri, file))
                self.update_resource(resource, file, 'updated')
                num_updated += 1
            elif (resource.change == 'created'):
                self.logger.info("created: %s -> %s" % (uri, file))
                self.update_resource(resource, file, 'created')
                num_created += 1
            elif (resource.change == 'deleted'):
                num_deleted += self.delete_resource(resource, file,
                                                    allow_deletion)
            else:
                raise ClientError("Unknown change type %s" % (resource.change))
    ### 7. Report status and planned actions
        self.log_status(incremental=True,
                        created=num_created,
                        updated=num_updated,
                        deleted=num_deleted,
                        to_delete=to_delete)
        ### 8. Record last timestamp we have seen
        if (self.last_timestamp > 0):
            ClientState().set_state(self.sitemap, self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" %
                             (datetime_to_str(self.last_timestamp)))

    ### 9. Done
        self.logger.debug("Completed incremental sync")

    def update_resource(self, resource, file, change=None):
        """Update resource from uri to file on local system

        Update means three things:
        1. GET resources
        2. set mtime in local time to be equal to timestamp in UTC (should perhaps
        or at least warn if different from LastModified from the GET response instead 
        but maybe warn if different (or just earlier than) the lastmod we expected 
        from the resource list
        3. check that resource matches expected information

        Also update self.last_timestamp if the timestamp (in source frame) of this
        resource is later and the current value.

        Returns the number of resources updated/created (0 or 1)
        """
        path = os.path.dirname(file)
        distutils.dir_util.mkpath(path)
        num_updated = 0
        if (self.dryrun):
            self.logger.info("dryrun: would GET %s --> %s" %
                             (resource.uri, file))
        else:
            # 1. GET
            try:
                urllib.urlretrieve(resource.uri, file)
                num_updated += 1
            except IOError as e:
                msg = "Failed to GET %s -- %s" % (resource.uri, str(e))
                if (self.ignore_failures):
                    self.logger.warning(msg)
                    return
                else:
                    raise ClientFatalError(msg)
            # 2. set timestamp if we have one
            if (resource.timestamp is not None):
                unixtime = int(resource.timestamp)  #no fractional
                os.utime(file, (unixtime, unixtime))
                if (resource.timestamp > self.last_timestamp):
                    self.last_timestamp = resource.timestamp
            self.log_event(Resource(resource=resource, change=change))
            # 3. sanity check
            length = os.stat(file).st_size
            if (resource.length != length):
                self.logger.info(
                    "Downloaded size for %s of %d bytes does not match expected %d bytes"
                    % (resource.uri, length, resource.length))
            if (self.checksum and resource.md5 is not None):
                file_md5 = compute_md5_for_file(file)
                if (resource.md5 != file_md5):
                    self.logger.info(
                        "MD5 mismatch for %s, got %s but expected %s bytes" %
                        (resource.uri, file_md5, resource.md5))
        return (num_updated)

    def delete_resource(self, resource, file, allow_deletion=False):
        """Delete copy of resource in file on local system

        Will only actually do the deletion if allow_deletion is True. Regardless 
        of whether the deletion occurs, self.last_timestamp will be updated 
        if the resource.timestamp is later than the current value.

        Returns the number of files actually deleted (0 or 1).
        """
        num_deleted = 0
        uri = resource.uri
        if (resource.timestamp is not None
                and resource.timestamp > self.last_timestamp):
            self.last_timestamp = resource.timestamp
        if (allow_deletion):
            if (self.dryrun):
                self.logger.info("dryrun: would delete %s -> %s" % (uri, file))
            else:
                try:
                    os.unlink(file)
                    num_deleted += 1
                except OSError as e:
                    msg = "Failed to DELETE %s -> %s : %s" % (uri, file,
                                                              str(e))
                    #if (self.ignore_failures):
                    self.logger.warning(msg)
                    #    return
                    #else:
                    #    raise ClientFatalError(msg)
                self.logger.info("deleted: %s -> %s" % (uri, file))
                self.log_event(Resource(resource=resource, change="deleted"))
        else:
            self.logger.info("nodelete: would delete %s (--delete to enable)" %
                             uri)
        return (num_deleted)

    def parse_document(self):
        """Parse any ResourceSync document and show information
        
        Will use sitemap URI taken either from explicit self.sitemap_name
        or derived from the mappings supplied.
        """
        s = Sitemap()
        self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
        try:
            list = s.parse_xml(urllib.urlopen(self.sitemap))
        except IOError as e:
            raise ClientFatalError("Cannot read document (%s)" % str(e))
        num_entries = len(list.resources)
        capability = '(unknown capability)'
        if ('capability' in list.md):
            capability = list.md['capability']
        print "Parsed %s document with %d entries" % (capability, num_entries)
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries > to_show):
                print "Showing first %d entries sorted by URI%s..." % (
                    to_show, override_str)
            n = 0
            for resource in list:
                print '[%d] %s' % (n, str(resource))
                n += 1
                if (n >= to_show):
                    break

    def explore(self):
        """Explore capabilities of a server interactvely
        
        Will use sitemap URI taken either from explicit self.sitemap_name
        or derived from the mappings supplied.
        """
        uri = None
        if (self.sitemap_name is not None):
            uri = self.sitemap
            print "Taking location from --sitemap option"
            acceptable_capabilities = None  #ie. any
        elif (len(self.mapper) > 0):
            pu = urlparse.urlparse(self.mapper.default_src_uri())
            uri = urlparse.urlunparse(
                [pu[0], pu[1], '/.well-known/resourcesync', '', '', ''])
            print "Will look for discovery information based on mappings"
            acceptable_capabilities = ['capabilitylist', 'capabilitylistindex']
        else:
            raise ClientFatalError(
                "Neither explicit sitemap nor mapping specified")
        history = []
        inp = None
        checks = None
        while (inp != 'q'):
            print
            if (inp == 'b'):
                if (len(history) < 2):
                    break  #can't do this, exit
                history.pop()  #throw away current
                uri = history.pop()
                acceptable_capabilities = None
            history.append(uri)
            (uri, checks, acceptable_capabilities,
             inp) = self.explore_uri(uri, checks, acceptable_capabilities,
                                     len(history) > 1)
        print "--explore done, bye..."

    def explore_uri(self, uri, checks, caps, show_back=True):
        """Interactive exploration of document at uri

        Will flag warnings if the document is not of type listed in caps
        """
        s = Sitemap()
        print "Reading %s" % (uri)
        options = {}
        capability = None
        try:
            if (caps == 'resource'):
                self.explore_show_head(uri, check_headers=checks)
            else:
                list = s.parse_xml(urllib.urlopen(uri))
                (options, capability) = self.explore_show_summary(
                    list, s.parsed_index, caps)
        except IOError as e:
            print "Cannot read %s (%s)\nGoing back" % (uri, str(e))
            return ('', '', '', 'b')
        except Exception as e:
            print "Cannot parse %s (%s)\nGoing back" % (uri, str(e))
            return ('', '', '', 'b')
        while (True):
            # don't offer number option for no resources/capabilities
            num_prompt = '' if (len(options) == 0) else 'number, '
            up_prompt = 'b(ack), ' if (show_back) else ''
            inp = raw_input("Follow [%s%sq(uit)]?" % (num_prompt, up_prompt))
            if (inp in options.keys()):
                break
            if (inp == 'q' or inp == 'b'):
                return ('', '', '', inp)
        checks = {}
        if (options[inp].capability is None):
            if (capability == 'capabilitylistindex'):
                # all links should be to capabilitylist documents
                caps = ['capabilitylist']
            elif (capability in [
                    'resourcelist', 'changelist', 'resourcedump', 'changedump'
            ]):
                caps = 'resource'
        else:
            r = options[inp]
            caps = [r.capability]
            if (r.length is not None):
                checks['content-length'] = r.length
            if (r.lastmod is not None):
                checks['last-modified'] = r.lastmod
            # FIXME - could do sanity check here and issue warnings if odd
        return (options[inp].uri, checks, caps, inp)

    def explore_show_summary(self, list, parsed_index, caps):
        """Show summary of one capability document

        Used as part of --explore.
        FIXME - should look for <rs:ln rel="up"...> link and show that
        """
        num_entries = len(list.resources)
        capability = '(unknown capability)'
        if ('capability' in list.md):
            capability = list.md['capability']
        if (parsed_index):
            capability += 'index'
        print "Parsed %s document with %d entries:" % (capability, num_entries)
        if (caps is not None and capability not in caps):
            print "WARNING - expected a %s document" % (','.join(caps))
        to_show = num_entries
        if (num_entries > 21):
            to_show = 20
        # What entries are allowed?
        # FIXME - not complete
        entry_caps = []
        if (capability == 'capabilitylistindex'):
            entry_caps = ['capabilitylist']
        elif (capability == 'capabilitylist'):
            entry_caps = [
                'resourcelist', 'changelist', 'resourcedump', 'changedump',
                'changelistindex'
            ]
        elif (capability == 'changelistindex'):
            entry_caps = ['changelist']
        options = {}
        n = 0
        if ('up' in list.ln):
            options['up'] = list.ln['up']
            print "[%s] %s" % ('up', list.ln['up'].uri)
        for r in list.resources:
            if (n >= to_show):
                print "(not showing remaining %d entries)" % (num_entries - n)
                break
            n += 1
            options[str(n)] = r
            print "[%d] %s" % (n, r.uri)
            if (r.capability is not None):
                warning = ''
                if (r.capability not in entry_caps):
                    warning = " (EXPECTED %s)" % (' or '.join(entry_caps))
                print "  %s%s" % (r.capability, warning)
            elif (len(entry_caps) == 1):
                r.capability = entry_caps[0]
                print "  capability not specified, should be %s" % (
                    r.capability)
        return (options, capability)

    def explore_show_head(self, uri, check_headers=None):
        """Do HEAD on uri and show infomation

        Will also check headers against any values specified in 
        check_headers.
        """
        print "HEAD %s" % (uri)
        response = requests.head(uri)
        print "  status: %s" % (response.status_code)
        # generate normalized lastmod
        #        if ('last-modified' in response.headers):
        #            response.headers.add['lastmod'] = datetime_to_str(str_to_datetime(response.headers['last-modified']))
        # print some of the headers
        for header in [
                'content-length', 'last-modified', 'lastmod', 'content-type',
                'etag'
        ]:
            if header in response.headers:
                check_str = ''
                if (check_headers is not None and header in check_headers):
                    if (response.headers[header] == check_headers[header]):
                        check_str = ' MATCHES EXPECTED VALUE'
                    else:
                        check_STR = ' EXPECTED %s' % (check_headers[header])
                print "  %s: %s%s" % (header, response.headers[header],
                                      check_str)

    def write_resource_list(self,
                            paths=None,
                            outfile=None,
                            links=None,
                            dump=None):
        """Write a Resource List or a Resource Dump for files on local disk

        Set of resources included is based on paths setting or else the mappings. 
        Optionally links can be added. Output will be to stdout unless outfile
        is specified.
        
        If dump is true then a Resource Dump is written instead of a Resource
        List. If outfile is not set then self.default_resource_dump will be used.
        """
        rl = self.build_resource_list(paths=paths, set_path=dump)
        if (links is not None):
            rl.ln = links
        if (dump):
            if (outfile is None):
                outfile = self.default_resource_dump
            self.logger.info("Writing resource dump to %s..." % (dump))
            d = Dump(format=self.dump_format)
            d.write(resource_list=rl, dumpfile=outfile)
        else:
            if (outfile is None):
                try:
                    print rl.as_xml()
                except ListBaseIndexError as e:
                    raise ClientFatalError(
                        "%s. Use --output option to specify base name for output files."
                        % str(e))
            else:
                rl.write(basename=outfile)

    def write_change_list(self,
                          paths=None,
                          outfile=None,
                          ref_sitemap=None,
                          newref_sitemap=None,
                          empty=None,
                          links=None,
                          dump=None):
        """Write a change list
        
        Unless the both ref_sitemap and newref_sitemap are specified then the Change 
        List is calculated between the reference an the current state of files on
        disk. The files on disk are scanned based either on the paths setting or
        else on the mappings.
        """
        cl = ChangeList(ln=links)
        if (not empty):
            # 1. Get and parse reference sitemap
            old_rl = self.read_reference_resource_list(ref_sitemap)
            # 2. Depending on whether a newref_sitemap was specified, either read that
            # or build resource list from files on disk
            if (newref_sitemap is None):
                # Get resource list from disk
                new_rl = self.build_resource_list(paths=paths, set_path=dump)
            else:
                new_rl = self.read_reference_resource_list(
                    newref_sitemap, name='new reference')
            # 3. Calculate change list
            (same, updated, deleted, created) = old_rl.compare(new_rl)
            cl.add_changed_resources(updated, change='updated')
            cl.add_changed_resources(deleted, change='deleted')
            cl.add_changed_resources(created, change='created')
        # 4. Write out change list
        cl.mapper = self.mapper
        cl.pretty_xml = self.pretty_xml
        if (self.max_sitemap_entries is not None):
            cl.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print cl.as_xml()
        else:
            cl.write(basename=outfile)
        self.write_dump_if_requested(cl, dump)

    def write_capability_list(self,
                              capabilities=None,
                              outfile=None,
                              links=None):
        """Write a Capability List to outfile or STDOUT"""
        capl = CapabilityList(ln=links)
        capl.pretty_xml = self.pretty_xml
        if (capabilities is not None):
            for name in capabilities.keys():
                capl.add_capability(name=name, uri=capabilities[name])
        if (outfile is None):
            print capl.as_xml()
        else:
            capl.write(basename=outfile)

    def write_source_description(self,
                                 capability_lists=None,
                                 outfile=None,
                                 links=None):
        """Write a ResourceSync Description document to outfile or STDOUT"""
        rsd = SourceDescription(ln=links)
        rsd.pretty_xml = self.pretty_xml
        if (capability_lists is not None):
            for uri in capability_lists:
                rsd.add_capability_list(uri)
        if (outfile is None):
            print rsd.as_xml()
        else:
            rsd.write(basename=outfile)

    def write_dump_if_requested(self, resource_list, dump):
        """Write a dump to the file dump"""
        if (dump is None):
            return

    def read_reference_resource_list(self, ref_sitemap, name='reference'):
        """Read reference resource list and return the ResourceList object

        name parameter just uses in output messages to say what type
        of resource list is being read.
        """
        rl = ResourceList()
        self.logger.info("Reading reference %s resource list from %s ..." %
                         (name, ref_sitemap))
        rl.mapper = self.mapper
        rl.read(uri=ref_sitemap, index_only=(not self.allow_multifile))
        num_entries = len(rl.resources)
        self.logger.info(
            "Read %s resource list with %d entries in %d sitemaps" %
            (name, num_entries, rl.num_files))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries > to_show):
                print "Showing first %d entries sorted by URI%s..." % (
                    to_show, override_str)
            n = 0
            for r in rl.resources:
                print r
                n += 1
                if (n >= to_show):
                    break
        return (rl)

    def log_status(self,
                   in_sync=True,
                   incremental=False,
                   audit=False,
                   same=None,
                   created=0,
                   updated=0,
                   deleted=0,
                   to_delete=0):
        """Write log message regarding status in standard form
        
        Split this off so all messages from baseline/audit/incremental
        are written in a consistent form.
        """
        if (audit):
            words = {
                'created': 'to create',
                'updated': 'to update',
                'deleted': 'to delete'
            }
        else:
            words = {
                'created': 'created',
                'updated': 'updated',
                'deleted': 'deleted'
            }
        if in_sync:
            # status rather than action
            status = "NO CHANGES" if incremental else "IN SYNC"
        else:
            if audit:
                status = "NOT IN SYNC"
            elif (to_delete > deleted):
                #will need --delete
                status = "PART APPLIED" if incremental else "PART SYNCED"
                words['deleted'] = 'to delete (--delete)'
                deleted = to_delete
            else:
                status = "CHANGES APPLIED" if incremental else "SYNCED"
        same = "" if (same is None) else ("same=%d, " % same)
        self.logger.warning("Status: %15s (%s%s=%d, %s=%d, %s=%d)" %\
             (status, same, words['created'], created,
              words['updated'], updated, words['deleted'], deleted))
Exemplo n.º 8
0
class Client(object):
    """Implementation of a ResourceSync client

    Logging is used for both console output and for detailed logs for
    automated analysis. Levels used:
      warning - usually shown to user
      info    - verbose output
      debug   - very verbose for automated analysis
    """

    def __init__(self, checksum=False, verbose=False, dryrun=False):
        super(Client, self).__init__()
        self.checksum = checksum
        self.verbose = verbose
        self.dryrun = dryrun
        self.logger = logging.getLogger("client")
        self.mapper = None
        self.sitemap_name = "sitemap.xml"
        self.dump_format = None
        self.exclude_patterns = []
        self.allow_multifile = True
        self.noauth = False
        self.max_sitemap_entries = None
        self.ignore_failures = False

    @property
    def mappings(self):
        """Provide access to mappings list within Mapper object"""
        if self.mapper is None:
            raise ClientFatalError("No mappings specified")
        return self.mapper.mappings

    def set_mappings(self, mappings):
        """Build and set Mapper object based on input mappings"""
        self.mapper = Mapper(mappings)

    def sitemap_changeset_uri(self, basename):
        """Get full URI (filepath) for sitemap/changeset based on basename"""
        if re.match(r"\w+:", basename):
            # looks like URI
            return basename
        elif re.match(r"/", basename):
            # looks like full path
            return basename
        else:
            # build from mapping with name appended
            return self.mappings[0].src_uri + "/" + basename

    @property
    def sitemap(self):
        """Return the sitemap URI based on maps or explicit settings"""
        return self.sitemap_changeset_uri(self.sitemap_name)

    @property
    def inventory(self):
        """Return inventory on disk based on current mappings

        Return inventory. Uses existing self.mapper settings.
        """
        ### 0. Sanity checks
        if len(self.mappings) < 1:
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Build from disk
        ib = InventoryBuilder(do_md5=self.checksum, mapper=self.mapper)
        ib.add_exclude_files(self.exclude_patterns)
        return ib.from_disk()

    def log_event(self, change):
        """Log a ResourceChange object as an event for automated analysis"""
        self.logger.debug("Event: " + repr(change))

    def sync_or_audit(self, allow_deletion=False, audit_only=False):
        action = "audit" if (audit_only) else "sync"
        self.logger.debug("Starting " + action)
        ### 0. Sanity checks
        if len(self.mappings) < 1:
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Get inventories from both src and dst
        # 1.a source inventory
        ib = InventoryBuilder(mapper=self.mapper)
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_inventory = src_sitemap.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError("Can't read source inventory from %s (%s)" % (self.sitemap, str(e)))
        self.logger.info("Read source inventory, %d resources listed" % (len(src_inventory)))
        if len(src_inventory) == 0:
            raise ClientFatalError("Aborting as there are no resources to sync")
        if self.checksum and not src_inventory.has_md5():
            self.checksum = False
            self.logger.info("Not calculating checksums on destination as not present in source inventory")
        # 1.b destination inventory mapped back to source URIs
        ib.do_md5 = self.checksum
        dst_inventory = ib.from_disk()
        ### 2. Compare these inventorys respecting any comparison options
        (same, updated, deleted, created) = dst_inventory.compare(src_inventory)
        ### 3. Report status and planned actions
        status = "  IN SYNC  "
        if len(updated) > 0 or len(deleted) > 0 or len(created) > 0:
            status = "NOT IN SYNC"
        self.logger.warning(
            "Status: %s (same=%d, updated=%d, deleted=%d, created=%d)"
            % (status, len(same), len(updated), len(deleted), len(created))
        )
        if audit_only:
            self.logger.debug("Completed " + action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        uauth = UrlAuthority(self.sitemap)
        for resource in src_inventory:
            if not uauth.has_authority_over(resource.uri):
                if self.noauth:
                    self.logger.info(
                        "Sitemap (%s) mentions resource at a location it does not have authority over (%s)"
                        % (self.sitemap, resource.uri)
                    )
                else:
                    raise ClientFatalError(
                        "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                        % (self.sitemap, resource.uri)
                    )
        ### 5. Grab files to do sync
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri, file))
            self.update_resource(resource, file, "UPDATED")
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri, file))
            self.update_resource(resource, file, "CREATED")
        for resource in deleted:
            uri = resource.uri
            if allow_deletion:
                file = self.mapper.src_to_dst(uri)
                if self.dryrun:
                    self.logger.info("dryrun: would delete %s -> %s" % (uri, file))
                else:
                    os.unlink(file)
                    self.logger.info("deleted: %s -> %s" % (uri, file))
                    self.log_event(ResourceChange(resource=resource, changetype="DELETED"))
            else:
                self.logger.info("nodelete: would delete %s (--delete to enable)" % uri)
        self.logger.debug("Completed " + action)

    def incremental(self, allow_deletion=False, changeset_uri=None):
        self.logger.debug("Starting incremental sync")
        ### 0. Sanity checks
        if len(self.mappings) < 1:
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Get URI of changeset, from sitemap or explicit
        if changeset_uri:
            # Translate as necessary using maps
            changeset = self.sitemap_changeset_uri(changeset_uri)
        else:
            # Get sitemap
            try:
                self.logger.info("Reading sitemap %s" % (self.sitemap))
                src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper)
                src_inventory = src_sitemap.read(uri=self.sitemap, index_only=True)
                self.logger.debug("Finished reading sitemap/sitemapindex")
            except Exception as e:
                raise ClientFatalError("Can't read source sitemap from %s (%s)" % (self.sitemap, str(e)))
            # Extract changeset location
            # FIXME - need to completely rework the way we handle/store capabilities
            links = self.extract_links(src_inventory.capabilities)
            if "current" not in links:
                raise ClientFatalError("Failed to extract changeset location from sitemap %s" % (self.sitemap))
            changeset = links["current"]
        ### 2. Read changeset from source
        ib = InventoryBuilder(mapper=self.mapper)
        try:
            self.logger.info("Reading changeset %s" % (changeset))
            src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_changeset = src_sitemap.read(uri=changeset, changeset=True)
            self.logger.debug("Finished reading changeset")
        except Exception as e:
            raise ClientFatalError("Can't read source changeset from %s (%s)" % (changeset, str(e)))
        self.logger.info("Read source changeset, %d resources listed" % (len(src_changeset)))
        if len(src_changeset) == 0:
            raise ClientFatalError("Aborting as there are no resources to sync")
        if self.checksum and not src_changeset.has_md5():
            self.checksum = False
            self.logger.info("Not calculating checksums on destination as not present in source inventory")
        ### 3. Check that sitemap has authority over URIs listed
        # FIXME - What does authority mean for changeset? Here use both the
        # changeset URI and, if we used it, the sitemap URI
        uauth_cs = UrlAuthority(changeset)
        if not changeset_uri:
            uauth_sm = UrlAuthority(self.sitemap)
        for resource in src_changeset:
            if not uauth_cs.has_authority_over(resource.uri) and (
                changeset_uri or not uauth_sm.has_authority_over(resource.uri)
            ):
                if self.noauth:
                    self.logger.warning(
                        "Changeset (%s) mentions resource at a location it does not have authority over (%s)"
                        % (changeset, resource.uri)
                    )
                else:
                    raise ClientFatalError(
                        "Aborting as changeset (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                        % (changeset, resource.uri)
                    )
        ### 3. Apply changes
        for resource in src_changeset:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            if resource.changetype == "UPDATED":
                self.logger.info("updated: %s -> %s" % (uri, file))
                self.update_resource(resource, file, "UPDATED")
            elif resource.changetype == "CREATED":
                self.logger.info("created: %s -> %s" % (uri, file))
                self.update_resource(resource, file, "CREATED")
            elif resource.changetype == "DELETED":
                if allow_deletion:
                    file = self.mapper.src_to_dst(uri)
                    if self.dryrun:
                        self.logger.info("dryrun: would delete %s -> %s" % (uri, file))
                    else:
                        os.unlink(file)
                        self.logger.info("deleted: %s -> %s" % (uri, file))
                        self.log_event(ResourceChange(resource=resource, changetype="DELETED"))
                else:
                    self.logger.info("nodelete: would delete %s (--delete to enable)" % uri)
            else:
                raise ClientError("Unknown change type %s" % (resource.changetype))
        self.logger.debug("Completed incremental stuff")

    def update_resource(self, resource, file, changetype=None):
        """Update resource from uri to file on local system

        Update means two things:
        1. GET resources
        2. set mtime in local time to be equal to timestamp in UTC (should perhaps
        or at least warn if different from LastModified from the GET response instead 
        but maybe warn if different (or just earlier than) the lastmod we expected 
        from the inventory
        """
        path = os.path.dirname(file)
        distutils.dir_util.mkpath(path)
        if self.dryrun:
            self.logger.info("dryrun: would GET %s --> %s" % (resource.uri, file))
        else:
            try:
                urllib.urlretrieve(resource.uri, file)
            except IOError as e:
                msg = "Failed to GET %s -- %s" % (resource.uri, str(e))
                if self.ignore_failures:
                    self.logger.warning(msg)
                    return
                else:
                    raise ClientFatalError(msg)
            if resource.timestamp is not None:
                unixtime = int(resource.timestamp)  # no fractional
                os.utime(file, (unixtime, unixtime))
            self.log_event(ResourceChange(resource=resource, changetype=changetype))

    def parse_sitemap(self):
        s = Sitemap(allow_multifile=self.allow_multifile)
        self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
        i = s.read(self.sitemap)
        num_entries = len(i)
        self.logger.warning("Read sitemap with %d entries in %d sitemaps" % (num_entries, s.sitemaps_created))
        if self.verbose:
            to_show = 100
            override_str = " (override with --max-sitemap-entries)"
            if self.max_sitemap_entries:
                to_show = self.max_sitemap_entries
                override_str = ""
            if num_entries > to_show:
                print "Showing first %d entries sorted by URI%s..." % (to_show, override_str)
            n = 0
            for r in i:
                print r
                n += 1
                if n >= to_show:
                    break

    def explore_links(self):
        """Explore links from sitemap and between changesets"""
        seen = dict()
        is_changeset, links = self.explore_links_get(self.sitemap, seen=seen)
        starting_changeset = self.sitemap
        if not is_changeset:
            if "current" in links:
                starting_changeset = links["current"]
                is_changeset, links = self.explore_links_get(links["current"], seen=seen)
        # Can we go backward?
        if "prev" in links and not links["prev"] in seen:
            self.logger.warning("Will follow links backwards...")
            while "prev" in links and not links["prev"] in seen:
                self.logger.warning('Following "prev" link')
                is_changeset, links = self.explore_links_get(links["prev"], seen=seen)
        else:
            self.logger.warning("No links backwards")
        # Can we go forward?
        links = seen[starting_changeset]
        if "next" in links and not links["next"] in seen:
            self.logger.warning("Will follow links forwards...")
            while "next" in links and not links["next"] in seen:
                self.logger.warning('Following "next" link')
                is_changeset, links = self.explore_links_get(links["next"], seen=seen)
        else:
            self.logger.warning("No links forwards")

    def explore_links_get(self, uri, seen=[]):
        # Check we haven't been here before
        if uri in seen:
            self.logger.warning("Already see %s, skipping" % (uri))
        s = Sitemap(allow_multifile=self.allow_multifile)
        self.logger.info("Reading sitemap from %s ..." % (uri))
        i = s.read(uri, index_only=True)
        self.logger.warning("Read %s from %s" % (s.read_type, uri))
        links = self.extract_links(i, verbose=True)
        if "next" in links and links["next"] == uri:
            self.logger.warning('- self reference "next" link')
        seen[uri] = links
        return (s.changeset_read, links)

    def write_sitemap(self, outfile=None, capabilities=None, dump=None):
        # Set up base_path->base_uri mappings, get inventory from disk
        i = self.inventory
        i.capabilities = capabilities
        s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper)
        if self.max_sitemap_entries is not None:
            s.max_sitemap_entries = self.max_sitemap_entries
        if outfile is None:
            print s.resources_as_xml(i, capabilities=i.capabilities)
        else:
            s.write(i, basename=outfile)
        self.write_dump_if_requested(i, dump)

    def changeset_sitemap(
        self, outfile=None, ref_sitemap=None, newref_sitemap=None, empty=None, capabilities=None, dump=None
    ):
        changeset = ChangeSet()
        changeset.capabilities = capabilities
        if not empty:
            # 1. Get and parse reference sitemap
            old_inv = self.read_reference_sitemap(ref_sitemap)
            # 2. Depending on whether a newref_sitemap was specified, either read that
            # or build inventory from files on disk
            if newref_sitemap is None:
                # Get inventory from disk
                new_inv = self.inventory
            else:
                new_inv = self.read_reference_sitemap(newref_sitemap, name="new reference")
            # 3. Calculate changeset
            (same, updated, deleted, created) = old_inv.compare(new_inv)
            changeset.add_changed_resources(updated, changetype="UPDATED")
            changeset.add_changed_resources(deleted, changetype="DELETED")
            changeset.add_changed_resources(created, changetype="CREATED")
        # 4. Write out changeset
        s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper)
        if self.max_sitemap_entries is not None:
            s.max_sitemap_entries = self.max_sitemap_entries
        if outfile is None:
            print s.resources_as_xml(changeset, changeset=True)
        else:
            s.write(changeset, basename=outfile, changeset=True)
        self.write_dump_if_requested(changeset, dump)

    def write_dump_if_requested(self, inventory, dump):
        if dump is None:
            return
        self.logger.info("Writing dump to %s..." % (dump))
        d = Dump(format=self.dump_format)
        d.write(inventory=inventory, dumpfile=dump)

    def read_reference_sitemap(self, ref_sitemap, name="reference"):
        """Read reference sitemap and return the inventory

        name parameter just uses in output messages to say what type
        of sitemap is being read.
        """
        sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper)
        self.logger.info("Reading %s sitemap(s) from %s ..." % (name, ref_sitemap))
        i = sitemap.read(ref_sitemap)
        num_entries = len(i)
        self.logger.warning(
            "Read %s sitemap with %d entries in %d sitemaps" % (name, num_entries, sitemap.sitemaps_created)
        )
        if self.verbose:
            to_show = 100
            override_str = " (override with --max-sitemap-entries)"
            if self.max_sitemap_entries:
                to_show = self.max_sitemap_entries
                override_str = ""
            if num_entries > to_show:
                print "Showing first %d entries sorted by URI%s..." % (to_show, override_str)
            n = 0
            for r in i:
                print r
                n += 1
                if n >= to_show:
                    break
        return i

    def extract_links(self, rc, verbose=False):
        """Extract links from capabilities inventory or changeset

        FIXME - when we finalize the form of links this should probably
        go along with other capabilities functions somewhere general.
        """
        links = dict()
        for href in rc.capabilities.keys():
            atts = rc.capabilities[href].get("attributes")
            self.logger.debug("Capability: %s" % (str(rc.capabilities[href])))
            if atts is not None:
                # split on spaces, check is changeset rel and diraction
                if "http://www.openarchives.org/rs/changeset" in atts:
                    for linktype in ["next", "prev", "current"]:
                        if linktype in atts:
                            if linktype in links:
                                raise ClientFatalError(
                                    "Duplicate link type %s, links to %s and %s" % (linktype, links[linktype], href)
                                )
                            links[linktype] = href
                            if verbose:
                                self.logger.warning('- got "%s" link to %s' % (linktype, href))
        return links
Exemplo n.º 9
0
 def test03_mapper2_src_to_dst(self):
     m = Mapper(['http://e.org/p=/tmp/q', 'http://e.org/r=/tmp/s'])
     self.assertEqual(m.src_to_dst('http://e.org/p/'), '/tmp/q/')
     self.assertEqual(m.src_to_dst('http://e.org/p/aa'), '/tmp/q/aa')
     self.assertEqual(m.src_to_dst('http://e.org/r/'), '/tmp/s/')
     self.assertEqual(m.src_to_dst('http://e.org/r/aa'), '/tmp/s/aa')
Exemplo n.º 10
0
 def test03_mapper2_src_to_dst(self):
     m = Mapper(['http://e.org/p=/tmp/q', 'http://e.org/r=/tmp/s'])
     self.assertEqual(m.src_to_dst('http://e.org/p/'), '/tmp/q/')
     self.assertEqual(m.src_to_dst('http://e.org/p/aa'), '/tmp/q/aa')
     self.assertEqual(m.src_to_dst('http://e.org/r/'), '/tmp/s/')
     self.assertEqual(m.src_to_dst('http://e.org/r/aa'), '/tmp/s/aa')
Exemplo n.º 11
0
class Client(object):
    """Implementation of a ResourceSync client

    Logging is used for both console output and for detailed logs for
    automated analysis. Levels used:
      warning - usually shown to user
      info    - verbose output
      debug   - very verbose for automated analysis
    """

    def __init__(self, checksum=False, verbose=False, dryrun=False):
        super(Client, self).__init__()
        self.checksum = checksum
        self.verbose = verbose
        self.dryrun = dryrun
        self.logger = logging.getLogger('client')
        self.mapper = None
        self.sitemap_name = 'sitemap.xml'
        self.dump_format = None
        self.exclude_patterns = []
        self.allow_multifile = True
        self.noauth = False
        self.max_sitemap_entries = None
        self.ignore_failures = False
        self.status_file = '.resync-client-status.cfg'

    @property
    def mappings(self):
        """Provide access to mappings list within Mapper object"""
        if (self.mapper is None):
            raise ClientFatalError("No mappings specified")
        return(self.mapper.mappings)

    def set_mappings(self,mappings):
        """Build and set Mapper object based on input mappings"""
        self.mapper = Mapper(mappings)

    def sitemap_changelist_uri(self,basename):
        """Get full URI (filepath) for sitemap/changelist based on basename"""
        if (re.match(r"\w+:",basename)):
            # looks like URI
            return(basename)
        elif (re.match(r"/",basename)):
            # looks like full path
            return(basename)
        else:
            # build from mapping with name appended
            return(self.mappings[0].src_uri + '/' + basename)

    @property
    def sitemap(self):
        """Return the sitemap URI based on maps or explicit settings"""
        return(self.sitemap_changelist_uri(self.sitemap_name))

    @property
    def resourcelist(self):
        """Return resourcelist on disk based on current mappings

        Return resourcelist. Uses existing self.mapper settings.
        """
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Build from disk
        ib = ResourceListBuilder(do_md5=self.checksum,mapper=self.mapper)
        ib.add_exclude_files(self.exclude_patterns)
        return( ib.from_disk() )

    def log_event(self, change):
        """Log a Resource object as an event for automated analysis"""
        self.logger.debug( "Event: "+repr(change) )

    def baseline_or_audit(self, allow_deletion=False, audit_only=False):
        """Baseline synchonization or audit

	Both functions implemented in this routine because audit is a prerequisite
	for a baseline sync.
	"""
        action = ( 'audit' if (audit_only) else 'baseline sync' ) 
        self.logger.debug("Starting "+action)
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Get inventories from both src and dst 
        # 1.a source resourcelist
        ib = ResourceListBuilder(mapper=self.mapper)
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_resourcelist = src_sitemap.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError("Can't read source resourcelist from %s (%s)" % (self.sitemap,str(e)))
        self.logger.info("Read source resourcelist, %d resources listed" % (len(src_resourcelist)))
        if (len(src_resourcelist)==0):
            raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_resourcelist.has_md5()):
            self.checksum=False
            self.logger.info("Not calculating checksums on destination as not present in source resourcelist")
        # 1.b destination resourcelist mapped back to source URIs
        ib.do_md5=self.checksum
        dst_resourcelist = ib.from_disk()
        ### 2. Compare these resourcelists respecting any comparison options
        (same,updated,deleted,created)=dst_resourcelist.compare(src_resourcelist)   
        ### 3. Report status and planned actions
        status = "  IN SYNC  "
        if (len(updated)>0 or len(deleted)>0 or len(created)>0):
            status = "NOT IN SYNC"
        self.logger.warning("Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\
              (status,len(same),len(updated),len(deleted),len(created)))
        if (audit_only):
            self.logger.debug("Completed "+action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        uauth = UrlAuthority(self.sitemap)
        for resource in src_resourcelist:
            if (not uauth.has_authority_over(resource.uri)):
                if (self.noauth):
                    #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri))
                    pass
                else:
                    raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri))
        ### 5. Grab files to do sync
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri,file))
            self.update_resource(resource,file,'updated')
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri,file))
            self.update_resource(resource,file,'created')
        for resource in deleted:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.delete_resource(resource,file,allow_deletion)
        ### 6. For sync reset any incremental status for site
        if (not audit_only):
            links = self.extract_links(src_resourcelist)
            if ('next' in links):
                self.write_incremental_status(self.sitemap,links['next'])
                self.logger.info("Written config with next incremental at %s" % (links['next']))
            else:
                self.write_incremental_status(self.sitemap)
        self.logger.debug("Completed "+action)

    def incremental(self, allow_deletion=False, changelist_uri=None):
	"""Incremental synchronization"""
        self.logger.debug("Starting incremental sync")
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        # Get current config
        inc_config_next=self.read_incremental_status(self.sitemap)
        ### 1. Get URI of changelist, from sitemap or explicit
        if (inc_config_next is not None):
            # We have config from last run for this site
            changelist = inc_config_next
            self.logger.info("ChangeList location from last incremental run %s" % (changelist))
        elif (changelist_uri):
            # Translate as necessary using maps
            changelist = self.sitemap_changelist_uri(changelist_uri)
        else:
            # Get sitemap
            try:
                self.logger.info("Reading sitemap %s" % (self.sitemap))
                src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper)
                src_resourcelist = src_sitemap.read(uri=self.sitemap, index_only=True)
                self.logger.debug("Finished reading sitemap/sitemapindex")
            except Exception as e:
                raise ClientFatalError("Can't read source sitemap from %s (%s)" % (self.sitemap,str(e)))
            # Extract changelist location
            # FIXME - need to completely rework the way we handle/store capabilities
            links = self.extract_links(src_resourcelist)
            if ('current' not in links):
                raise ClientFatalError("Failed to extract changelist location from sitemap %s" % (self.sitemap))
            changelist = links['current']
        ### 2. Read changelist from source
        ib = ResourceListBuilder(mapper=self.mapper)
        try:
            self.logger.info("Reading changelist %s" % (changelist))
            src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_changelist = src_sitemap.read(uri=changelist, changelist=True)
            self.logger.debug("Finished reading changelist")
        except Exception as e:
            raise ClientFatalError("Can't read source changelist from %s (%s)" % (changelist,str(e)))
        self.logger.info("Read source changelist, %d resources listed" % (len(src_changelist)))
        #if (len(src_changelist)==0):
        #    raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_changelist.has_md5()):
            self.checksum=False
            self.logger.info("Not calculating checksums on destination as not present in source resourcelist")
        ### 3. Check that sitemap has authority over URIs listed
        # FIXME - What does authority mean for changelist? Here use both the
        # changelist URI and, if we used it, the sitemap URI
        uauth_cs = UrlAuthority(changelist)
        if (not changelist_uri):
            uauth_sm = UrlAuthority(self.sitemap)
        for resource in src_changelist:
            if (not uauth_cs.has_authority_over(resource.uri) and 
                (changelist_uri or not uauth_sm.has_authority_over(resource.uri))):
                if (self.noauth):
                    #self.logger.info("ChangeList (%s) mentions resource at a location it does not have authority over (%s)" % (changelist,resource.uri))
                    pass
                else:
                    raise ClientFatalError("Aborting as changelist (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (changelist,resource.uri))
        ### 3. Apply changes
        num_updated = 0
        num_deleted = 0
        num_created = 0
        for resource in src_changelist:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            if (resource.change == 'updated'):
                self.logger.info("updated: %s -> %s" % (uri,file))
                self.update_resource(resource,file,'updated')
                num_updated+=1
            elif (resource.change == 'created'):
                self.logger.info("created: %s -> %s" % (uri,file))
                self.update_resource(resource,file,'created')
                num_created+=1
            elif (resource.change == 'deleted'):
                self.delete_resource(resource,file,allow_deletion)
                num_deleted+=1
            else:
                raise ClientError("Unknown change type %s" % (resource.change) )
        # 4. Report status and planned actions
        status = "NO CHANGES"
        if ((num_updated+num_deleted+num_created)>0):
            status = " CHANGES  "
        self.logger.warning("Status: %s (updated=%d, deleted=%d, created=%d)" %\
              (status,num_updated,num_deleted,num_created))
        # 5. Store next link if available
        if ((num_updated+num_deleted+num_created)>0):
            links = self.extract_links(src_changelist)
            if ('next' in links):
                self.write_incremental_status(self.sitemap,links['next'])
                self.logger.info("Written config with next incremental at %s" % (links['next']))
            else:
                self.logger.warning("Failed to extract next changelist location from changelist %s" % (changelist))
        # 6. Done
        self.logger.debug("Completed incremental sync")

    def update_resource(self, resource, file, change=None):
        """Update resource from uri to file on local system

        Update means two things:
        1. GET resources
        2. set mtime in local time to be equal to timestamp in UTC (should perhaps
        or at least warn if different from LastModified from the GET response instead 
        but maybe warn if different (or just earlier than) the lastmod we expected 
        from the resourcelist
        """
        path = os.path.dirname(file)
        distutils.dir_util.mkpath(path)
        if (self.dryrun):
            self.logger.info("dryrun: would GET %s --> %s" % (resource.uri,file))
        else:
            try:
                urllib.urlretrieve(resource.uri,file)
            except IOError as e:
                msg = "Failed to GET %s -- %s" % (resource.uri,str(e))
                if (self.ignore_failures):
                    self.logger.warning(msg)
                    return
                else:
                    raise ClientFatalError(msg)
            # sanity check
            size = os.stat(file).st_size
            if (resource.size != size):
                self.logger.info("Downloaded size for %s of %d bytes does not match expected %d bytes" % (resource.uri,size,resource.size))
            # set timestamp if we have one
            if (resource.timestamp is not None):
                unixtime = int(resource.timestamp) #no fractional
                os.utime(file,(unixtime,unixtime))
            self.log_event(Resource(resource=resource, change=change))

    def delete_resource(self, resource, file, allow_deletion=False):
        """Delete copy of resource in file on local system
        """
        uri = resource.uri
        if (allow_deletion):
            if (self.dryrun):
                self.logger.info("dryrun: would delete %s -> %s" % (uri,file))
            else:
                try:
                    os.unlink(file)
                except OSError as e:
                    msg = "Failed to DELETE %s -> %s : %s" % (uri,file,str(e))
                    if (self.ignore_failures):
                        self.logger.warning(msg)
                        return
                    else:
                        raise ClientFatalError(msg)
                self.logger.info("deleted: %s -> %s" % (uri,file))
                self.log_event(Resource(resource=resource, change="deleted"))
        else:
            self.logger.info("nodelete: would delete %s (--delete to enable)" % uri)

    def parse_sitemap(self):
        s=Sitemap(allow_multifile=self.allow_multifile)
        self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
        i = s.read(self.sitemap)
        num_entries = len(i)
        self.logger.warning("Read sitemap with %d entries in %d sitemaps" % (num_entries,s.sitemaps_created))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries>to_show):
                print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
            n=0
            for r in i:
                print r
                n+=1
                if ( n >= to_show ):
                    break

    def explore_links(self):
        """Explore links from sitemap and between changelists"""
        seen = dict()
        is_changelist,links = self.explore_links_get(self.sitemap, seen=seen)
        starting_changelist = self.sitemap
        if (not is_changelist):
            if ('current' in links):
                starting_changelist = links['current']
                is_changelist,links = self.explore_links_get(links['current'], seen=seen)
        # Can we go backward?
        if ('prev' in links and not links['prev'] in seen):
            self.logger.warning("Will follow links backwards...")
            while ('prev' in links and not links['prev'] in seen):
                self.logger.warning("Following \"prev\" link")
                is_changelist,links = self.explore_links_get(links['prev'], seen=seen)
        else:
            self.logger.warning("No links backwards")
        # Can we go forward?
        links = seen[starting_changelist]
        if ('next' in links and not links['next'] in seen):
            self.logger.warning("Will follow links forwards...")
            while ('next' in links and not links['next'] in seen):
                self.logger.warning("Following \"next\" link")
                is_changelist,links = self.explore_links_get(links['next'], seen=seen)
        else:
            self.logger.warning("No links forwards")

    def explore_links_get(self, uri, seen=[]):
        # Check we haven't been here before
        if (uri in seen):
            self.logger.warning("Already see %s, skipping" % (uri))
        s=Sitemap(allow_multifile=self.allow_multifile)
        self.logger.info("Reading sitemap from %s ..." % (uri))
        i = s.read(uri, index_only=True)
        self.logger.warning("Read %s from %s" % (s.read_type,uri))
        links = self.extract_links(i, verbose=True)
        if ('next' in links and links['next']==uri):
            self.logger.warning("- self reference \"next\" link")
        seen[uri]=links
        return(s.changelist_read,links)

    def write_sitemap(self,outfile=None,capabilities=None,dump=None):
        # Set up base_path->base_uri mappings, get resourcelist from disk
        i = self.resourcelist
        i.capabilities = capabilities
        s=Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper)
        if (self.max_sitemap_entries is not None):
            s.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print s.resources_as_xml(i,capabilities=i.capabilities)
        else:
            s.write(i,basename=outfile)
        self.write_dump_if_requested(i,dump)

    def changelist_sitemap(self,outfile=None,ref_sitemap=None,newref_sitemap=None,
                          empty=None,capabilities=None,dump=None):
        changelist = ChangeList()
        changelist.capabilities = capabilities
        if (not empty):
            # 1. Get and parse reference sitemap
            old_inv = self.read_reference_sitemap(ref_sitemap)
            # 2. Depending on whether a newref_sitemap was specified, either read that 
            # or build resourcelist from files on disk
            if (newref_sitemap is None):
                # Get resourcelist from disk
                new_inv = self.resourcelist
            else:
                new_inv = self.read_reference_sitemap(newref_sitemap,name='new reference')
            # 3. Calculate changelist
            (same,updated,deleted,created)=old_inv.compare(new_inv)   
            changelist.add_changed_resources( updated, change='updated' )
            changelist.add_changed_resources( deleted, change='deleted' )
            changelist.add_changed_resources( created, change='created' )
        # 4. Write out changelist
        s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper)
        if (self.max_sitemap_entries is not None):
            s.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print s.resources_as_xml(changelist,changelist=True)
        else:
            s.write(changelist,basename=outfile,changelist=True)
        self.write_dump_if_requested(changelist,dump)

    def write_dump_if_requested(self,resourcelist,dump):
        if (dump is None):
            return
        self.logger.info("Writing dump to %s..." % (dump))
        d = Dump(format=self.dump_format)
        d.write(resourcelist=resourcelist,dumpfile=dump)

    def read_reference_sitemap(self,ref_sitemap,name='reference'):
        """Read reference sitemap and return the resourcelist

        name parameter just uses in output messages to say what type
        of sitemap is being read.
        """
        sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper)
        self.logger.info("Reading %s sitemap(s) from %s ..." % (name,ref_sitemap))
        i = sitemap.read(ref_sitemap)
        num_entries = len(i)
        self.logger.warning("Read %s sitemap with %d entries in %d sitemaps" % (name,num_entries,sitemap.sitemaps_created))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries>to_show):
                print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
            n=0
            for r in i:
                print r
                n+=1
                if ( n >= to_show ):
                    break
        return(i)

    def extract_links(self, rc, verbose=False):
        """Extract links from capabilities resourcelist or changelist

        FIXME - when we finalize the form of links this should probably
        go along with other capabilities functions somewhere general.
        """
        links = dict()
        for href in rc.capabilities.keys():
            atts = rc.capabilities[href].get('attributes')
            self.logger.debug("Capability: %s" % (str(rc.capabilities[href])))
            if (atts is not None):
                # split on spaces, check is changelist rel and diraction
                if ('http://www.openarchives.org/rs/changelist' in atts):
                    for linktype in ['next','prev','current']:
                        if (linktype in atts):
                            if (linktype in links):
                                raise ClientFatalError("Duplicate link type %s, links to %s and %s" % (linktype,links[linktype],href))
                            links[linktype] = href;
                            if (verbose):
                                self.logger.warning("- got \"%s\" link to %s" % (linktype,href))
        return(links) 

    def write_incremental_status(self,site,next=None):
        """Write status dict to client status file
        
        FIXME - should have some file lock to avoid race
        """
        parser = ConfigParser.SafeConfigParser()
        parser.read(self.status_file)
        status_section = 'incremental'
        if (not parser.has_section(status_section)):
            parser.add_section(status_section)
        if (next is None):
            parser.remove_option(status_section, self.config_site_to_name(site))
        else:
            parser.set(status_section, self.config_site_to_name(site), next)
        with open(self.status_file, 'wb') as configfile:
            parser.write(configfile)
            configfile.close()

    def read_incremental_status(self,site):
        """Read client status file and return dict"""
        parser = ConfigParser.SafeConfigParser()
        status_section = 'incremental'
        parser.read(self.status_file)
        next = None
        try:
            next = parser.get(status_section,self.config_site_to_name(site))
        except ConfigParser.NoSectionError as e:
            pass
        except ConfigParser.NoOptionError as e:
            pass
        return(next)

    def config_site_to_name(self, name):
        return( re.sub(r"[^\w]",'_',name) )
Exemplo n.º 12
0
class Client(object):
    """Implementation of a ResourceSync client

    Logging is used for both console output and for detailed logs for
    automated analysis. Levels used:
      warning - usually shown to user
      info    - verbose output
      debug   - very verbose for automated analysis
    """
    def __init__(self, checksum=False, verbose=False, dryrun=False):
        super(Client, self).__init__()
        self.checksum = checksum
        self.verbose = verbose
        self.dryrun = dryrun
        self.logger = logging.getLogger('client')
        self.mapper = None
        self.sitemap_name = 'sitemap.xml'
        self.dump_format = None
        self.exclude_patterns = []
        self.allow_multifile = True
        self.noauth = False
        self.max_sitemap_entries = None
        self.ignore_failures = False

    @property
    def mappings(self):
        """Provide access to mappings list within Mapper object"""
        if (self.mapper is None):
            raise ClientFatalError("No mappings specified")
        return (self.mapper.mappings)

    def set_mappings(self, mappings):
        """Build and set Mapper object based on input mappings"""
        self.mapper = Mapper(mappings)

    def sitemap_changeset_uri(self, basename):
        """Get full URI (filepath) for sitemap/changeset based on basename"""
        if (re.match(r"\w+:", basename)):
            # looks like URI
            return (basename)
        elif (re.match(r"/", basename)):
            # looks like full path
            return (basename)
        else:
            # build from mapping with name appended
            return (self.mappings[0].src_uri + '/' + basename)

    @property
    def sitemap(self):
        """Return the sitemap URI based on maps or explicit settings"""
        return (self.sitemap_changeset_uri(self.sitemap_name))

    @property
    def inventory(self):
        """Return inventory on disk based on current mappings

        Return inventory. Uses existing self.mapper settings.
        """
        ### 0. Sanity checks
        if (len(self.mappings) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        ### 1. Build from disk
        ib = InventoryBuilder(do_md5=self.checksum, mapper=self.mapper)
        ib.add_exclude_files(self.exclude_patterns)
        return (ib.from_disk())

    def log_event(self, change):
        """Log a ResourceChange object as an event for automated analysis"""
        self.logger.debug("Event: " + repr(change))

    def sync_or_audit(self, allow_deletion=False, audit_only=False):
        action = ('audit' if (audit_only) else 'sync')
        self.logger.debug("Starting " + action)
        ### 0. Sanity checks
        if (len(self.mappings) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        ### 1. Get inventories from both src and dst
        # 1.a source inventory
        ib = InventoryBuilder(mapper=self.mapper)
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_sitemap = Sitemap(allow_multifile=self.allow_multifile,
                                  mapper=self.mapper)
            src_inventory = src_sitemap.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError("Can't read source inventory from %s (%s)" %
                                   (self.sitemap, str(e)))
        self.logger.info("Read source inventory, %d resources listed" %
                         (len(src_inventory)))
        if (len(src_inventory) == 0):
            raise ClientFatalError(
                "Aborting as there are no resources to sync")
        if (self.checksum and not src_inventory.has_md5()):
            self.checksum = False
            self.logger.info(
                "Not calculating checksums on destination as not present in source inventory"
            )
        # 1.b destination inventory mapped back to source URIs
        ib.do_md5 = self.checksum
        dst_inventory = ib.from_disk()
        ### 2. Compare these inventorys respecting any comparison options
        (same, updated, deleted,
         created) = dst_inventory.compare(src_inventory)
        ### 3. Report status and planned actions
        status = "  IN SYNC  "
        if (len(updated) > 0 or len(deleted) > 0 or len(created) > 0):
            status = "NOT IN SYNC"
        self.logger.warning("Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\
              (status,len(same),len(updated),len(deleted),len(created)))
        if (audit_only):
            self.logger.debug("Completed " + action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        uauth = UrlAuthority(self.sitemap)
        for resource in src_inventory:
            if (not uauth.has_authority_over(resource.uri)):
                if (self.noauth):
                    self.logger.info(
                        "Sitemap (%s) mentions resource at a location it does not have authority over (%s)"
                        % (self.sitemap, resource.uri))
                else:
                    raise ClientFatalError(
                        "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                        % (self.sitemap, resource.uri))
        ### 5. Grab files to do sync
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri, file))
            self.update_resource(resource, file, 'UPDATED')
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri, file))
            self.update_resource(resource, file, 'CREATED')
        for resource in deleted:
            uri = resource.uri
            if (allow_deletion):
                file = self.mapper.src_to_dst(uri)
                if (self.dryrun):
                    self.logger.info("dryrun: would delete %s -> %s" %
                                     (uri, file))
                else:
                    os.unlink(file)
                    self.logger.info("deleted: %s -> %s" % (uri, file))
                    self.log_event(
                        ResourceChange(resource=resource,
                                       changetype="DELETED"))
            else:
                self.logger.info(
                    "nodelete: would delete %s (--delete to enable)" % uri)
        self.logger.debug("Completed " + action)

    def incremental(self, allow_deletion=False, changeset_uri=None):
        self.logger.debug("Starting incremental sync")
        ### 0. Sanity checks
        if (len(self.mappings) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        ### 1. Get URI of changeset, from sitemap or explicit
        if (changeset_uri):
            # Translate as necessary using maps
            changeset = self.sitemap_changeset_uri(changeset_uri)
        else:
            # Get sitemap
            try:
                self.logger.info("Reading sitemap %s" % (self.sitemap))
                src_sitemap = Sitemap(allow_multifile=self.allow_multifile,
                                      mapper=self.mapper)
                src_inventory = src_sitemap.read(uri=self.sitemap,
                                                 index_only=True)
                self.logger.debug("Finished reading sitemap/sitemapindex")
            except Exception as e:
                raise ClientFatalError(
                    "Can't read source sitemap from %s (%s)" %
                    (self.sitemap, str(e)))
            # Extract changeset location
            # FIXME - need to completely rework the way we handle/store capabilities
            links = self.extract_links(src_inventory.capabilities)
            if ('current' not in links):
                raise ClientFatalError(
                    "Failed to extract changeset location from sitemap %s" %
                    (self.sitemap))
            changeset = links['current']
        ### 2. Read changeset from source
        ib = InventoryBuilder(mapper=self.mapper)
        try:
            self.logger.info("Reading changeset %s" % (changeset))
            src_sitemap = Sitemap(allow_multifile=self.allow_multifile,
                                  mapper=self.mapper)
            src_changeset = src_sitemap.read(uri=changeset, changeset=True)
            self.logger.debug("Finished reading changeset")
        except Exception as e:
            raise ClientFatalError("Can't read source changeset from %s (%s)" %
                                   (changeset, str(e)))
        self.logger.info("Read source changeset, %d resources listed" %
                         (len(src_changeset)))
        if (len(src_changeset) == 0):
            raise ClientFatalError(
                "Aborting as there are no resources to sync")
        if (self.checksum and not src_changeset.has_md5()):
            self.checksum = False
            self.logger.info(
                "Not calculating checksums on destination as not present in source inventory"
            )
        ### 3. Check that sitemap has authority over URIs listed
        # FIXME - What does authority mean for changeset? Here use both the
        # changeset URI and, if we used it, the sitemap URI
        uauth_cs = UrlAuthority(changeset)
        if (not changeset_uri):
            uauth_sm = UrlAuthority(self.sitemap)
        for resource in src_changeset:
            if (not uauth_cs.has_authority_over(resource.uri)
                    and (changeset_uri
                         or not uauth_sm.has_authority_over(resource.uri))):
                if (self.noauth):
                    self.logger.warning(
                        "Changeset (%s) mentions resource at a location it does not have authority over (%s)"
                        % (changeset, resource.uri))
                else:
                    raise ClientFatalError(
                        "Aborting as changeset (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                        % (changeset, resource.uri))
        ### 3. Apply changes
        for resource in src_changeset:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            if (resource.changetype == 'UPDATED'):
                self.logger.info("updated: %s -> %s" % (uri, file))
                self.update_resource(resource, file, 'UPDATED')
            elif (resource.changetype == 'CREATED'):
                self.logger.info("created: %s -> %s" % (uri, file))
                self.update_resource(resource, file, 'CREATED')
            elif (resource.changetype == 'DELETED'):
                if (allow_deletion):
                    file = self.mapper.src_to_dst(uri)
                    if (self.dryrun):
                        self.logger.info("dryrun: would delete %s -> %s" %
                                         (uri, file))
                    else:
                        os.unlink(file)
                        self.logger.info("deleted: %s -> %s" % (uri, file))
                        self.log_event(
                            ResourceChange(resource=resource,
                                           changetype="DELETED"))
                else:
                    self.logger.info(
                        "nodelete: would delete %s (--delete to enable)" % uri)
            else:
                raise ClientError("Unknown change type %s" %
                                  (resource.changetype))
        self.logger.debug("Completed incremental stuff")

    def update_resource(self, resource, file, changetype=None):
        """Update resource from uri to file on local system

        Update means two things:
        1. GET resources
        2. set mtime in local time to be equal to timestamp in UTC (should perhaps
        or at least warn if different from LastModified from the GET response instead 
        but maybe warn if different (or just earlier than) the lastmod we expected 
        from the inventory
        """
        path = os.path.dirname(file)
        distutils.dir_util.mkpath(path)
        if (self.dryrun):
            self.logger.info("dryrun: would GET %s --> %s" %
                             (resource.uri, file))
        else:
            try:
                urllib.urlretrieve(resource.uri, file)
            except IOError as e:
                msg = "Failed to GET %s -- %s" % (resource.uri, str(e))
                if (self.ignore_failures):
                    self.logger.warning(msg)
                    return
                else:
                    raise ClientFatalError(msg)
            if (resource.timestamp is not None):
                unixtime = int(resource.timestamp)  #no fractional
                os.utime(file, (unixtime, unixtime))
            self.log_event(
                ResourceChange(resource=resource, changetype=changetype))

    def parse_sitemap(self):
        s = Sitemap(allow_multifile=self.allow_multifile)
        self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
        i = s.read(self.sitemap)
        num_entries = len(i)
        self.logger.warning("Read sitemap with %d entries in %d sitemaps" %
                            (num_entries, s.sitemaps_created))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries > to_show):
                print "Showing first %d entries sorted by URI%s..." % (
                    to_show, override_str)
            n = 0
            for r in i:
                print r
                n += 1
                if (n >= to_show):
                    break

    def explore_links(self):
        """Explore links from sitemap and between changesets"""
        seen = dict()
        is_changeset, links = self.explore_links_get(self.sitemap, seen=seen)
        starting_changeset = self.sitemap
        if (not is_changeset):
            if ('current' in links):
                starting_changeset = links['current']
                is_changeset, links = self.explore_links_get(links['current'],
                                                             seen=seen)
        # Can we go backward?
        if ('prev' in links and not links['prev'] in seen):
            self.logger.warning("Will follow links backwards...")
            while ('prev' in links and not links['prev'] in seen):
                self.logger.warning("Following \"prev\" link")
                is_changeset, links = self.explore_links_get(links['prev'],
                                                             seen=seen)
        else:
            self.logger.warning("No links backwards")
        # Can we go forward?
        links = seen[starting_changeset]
        if ('next' in links and not links['next'] in seen):
            self.logger.warning("Will follow links forwards...")
            while ('next' in links and not links['next'] in seen):
                self.logger.warning("Following \"next\" link")
                is_changeset, links = self.explore_links_get(links['next'],
                                                             seen=seen)
        else:
            self.logger.warning("No links forwards")

    def explore_links_get(self, uri, seen=[]):
        # Check we haven't been here before
        if (uri in seen):
            self.logger.warning("Already see %s, skipping" % (uri))
        s = Sitemap(allow_multifile=self.allow_multifile)
        self.logger.info("Reading sitemap from %s ..." % (uri))
        i = s.read(uri, index_only=True)
        self.logger.warning("Read %s from %s" % (s.read_type, uri))
        links = self.extract_links(i, verbose=True)
        if ('next' in links and links['next'] == uri):
            self.logger.warning("- self reference \"next\" link")
        seen[uri] = links
        return (s.changeset_read, links)

    def write_sitemap(self, outfile=None, capabilities=None, dump=None):
        # Set up base_path->base_uri mappings, get inventory from disk
        i = self.inventory
        i.capabilities = capabilities
        s = Sitemap(pretty_xml=True,
                    allow_multifile=self.allow_multifile,
                    mapper=self.mapper)
        if (self.max_sitemap_entries is not None):
            s.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print s.resources_as_xml(i, capabilities=i.capabilities)
        else:
            s.write(i, basename=outfile)
        self.write_dump_if_requested(i, dump)

    def changeset_sitemap(self,
                          outfile=None,
                          ref_sitemap=None,
                          newref_sitemap=None,
                          empty=None,
                          capabilities=None,
                          dump=None):
        changeset = ChangeSet()
        changeset.capabilities = capabilities
        if (not empty):
            # 1. Get and parse reference sitemap
            old_inv = self.read_reference_sitemap(ref_sitemap)
            # 2. Depending on whether a newref_sitemap was specified, either read that
            # or build inventory from files on disk
            if (newref_sitemap is None):
                # Get inventory from disk
                new_inv = self.inventory
            else:
                new_inv = self.read_reference_sitemap(newref_sitemap,
                                                      name='new reference')
            # 3. Calculate changeset
            (same, updated, deleted, created) = old_inv.compare(new_inv)
            changeset.add_changed_resources(updated, changetype='UPDATED')
            changeset.add_changed_resources(deleted, changetype='DELETED')
            changeset.add_changed_resources(created, changetype='CREATED')
        # 4. Write out changeset
        s = Sitemap(pretty_xml=True,
                    allow_multifile=self.allow_multifile,
                    mapper=self.mapper)
        if (self.max_sitemap_entries is not None):
            s.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print s.resources_as_xml(changeset, changeset=True)
        else:
            s.write(changeset, basename=outfile, changeset=True)
        self.write_dump_if_requested(changeset, dump)

    def write_dump_if_requested(self, inventory, dump):
        if (dump is None):
            return
        self.logger.info("Writing dump to %s..." % (dump))
        d = Dump(format=self.dump_format)
        d.write(inventory=inventory, dumpfile=dump)

    def read_reference_sitemap(self, ref_sitemap, name='reference'):
        """Read reference sitemap and return the inventory

        name parameter just uses in output messages to say what type
        of sitemap is being read.
        """
        sitemap = Sitemap(allow_multifile=self.allow_multifile,
                          mapper=self.mapper)
        self.logger.info("Reading %s sitemap(s) from %s ..." %
                         (name, ref_sitemap))
        i = sitemap.read(ref_sitemap)
        num_entries = len(i)
        self.logger.warning("Read %s sitemap with %d entries in %d sitemaps" %
                            (name, num_entries, sitemap.sitemaps_created))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries > to_show):
                print "Showing first %d entries sorted by URI%s..." % (
                    to_show, override_str)
            n = 0
            for r in i:
                print r
                n += 1
                if (n >= to_show):
                    break
        return (i)

    def extract_links(self, rc, verbose=False):
        """Extract links from capabilities inventory or changeset

        FIXME - when we finalize the form of links this should probably
        go along with other capabilities functions somewhere general.
        """
        links = dict()
        for href in rc.capabilities.keys():
            atts = rc.capabilities[href].get('attributes')
            self.logger.debug("Capability: %s" % (str(rc.capabilities[href])))
            if (atts is not None):
                # split on spaces, check is changeset rel and diraction
                if ('http://www.openarchives.org/rs/changeset' in atts):
                    for linktype in ['next', 'prev', 'current']:
                        if (linktype in atts):
                            if (linktype in links):
                                raise ClientFatalError(
                                    "Duplicate link type %s, links to %s and %s"
                                    % (linktype, links[linktype], href))
                            links[linktype] = href
                            if (verbose):
                                self.logger.warning("- got \"%s\" link to %s" %
                                                    (linktype, href))
        return (links)
Exemplo n.º 13
0
class Client(object):
    """Implementation of a ResourceSync client

    Logging is used for both console output and for detailed logs for
    automated analysis. Levels used:
      warning - usually shown to user
      info    - verbose output
      debug   - very verbose for automated analysis
    """

    def __init__(self, checksum=False, verbose=False, dryrun=False):
        super(Client, self).__init__()
        self.checksum = checksum
        self.verbose = verbose
        self.dryrun = dryrun
        self.logger = logging.getLogger('client')
        self.mapper = None
        self.resource_list_name = 'resourcelist.xml'
        self.change_list_name = 'changelist.xml'
        self.dump_format = None
        self.exclude_patterns = []
        self.sitemap_name = None
        self.allow_multifile = True
        self.noauth = False
        self.max_sitemap_entries = None
        self.ignore_failures = False
        self.status_file = '.resync-client-status.cfg'

    @property
    def mappings(self):
        """Provide access to mappings list within Mapper object"""
        if (self.mapper is None):
            raise ClientFatalError("No mappings specified")
        return(self.mapper.mappings)

    def set_mappings(self,mappings):
        """Build and set Mapper object based on input mappings"""
        self.mapper = Mapper(mappings, use_default_path=True)

    def sitemap_uri(self,basename):
        """Get full URI (filepath) for sitemap based on basename"""
        if (re.match(r"\w+:",basename)):
            # looks like URI
            return(basename)
        elif (re.match(r"/",basename)):
            # looks like full path
            return(basename)
        else:
            # build from mapping with name appended
            return(self.mappings[0].src_uri + '/' + basename)

    @property
    def sitemap(self):
        """Return the sitemap URI based on maps or explicit settings"""
        if (self.sitemap_name is not None):
            return(self.sitemap_name)
        return(self.sitemap_uri(self.resource_list_name))

    @property
    def resource_list(self):
        """Return resource_list on disk based on current mappings

        Return resource_list. Uses existing self.mapper settings.
        """
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Build from disk
        rlb = ResourceListBuilder(do_md5=self.checksum,mapper=self.mapper)
        rlb.add_exclude_files(self.exclude_patterns)
        return( rlb.from_disk() )

    def log_event(self, change):
        """Log a Resource object as an event for automated analysis"""
        self.logger.debug( "Event: "+repr(change) )

    def baseline_or_audit(self, allow_deletion=False, audit_only=False):
        """Baseline synchonization or audit

	Both functions implemented in this routine because audit is a prerequisite
	for a baseline sync. In the case of baseline sync the last timestamp seen
        is recorded as client state.
	"""
        action = ( 'audit' if (audit_only) else 'baseline sync' ) 
        self.logger.debug("Starting "+action)
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Get inventories from both src and dst 
        # 1.a source resource_list
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_resource_list = ResourceList(allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_resource_list.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError("Can't read source resource_list from %s (%s)" % (self.sitemap,str(e)))
        self.logger.info("Read source resource_list, %d resources listed" % (len(src_resource_list)))
        if (len(src_resource_list)==0):
            raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_resource_list.has_md5()):
            self.checksum=False
            self.logger.info("Not calculating checksums on destination as not present in source resource_list")
        # 1.b destination resource_list mapped back to source URIs
        rlb = ResourceListBuilder(mapper=self.mapper)
        rlb.do_md5=self.checksum
        dst_resource_list = rlb.from_disk()
        ### 2. Compare these resource_lists respecting any comparison options
        (same,updated,deleted,created)=dst_resource_list.compare(src_resource_list)   
        ### 3. Report status and planned actions
        self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0),
                        audit=True,same=len(same),created=len(created),
                        updated=len(updated),deleted=len(deleted))
        if (audit_only or len(created)+len(updated)+len(deleted)==0):
            self.logger.debug("Completed "+action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        uauth = UrlAuthority(self.sitemap)
        for resource in src_resource_list:
            if (not uauth.has_authority_over(resource.uri)):
                if (self.noauth):
                    #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri))
                    pass
                else:
                    raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri))
        ### 5. Grab files to do sync
        delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else ''
        self.logger.warning("Will GET %d resources%s" % (len(created)+len(updated),delete_msg))
        self.last_timestamp = 0
        num_created=0
        num_updated=0
        num_deleted=0
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri,file))
            num_created+=self.update_resource(resource,file,'created')
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri,file))
            num_updated+=self.update_resource(resource,file,'updated')
        for resource in deleted:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            num_deleted+=self.delete_resource(resource,file,allow_deletion)
        ### 6. Store last timestamp to allow incremental sync
        if (not audit_only and self.last_timestamp>0):
            ClientState().set_state(self.sitemap,self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp)))
        ### 7. Done
        self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0),
                        same=len(same),created=num_created,
                        updated=num_updated,deleted=num_deleted)
        self.logger.debug("Completed %s" % (action))

    def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None):
	"""Incremental synchronization

        """
        self.logger.debug("Starting incremental sync")
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        from_timestamp = None
        if (from_datetime is not None):
            try:
                from_timestamp = str_to_datetime(from_datetime)
            except ValueError:
                raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime)
        ### 1. Work out where to start from
        if (from_timestamp is None):
            from_timestamp=ClientState().get_state(self.sitemap)
            if (from_timestamp is None):
                raise ClientFatalError("No stored timestamp for this site, and no explicit --from")
        ### 2. Get URI of change list, from sitemap or explicit
        if (change_list_uri):
            # Translate as necessary using maps
            change_list = self.sitemap_uri(change_list_uri)
        else:
            # Try default name
            change_list = self.sitemap_uri(self.change_list_name)
        ### 3. Read change list from source
        try:
            self.logger.info("Reading change list %s" % (change_list))
            src_change_list = ChangeList()
            src_change_list.read(uri=change_list)
            self.logger.debug("Finished reading change list")
        except Exception as e:
            raise ClientFatalError("Can't read source change list from %s (%s)" % (change_list,str(e)))
        self.logger.info("Read source change list, %d changes listed" % (len(src_change_list)))
        #if (len(src_change_list)==0):
        #    raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_change_list.has_md5()):
            self.checksum=False
            self.logger.info("Not calculating checksums on destination as not present in source change list")
        # Check all changes have timestamp and record last
        self.last_timestamp = 0
        for resource in src_change_list:
            if (resource.timestamp is None):
                raise ClientFatalError("Aborting - missing timestamp for change in %s" % (uri))
            if (resource.timestamp > self.last_timestamp):
                self.last_timestamp = resource.timestamp
        ### 4. Check that the change list has authority over URIs listed
        # FIXME - What does authority mean for change list? Here use both the
        # change list URI and, if we used it, the sitemap URI
        uauth_cs = UrlAuthority(change_list)
        if (not change_list_uri):
            uauth_sm = UrlAuthority(self.sitemap)
        for resource in src_change_list:
            if (not uauth_cs.has_authority_over(resource.uri) and 
                (change_list_uri or not uauth_sm.has_authority_over(resource.uri))):
                if (self.noauth):
                    #self.logger.info("Change list (%s) mentions resource at a location it does not have authority over (%s)" % (change_list,resource.uri))
                    pass
                else:
                    raise ClientFatalError("Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list,resource.uri))
        ### 5. Prune entries before starting timestamp and dupe changes for a resource
        num_skipped = src_change_list.prune_before(from_timestamp)
        if (num_skipped>0):
            self.logger.info("Skipped %d changes before %s" % (num_skipped,datetime_to_str(from_timestamp)))
        num_dupes = src_change_list.prune_dupes()
        if (num_dupes>0):
            self.logger.info("Removed %d prior changes" % (num_dupes))
        ### 6. Apply changes at same time or after from_timestamp
        self.logger.info("Applying %d changes" % (len(src_change_list)))
        num_updated = 0
        num_deleted = 0
        num_created = 0
        for resource in src_change_list:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            if (resource.change == 'updated'):
                self.logger.info("updated: %s -> %s" % (uri,file))
                self.update_resource(resource,file,'updated')
                num_updated+=1
            elif (resource.change == 'created'):
                self.logger.info("created: %s -> %s" % (uri,file))
                self.update_resource(resource,file,'created')
                num_created+=1
            elif (resource.change == 'deleted'):
                self.delete_resource(resource,file,allow_deletion)
                num_deleted+=1
            else:
                raise ClientError("Unknown change type %s" % (resource.change) )
        ### 7. Report status and planned actions
        self.log_status(in_sync=((num_updated+num_deleted+num_created)==0),
                        incremental=True,created=num_created, updated=num_updated, 
                        deleted=num_deleted)
        ### 8. Record last timestamp we have seen
        if (self.last_timestamp>0):
            ClientState().set_state(self.sitemap,self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp)))
        ### 9. Done
        self.logger.debug("Completed incremental sync")

    def update_resource(self, resource, file, change=None):
        """Update resource from uri to file on local system

        Update means three things:
        1. GET resources
        2. set mtime in local time to be equal to timestamp in UTC (should perhaps
        or at least warn if different from LastModified from the GET response instead 
        but maybe warn if different (or just earlier than) the lastmod we expected 
        from the resource_list
        3. check that resource matches expected information

        Also update self.last_timestamp if the timestamp (in source frame) of this
        resource is later and the current value.

        Returns the number of resources updated/created (0 or 1)
        """
        path = os.path.dirname(file)
        distutils.dir_util.mkpath(path)
        num_updated=0
        if (self.dryrun):
            self.logger.info("dryrun: would GET %s --> %s" % (resource.uri,file))
        else:
            # 1. GET
            try:
                urllib.urlretrieve(resource.uri,file)
                num_updated+=1
            except IOError as e:
                msg = "Failed to GET %s -- %s" % (resource.uri,str(e))
                if (self.ignore_failures):
                    self.logger.warning(msg)
                    return
                else:
                    raise ClientFatalError(msg)
            # 2. set timestamp if we have one
            if (resource.timestamp is not None):
                unixtime = int(resource.timestamp) #no fractional
                os.utime(file,(unixtime,unixtime))
                if (resource.timestamp > self.last_timestamp):
                    self.last_timestamp = resource.timestamp
            self.log_event(Resource(resource=resource, change=change))
            # 3. sanity check
            length = os.stat(file).st_size
            if (resource.length != length):
                self.logger.info("Downloaded size for %s of %d bytes does not match expected %d bytes" % (resource.uri,length,resource.length))
            if (self.checksum and resource.md5 is not None):
                file_md5 = compute_md5_for_file(file)
                if (resource.md5 != file_md5):
                    self.logger.info("MD5 mismatch for %s, got %s but expected %s bytes" % (resource.uri,file_md5,resource.md5))
        return(num_updated)

    def delete_resource(self, resource, file, allow_deletion=False):
        """Delete copy of resource in file on local system

        Will only actually do the deletion if allow_deletion is True. Regardless 
        of whether the deletion occurs, self.last_timestamp will be updated 
        if the resource.timestamp is later than the current value.

        Returns the number of files actually deleted (0 or 1).
        """
        num_deleted=0
        uri = resource.uri
        if (resource.timestamp is not None and
            resource.timestamp > self.last_timestamp):
            self.last_timestamp = resource.timestamp
        if (allow_deletion):
            if (self.dryrun):
                self.logger.info("dryrun: would delete %s -> %s" % (uri,file))
            else:
                try:
                    os.unlink(file)
                    num_deleted+=1
                except OSError as e:
                    msg = "Failed to DELETE %s -> %s : %s" % (uri,file,str(e))
                    #if (self.ignore_failures):
                    self.logger.warning(msg)
                    #    return
                    #else:
                    #    raise ClientFatalError(msg)
                self.logger.info("deleted: %s -> %s" % (uri,file))
                self.log_event(Resource(resource=resource, change="deleted"))
        else:
            self.logger.info("nodelete: would delete %s (--delete to enable)" % uri)
        return(num_deleted)

    def parse_document(self):
        """Parse any ResourceSync document and show information
        
        Will use sitemap URI taken either from explicit self.sitemap_name
        or derived from the mappings supplied.
        """
        s=Sitemap()
        self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
        try:
            list = s.parse_xml(urllib.urlopen(self.sitemap))
        except IOError as e:
            raise ClientFatalError("Cannot read document (%s)" % str(e))
        num_entries = len(list.resources)
        capability = '(unknown capability)'
        if ('capability' in list.md):
            capability = list.md['capability']
        print "Parsed %s document with %d entries" % (capability,num_entries)
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries>to_show):
                print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
            n=0
            for resource in list:
                print '[%d] %s' % (n,str(resource))
                n+=1
                if ( n >= to_show ):
                    break

    def explore(self):
        """Explore capabilities of a server interactvely
        
        Will use sitemap URI taken either from explicit self.sitemap_name
        or derived from the mappings supplied.
        """
        uri = None
        if (self.sitemap_name is not None):
            uri = self.sitemap
            print "Taking location from --sitemap option"
            acceptable_capabilities = None #ie. any
        elif (len(self.mappings)>0):
            pu = urlparse.urlparse(self.mappings[0].src_uri)
            uri = urlparse.urlunparse( [ pu[0], pu[1], '/.well-known/resourcesync', '', '', '' ] )
            print "Will look for discovery information based on mappings"
            acceptable_capabilities = [ 'capabilitylist', 'capabilitylistindex' ]
        else:
            raise FatalError("Neither explicit sitemap nor mapping specified")
        inp = None
        while (inp!='q'):
            print
            (uri, acceptable_capabilities, inp) = self.explore_uri(uri,acceptable_capabilities)

    def explore_uri(self, uri, caps):
        """Interactive exploration of document at uri

        Will flag warnings if the document is not of type listed in caps
        """
        s=Sitemap()
        print "Reading %s" % (uri)
        try:
            list = s.parse_xml(urllib.urlopen(uri))
        except IOError as e:
            raise ClientFatalError("Cannot read %s (%s)" % (uri,str(e)))
        num_entries = len(list.resources)
        capability = '(unknown capability)'
        if ('capability' in list.md):
            capability = list.md['capability']
        if (s.parsed_index):
            capability += 'index'
        print "Parsed %s document with %d entries:" % (capability,num_entries)
        if (caps is not None and capability not in caps):
            print "WARNING - expected a %s document" % (','.join(caps))
        to_show = num_entries
        if (num_entries>21):
            to_show = 20
        # What entries are allowed? 
        # FIXME - not complete
        if (capability == 'capabilitylistindex'):
            entry_caps = ['capabilitylist']
        elif (capability == 'capabilitylist'):
            entry_caps = ['resourcelist','changelist','resourcedump','changedump','changelistindex']
        elif (capability == 'changelistindex'):
            entry_caps = ['changelist']
        n = 0
        options = {}
        for r in list.resources:
            if (n>=to_show):
                print "(not showing remaining %d entries)" % (num_entries-n)
                last
            n+=1
            options[str(n)]=r
            print "[%d] %s" % (n,r.uri)
            if (r.capability is not None):
                warning = ''
                if (r.capability not in entry_caps):
                    warning = " (EXPECTED %s)" % (' or '.join(entry_caps))
                print "  %s%s" % (r.capability,warning)
            elif (len(entry_caps)==1):
                r.capability=entry_caps[0]
                print "  capability not specified, should be %s" % (r.capability)
        while (True):
            inp = raw_input( "Follow [number or q(uit)]?" )
            if (inp in options.keys()):
                break
            if (inp == 'q'):
                return('','',inp)
        caps = [ options[inp].capability ]
        if (capability == 'capabilitylistindex'):
            # all links should be to capabilitylist documents
            if (caps is None):
                caps = ['capabilitylist']
        return( options[inp].uri, caps, inp )

    def write_resource_list(self,outfile=None,links=None,dump=None):
        """Write a resource list sitemap for files on local disk
        based on the base_path->base_uri mappings.
        """
        rl = self.resource_list
        rl.ln = links
        kwargs = { 'pretty_xml': True,
                   'allow_multifile': self.allow_multifile,
                   'mapper' : self.mapper }
        if (self.max_sitemap_entries is not None):
            kwargs['max_sitemap_entries'] = self.max_sitemap_entries
        if (outfile is None):
            print rl.as_xml(**kwargs)
        else:
            rl.write(basename=outfile,**kwargs)
        self.write_dump_if_requested(rl,dump)

    def write_change_list(self,outfile=None,ref_sitemap=None,newref_sitemap=None,
                          empty=None,links=None,dump=None):
        cl = ChangeList(ln=links)
        if (not empty):
            # 1. Get and parse reference sitemap
            old_rl = self.read_reference_resource_list(ref_sitemap)
            # 2. Depending on whether a newref_sitemap was specified, either read that 
            # or build resource_list from files on disk
            if (newref_sitemap is None):
                # Get resource list from disk
                new_rl = self.resource_list
            else:
                new_rl = self.read_reference_resource_list(newref_sitemap,name='new reference')
            # 3. Calculate change list
            (same,updated,deleted,created)=old_rl.compare(new_rl)   
            cl.add_changed_resources( updated, change='updated' )
            cl.add_changed_resources( deleted, change='deleted' )
            cl.add_changed_resources( created, change='created' )
        # 4. Write out change list
        kwargs = { 'pretty_xml': True,
                   'mapper' : self.mapper }
        if (self.max_sitemap_entries is not None):
            kwargs['max_sitemap_entries'] = self.max_sitemap_entries
        if (outfile is None):
            print cl.as_xml(**kwargs)
        else:
            cl.write(basename=outfile,**kwargs)
        self.write_dump_if_requested(cl,dump)

    def write_capability_list(self,capabilities=None,outfile=None,links=None):
        """Write a Capability List to outfile or STDOUT"""
        capl = CapabilityList(ln=links)
        if (capabilities is not None):
            for name in capabilities.keys():
                capl.add_capability(name=name, uri=capabilities[name])
        kwargs = { 'pretty_xml': True }
        if (outfile is None):
            print capl.as_xml(**kwargs)
        else:
            capl.write(basename=outfile,**kwargs)

    def write_capability_list_index(self,capability_lists=None,outfile=None,links=None):
        """Write a Capability List to outfile or STDOUT"""
        capli = CapabilityListIndex(ln=links)
        if (capability_lists is not None):
            for uri in capability_lists:
                capli.add_capability_list(uri)
        kwargs = { 'pretty_xml': True }
        if (outfile is None):
            print capli.as_xml(**kwargs)
        else:
            capli.write(basename=outfile,**kwargs)

    def write_dump_if_requested(self,resource_list,dump):
        if (dump is None):
            return
        self.logger.info("Writing dump to %s..." % (dump))
        d = Dump(format=self.dump_format)
        d.write(resource_list=resource_list,dumpfile=dump)

    def read_reference_resource_list(self,ref_sitemap,name='reference'):
        """Read reference resource list and return the ResourceList object

        name parameter just uses in output messages to say what type
        of resource list is being read.
        """
        rl = ResourceList()
        self.logger.info("Reading reference %s resource list from %s ..." % (name,ref_sitemap))
        rl.mapper=self.mapper
        rl.read(uri=ref_sitemap,index_only=(not self.allow_multifile))
        num_entries = len(rl.resources)
        self.logger.info("Read %s resource list with %d entries in %d sitemaps" % (name,num_entries,rl.num_files))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries>to_show):
                print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
            n=0
            for r in rl.resources:
                print r
                n+=1
                if ( n >= to_show ):
                    break
        return(rl)

    def log_status(self, in_sync=True, incremental=False, audit=False,
                   same=None, created=0, updated=0, deleted=0):
        """Write log message regarding status in standard form
        
        Split this off so we messages from baseline/audit/incremental
        are written in a consistent form.
        """
        if (incremental):     
            status = "NO CHANGES" if in_sync else "CHANGES"
        else:
            status = "IN SYNC" if in_sync else ("NOT IN SYNC" if (audit) else "SYNCED")
        if (audit):
            words = { 'created': 'to create',
                      'updated': 'to update',
                      'deleted': 'to delete' }
        else:
            words = { 'created': 'created',
                      'updated': 'updated',
                      'deleted': 'deleted' }
        same =  "" if (same is None) else ("same=%d, " % same)
        self.logger.warning("Status: %11s (%s%s=%d, %s=%d, %s=%d)" %\
             (status, same, words['created'], created, 
              words['updated'], updated, words['deleted'], deleted))
Exemplo n.º 14
0
class Client(object):
    """Implementation of a ResourceSync client

    Logging is used for both console output and for detailed logs for
    automated analysis. Levels used:
      warning - usually shown to user
      info    - verbose output
      debug   - very verbose for automated analysis
    """

    def __init__(self, checksum=False, verbose=False, dryrun=False):
        super(Client, self).__init__()
        self.checksum = checksum
        self.verbose = verbose
        self.dryrun = dryrun
        self.logger = logging.getLogger('resync.client')
        self.mapper = Mapper()
        self.resource_list_name = 'resourcelist.xml'
        self.change_list_name = 'changelist.xml'
        self.dump_format = None
        self.exclude_patterns = []
        self.sitemap_name = None
        self.allow_multifile = True
        self.noauth = False
        self.strictauth = False
        self.max_sitemap_entries = None
        self.ignore_failures = False
        self.pretty_xml = True
        # Default file names
        self.status_file = '.resync-client-status.cfg'
        self.default_resource_dump = 'resourcedump.zip'
        self.default_change_dump = 'changedump.zip'


    def set_mappings(self,mappings):
        """Build and set Mapper object based on input mappings"""
        self.mapper = Mapper(mappings, use_default_path=True)

    def sitemap_uri(self,basename):
        """Get full URI (filepath) for sitemap based on basename"""
        if (re.match(r"\w+:",basename)):
            # looks like URI
            return(basename)
        elif (re.match(r"/",basename)):
            # looks like full path
            return(basename)
        else:
            # build from mapping with name appended
            return(self.mapper.default_src_uri() + '/' + basename)

    @property
    def sitemap(self):
        """Return the sitemap URI based on maps or explicit settings"""
        if (self.sitemap_name is not None):
            return(self.sitemap_name)
        return(self.sitemap_uri(self.resource_list_name))

    def build_resource_list(self, paths=None, set_path=False):
        """Return a resource list for files on local disk

        The set of files is taken by disk scan from the paths specified or
        else defaults to the paths specified in the current mappings

        paths - override paths from mappings if specified

        set_path - set true to set the path information for each resource 
            included. This is used to build a resource list as the basis
            for creating a dump.

        Return ResourceList. Uses existing self.mapper settings.
        """
        # 0. Sanity checks, parse paths is specified
        if (len(self.mapper)<1):
            raise ClientFatalError("No source to destination mapping specified")
        if (paths is not None):
            # Expect comma separated list of paths
            paths=paths.split(',')
        # 1. Build from disk
        rlb = ResourceListBuilder(set_md5=self.checksum,mapper=self.mapper)
        rlb.set_path=set_path
        rlb.add_exclude_files(self.exclude_patterns)
        rl = rlb.from_disk(paths=paths)
        # 2. Set defaults and overrides
        rl.allow_multifile = self.allow_multifile
        rl.pretty_xml = self.pretty_xml
        rl.mapper = self.mapper
        if (self.max_sitemap_entries is not None):
            rl.max_sitemap_entries = self.max_sitemap_entries
        return(rl)

    def log_event(self, change):
        """Log a Resource object as an event for automated analysis"""
        self.logger.debug( "Event: "+repr(change) )

    def baseline_or_audit(self, allow_deletion=False, audit_only=False):
        """Baseline synchonization or audit

	Both functions implemented in this routine because audit is a prerequisite
	for a baseline sync. In the case of baseline sync the last timestamp seen
        is recorded as client state.
	"""
        action = ( 'audit' if (audit_only) else 'baseline sync' ) 
        self.logger.debug("Starting "+action)
        ### 0. Sanity checks
        if (len(self.mapper)<1):
            raise ClientFatalError("No source to destination mapping specified")
        if (not audit_only and self.mapper.unsafe()):
            raise ClientFatalError("Source to destination mappings unsafe: %s" % str(self.mapper))
        ### 1. Get inventories from both src and dst 
        # 1.a source resource list
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_resource_list = ResourceList(allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_resource_list.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError("Can't read source resource list from %s (%s)" % (self.sitemap,str(e)))
        self.logger.info("Read source resource list, %d resources listed" % (len(src_resource_list)))
        if (len(src_resource_list)==0):
            raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_resource_list.has_md5()):
            self.checksum=False
            self.logger.info("Not calculating checksums on destination as not present in source resource list")
        # 1.b destination resource list mapped back to source URIs
        rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper)
        dst_resource_list = rlb.from_disk()
        ### 2. Compare these resource lists respecting any comparison options
        (same,updated,deleted,created)=dst_resource_list.compare(src_resource_list)   
        ### 3. Report status and planned actions
        self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0),
                        audit=True,same=len(same),created=len(created),
                        updated=len(updated),deleted=len(deleted))
        if (audit_only or len(created)+len(updated)+len(deleted)==0):
            self.logger.debug("Completed "+action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        if (not self.noauth):
            uauth = UrlAuthority(self.sitemap, strict=self.strictauth)
            for resource in src_resource_list:
                if (not uauth.has_authority_over(resource.uri)):
                    raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri))
        ### 5. Grab files to do sync
        delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else ''
        self.logger.warning("Will GET %d resources%s" % (len(created)+len(updated),delete_msg))
        self.last_timestamp = 0
        num_created=0
        num_updated=0
        num_deleted=0
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri,file))
            num_created+=self.update_resource(resource,file,'created')
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri,file))
            num_updated+=self.update_resource(resource,file,'updated')
        for resource in deleted:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            num_deleted+=self.delete_resource(resource,file,allow_deletion)
        ### 6. Store last timestamp to allow incremental sync
        if (not audit_only and self.last_timestamp>0):
            ClientState().set_state(self.sitemap,self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp)))
        ### 7. Done
        self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0),
                        same=len(same),created=num_created,
                        updated=num_updated,deleted=num_deleted,to_delete=len(deleted))
        self.logger.debug("Completed %s" % (action))

    def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None):
	"""Incremental synchronization

        Use Change List to do incremental sync
        """
        self.logger.debug("Starting incremental sync")
        ### 0. Sanity checks
        if (len(self.mapper)<1):
            raise ClientFatalError("No source to destination mapping specified")
        if (self.mapper.unsafe()):
            raise ClientFatalError("Source to destination mappings unsafe: %s" % str(self.mapper))
        from_timestamp = None
        if (from_datetime is not None):
            try:
                from_timestamp = str_to_datetime(from_datetime)
            except ValueError:
                raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime)
        ### 1. Work out where to start from
        if (from_timestamp is None):
            from_timestamp=ClientState().get_state(self.sitemap)
            if (from_timestamp is None):
                raise ClientFatalError("Cannot do incremental sync. No stored timestamp for this site, and no explicit --from.")
        ### 2. Get URI of change list, from sitemap or explicit
        if (change_list_uri):
            # Translate as necessary using maps
            change_list = self.sitemap_uri(change_list_uri)
        else:
            # Try default name
            change_list = self.sitemap_uri(self.change_list_name)
        ### 3. Read change list from source
        try:
            self.logger.info("Reading change list %s" % (change_list))
            src_change_list = ChangeList()
            src_change_list.read(uri=change_list)
            self.logger.debug("Finished reading change list")
        except Exception as e:
            raise ClientFatalError("Can't read source change list from %s (%s)" % (change_list,str(e)))
        self.logger.info("Read source change list, %d changes listed" % (len(src_change_list)))
        #if (len(src_change_list)==0):
        #    raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_change_list.has_md5()):
            self.checksum=False
            self.logger.info("Not calculating checksums on destination as not present in source change list")
        # Check all changes have timestamp and record last
        self.last_timestamp = 0
        for resource in src_change_list:
            if (resource.timestamp is None):
                raise ClientFatalError("Aborting - missing timestamp for change in %s" % (uri))
            if (resource.timestamp > self.last_timestamp):
                self.last_timestamp = resource.timestamp
        ### 4. Check that the change list has authority over URIs listed
        # FIXME - What does authority mean for change list? Here use both the
        # change list URI and, if we used it, the sitemap URI
        if (not self.noauth):
            uauth_cs = UrlAuthority(change_list, self.strictauth)
            if (not change_list_uri):
                uauth_sm = UrlAuthority(self.sitemap)
                for resource in src_change_list:
                    if (not uauth_cs.has_authority_over(resource.uri) and 
                        (change_list_uri or not uauth_sm.has_authority_over(resource.uri))):
                        raise ClientFatalError("Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list,resource.uri))
        ### 5. Prune entries before starting timestamp and dupe changes for a resource
        num_skipped = src_change_list.prune_before(from_timestamp)
        if (num_skipped>0):
            self.logger.info("Skipped %d changes before %s" % (num_skipped,datetime_to_str(from_timestamp)))
        num_dupes = src_change_list.prune_dupes()
        if (num_dupes>0):
            self.logger.info("Removed %d prior changes" % (num_dupes))
        # Review and log status before
        # FIXME - should at this stage prune the change list to pick out
        # only the last change for each resource
        to_update = 0
        to_create = 0
        to_delete = 0
        for resource in src_change_list:
            if (resource.change == 'updated'):
                to_update+=1
            elif (resource.change == 'created'):
                to_create+=1
            elif (resource.change == 'deleted'):
                to_delete+=1
            else:
                raise ClientError("Unknown change type %s" % (resource.change) )
        # Log status based on what we know from the Change List. Exit if
        # either there are no changes or if there are only deletions and
        # we don't allow deletion
        in_sync = ((to_update+to_delete+to_create)==0)
        self.log_status(in_sync=in_sync, incremental=True, created=to_create, 
                        updated=to_update, deleted=to_delete)
        if (in_sync or ((to_update+to_create)==0 and not allow_deletion)):
            self.logger.debug("Completed incremental")
            return
        ### 6. Apply changes at same time or after from_timestamp
        delete_msg = (", and delete %d resources" % to_delete) if (allow_deletion) else ''
        self.logger.warning("Will apply %d changes%s" % (len(src_change_list),delete_msg))
        num_updated = 0
        num_deleted = 0
        num_created = 0
        for resource in src_change_list:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            if (resource.change == 'updated'):
                self.logger.info("updated: %s -> %s" % (uri,file))
                self.update_resource(resource,file,'updated')
                num_updated+=1
            elif (resource.change == 'created'):
                self.logger.info("created: %s -> %s" % (uri,file))
                self.update_resource(resource,file,'created')
                num_created+=1
            elif (resource.change == 'deleted'):
                num_deleted+=self.delete_resource(resource,file,allow_deletion)
            else:
                raise ClientError("Unknown change type %s" % (resource.change) )
        ### 7. Report status and planned actions
        self.log_status(incremental=True,created=num_created, updated=num_updated, 
                        deleted=num_deleted,to_delete=to_delete)
        ### 8. Record last timestamp we have seen
        if (self.last_timestamp>0):
            ClientState().set_state(self.sitemap,self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp)))
        ### 9. Done
        self.logger.debug("Completed incremental sync")

    def update_resource(self, resource, file, change=None):
        """Update resource from uri to file on local system

        Update means three things:
        1. GET resources
        2. set mtime in local time to be equal to timestamp in UTC (should perhaps
        or at least warn if different from LastModified from the GET response instead 
        but maybe warn if different (or just earlier than) the lastmod we expected 
        from the resource list
        3. check that resource matches expected information

        Also update self.last_timestamp if the timestamp (in source frame) of this
        resource is later and the current value.

        Returns the number of resources updated/created (0 or 1)
        """
        path = os.path.dirname(file)
        distutils.dir_util.mkpath(path)
        num_updated=0
        if (self.dryrun):
            self.logger.info("dryrun: would GET %s --> %s" % (resource.uri,file))
        else:
            # 1. GET
            try:
                urllib.urlretrieve(resource.uri,file)
                num_updated+=1
            except IOError as e:
                msg = "Failed to GET %s -- %s" % (resource.uri,str(e))
                if (self.ignore_failures):
                    self.logger.warning(msg)
                    return
                else:
                    raise ClientFatalError(msg)
            # 2. set timestamp if we have one
            if (resource.timestamp is not None):
                unixtime = int(resource.timestamp) #no fractional
                os.utime(file,(unixtime,unixtime))
                if (resource.timestamp > self.last_timestamp):
                    self.last_timestamp = resource.timestamp
            self.log_event(Resource(resource=resource, change=change))
            # 3. sanity check
            length = os.stat(file).st_size
            if (resource.length != length):
                self.logger.info("Downloaded size for %s of %d bytes does not match expected %d bytes" % (resource.uri,length,resource.length))
            if (self.checksum and resource.md5 is not None):
                file_md5 = compute_md5_for_file(file)
                if (resource.md5 != file_md5):
                    self.logger.info("MD5 mismatch for %s, got %s but expected %s bytes" % (resource.uri,file_md5,resource.md5))
        return(num_updated)

    def delete_resource(self, resource, file, allow_deletion=False):
        """Delete copy of resource in file on local system

        Will only actually do the deletion if allow_deletion is True. Regardless 
        of whether the deletion occurs, self.last_timestamp will be updated 
        if the resource.timestamp is later than the current value.

        Returns the number of files actually deleted (0 or 1).
        """
        num_deleted=0
        uri = resource.uri
        if (resource.timestamp is not None and
            resource.timestamp > self.last_timestamp):
            self.last_timestamp = resource.timestamp
        if (allow_deletion):
            if (self.dryrun):
                self.logger.info("dryrun: would delete %s -> %s" % (uri,file))
            else:
                try:
                    os.unlink(file)
                    num_deleted+=1
                except OSError as e:
                    msg = "Failed to DELETE %s -> %s : %s" % (uri,file,str(e))
                    #if (self.ignore_failures):
                    self.logger.warning(msg)
                    #    return
                    #else:
                    #    raise ClientFatalError(msg)
                self.logger.info("deleted: %s -> %s" % (uri,file))
                self.log_event(Resource(resource=resource, change="deleted"))
        else:
            self.logger.info("nodelete: would delete %s (--delete to enable)" % uri)
        return(num_deleted)

    def parse_document(self):
        """Parse any ResourceSync document and show information
        
        Will use sitemap URI taken either from explicit self.sitemap_name
        or derived from the mappings supplied.
        """
        s=Sitemap()
        self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
        try:
            list = s.parse_xml(urllib.urlopen(self.sitemap))
        except IOError as e:
            raise ClientFatalError("Cannot read document (%s)" % str(e))
        num_entries = len(list.resources)
        capability = '(unknown capability)'
        if ('capability' in list.md):
            capability = list.md['capability']
        print "Parsed %s document with %d entries" % (capability,num_entries)
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries>to_show):
                print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
            n=0
            for resource in list:
                print '[%d] %s' % (n,str(resource))
                n+=1
                if ( n >= to_show ):
                    break

    def explore(self):
        """Explore capabilities of a server interactvely
        
        Will use sitemap URI taken either from explicit self.sitemap_name
        or derived from the mappings supplied.
        """
        uri = None
        if (self.sitemap_name is not None):
            uri = self.sitemap
            print "Taking location from --sitemap option"
            acceptable_capabilities = None #ie. any
        elif (len(self.mapper)>0):
            pu = urlparse.urlparse(self.mapper.default_src_uri())
            uri = urlparse.urlunparse( [ pu[0], pu[1], '/.well-known/resourcesync', '', '', '' ] )
            print "Will look for discovery information based on mappings"
            acceptable_capabilities = [ 'capabilitylist', 'capabilitylistindex' ]
        else:
            raise ClientFatalError("Neither explicit sitemap nor mapping specified")
        history = []
        inp = None
        checks = None
        while (inp!='q'):
            print
            if (inp=='b'):
                if (len(history)<2):
                    break #can't do this, exit
                history.pop() #throw away current
                uri=history.pop()
                acceptable_capabilities=None
            history.append(uri)
            (uri,checks,acceptable_capabilities,inp) = self.explore_uri(uri,checks,acceptable_capabilities,len(history)>1)
        print "--explore done, bye..."

    def explore_uri(self, uri, checks, caps, show_back=True):
        """Interactive exploration of document at uri

        Will flag warnings if the document is not of type listed in caps
        """
        s=Sitemap()
        print "Reading %s" % (uri)
        options={}
        capability=None
        try:
            if (caps=='resource'):
                self.explore_show_head(uri,check_headers=checks)
            else: 
                list = s.parse_xml(urllib.urlopen(uri))
                (options,capability)=self.explore_show_summary(list,s.parsed_index,caps)
        except IOError as e:
            print "Cannot read %s (%s)\nGoing back" % (uri,str(e))
            return('','','','b')
        except Exception as e:
            print "Cannot parse %s (%s)\nGoing back" % (uri,str(e))
            return('','','','b')
        while (True):
            # don't offer number option for no resources/capabilities
            num_prompt = '' if (len(options)==0) else 'number, '
            up_prompt = 'b(ack), ' if (show_back) else ''
            inp = raw_input( "Follow [%s%sq(uit)]?" % (num_prompt,up_prompt) )
            if (inp in options.keys()):
                break
            if (inp == 'q' or inp == 'b'):
                return('','','',inp)
        checks = {}
        if ( options[inp].capability is None ):
            if (capability == 'capabilitylistindex'):
                # all links should be to capabilitylist documents
                caps = ['capabilitylist']
            elif (capability in ['resourcelist','changelist',
                                 'resourcedump','changedump']):
                caps = 'resource'
        else:
            r = options[inp]
            caps = [r.capability]
            if (r.length is not None):
                checks['content-length']=r.length
            if (r.lastmod is not None):
                checks['last-modified']=r.lastmod
            # FIXME - could do sanity check here and issue warnings if odd
        return( options[inp].uri, checks, caps, inp )

    def explore_show_summary(self,list,parsed_index,caps):
        """Show summary of one capability document

        Used as part of --explore.
        FIXME - should look for <rs:ln rel="up"...> link and show that
        """
        num_entries = len(list.resources)
        capability = '(unknown capability)'
        if ('capability' in list.md):
            capability = list.md['capability']
        if (parsed_index):
            capability += 'index'
        print "Parsed %s document with %d entries:" % (capability,num_entries)
        if (caps is not None and capability not in caps):
            print "WARNING - expected a %s document" % (','.join(caps))
        to_show = num_entries
        if (num_entries>21):
            to_show = 20
        # What entries are allowed? 
        # FIXME - not complete
        entry_caps = []
        if (capability == 'capabilitylistindex'):
            entry_caps = ['capabilitylist']
        elif (capability == 'capabilitylist'):
            entry_caps = ['resourcelist','changelist','resourcedump','changedump','changelistindex']
        elif (capability == 'changelistindex'):
            entry_caps = ['changelist']
        options = {}
        n=0
        if ('up' in list.ln):
            options['up']=list.ln['up']
            print "[%s] %s" % ('up',list.ln['up'].uri)
        for r in list.resources:
            if (n>=to_show):
                print "(not showing remaining %d entries)" % (num_entries-n)
                break
            n+=1
            options[str(n)]=r
            print "[%d] %s" % (n,r.uri)
            if (r.capability is not None):
                warning = ''
                if (r.capability not in entry_caps):
                    warning = " (EXPECTED %s)" % (' or '.join(entry_caps))
                print "  %s%s" % (r.capability,warning)
            elif (len(entry_caps)==1):
                r.capability=entry_caps[0]
                print "  capability not specified, should be %s" % (r.capability)
        return(options,capability)

    def explore_show_head(self,uri,check_headers=None):
        """Do HEAD on uri and show infomation

        Will also check headers against any values specified in 
        check_headers.
        """
        print "HEAD %s" % (uri)
        response = requests.head(uri)
        print "  status: %s" % (response.status_code)
        # generate normalized lastmod
#        if ('last-modified' in response.headers):
#            response.headers.add['lastmod'] = datetime_to_str(str_to_datetime(response.headers['last-modified']))
        # print some of the headers
        for header in ['content-length','last-modified','lastmod','content-type','etag']:
            if header in response.headers:
                check_str=''
                if (check_headers is not None and
                    header in check_headers):
                    if (response.headers[header] == check_headers[header]):
                        check_str=' MATCHES EXPECTED VALUE'
                    else:
                        check_STR=' EXPECTED %s' % (check_headers[header])
                print "  %s: %s%s" % (header, response.headers[header], check_str)

    def write_resource_list(self,paths=None,outfile=None,links=None,dump=None):
        """Write a Resource List or a Resource Dump for files on local disk

        Set of resources included is based on paths setting or else the mappings. 
        Optionally links can be added. Output will be to stdout unless outfile
        is specified.
        
        If dump is true then a Resource Dump is written instead of a Resource
        List. If outfile is not set then self.default_resource_dump will be used.
        """
        rl = self.build_resource_list(paths=paths,set_path=dump)
        if (links is not None):
            rl.ln = links
        if (dump):
            if (outfile is None):
                outfile = self.default_resource_dump
            self.logger.info("Writing resource dump to %s..." % (dump))
            d = Dump(format=self.dump_format)
            d.write(resource_list=rl,dumpfile=outfile)
        else:
            if (outfile is None):
                try:
                    print rl.as_xml()
                except ListBaseIndexError as e:
                    raise ClientFatalError("%s. Use --output option to specify base name for output files." % str(e))
            else:
                rl.write(basename=outfile)

    def write_change_list(self,paths=None,outfile=None,ref_sitemap=None,newref_sitemap=None,
                          empty=None,links=None,dump=None):
        """Write a change list
        
        Unless the both ref_sitemap and newref_sitemap are specified then the Change 
        List is calculated between the reference an the current state of files on
        disk. The files on disk are scanned based either on the paths setting or
        else on the mappings.
        """
        cl = ChangeList(ln=links)
        if (not empty):
            # 1. Get and parse reference sitemap
            old_rl = self.read_reference_resource_list(ref_sitemap)
            # 2. Depending on whether a newref_sitemap was specified, either read that 
            # or build resource list from files on disk
            if (newref_sitemap is None):
                # Get resource list from disk
                new_rl = self.build_resource_list(paths=paths,set_path=dump)
            else:
                new_rl = self.read_reference_resource_list(newref_sitemap,name='new reference')
            # 3. Calculate change list
            (same,updated,deleted,created)=old_rl.compare(new_rl)   
            cl.add_changed_resources( updated, change='updated' )
            cl.add_changed_resources( deleted, change='deleted' )
            cl.add_changed_resources( created, change='created' )
        # 4. Write out change list
        cl.mapper = self.mapper
        cl.pretty_xml = self.pretty_xml
        if (self.max_sitemap_entries is not None):
            cl.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print cl.as_xml()
        else:
            cl.write(basename=outfile)
        self.write_dump_if_requested(cl,dump)

    def write_capability_list(self,capabilities=None,outfile=None,links=None):
        """Write a Capability List to outfile or STDOUT"""
        capl = CapabilityList(ln=links)
        capl.pretty_xml = self.pretty_xml
        if (capabilities is not None):
            for name in capabilities.keys():
                capl.add_capability(name=name, uri=capabilities[name])
        if (outfile is None):
            print capl.as_xml()
        else:
            capl.write(basename=outfile)

    def write_source_description(self,capability_lists=None,outfile=None,links=None):
        """Write a ResourceSync Description document to outfile or STDOUT"""
        rsd = SourceDescription(ln=links)
        rsd.pretty_xml = self.pretty_xml
        if (capability_lists is not None):
            for uri in capability_lists:
                rsd.add_capability_list(uri)
        if (outfile is None):
            print rsd.as_xml()
        else:
            rsd.write(basename=outfile)

    def write_dump_if_requested(self,resource_list,dump):
        """Write a dump to the file dump"""
        if (dump is None):
            return

    def read_reference_resource_list(self,ref_sitemap,name='reference'):
        """Read reference resource list and return the ResourceList object

        name parameter just uses in output messages to say what type
        of resource list is being read.
        """
        rl = ResourceList()
        self.logger.info("Reading reference %s resource list from %s ..." % (name,ref_sitemap))
        rl.mapper=self.mapper
        rl.read(uri=ref_sitemap,index_only=(not self.allow_multifile))
        num_entries = len(rl.resources)
        self.logger.info("Read %s resource list with %d entries in %d sitemaps" % (name,num_entries,rl.num_files))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries>to_show):
                print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
            n=0
            for r in rl.resources:
                print r
                n+=1
                if ( n >= to_show ):
                    break
        return(rl)

    def log_status(self, in_sync=True, incremental=False, audit=False,
                   same=None, created=0, updated=0, deleted=0, to_delete=0):
        """Write log message regarding status in standard form
        
        Split this off so all messages from baseline/audit/incremental
        are written in a consistent form.
        """
        if (audit):
            words = { 'created': 'to create',
                      'updated': 'to update',
                      'deleted': 'to delete' }
        else:
            words = { 'created': 'created',
                      'updated': 'updated',
                      'deleted': 'deleted' }
        if in_sync:
            # status rather than action
            status = "NO CHANGES" if incremental else "IN SYNC" 
        else:
            if audit:
                status = "NOT IN SYNC"
            elif (to_delete>deleted):
                #will need --delete
                status = "PART APPLIED" if incremental else"PART SYNCED"
                words['deleted']='to delete (--delete)'
                deleted=to_delete
            else: 
                status = "CHANGES APPLIED" if incremental else "SYNCED"
        same =  "" if (same is None) else ("same=%d, " % same)
        self.logger.warning("Status: %15s (%s%s=%d, %s=%d, %s=%d)" %\
             (status, same, words['created'], created, 
              words['updated'], updated, words['deleted'], deleted))
Exemplo n.º 15
0
class Client():
    """Implementation of a ResourceSync client"""
    def __init__(self, checksum=False, verbose=False, dryrun=False):
        self.checksum = checksum
        self.verbose = verbose
        self.dryrun = dryrun
        self.mapper = None
        self.sitemap_name = 'sitemap.xml'
        self.dump_format = None
        self.allow_multifile = False
        self.max_sitemap_entries = None

    @property
    def mappings(self):
        """Provide access to mappings list within Mapper object"""
        if (self.mapper is None):
            raise ClientFatalError("No mappings specified")
        return (self.mapper.mappings)


#    @mappings.setter

    def set_mappings(self, mappings):
        """Build and set Mapper object based on input mappings"""
        self.mapper = Mapper(mappings)

    @property
    def sitemap(self):
        """Return the sitemap URI base on maps or explicit settings"""
        if (re.match(r"\w+:", self.sitemap_name)):
            # looks like URI
            return (self.sitemap_name)
        elif (re.match(r"/", self.sitemap_name)):
            # looks like full path
            return (self.sitemap_name)
        else:
            # build from mapping with name appended
            return (self.mappings[0].src_uri + '/' + self.sitemap_name)

    @property
    def inventory(self):
        """Return inventory on disk based on current mappings

        Return inventory. Uses existing self.mapper settings.
        """
        ### 0. Sanity checks
        if (len(self.mappings) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        ### 1. Build from disk
        ib = InventoryBuilder(do_md5=self.checksum,
                              verbose=self.verbose,
                              mapper=self.mapper)
        return (ib.from_disk())

    def sync_or_audit(self, allow_deletion=False, audit_only=False):
        ### 0. Sanity checks
        if (len(self.mappings) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        ### 1. Get inventories from both src and dst
        # 1.a source inventory
        ib = InventoryBuilder(verbose=self.verbose, mapper=self.mapper)
        try:
            if (self.verbose):
                print "Reading sitemap %s ..." % (self.sitemap)
            src_inventory = ib.get(self.sitemap)
        except IOError as e:
            raise ClientFatalError("Can't read source inventory from %s (%s)" %
                                   (self.sitemap, str(e)))
        if (self.verbose):
            print "Read source inventory, %d resources listed" % (
                len(src_inventory))
        if (len(src_inventory) == 0):
            raise ClientFatalError(
                "Aborting as there are no resources to sync")
        if (self.checksum and not src_inventory.has_md5()):
            self.checksum = False
            print "Not calculating checksums on destination as not present in source inventory"
        # 1.b destination inventory mapped back to source URIs
        ib.do_md5 = self.checksum
        dst_inventory = ib.from_disk()
        ### 2. Compare these inventorys respecting any comparison options
        (num_same, updated, deleted,
         created) = dst_inventory.compare(src_inventory)
        ### 3. Report status and planned actions
        status = "  IN SYNC  "
        if (len(updated) > 0 or len(deleted) > 0 or len(created) > 0):
            status = "NOT IN SYNC"
        print "Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\
              (status,num_same,len(updated),len(deleted),len(created))

        if (audit_only):
            return
        ### 4. Grab files to do sync
        for uri in updated:
            file = self.mapper.src_to_dst(uri)
            if (self.verbose):
                print "updated: %s -> %s" % (uri, file)
            self.update_resource(uri, file,
                                 src_inventory.resources[uri].timestamp)
        for uri in created:
            file = self.mapper.src_to_dst(uri)
            self.update_resource(uri, file,
                                 src_inventory.resources[uri].timestamp)
        for uri in deleted:
            if (allow_deletion):
                file = self.mapper.src_to_dst(uri)
                if (self.dryrun):
                    print "dryrun: would delete %s -> %s" % (uri, file)
                else:
                    os.unlink(file)
                    if (self.verbose):
                        print "deleted: %s -> %s" % (uri, file)
            else:
                if (self.verbose):
                    print "nodelete: would delete %s (--delete to enable)" % uri

    def update_resource(self, uri, file, timestamp=None):
        """Update resource from uri to file on local system

        Update means two things:
        1. GET resources
        2. set mtime to be equal to timestamp (should probably use LastModified 
        from the GET response instead but maybe warn if different (or just 
        earlier than) the lastmod we expected from the inventory
        """
        path = os.path.dirname(file)
        distutils.dir_util.mkpath(path)
        if (self.dryrun):
            print "dryrun: would GET %s --> %s" % (uri, file)
        else:
            urllib.urlretrieve(uri, file)
            if (self.verbose):
                print "created: %s -> %s" % (uri, file)
            if (timestamp is not None):
                unixtime = int(timestamp)  #get rid of any fractional seconds
                os.utime(file, (unixtime, unixtime))

    def parse_sitemap(self):
        s = Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile)
        if (self.verbose):
            print "Reading sitemap(s) from %s ..." % (sitemap)
        i = s.read(sitemap)
        num_entries = len(i)
        print "Read sitemap with %d entries in %d sitemaps" % (
            num_entries, s.sitemaps_created)
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries > to_show):
                print "Showing first %d entries sorted by URI%s..." % (
                    to_show, override_str)
            n = 0
            for r in i.resource_uris():
                print i.resources[r]
                n += 1
                if (n >= to_show):
                    break

    def write_sitemap(self, outfile=None, capabilities=None, dump=None):
        # Set up base_path->base_uri mappings, get inventory from disk
        i = self.inventory
        i.capabilities = capabilities
        s = Sitemap(verbose=self.verbose,
                    pretty_xml=True,
                    allow_multifile=self.allow_multifile,
                    mapper=self.mapper)
        if (self.max_sitemap_entries is not None):
            s.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print s.inventory_as_xml(i)
        else:
            s.write(i, basename=outfile)
        self.write_dump_if_requested(i, dump)

    def changeset_sitemap(self,
                          outfile=None,
                          ref_sitemap=None,
                          capabilities=None,
                          dump=None):
        # 1. Get and parse reference sitemap
        rs = Sitemap(verbose=self.verbose,
                     allow_multifile=self.allow_multifile,
                     mapper=self.mapper)
        if (self.verbose):
            print "Reading sitemap(s) from %s ..." % (ref_sitemap)
        ri = rs.read(ref_sitemap)
        num_entries = len(ri)
        print "Read reference sitemap with %d entries in %d sitemaps" % (
            num_entries, rs.sitemaps_created)
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries > to_show):
                print "Showing first %d entries sorted by URI%s..." % (
                    to_show, override_str)
            n = 0
            for r in ri.resource_uris():
                print ri.resources[r]
                n += 1
                if (n >= to_show):
                    break
        # 2. Set up base_path->base_uri mappings, get inventory from disk
        disk_inventory = self.inventory
        # 3. Calculate changeset
        (num_same, updated, deleted, created) = ri.compare(disk_inventory)
        changeset = Inventory()
        changeset.capabilities = capabilities
        changeset.add(disk_inventory.changeset(updated, changetype='updated'))
        changeset.add(ri.changeset(deleted, changetype='deleted'))
        changeset.add(disk_inventory.changeset(created, changetype='created'))
        # 4. Write out changeset
        s = Sitemap(verbose=self.verbose,
                    pretty_xml=True,
                    allow_multifile=self.allow_multifile,
                    mapper=self.mapper)
        if (self.max_sitemap_entries is not None):
            s.max_sitemap_entries = self.max_sitemap_entries
        if (outfile is None):
            print s.inventory_as_xml(changeset)
        else:
            s.write(changeset, basename=outfile)
        self.write_dump_if_requested(changeset, dump)

    def write_dump_if_requested(self, inventory, dump):
        if (dump is None):
            return
        if (self.verbose):
            print "Writing dump to %s..." % (dump)
        d = Dump(format=self.dump_format)
        d.write(inventory=inventory, dumpfile=dump)