def from_disk(self, resource_list=None, paths=None): """Create or extend resource_list with resources from disk scan Assumes very simple disk path to URL mapping (in self.mapping): chop path and replace with url_path. Returns the new or extended ResourceList object. If a resource_list is specified then items are added to that rather than creating a new one. If paths is specified then these are used instead of the set of local paths in self.mapping. Example usage with mapping start paths: mapper=Mapper('http://example.org/path','/path/to/files') rlb = ResourceListBuilder(mapper=mapper) m = rlb.from_disk() Example usage with explicit paths: mapper=Mapper('http://example.org/path','/path/to/files') rlb = ResourceListBuilder(mapper=mapper) m = rlb.from_disk(paths=['/path/to/files/a','/path/to/files/b']) """ num = 0 # Either use resource_list passed in or make a new one if (resource_list is None): resource_list = ResourceList() # Compile exclude pattern matches self.compile_excludes() # Work out start paths from map if not explicitly specified if (paths is None): paths = [] for map in self.mapper.mappings: paths.append(map.dst_path) # Set start time unless already set (perhaps because building in chunks) if (resource_list.md_at is None): resource_list.md_at = datetime_to_str() # Run for each map in the mappings for path in paths: self.logger.info("Scanning disk from %s" % (path)) self.from_disk_add_path(path=path, resource_list=resource_list) # Set end time resource_list.md_completed = datetime_to_str() return (resource_list)
def from_disk(self, resource_list=None, paths=None): """Create or extend resource_list with resources from disk scan Assumes very simple disk path to URL mapping (in self.mapping): chop path and replace with url_path. Returns the new or extended ResourceList object. If a resource_list is specified then items are added to that rather than creating a new one. If paths is specified then these are used instead of the set of local paths in self.mapping. Example usage with mapping start paths: mapper=Mapper('http://example.org/path','/path/to/files') rlb = ResourceListBuilder(mapper=mapper) m = rlb.from_disk() Example usage with explicit paths: mapper=Mapper('http://example.org/path','/path/to/files') rlb = ResourceListBuilder(mapper=mapper) m = rlb.from_disk(paths=['/path/to/files/a','/path/to/files/b']) """ num=0 # Either use resource_list passed in or make a new one if (resource_list is None): resource_list = ResourceList() # Compile exclude pattern matches self.compile_excludes() # Work out start paths from map if not explicitly specified if (paths is None): paths=[] for map in self.mapper.mappings: paths.append(map.dst_path) # Set start time unless already set (perhaps because building in chunks) if (resource_list.md_at is None): resource_list.md_at = datetime_to_str() # Run for each map in the mappings for path in paths: self.logger.info("Scanning disk from %s" % (path)) self.from_disk_add_path(path=path, resource_list=resource_list) # Set end time resource_list.md_completed = datetime_to_str() return(resource_list)
def _str_datetime_now(self, x=None): """Return datetime string for use with time attributes Handling depends on input: 'now' - returns datetime for now number - assume datetime values, generate string other - no change, return same value """ if (x == 'now'): # Now, this is wht datetime_to_str() with no arg gives return( datetime_to_str() ) try: # Test for number junk = x + 0.0 return datetime_to_str(x) except TypeError: # Didn't look like a number, treat as string return x
def _str_datetime_now(self, x=None): """Return datetime string for use with time attributes Handling depends on input: 'now' - returns datetime for now number - assume datetime values, generate string other - no change, return same value """ if (x == 'now'): # Now, this is wht datetime_to_str() with no arg gives return (datetime_to_str()) try: # Test for number junk = x + 0.0 return datetime_to_str(x) except TypeError: # Didn't look like a number, treat as string return x
def default_capability_and_modified(self): """Set capability name and modified time in md Every ResourceSync document should have these two top-level metadata attributes. """ if ('capability' not in self.md and self.capability_md is not None): self.md['capability']=self.capability_md if ('modified' not in self.md): self.md['modified']=datetime_to_str(no_fractions=True)
def head_on_file(self,file): """Mock up requests.head(..) response on local file """ response = HeadResponse() if (not os.path.isfile(file)): response.status_code='404' else: response.status_code='200' response.headers['last-modified']=datetime_to_str(os.path.getmtime(file)) response.headers['content-length']=os.path.getsize(file) return(response)
def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None): """Incremental synchronization Use Change List to do incremental sync """ self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mapper) < 1): raise ClientFatalError( "No source to destination mapping specified") if (self.mapper.unsafe()): raise ClientFatalError( "Source to destination mappings unsafe: %s" % str(self.mapper)) from_timestamp = None if (from_datetime is not None): try: from_timestamp = str_to_datetime(from_datetime) except ValueError: raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime) ### 1. Work out where to start from if (from_timestamp is None): from_timestamp = ClientState().get_state(self.sitemap) if (from_timestamp is None): raise ClientFatalError( "Cannot do incremental sync. No stored timestamp for this site, and no explicit --from." ) ### 2. Get URI of change list, from sitemap or explicit if (change_list_uri): # Translate as necessary using maps change_list = self.sitemap_uri(change_list_uri) else: # Try default name change_list = self.sitemap_uri(self.change_list_name) ### 3. Read change list from source try: self.logger.info("Reading change list %s" % (change_list)) src_change_list = ChangeList() src_change_list.read(uri=change_list) self.logger.debug("Finished reading change list") except Exception as e: raise ClientFatalError( "Can't read source change list from %s (%s)" % (change_list, str(e))) self.logger.info("Read source change list, %d changes listed" % (len(src_change_list))) #if (len(src_change_list)==0): # raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_change_list.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source change list" ) # Check all changes have timestamp and record last self.last_timestamp = 0 for resource in src_change_list: if (resource.timestamp is None): raise ClientFatalError( "Aborting - missing timestamp for change in %s" % (uri)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp ### 4. Check that the change list has authority over URIs listed # FIXME - What does authority mean for change list? Here use both the # change list URI and, if we used it, the sitemap URI if (not self.noauth): uauth_cs = UrlAuthority(change_list, self.strictauth) if (not change_list_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_change_list: if (not uauth_cs.has_authority_over(resource.uri) and (change_list_uri or not uauth_sm.has_authority_over(resource.uri))): raise ClientFatalError( "Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list, resource.uri)) ### 5. Prune entries before starting timestamp and dupe changes for a resource num_skipped = src_change_list.prune_before(from_timestamp) if (num_skipped > 0): self.logger.info("Skipped %d changes before %s" % (num_skipped, datetime_to_str(from_timestamp))) num_dupes = src_change_list.prune_dupes() if (num_dupes > 0): self.logger.info("Removed %d prior changes" % (num_dupes)) # Review and log status before # FIXME - should at this stage prune the change list to pick out # only the last change for each resource to_update = 0 to_create = 0 to_delete = 0 for resource in src_change_list: if (resource.change == 'updated'): to_update += 1 elif (resource.change == 'created'): to_create += 1 elif (resource.change == 'deleted'): to_delete += 1 else: raise ClientError("Unknown change type %s" % (resource.change)) # Log status based on what we know from the Change List. Exit if # either there are no changes or if there are only deletions and # we don't allow deletion in_sync = ((to_update + to_delete + to_create) == 0) self.log_status(in_sync=in_sync, incremental=True, created=to_create, updated=to_update, deleted=to_delete) if (in_sync or ((to_update + to_create) == 0 and not allow_deletion)): self.logger.debug("Completed incremental") return ### 6. Apply changes at same time or after from_timestamp delete_msg = (", and delete %d resources" % to_delete) if (allow_deletion) else '' self.logger.warning("Will apply %d changes%s" % (len(src_change_list), delete_msg)) num_updated = 0 num_deleted = 0 num_created = 0 for resource in src_change_list: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.change == 'updated'): self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'updated') num_updated += 1 elif (resource.change == 'created'): self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'created') num_created += 1 elif (resource.change == 'deleted'): num_deleted += self.delete_resource(resource, file, allow_deletion) else: raise ClientError("Unknown change type %s" % (resource.change)) ### 7. Report status and planned actions self.log_status(incremental=True, created=num_created, updated=num_updated, deleted=num_deleted, to_delete=to_delete) ### 8. Record last timestamp we have seen if (self.last_timestamp > 0): ClientState().set_state(self.sitemap, self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 9. Done self.logger.debug("Completed incremental sync")
def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. In the case of baseline sync the last timestamp seen is recorded as client state. """ action = ('audit' if (audit_only) else 'baseline sync') self.logger.debug("Starting " + action) ### 0. Sanity checks if (len(self.mapper) < 1): raise ClientFatalError( "No source to destination mapping specified") if (not audit_only and self.mapper.unsafe()): raise ClientFatalError( "Source to destination mappings unsafe: %s" % str(self.mapper)) ### 1. Get inventories from both src and dst # 1.a source resource list try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_resource_list = ResourceList( allow_multifile=self.allow_multifile, mapper=self.mapper) src_resource_list.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError( "Can't read source resource list from %s (%s)" % (self.sitemap, str(e))) self.logger.info("Read source resource list, %d resources listed" % (len(src_resource_list))) if (len(src_resource_list) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_resource_list.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source resource list" ) # 1.b destination resource list mapped back to source URIs rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper) dst_resource_list = rlb.from_disk() ### 2. Compare these resource lists respecting any comparison options (same, updated, deleted, created) = dst_resource_list.compare(src_resource_list) ### 3. Report status and planned actions self.log_status(in_sync=(len(updated) + len(deleted) + len(created) == 0), audit=True, same=len(same), created=len(created), updated=len(updated), deleted=len(deleted)) if (audit_only or len(created) + len(updated) + len(deleted) == 0): self.logger.debug("Completed " + action) return ### 4. Check that sitemap has authority over URIs listed if (not self.noauth): uauth = UrlAuthority(self.sitemap, strict=self.strictauth) for resource in src_resource_list: if (not uauth.has_authority_over(resource.uri)): raise ClientFatalError( "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap, resource.uri)) ### 5. Grab files to do sync delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else '' self.logger.warning("Will GET %d resources%s" % (len(created) + len(updated), delete_msg)) self.last_timestamp = 0 num_created = 0 num_updated = 0 num_deleted = 0 for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri, file)) num_created += self.update_resource(resource, file, 'created') for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri, file)) num_updated += self.update_resource(resource, file, 'updated') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) num_deleted += self.delete_resource(resource, file, allow_deletion) ### 6. Store last timestamp to allow incremental sync if (not audit_only and self.last_timestamp > 0): ClientState().set_state(self.sitemap, self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 7. Done self.log_status(in_sync=(len(updated) + len(deleted) + len(created) == 0), same=len(same), created=num_created, updated=num_updated, deleted=num_deleted, to_delete=len(deleted)) self.logger.debug("Completed %s" % (action))
def lastmod(self): """The Last-Modified data in W3C Datetime syntax, Z notation""" if self.timestamp is None: return None return datetime_to_str(self.timestamp)
def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None): """Incremental synchronization """ self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") from_timestamp = None if (from_datetime is not None): try: from_timestamp = str_to_datetime(from_datetime) except ValueError: raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime) ### 1. Work out where to start from if (from_timestamp is None): from_timestamp=ClientState().get_state(self.sitemap) if (from_timestamp is None): raise ClientFatalError("No stored timestamp for this site, and no explicit --from") ### 2. Get URI of change list, from sitemap or explicit if (change_list_uri): # Translate as necessary using maps change_list = self.sitemap_uri(change_list_uri) else: # Try default name change_list = self.sitemap_uri(self.change_list_name) ### 3. Read change list from source try: self.logger.info("Reading change list %s" % (change_list)) src_change_list = ChangeList() src_change_list.read(uri=change_list) self.logger.debug("Finished reading change list") except Exception as e: raise ClientFatalError("Can't read source change list from %s (%s)" % (change_list,str(e))) self.logger.info("Read source change list, %d changes listed" % (len(src_change_list))) #if (len(src_change_list)==0): # raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_change_list.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source change list") # Check all changes have timestamp and record last self.last_timestamp = 0 for resource in src_change_list: if (resource.timestamp is None): raise ClientFatalError("Aborting - missing timestamp for change in %s" % (uri)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp ### 4. Check that the change list has authority over URIs listed # FIXME - What does authority mean for change list? Here use both the # change list URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(change_list) if (not change_list_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_change_list: if (not uauth_cs.has_authority_over(resource.uri) and (change_list_uri or not uauth_sm.has_authority_over(resource.uri))): if (self.noauth): #self.logger.info("Change list (%s) mentions resource at a location it does not have authority over (%s)" % (change_list,resource.uri)) pass else: raise ClientFatalError("Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list,resource.uri)) ### 5. Prune entries before starting timestamp and dupe changes for a resource num_skipped = src_change_list.prune_before(from_timestamp) if (num_skipped>0): self.logger.info("Skipped %d changes before %s" % (num_skipped,datetime_to_str(from_timestamp))) num_dupes = src_change_list.prune_dupes() if (num_dupes>0): self.logger.info("Removed %d prior changes" % (num_dupes)) ### 6. Apply changes at same time or after from_timestamp self.logger.info("Applying %d changes" % (len(src_change_list))) num_updated = 0 num_deleted = 0 num_created = 0 for resource in src_change_list: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.change == 'updated'): self.logger.info("updated: %s -> %s" % (uri,file)) self.update_resource(resource,file,'updated') num_updated+=1 elif (resource.change == 'created'): self.logger.info("created: %s -> %s" % (uri,file)) self.update_resource(resource,file,'created') num_created+=1 elif (resource.change == 'deleted'): self.delete_resource(resource,file,allow_deletion) num_deleted+=1 else: raise ClientError("Unknown change type %s" % (resource.change) ) ### 7. Report status and planned actions self.log_status(in_sync=((num_updated+num_deleted+num_created)==0), incremental=True,created=num_created, updated=num_updated, deleted=num_deleted) ### 8. Record last timestamp we have seen if (self.last_timestamp>0): ClientState().set_state(self.sitemap,self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 9. Done self.logger.debug("Completed incremental sync")
def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. In the case of baseline sync the last timestamp seen is recorded as client state. """ action = ( 'audit' if (audit_only) else 'baseline sync' ) self.logger.debug("Starting "+action) ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source resource_list try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_resource_list = ResourceList(allow_multifile=self.allow_multifile, mapper=self.mapper) src_resource_list.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source resource_list from %s (%s)" % (self.sitemap,str(e))) self.logger.info("Read source resource_list, %d resources listed" % (len(src_resource_list))) if (len(src_resource_list)==0): raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_resource_list.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source resource_list") # 1.b destination resource_list mapped back to source URIs rlb = ResourceListBuilder(mapper=self.mapper) rlb.do_md5=self.checksum dst_resource_list = rlb.from_disk() ### 2. Compare these resource_lists respecting any comparison options (same,updated,deleted,created)=dst_resource_list.compare(src_resource_list) ### 3. Report status and planned actions self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), audit=True,same=len(same),created=len(created), updated=len(updated),deleted=len(deleted)) if (audit_only or len(created)+len(updated)+len(deleted)==0): self.logger.debug("Completed "+action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_resource_list: if (not uauth.has_authority_over(resource.uri)): if (self.noauth): #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri)) pass else: raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri)) ### 5. Grab files to do sync delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else '' self.logger.warning("Will GET %d resources%s" % (len(created)+len(updated),delete_msg)) self.last_timestamp = 0 num_created=0 num_updated=0 num_deleted=0 for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri,file)) num_created+=self.update_resource(resource,file,'created') for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri,file)) num_updated+=self.update_resource(resource,file,'updated') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) num_deleted+=self.delete_resource(resource,file,allow_deletion) ### 6. Store last timestamp to allow incremental sync if (not audit_only and self.last_timestamp>0): ClientState().set_state(self.sitemap,self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 7. Done self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), same=len(same),created=num_created, updated=num_updated,deleted=num_deleted) self.logger.debug("Completed %s" % (action))
def md_until(self): """md_until value in W3C Datetime syntax, Z notation""" return datetime_to_str(self._get_extra('ts_until'))
def md_from(self): """md_from value in W3C Datetime syntax, Z notation""" return datetime_to_str(self._get_extra('ts_from'))
def md_completed(self): """md_completed value in W3C Datetime syntax, Z notation""" return datetime_to_str(self._get_extra('ts_completed'))
def md_at(self): """md_at values in W3C Datetime syntax, Z notation""" return datetime_to_str(self._get_extra('ts_at'))
def lastmod(self): """The Last-Modified data in W3C Datetime syntax, Z notation""" return datetime_to_str(self.timestamp)