def list_patch_files(self, resourcelist, max_files=-1): """ Append resources with the name pattern 'rdf_out_*' to a resourcelist. All resources in resource_dir are included except for the last one in alphabetical sort order. If max_files is set to a value greater than 0, will only include up to max_files. :param resourcelist: the resourcelist to append to :param max_files: the maximum number of resources to append to the list :return: True if the list includes the one but last rdf_out_* file in resource_dir, False otherwise """ rdf_out_files = sorted(glob(os.path.join(self.resource_dir, PATTERN_RDF_OUT + "*"))) if len(rdf_out_files) > 0: rdf_out_files.pop() # remove last from list n = 0 for file in rdf_out_files: filename = os.path.basename(file) timestamp = self.extract_timestamp(file) length = os.stat(file).st_size md5 = compute_md5_for_file(file) resourcelist.add( Resource(self.publish_url + filename, md5=md5, length=length, lastmod=timestamp, path=file)) n += 1 if 0 < max_files == n: break exhausted = len(rdf_out_files) == n return exhausted
def list_patch_files(self, resourcelist, max_files=-1): """ Append resources with the name pattern 'rdf_out_*' to a resourcelist. All resources in resource_dir are included except for the last one in alphabetical sort order. If max_files is set to a value greater than 0, will only include up to max_files. :param resourcelist: the resourcelist to append to :param max_files: the maximum number of resources to append to the list :return: True if the list includes the one but last rdf_out_* file in resource_dir, False otherwise """ rdf_out_files = sorted( glob(os.path.join(self.resource_dir, PATTERN_RDF_OUT + "*"))) if len(rdf_out_files) > 0: rdf_out_files.pop() # remove last from list n = 0 for file in rdf_out_files: filename = os.path.basename(file) timestamp = self.extract_timestamp(file) length = os.stat(file).st_size md5 = compute_md5_for_file(file) resourcelist.add( Resource(self.publish_url + filename, md5=md5, length=length, lastmod=timestamp, path=file)) n += 1 if 0 < max_files == n: break exhausted = len(rdf_out_files) == n return exhausted
def update_resource(self, resource, file, change=None): """Update resource from uri to file on local system Update means three things: 1. GET resources 2. set mtime in local time to be equal to timestamp in UTC (should perhaps or at least warn if different from LastModified from the GET response instead but maybe warn if different (or just earlier than) the lastmod we expected from the resource list 3. check that resource matches expected information Also update self.last_timestamp if the timestamp (in source frame) of this resource is later and the current value. Returns the number of resources updated/created (0 or 1) """ path = os.path.dirname(file) distutils.dir_util.mkpath(path) num_updated = 0 if (self.dryrun): self.logger.info("dryrun: would GET %s --> %s" % (resource.uri, file)) else: # 1. GET try: urllib.urlretrieve(resource.uri, file) num_updated += 1 except IOError as e: msg = "Failed to GET %s -- %s" % (resource.uri, str(e)) if (self.ignore_failures): self.logger.warning(msg) return else: raise ClientFatalError(msg) # 2. set timestamp if we have one if (resource.timestamp is not None): unixtime = int(resource.timestamp) #no fractional os.utime(file, (unixtime, unixtime)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp self.log_event(Resource(resource=resource, change=change)) # 3. sanity check length = os.stat(file).st_size if (resource.length != length): self.logger.info( "Downloaded size for %s of %d bytes does not match expected %d bytes" % (resource.uri, length, resource.length)) if (self.checksum and resource.md5 is not None): file_md5 = compute_md5_for_file(file) if (resource.md5 != file_md5): self.logger.info( "MD5 mismatch for %s, got %s but expected %s bytes" % (resource.uri, file_md5, resource.md5)) return (num_updated)
def update_resource(self, resource, file, change=None): """Update resource from uri to file on local system Update means three things: 1. GET resources 2. set mtime in local time to be equal to timestamp in UTC (should perhaps or at least warn if different from LastModified from the GET response instead but maybe warn if different (or just earlier than) the lastmod we expected from the resource_list 3. check that resource matches expected information Also update self.last_timestamp if the timestamp (in source frame) of this resource is later and the current value. Returns the number of resources updated/created (0 or 1) """ path = os.path.dirname(file) distutils.dir_util.mkpath(path) num_updated=0 if (self.dryrun): self.logger.info("dryrun: would GET %s --> %s" % (resource.uri,file)) else: # 1. GET try: urllib.urlretrieve(resource.uri,file) num_updated+=1 except IOError as e: msg = "Failed to GET %s -- %s" % (resource.uri,str(e)) if (self.ignore_failures): self.logger.warning(msg) return else: raise ClientFatalError(msg) # 2. set timestamp if we have one if (resource.timestamp is not None): unixtime = int(resource.timestamp) #no fractional os.utime(file,(unixtime,unixtime)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp self.log_event(Resource(resource=resource, change=change)) # 3. sanity check length = os.stat(file).st_size if (resource.length != length): self.logger.info("Downloaded size for %s of %d bytes does not match expected %d bytes" % (resource.uri,length,resource.length)) if (self.checksum and resource.md5 is not None): file_md5 = compute_md5_for_file(file) if (resource.md5 != file_md5): self.logger.info("MD5 mismatch for %s, got %s but expected %s bytes" % (resource.uri,file_md5,resource.md5)) return(num_updated)
def _create_resource(self, basename=None, file_path=None, notify_observers=True): """Create a new resource, add it to the source, notify observers.""" payload = open(file_path).read() md5 = compute_md5_for_file(file_path) self._repository[basename] = { 'timestamp': os.path.getmtime(file_path), 'length': len(payload), 'md5': md5 } if notify_observers: change = Resource(resource=self.resource(basename), change="created") self.notify_observers(change)
def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True): """ Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the zip. -- The resync.Dump.write_zip method used in this method has the side effect of changing local paths in resourcelist into paths relative in zip. :param resourcelist: resources to zip :param prefix: prefix of the zip file :param write_list: True if resourcelist should be written to local disc. Default: False :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True :return: the created zip as a resync.Resource. """ md_at = None # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. index = -1 zipfiles = sorted( glob(os.path.join(self.publish_dir, prefix + "*.zip"))) if len(zipfiles) > 0: last_zip_file = zipfiles[len(zipfiles) - 1] basename = os.path.basename(last_zip_file) index = int(re.findall('\d+', basename)[0]) zip_name = "%s%05d" % (prefix, index + 1) if (write_list): # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest. rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w") rl_file.write(resourcelist.as_xml()) rl_file.close() zip_path = os.path.join(self.publish_dir, zip_name + ".zip") dump = Dump() dump.path_prefix = self.resource_dir dump.write_zip(resourcelist, zip_path) # paths in resourcelist will be stripped. md_completed = None # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. #print "Zipped %d resources in %s" % (len(resourcelist), zip_path) loc = self.publish_url + zip_name + ".zip" # mandatory lastmod = self.last_modified(resourcelist) # optional md_type = "application/zip" # recommended md_length = os.stat(zip_path).st_size md5 = compute_md5_for_file(zip_path) zip_resource = Resource(uri=loc, lastmod=lastmod, length=md_length, md5=md5, mime_type=md_type, md_at=md_at, md_completed=md_completed) if write_manifest: rdm = ResourceDumpManifest(resources=resourcelist.resources) rdm_file = open( os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w") rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml" rdm_file.write(rdm.as_xml()) rdm_file.close() zip_resource.link_set(rel="content", href=rdm_url) return zip_resource
def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True): """ Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the zip. -- The resync.Dump.write_zip method used in this method has the side effect of changing local paths in resourcelist into paths relative in zip. :param resourcelist: resources to zip :param prefix: prefix of the zip file :param write_list: True if resourcelist should be written to local disc. Default: False :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True :return: the created zip as a resync.Resource. """ md_at = ( None ) # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. index = -1 zipfiles = sorted(glob(os.path.join(self.publish_dir, prefix + "*.zip"))) if len(zipfiles) > 0: last_zip_file = zipfiles[len(zipfiles) - 1] basename = os.path.basename(last_zip_file) index = int(re.findall("\d+", basename)[0]) zip_name = "%s%05d" % (prefix, index + 1) if write_list: # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest. rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w") rl_file.write(resourcelist.as_xml()) rl_file.close() zip_path = os.path.join(self.publish_dir, zip_name + ".zip") dump = Dump() dump.path_prefix = self.resource_dir dump.write_zip(resourcelist, zip_path) # paths in resourcelist will be stripped. md_completed = ( None ) # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. # print "Zipped %d resources in %s" % (len(resourcelist), zip_path) loc = self.publish_url + zip_name + ".zip" # mandatory lastmod = self.last_modified(resourcelist) # optional md_type = "application/zip" # recommended md_length = os.stat(zip_path).st_size md5 = compute_md5_for_file(zip_path) zip_resource = Resource( uri=loc, lastmod=lastmod, length=md_length, md5=md5, mime_type=md_type, md_at=md_at, md_completed=md_completed, ) if write_manifest: rdm = ResourceDumpManifest(resources=resourcelist.resources) rdm_file = open(os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w") rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml" rdm_file.write(rdm.as_xml()) rdm_file.close() zip_resource.link_set(rel="content", href=rdm_url) return zip_resource