def add_file(self, resource_list=None, dir=None, file=None): """Add a single file to resource_list Follows object settings of set_path, set_md5 and set_length. """ try: if self.exclude_file(file): self.logger.debug("Excluding file %s" % (file)) return # get abs filename and also URL if (dir is not None): file = os.path.join(dir,file) if (not os.path.isfile(file) or not (self.include_symlinks or not os.path.islink(file))): return uri = self.mapper.dst_to_src(file) if (uri is None): raise Exception("Internal error, mapping failed") file_stat=os.stat(file) except OSError as e: sys.stderr.write("Ignoring file %s (error: %s)" % (file,str(e))) return timestamp = file_stat.st_mtime #UTC r = Resource(uri=uri,timestamp=timestamp) if (self.set_path): # add full local path r.path=file if (self.set_md5): # add md5 r.md5=compute_md5_for_file(file) if (self.set_length): # add length r.length=file_stat.st_size resource_list.add(r)
def add_file(self, resource_list=None, dir=None, file=None): """Add a single file to resource_list Follows object settings of set_path, set_md5 and set_length. """ try: if self.exclude_file(file): self.logger.debug("Excluding file %s" % (file)) return # get abs filename and also URL if (dir is not None): file = os.path.join(dir, file) if (not os.path.isfile(file) or not (self.include_symlinks or not os.path.islink(file))): return uri = self.mapper.dst_to_src(file) if (uri is None): raise Exception("Internal error, mapping failed") file_stat = os.stat(file) except OSError as e: sys.stderr.write("Ignoring file %s (error: %s)" % (file, str(e))) return timestamp = file_stat.st_mtime #UTC r = Resource(uri=uri, timestamp=timestamp) if (self.set_path): # add full local path r.path = file if (self.set_md5): # add md5 r.md5 = compute_md5_for_file(file) if (self.set_length): # add length r.length = file_stat.st_size resource_list.add(r)
def write(self, basename='/tmp/sitemap.xml', **kwargs): """Write one or a set of sitemap files to disk resources is a ResourceContainer that may be an ResourceList or a ChangeList. This may be a generator so data is read as needed and length is determined at the end. basename is used as the name of the single sitemap file or the sitemapindex for a set of sitemap files. Uses self.max_sitemap_entries to determine whether the resource_list can be written as one sitemap. If there are more entries and self.allow_multifile is set true then a set of sitemap files, with an sitemapindex, will be written. """ # Access resources through iterator only resources_iter = iter(self.resources) ( chunk, next ) = self.get_resources_chunk(resources_iter) s = Sitemap(**kwargs) if (next is not None): # Have more than self.max_sitemap_entries => sitemapindex if (not self.allow_multifile): raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled") # Work out how to name the sitemaps, attempt to add %05d before ".xml$", else append sitemap_prefix = basename sitemap_suffix = '.xml' if (basename[-4:] == '.xml'): sitemap_prefix = basename[:-4] # Use iterator over all resources and count off sets of # max_sitemap_entries to go into each sitemap, store the # names of the sitemaps as we go sitemaps=ListBase() while (len(chunk)>0): file = sitemap_prefix + ( "%05d" % (len(sitemaps)) ) + sitemap_suffix self.logger.info("Writing sitemap %s..." % (file)) f = open(file, 'w') s.resources_as_xml(chunk, fh=f) f.close() # Record information about this sitemap for index r = Resource( uri = self.mapper.dst_to_src(file), path = file, timestamp = os.stat(file).st_mtime, md5 = compute_md5_for_file(file) ) sitemaps.add(r) # Get next chunk ( chunk, next ) = self.get_resources_chunk(resources_iter,next) self.logger.info("Wrote %d sitemaps" % (len(sitemaps))) f = open(basename, 'w') self.logger.info("Writing sitemapindex %s..." % (basename)) s.resources_as_xml(resources=sitemaps,sitemapindex=True,fh=f) f.close() self.logger.info("Wrote sitemapindex %s" % (basename)) else: f = open(basename, 'w') self.logger.info("Writing sitemap %s..." % (basename)) s.resources_as_xml(chunk, fh=f) f.close() self.logger.info("Wrote sitemap %s" % (basename))
def from_disk_add_map(self, resource_list=None, map=None, set_path=False): """Add to resource_list with resources from disk scan based one map If set_path is True then the path attribue will be set with the local path for each Resource. """ # sanity if (resource_list is None or map is None): raise ValueError("Must specify resource_list and map") path=map.dst_path #print "walking: %s" % (path) # for each file: create Resource object, add, increment counter num_files=0 for dirpath, dirs, files in os.walk(path,topdown=True): for file_in_dirpath in files: num_files+=1 if (num_files%50000 == 0): self.logger.info("ResourceListBuilder.from_disk_add_map: %d files..." % (num_files)) try: if self.exclude_file(file_in_dirpath): self.logger.debug("Excluding file %s" % (file_in_dirpath)) continue # get abs filename and also URL file = os.path.join(dirpath,file_in_dirpath) if (not os.path.isfile(file) or not (self.include_symlinks or not os.path.islink(file))): continue uri = map.dst_to_src(file) if (uri is None): raise Exception("Internal error, mapping failed") file_stat=os.stat(file) except OSError as e: sys.stderr.write("Ignoring file %s (error: %s)" % (file,str(e))) continue timestamp = file_stat.st_mtime #UTC r = Resource(uri=uri,timestamp=timestamp) if (set_path): r.path=file if (self.do_md5): # add md5 r.md5=compute_md5_for_file(file) if (self.do_length): # add length r.length=file_stat.st_size resource_list.add(r) # prune list of dirs based on self.exclude_dirs for exclude in self.exclude_dirs: if exclude in dirs: self.logger.debug("Excluding dir %s" % (exclude)) dirs.remove(exclude)
def write(self, basename='/tmp/sitemap.xml'): """Write one or a set of sitemap files to disk resources is a ResourceContainer that may be an ResourceList or a ChangeList. This may be a generator so data is read as needed and length is determined at the end. basename is used as the name of the single sitemap file or the sitemapindex for a set of sitemap files. Uses self.max_sitemap_entries to determine whether the resource_list can be written as one sitemap. If there are more entries and self.allow_multifile is set true then a set of sitemap files, with an sitemapindex, will be written. """ # Access resources through iterator only resources_iter = iter(self.resources) ( chunk, next ) = self.get_resources_chunk(resources_iter) s = self.new_sitemap() if (next is not None): # Have more than self.max_sitemap_entries => sitemapindex if (not self.allow_multifile): raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled") # Work out URI of sitemapindex so that we can link up to # it from the individual sitemap files try: index_uri = self.mapper.dst_to_src(basename) except MapperError as e: raise ListBaseIndexError("Cannot map sitemapindex filename to URI (%s)" % str(e)) # Use iterator over all resources and count off sets of # max_sitemap_entries to go into each sitemap, store the # names of the sitemaps as we go. Copy md from self into # the index and use this for all chunks also index=ListBase(md=self.md.copy(), ln=list(self.ln)) index.capability_name = self.capability_name index.default_capability() while (len(chunk)>0): file = self.part_name(basename,len(index)) # Check that we can map the filename of this sitemap into # URI space for the sitemapindex try: uri = self.mapper.dst_to_src(file) except MapperError as e: raise ListBaseIndexError("Cannot map sitemap filename to URI (%s)" % str(e)) self.logger.info("Writing sitemap %s..." % (file)) f = open(file, 'w') chunk.index = index_uri chunk.md = index.md s.resources_as_xml(chunk, fh=f) f.close() # Record information about this sitemap for index r = Resource( uri = uri, timestamp = os.stat(file).st_mtime, md5 = compute_md5_for_file(file) ) index.add(r) # Get next chunk ( chunk, next ) = self.get_resources_chunk(resources_iter,next) self.logger.info("Wrote %d sitemaps" % (len(index))) f = open(basename, 'w') self.logger.info("Writing sitemapindex %s..." % (basename)) s.resources_as_xml(index,sitemapindex=True,fh=f) f.close() self.logger.info("Wrote sitemapindex %s" % (basename)) else: f = open(basename, 'w') self.logger.info("Writing sitemap %s..." % (basename)) s.resources_as_xml(chunk, fh=f) f.close() self.logger.info("Wrote sitemap %s" % (basename))
def write(self, basename='/tmp/sitemap.xml'): """Write one or a set of sitemap files to disk resources is a ResourceContainer that may be an ResourceList or a ChangeList. This may be a generator so data is read as needed and length is determined at the end. basename is used as the name of the single sitemap file or the sitemapindex for a set of sitemap files. Uses self.max_sitemap_entries to determine whether the resource_list can be written as one sitemap. If there are more entries and self.allow_multifile is set true then a set of sitemap files, with an sitemapindex, will be written. """ # Access resources through iterator only resources_iter = iter(self.resources) (chunk, next) = self.get_resources_chunk(resources_iter) s = self.new_sitemap() if (next is not None): # Have more than self.max_sitemap_entries => sitemapindex if (not self.allow_multifile): raise ListBaseIndexError( "Too many entries for a single sitemap but multifile disabled" ) # Work out URI of sitemapindex so that we can link up to # it from the individual sitemap files try: index_uri = self.mapper.dst_to_src(basename) except MapperError as e: raise ListBaseIndexError( "Cannot map sitemapindex filename to URI (%s)" % str(e)) # Use iterator over all resources and count off sets of # max_sitemap_entries to go into each sitemap, store the # names of the sitemaps as we go. Copy md from self into # the index and use this for all chunks also index = ListBase(md=self.md.copy(), ln=list(self.ln)) index.capability_name = self.capability_name index.default_capability() while (len(chunk) > 0): file = self.part_name(basename, len(index)) # Check that we can map the filename of this sitemap into # URI space for the sitemapindex try: uri = self.mapper.dst_to_src(file) except MapperError as e: raise ListBaseIndexError( "Cannot map sitemap filename to URI (%s)" % str(e)) self.logger.info("Writing sitemap %s..." % (file)) f = open(file, 'w') chunk.index = index_uri chunk.md = index.md s.resources_as_xml(chunk, fh=f) f.close() # Record information about this sitemap for index r = Resource(uri=uri, timestamp=os.stat(file).st_mtime, md5=compute_md5_for_file(file)) index.add(r) # Get next chunk (chunk, next) = self.get_resources_chunk(resources_iter, next) self.logger.info("Wrote %d sitemaps" % (len(index))) f = open(basename, 'w') self.logger.info("Writing sitemapindex %s..." % (basename)) s.resources_as_xml(index, sitemapindex=True, fh=f) f.close() self.logger.info("Wrote sitemapindex %s" % (basename)) else: f = open(basename, 'w') self.logger.info("Writing sitemap %s..." % (basename)) s.resources_as_xml(chunk, fh=f) f.close() self.logger.info("Wrote sitemap %s" % (basename))