def parse(self, uri=None, fh=None, **kwargs): """Parse a single XML document for this list Does not handle the case of sitemapindex+sitemaps ResourceList """ if uri is not None: try: fh = URLopener().open(uri) except IOError as e: raise Exception("Failed to load sitemap/sitemapindex from %s (%s)" % (uri, str(e))) if fh is None: raise Exception("Nothing to parse") s = Sitemap(**kwargs) s.parse_xml(fh=fh, resources=self, capability=self.capability_md, sitemapindex=False) self.parsed_index = s.parsed_index
def read(self, uri=None, resources=None, capability=None, index_only=False): """Read sitemap from a URI including handling sitemapindexes If index_only is True then individual sitemaps references in a sitemapindex will not be read. This will result in no resources being returned and is useful only to read the metadata and links listed in the sitemapindex. Includes the subtlety that if the input URI is a local file and is a sitemapindex which contains URIs for the individual sitemaps, then these are mapped to the filesystem also. """ try: fh = URLopener().open(uri) self.num_files += 1 except IOError as e: raise IOError("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length self.logger.debug( "Read %d bytes from %s" % (self.content_length,uri) ) except KeyError: # If we don't get a length then c'est la vie self.logger.debug( "Read ????? bytes from %s" % (uri) ) pass self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) ) s = Sitemap() s.parse_xml(fh=fh,resources=self,capability='resourcelist') # what did we read? sitemap or sitemapindex? if (s.parsed_index): # sitemapindex if (not self.allow_multifile): raise ListBaseIndexError("Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) self.logger.info( "Parsed as sitemapindex, %d sitemaps" % (len(self.resources)) ) sitemapindex_is_file = self.is_file_uri(uri) if (index_only): # don't read the component sitemaps self.sitemapindex = True return # now loop over all entries to read each sitemap and add to resources sitemaps = self.resources self.resources = self.resources_class() self.logger.info( "Now reading %d sitemaps" % len(sitemaps.uris()) ) for sitemap_uri in sorted(sitemaps.uris()): self.read_component_sitemap(uri,sitemap_uri,s,sitemapindex_is_file) else: # sitemap self.logger.info( "Parsed as sitemap, %d resources" % (len(self.resources)) )