def read_component_sitemap(self, sitemapindex_uri, sitemap_uri, sitemap, sitemapindex_is_file): """Read a component sitemap of a Resource List with index Each component must be a sitemap with the """ if (sitemapindex_is_file): if (not self.is_file_uri(sitemap_uri)): # Attempt to map URI to local file remote_uri = sitemap_uri sitemap_uri = self.mapper.src_to_dst(remote_uri) self.logger.info("Mapped %s to local file %s" % (remote_uri, sitemap_uri)) else: # The individual sitemaps should be at a URL (scheme/server/path) # that the sitemapindex URL can speak authoritatively about if (self.check_url_authority and not UrlAuthority(sitemapindex_uri).has_authority_over(sitemap_uri)): raise ListBaseIndexError("The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (sitemapindex_uri,sitemap_uri)) try: fh = URLopener().open(sitemap_uri) self.num_files += 1 except IOError as e: raise ListBaseIndexError("Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri,sitemapindex_uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length except KeyError: # If we don't get a length then c'est la vie pass self.logger.info( "Reading sitemap from %s (%d bytes)" % (sitemap_uri,self.content_length) ) component = sitemap.parse_xml( fh=fh, sitemapindex=False ) # Copy resources into self, check any metadata for r in component: self.resources.add(r)
def read(self, uri=None, resources=None, index_only=False): """Read sitemap from a URI including handling sitemapindexes If index_only is True then individual sitemaps references in a sitemapindex will not be read. This will result in no resources being returned and is useful only to read the metadata and links listed in the sitemapindex. Includes the subtlety that if the input URI is a local file and is a sitemapindex which contains URIs for the individual sitemaps, then these are mapped to the filesystem also. """ try: fh = URLopener().open(uri) self.num_files += 1 except IOError as e: raise IOError("Failed to load sitemap/sitemapindex from %s (%s)" % (uri, str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length self.logger.debug("Read %d bytes from %s" % (self.content_length, uri)) except KeyError: # If we don't get a length then c'est la vie self.logger.debug("Read ????? bytes from %s" % (uri)) pass self.logger.info("Read sitemap/sitemapindex from %s" % (uri)) s = self.new_sitemap() s.parse_xml(fh=fh, resources=self, capability=self.capability_name) # what did we read? sitemap or sitemapindex? if (s.parsed_index): # sitemapindex if (not self.allow_multifile): raise ListBaseIndexError( "Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) self.logger.info("Parsed as sitemapindex, %d sitemaps" % (len(self.resources))) sitemapindex_is_file = self.is_file_uri(uri) if (index_only): # don't read the component sitemaps self.sitemapindex = True return # now loop over all entries to read each sitemap and add to resources sitemaps = self.resources self.resources = self.resources_class() self.logger.info("Now reading %d sitemaps" % len(sitemaps.uris())) for sitemap_uri in sorted(sitemaps.uris()): self.read_component_sitemap(uri, sitemap_uri, s, sitemapindex_is_file) else: # sitemap self.logger.info("Parsed as sitemap, %d resources" % (len(self.resources)))
def read(self, uri=None, resources=None, index_only=False): """Read sitemap from a URI including handling sitemapindexes If index_only is True then individual sitemaps references in a sitemapindex will not be read. This will result in no resources being returned and is useful only to read the metadata and links listed in the sitemapindex. Includes the subtlety that if the input URI is a local file and is a sitemapindex which contains URIs for the individual sitemaps, then these are mapped to the filesystem also. """ try: fh = URLopener().open(uri) self.num_files += 1 except IOError as e: raise IOError("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length self.logger.debug( "Read %d bytes from %s" % (self.content_length,uri) ) except KeyError: # If we don't get a length then c'est la vie self.logger.debug( "Read ????? bytes from %s" % (uri) ) pass self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) ) s = self.new_sitemap() s.parse_xml(fh=fh,resources=self,capability=self.capability_name) # what did we read? sitemap or sitemapindex? if (s.parsed_index): # sitemapindex if (not self.allow_multifile): raise ListBaseIndexError("Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) self.logger.info( "Parsed as sitemapindex, %d sitemaps" % (len(self.resources)) ) sitemapindex_is_file = self.is_file_uri(uri) if (index_only): # don't read the component sitemaps self.sitemapindex = True return # now loop over all entries to read each sitemap and add to resources sitemaps = self.resources self.resources = self.resources_class() self.logger.info( "Now reading %d sitemaps" % len(sitemaps.uris()) ) for sitemap_uri in sorted(sitemaps.uris()): self.read_component_sitemap(uri,sitemap_uri,s,sitemapindex_is_file) else: # sitemap self.logger.info( "Parsed as sitemap, %d resources" % (len(self.resources)) )
def read_component_sitemap(self, sitemapindex_uri, sitemap_uri, sitemap, sitemapindex_is_file): """Read a component sitemap of a Resource List with index Each component must be a sitemap with the """ if (sitemapindex_is_file): if (not self.is_file_uri(sitemap_uri)): # Attempt to map URI to local file remote_uri = sitemap_uri sitemap_uri = self.mapper.src_to_dst(remote_uri) self.logger.info("Mapped %s to local file %s" % (remote_uri, sitemap_uri)) else: # The individual sitemaps should be at a URL (scheme/server/path) # that the sitemapindex URL can speak authoritatively about if (self.check_url_authority and not UrlAuthority( sitemapindex_uri).has_authority_over(sitemap_uri)): raise ListBaseIndexError( "The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (sitemapindex_uri, sitemap_uri)) try: fh = URLopener().open(sitemap_uri) self.num_files += 1 except IOError as e: raise ListBaseIndexError( "Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri, sitemapindex_uri, str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length except KeyError: # If we don't get a length then c'est la vie pass self.logger.info("Reading sitemap from %s (%d bytes)" % (sitemap_uri, self.content_length)) component = sitemap.parse_xml(fh=fh, sitemapindex=False) # Copy resources into self, check any metadata for r in component: self.resources.add(r)
def read(self, uri=None, resources=None, changeset=None, index_only=False): """Read sitemap from a URI including handling sitemapindexes Returns the inventory or changeset. If changeset is not specified (None) then it is assumed that an Inventory is to be read, unless the XML indicates a ChangeSet. If changeset is True then a ChangeSet if expected; if changeset if False then an Inventory is expected. If index_only is True then individual sitemaps references in a sitemapindex will not be read. This will result in no resources being returned and is useful only to read the capabilities and metadata listed in the sitemapindex. Will set self.read_type to a string value sitemap/sitemapindex/changeset/changesetindex depleding on the type of the file expected/read. Includes the subtlety that if the input URI is a local file and is a sitemapindex which contains URIs for the individual sitemaps, then these are mapped to the filesystem also. """ try: fh = URLopener().open(uri) except IOError as e: raise Exception( "Failed to load sitemap/sitemapindex from %s (%s)" % (uri, str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length self.logger.debug("Read %d bytes from %s" % (self.content_length, uri)) except KeyError: # If we don't get a length then c'est la vie self.logger.debug("Read ????? bytes from %s" % (uri)) pass self.logger.info("Read sitemap/sitemapindex from %s" % (uri)) etree = parse(fh) # check root element: urlset (for sitemap), sitemapindex or bad self.sitemaps_created = 0 root = etree.getroot() # assume inventory but look to see whether this is a changeset # as indicated with rs:type="changeset" on the root resources_class = self.inventory_class sitemap_xml_parser = self.inventory_parse_xml self.changeset_read = False self.read_type = 'sitemap' root_type = root.attrib.get('{' + RS_NS + '}type', None) if (root_type is not None): if (root_type == 'changeset'): self.changeset_read = True else: self.logger.info( "Bad value of rs:type on root element (%s), ignoring" % (root_type)) elif (changeset is True): self.changeset_read = True if (self.changeset_read): self.read_type = 'changeset' resources_class = self.changeset_class sitemap_xml_parser = self.changeset_parse_xml # now have make sure we have a place to put the data we read if (resources is None): resources = resources_class() # sitemap or sitemapindex? if (root.tag == '{' + SITEMAP_NS + "}urlset"): self.logger.info("Parsing as sitemap") sitemap_xml_parser(etree=etree, resources=resources) self.sitemaps_created += 1 elif (root.tag == '{' + SITEMAP_NS + "}sitemapindex"): self.read_type += 'index' if (not self.allow_multifile): raise Exception( "Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) self.logger.info("Parsing as sitemapindex") sitemaps = self.sitemapindex_parse_xml(etree=etree) sitemapindex_is_file = self.is_file_uri(uri) if (index_only): return (resources) # now loop over all entries to read each sitemap and add to resources self.logger.info("Now reading %d sitemaps" % len(sitemaps)) for sitemap_uri in sorted(sitemaps.resources.keys()): if (sitemapindex_is_file): if (not self.is_file_uri(sitemap_uri)): # Attempt to map URI to local file remote_uri = sitemap_uri sitemap_uri = self.mapper.src_to_dst(remote_uri) else: # The individual sitemaps should be at a URL (scheme/server/path) # that the sitemapindex URL can speak authoritatively about if (not UrlAuthority(uri).has_authority_over(sitemap_uri)): raise Exception( "The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (uri, sitemap_uri)) try: fh = URLopener().open(sitemap_uri) except IOError as e: raise Exception( "Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri, uri, str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length except KeyError: # If we don't get a length then c'est la vie pass self.logger.info("Read sitemap from %s (%d)" % (sitemap_uri, self.content_length)) sitemap_xml_parser(fh=fh, resources=resources) self.sitemaps_created += 1 else: raise ValueError( "XML read from %s is not a sitemap or sitemapindex" % (uri)) return (resources)
def read(self, uri=None, resources=None): """Read sitemap from a URI including handling sitemapindexes Returns the inventory or changeset. If resources is not specified then it is assumed that an Inventory is to be read, pass in a ChangeSet object to read a changeset. Includes the subtlety that if the input URI is a local file and the """ try: fh = URLopener().open(uri) except IOError as e: raise Exception("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length except KeyError: # If we don't get a length then c'est la vie pass self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) ) etree = parse(fh) # check root element: urlset (for sitemap), sitemapindex or bad self.sitemaps_created=0 root = etree.getroot() # assume inventory but look to see whether this is a changeset # as indicated with rs:type="changeset" on the root resources_class = self.inventory_class sitemap_xml_parser = self.inventory_parse_xml self.changeset_read = False root_type = root.attrib.get('{'+RS_NS+'}type',None) if (root_type is not None): if (root_type == 'changeset'): resources_class = self.changeset_class sitemap_xml_parser = self.changeset_parse_xml self.changeset_read = True else: self.logger.info("Bad value of rs:type on root element (%s), ignoring" % (root_type)) # now have make sure we have a place to put the data we read if (resources is None): resources=resources_class() # sitemap or sitemapindex? if (root.tag == '{'+SITEMAP_NS+"}urlset"): self.logger.info( "Parsing as sitemap" ) sitemap_xml_parser(etree=etree, resources=resources) self.sitemaps_created+=1 elif (root.tag == '{'+SITEMAP_NS+"}sitemapindex"): if (not self.allow_multifile): raise Exception("Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) self.logger.info( "Parsing as sitemapindex" ) sitemaps=self.sitemapindex_parse_xml(etree=etree) sitemapindex_is_file = self.is_file_uri(uri) # now loop over all entries to read each sitemap and add to resources self.logger.info( "Now reading %d sitemaps" % len(sitemaps) ) for sitemap_uri in sorted(sitemaps.resources.keys()): if (sitemapindex_is_file): if (not self.is_file_uri(sitemap_uri)): # Attempt to map URI to local file remote_uri = sitemap_uri sitemap_uri = self.mapper.src_to_dst(remote_uri) else: # The individual sitemaps should be at a URL (scheme/server/path) # that the sitemapindex URL can speak authoritatively about if (not UrlAuthority(uri).has_authority_over(sitemap_uri)): raise Exception("The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (uri,sitemap_uri)) try: fh = URLopener().open(sitemap_uri) except IOError as e: raise Exception("Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri,uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length except KeyError: # If we don't get a length then c'est la vie pass self.logger.info( "Read sitemap from %s (%d)" % (sitemap_uri,self.content_length) ) sitemap_xml_parser( fh=fh, resources=resources ) self.sitemaps_created+=1 else: raise ValueError("XML read from %s is not a sitemap or sitemapindex" % (sitemap_uri)) return(resources)
def read(self, uri=None, resources=None, changelist=None, index_only=False): """Read sitemap from a URI including handling sitemapindexes Returns the resourcelist or changelist. If changelist is not specified (None) then it is assumed that an ResourceList is to be read, unless the XML indicates a Changelist. If changelist is True then a Changelist if expected; if changelist if False then an ResourceList is expected. If index_only is True then individual sitemaps references in a sitemapindex will not be read. This will result in no resources being returned and is useful only to read the capabilities and metadata listed in the sitemapindex. Will set self.read_type to a string value sitemap/sitemapindex/changelist/changelistindex depleding on the type of the file expected/read. Includes the subtlety that if the input URI is a local file and is a sitemapindex which contains URIs for the individual sitemaps, then these are mapped to the filesystem also. """ try: fh = URLopener().open(uri) except IOError as e: raise Exception("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length self.logger.debug( "Read %d bytes from %s" % (self.content_length,uri) ) except KeyError: # If we don't get a length then c'est la vie self.logger.debug( "Read ????? bytes from %s" % (uri) ) pass self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) ) etree = parse(fh) # check root element: urlset (for sitemap), sitemapindex or bad self.sitemaps_created=0 root = etree.getroot() # assume resourcelist but look to see whether this is a changelist # as indicated with rs:type="changelist" on the root resources_class = self.resourcelist_class sitemap_xml_parser = self.resourcelist_parse_xml self.changelist_read = False self.read_type = 'sitemap' root_type = root.attrib.get('{'+RS_NS+'}type',None) if (root_type is not None): if (root_type == 'changelist'): self.changelist_read = True else: self.logger.info("Bad value of rs:type on root element (%s), ignoring" % (root_type)) elif (changelist is True): self.changelist_read = True if (self.changelist_read): self.read_type = 'changelist' resources_class = self.changelist_class sitemap_xml_parser = self.changelist_parse_xml # now have make sure we have a place to put the data we read if (resources is None): resources=resources_class() # sitemap or sitemapindex? if (root.tag == '{'+SITEMAP_NS+"}urlset"): self.logger.info( "Parsing as sitemap" ) sitemap_xml_parser(etree=etree, resources=resources) self.sitemaps_created+=1 elif (root.tag == '{'+SITEMAP_NS+"}sitemapindex"): self.read_type += 'index' if (not self.allow_multifile): raise Exception("Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) self.logger.info( "Parsing as sitemapindex" ) sitemaps=self.sitemapindex_parse_xml(etree=etree) sitemapindex_is_file = self.is_file_uri(uri) if (index_only): return(resources) # now loop over all entries to read each sitemap and add to resources self.logger.info( "Now reading %d sitemaps" % len(sitemaps) ) for sitemap_uri in sorted(sitemaps.resources.keys()): if (sitemapindex_is_file): if (not self.is_file_uri(sitemap_uri)): # Attempt to map URI to local file remote_uri = sitemap_uri sitemap_uri = self.mapper.src_to_dst(remote_uri) else: # The individual sitemaps should be at a URL (scheme/server/path) # that the sitemapindex URL can speak authoritatively about if (not UrlAuthority(uri).has_authority_over(sitemap_uri)): raise Exception("The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (uri,sitemap_uri)) try: fh = URLopener().open(sitemap_uri) except IOError as e: raise Exception("Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri,uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length except KeyError: # If we don't get a length then c'est la vie pass self.logger.info( "Read sitemap from %s (%d)" % (sitemap_uri,self.content_length) ) sitemap_xml_parser( fh=fh, resources=resources ) self.sitemaps_created+=1 else: raise ValueError("XML read from %s is not a sitemap or sitemapindex" % (uri)) return(resources)