def changelist_sitemap(self,outfile=None,ref_sitemap=None,newref_sitemap=None, empty=None,capabilities=None,dump=None): changelist = ChangeList() changelist.capabilities = capabilities if (not empty): # 1. Get and parse reference sitemap old_inv = self.read_reference_sitemap(ref_sitemap) # 2. Depending on whether a newref_sitemap was specified, either read that # or build resourcelist from files on disk if (newref_sitemap is None): # Get resourcelist from disk new_inv = self.resourcelist else: new_inv = self.read_reference_sitemap(newref_sitemap,name='new reference') # 3. Calculate changelist (same,updated,deleted,created)=old_inv.compare(new_inv) changelist.add_changed_resources( updated, change='updated' ) changelist.add_changed_resources( deleted, change='deleted' ) changelist.add_changed_resources( created, change='created' ) # 4. Write out changelist s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.resources_as_xml(changelist,changelist=True) else: s.write(changelist,basename=outfile,changelist=True) self.write_dump_if_requested(changelist,dump)
def changeset_sitemap( self, outfile=None, ref_sitemap=None, newref_sitemap=None, empty=None, capabilities=None, dump=None ): changeset = ChangeSet() changeset.capabilities = capabilities if not empty: # 1. Get and parse reference sitemap old_inv = self.read_reference_sitemap(ref_sitemap) # 2. Depending on whether a newref_sitemap was specified, either read that # or build inventory from files on disk if newref_sitemap is None: # Get inventory from disk new_inv = self.inventory else: new_inv = self.read_reference_sitemap(newref_sitemap, name="new reference") # 3. Calculate changeset (same, updated, deleted, created) = old_inv.compare(new_inv) changeset.add_changed_resources(updated, changetype="UPDATED") changeset.add_changed_resources(deleted, changetype="DELETED") changeset.add_changed_resources(created, changetype="CREATED") # 4. Write out changeset s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if self.max_sitemap_entries is not None: s.max_sitemap_entries = self.max_sitemap_entries if outfile is None: print s.resources_as_xml(changeset, changeset=True) else: s.write(changeset, basename=outfile, changeset=True) self.write_dump_if_requested(changeset, dump)
def save_sitemap(self, sitemap, path): # writing the string sitemap.as_xml() to disk results in encoding=ASCII on some systems. # due to https://docs.python.org/3.4/library/xml.etree.elementtree.html#write sitemap.default_capability() with open(path, "wb") as f: s = Sitemap(pretty_xml=self.para.is_saving_pretty_xml) s.resources_as_xml(sitemap, sitemapindex=sitemap.sitemapindex, fh=f)
def write_sitemap(self, outfile=None, capabilities=None, dump=None): # Set up base_path->base_uri mappings, get inventory from disk i = self.inventory i.capabilities = capabilities s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if self.max_sitemap_entries is not None: s.max_sitemap_entries = self.max_sitemap_entries if outfile is None: print s.resources_as_xml(i, capabilities=i.capabilities) else: s.write(i, basename=outfile) self.write_dump_if_requested(i, dump)
def write_sitemap(self, outfile=None, capabilities=None, dump=None): # Set up base_path->base_uri mappings, get inventory from disk i = self.inventory i.capabilities = capabilities s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.resources_as_xml(i, capabilities=i.capabilities) else: s.write(i, basename=outfile) self.write_dump_if_requested(i, dump)
def test2_pretty_output(self): ib = InventoryBuilder() ib.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() s.pretty_xml=True self.assertEqual(s.resources_as_xml(i),'<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:size>45</rs:size></url>\n</urlset>' )
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() xml = s.resources_as_xml(i) self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>20</rs:size><rs:fixity type="md5">a/Jv1mYBtSjS4LR\+qoft/Q==</rs:fixity>',xml) ) #must escape + in md5 self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>45</rs:size><rs:fixity type="md5">RS5Uva4WJqxdbnvoGzneIQ==</rs:fixity>',xml) )
def changeset_sitemap(self,outfile=None,ref_sitemap=None,capabilities=None, dump=None): # 1. Get and parse reference sitemap rs = Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.verbose): print "Reading sitemap(s) from %s ..." % (ref_sitemap) ri = rs.read(ref_sitemap) num_entries = len(ri) print "Read reference sitemap with %d entries in %d sitemaps" % (num_entries,rs.sitemaps_created) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for r in i: print r n+=1 if ( n >= to_show ): break # 2. Set up base_path->base_uri mappings, get inventory from disk disk_inventory = self.inventory # 3. Calculate changeset (same,updated,deleted,created)=ri.compare(disk_inventory) changeset = ChangeSet() changeset.capabilities = capabilities changeset.add_changed_resources( updated, changetype='updated' ) changeset.add_changed_resources( deleted, changetype='deleted' ) changeset.add_changed_resources( created, changetype='created' ) # 4. Write out changeset s = Sitemap(verbose=self.verbose, pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.resources_as_xml(changeset) else: s.write(changeset,basename=outfile) self.write_dump_if_requested(changeset,dump)
def test2_pretty_output(self): ib = InventoryBuilder() ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() s.pretty_xml = True self.assertEqual( s.resources_as_xml(i), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:size>45</rs:size></url>\n</urlset>' )
def changeset_sitemap(self, outfile=None, ref_sitemap=None, newref_sitemap=None, empty=None, capabilities=None, dump=None): changeset = ChangeSet() changeset.capabilities = capabilities if (not empty): # 1. Get and parse reference sitemap old_inv = self.read_reference_sitemap(ref_sitemap) # 2. Depending on whether a newref_sitemap was specified, either read that # or build inventory from files on disk if (newref_sitemap is None): # Get inventory from disk new_inv = self.inventory else: new_inv = self.read_reference_sitemap(newref_sitemap, name='new reference') # 3. Calculate changeset (same, updated, deleted, created) = old_inv.compare(new_inv) changeset.add_changed_resources(updated, changetype='UPDATED') changeset.add_changed_resources(deleted, changetype='DELETED') changeset.add_changed_resources(created, changetype='CREATED') # 4. Write out changeset s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.resources_as_xml(changeset, changeset=True) else: s.write(changeset, basename=outfile, changeset=True) self.write_dump_if_requested(changeset, dump)
def convert_to_xml(self, resources, sitemap_index=False, fh=None): """Write or return XML for a set of resources in sitemap format. Arguments: - resources - either an iterable or iterator of Resource objects; if there an md attribute this will go to <rs:md> if there an ln attribute this will go to <rs:ln> - sitemapindex - set True to write sitemapindex instead of sitemap - fh - write to filehandle fh instead of returning string """ sitemap = Sitemap() self.res_container = resources if len(self.res_container) == 0: return return sitemap.resources_as_xml(self.res_container, sitemapindex=sitemap_index, fh=fh)
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() xml = s.resources_as_xml(i) self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>20</rs:size><rs:fixity type="md5">a/Jv1mYBtSjS4LR\+qoft/Q==</rs:fixity>', xml)) #must escape + in md5 self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>45</rs:size><rs:fixity type="md5">RS5Uva4WJqxdbnvoGzneIQ==</rs:fixity>', xml))