def test_02_resource_created(self): # ResourceChange with created r1 = ResourceChange('http://example.org/r/1', 1234, 9999, 'Q2hlY2sgSW50ZWdyaXR5IQ==', changetype='CREATED') xml = Sitemap().resource_as_xml(r1) self.assertEqual( xml, "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><lastmod rs:type=\"created\">1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>" ) # Now make inventory i = Inventory() i.add(r1) inv_xml = Sitemap().resources_as_xml(i) self.assertEqual( inv_xml, "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>http://example.org/r/1</loc><lastmod rs:type=\"created\">1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url></urlset>" ) # and try parsing back s = Sitemap() s.resource_class = ResourceChange i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml)) self.assertEqual(len(i), 1) r = iter(i).next() self.assertEqual(r.uri, 'http://example.org/r/1') self.assertEqual(r.timestamp, 1234) self.assertEqual(r.changetype, 'CREATED')
def test5_add_changed_resources(self): added = Inventory() added.add( Resource('a',timestamp=1) ) added.add( Resource('d',timestamp=4)) self.assertEqual(len(added), 2, "2 things in added inventory") changes = ChangeSet() changes.add_changed_resources( added, changetype='created' ) self.assertEqual(len(changes), 2, "2 things added") i = iter(changes) first = i.next() self.assertEqual(first.uri, 'a', "changes[0].uri=a") self.assertEqual(first.timestamp, 1, "changes[0].timestamp=1") self.assertEqual(first.changetype, 'created', "changes[0].changetype=created") second = i.next() self.assertEqual(second.timestamp, 4, "changes[1].timestamp=4") self.assertEqual(second.changetype, 'created', "changes[1].changetype=created") # Now add some with updated (one same, one diff) updated = Inventory() updated.add( Resource('a',timestamp=5) ) updated.add( Resource('b',timestamp=6)) self.assertEqual(len(updated), 2, "2 things in updated inventory") changes.add_changed_resources( updated, changetype='updated' ) self.assertEqual(len(changes), 4, "4 = 2 old + 2 things updated") # Make new inventory from the changes which should not have dupes dst = Inventory() dst.add( changes, replace=True ) self.assertEqual(len(dst), 3, "3 unique resources") self.assertEqual(dst.resources['a'].timestamp, 5 ) # 5 was later in last the 1 self.assertEqual(dst.resources['a'].changetype, 'updated') self.assertEqual(dst.resources['b'].timestamp, 6) self.assertEqual(dst.resources['b'].changetype, 'updated') self.assertEqual(dst.resources['d'].timestamp, 4) self.assertEqual(dst.resources['d'].changetype, 'created')
def test_01_print(self): i = Inventory() i.add( Resource(uri='a',lastmod='2001-01-01',size=1234) ) i.capabilities['http://example.org/changeset1'] = \ {"type": "changeset", "attributes": ["self next"]} self.assertEqual( len(i.capabilities), 1 ) self.assertEqual( Sitemap().inventory_as_xml(i), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:rs="http://resourcesync.org/change/0.1"><atom:link href="http://example.org/changeset1" rel="self next" type="changeset" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00</lastmod><rs:size>1234</rs:size></url></urlset>' )
def test_01_print(self): i = Inventory() i.add( Resource(uri='a',lastmod='2001-01-01',size=1234) ) i.capabilities['http://example.org/changeset1'] = \ {"type": "changeset", "attributes": ["self next"]} self.assertEqual( len(i.capabilities), 1 ) self.assertEqual( Sitemap().resources_as_xml(i, capabilities=i.capabilities), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/" xmlns:xhtml="http://www.w3.org/1999/xhtml_DEFANGED"><xhtml:link href="http://example.org/changeset1" rel="self next" type="changeset" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:size>1234</rs:size></url></urlset>' )
def test00_dump_creation(self): i=Inventory() i.add( ResourceFile('http://ex.org/a', size=1, file='resync/test/testdata/a') ) i.add( ResourceFile('http://ex.org/b', size=2, file='resync/test/testdata/b') ) d=Dump() d.check_files(inventory=i) self.assertEqual(d.total_size, 28)
def test5_add(self): r1 = Resource(uri='a') r2 = Resource(uri='b') m = Inventory() m.add(r1) self.assertRaises(ValueError, m.add, r1) m.add(r2) self.assertRaises(ValueError, m.add, r2)
def test5_add(self): r1 = Resource(uri="a") r2 = Resource(uri="b") m = Inventory() m.add(r1) self.assertRaises(ValueError, m.add, r1) m.add(r2) self.assertRaises(ValueError, m.add, r2)
def test6_has_md5(self): r1 = Resource(uri="a") r2 = Resource(uri="b") m = Inventory() self.assertFalse(m.has_md5()) m.add(r1) m.add(r2) self.assertFalse(m.has_md5()) r1.md5 = "aabbcc" self.assertTrue(m.has_md5())
def test_09_print_subset(self): r1 = Resource(uri='a',lastmod='2001-01-01',size=1234) r2 = Resource(uri='b',lastmod='2002-02-02',size=56789) r3 = Resource(uri='c',lastmod='2003-03-03',size=0) r3 = Resource(uri='d',lastmod='2003-03-04',size=444) m = Inventory() m.add(r1) m.add(r2) m.add(r3) self.assertEqual( Sitemap().inventory_as_xml(m, entries=['d','b']), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://resourcesync.org/change/0.1\"><url><loc>d</loc><lastmod>2003-03-04T00:00:00</lastmod><rs:size>444</rs:size></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00</lastmod><rs:size>56789</rs:size></url></urlset>")
def test6_has_md5(self): r1 = Resource(uri='a') r2 = Resource(uri='b') i = Inventory() self.assertFalse( i.has_md5() ) i.add(r1) i.add(r2) self.assertFalse( i.has_md5() ) r1.md5="aabbcc" self.assertTrue( i.has_md5() )
def test6_has_md5(self): r1 = Resource(uri='a') r2 = Resource(uri='b') m = Inventory() self.assertFalse(m.has_md5()) m.add(r1) m.add(r2) self.assertFalse(m.has_md5()) r1.md5 = "aabbcc" self.assertTrue(m.has_md5())
def test_01_print(self): i = Inventory() i.add(Resource(uri='a', lastmod='2001-01-01', size=1234)) i.capabilities['http://example.org/changeset1'] = \ {"type": "changeset", "attributes": ["self next"]} self.assertEqual(len(i.capabilities), 1) self.assertEqual( Sitemap().inventory_as_xml(i), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:rs="http://resourcesync.org/change/0.1"><atom:link href="http://example.org/changeset1" rel="self next" type="changeset" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00</lastmod><rs:size>1234</rs:size></url></urlset>' )
def test5_add_iterable(self): r1 = Resource(uri='a',size=1) r2 = Resource(uri='b',size=2) i = Inventory() i.add( [r1,r2] ) self.assertRaises( InventoryDupeError, i.add, r1) self.assertRaises( InventoryDupeError, i.add, r2) # allow dupes r1d = Resource(uri='a',size=10) i.add( [r1d] ,replace=True) self.assertEqual( len(i), 2 ) self.assertEqual( i.resources['a'].size, 10 )
def test1_same(self): src = Inventory() src.add( Resource('a',timestamp=1) ) src.add( Resource('b',timestamp=2) ) dst = Inventory() dst.add( Resource('a',timestamp=1) ) dst.add( Resource('b',timestamp=2) ) ( num_same, changed, deleted, added ) = dst.compare(src) self.assertEqual(num_same, 2, "2 things unchanged") self.assertEqual(changed, [], "nothing changed") self.assertEqual(deleted, [], "nothing deleted") self.assertEqual(added, [], "nothing added")
def test8_iter(self): i = Inventory() i.add( Resource('a',timestamp=1) ) i.add( Resource('b',timestamp=2) ) i.add( Resource('c',timestamp=3) ) i.add( Resource('d',timestamp=4) ) resources=[] for r in i: resources.append(r) self.assertEqual(len(resources), 4) self.assertEqual( resources[0].uri, 'a') self.assertEqual( resources[3].uri, 'd')
def test1_same(self): src = Inventory() src.add(Resource('a', timestamp=1)) src.add(Resource('b', timestamp=2)) dst = Inventory() dst.add(Resource('a', timestamp=1)) dst.add(Resource('b', timestamp=2)) (num_same, changed, deleted, added) = dst.compare(src) self.assertEqual(num_same, 2, "2 things unchanged") self.assertEqual(changed, [], "nothing changed") self.assertEqual(deleted, [], "nothing deleted") self.assertEqual(added, [], "nothing added")
def test7_iter(self): i = Inventory() i.add( Resource('a',timestamp=1) ) i.add( Resource('b',timestamp=2) ) i.add( Resource('c',timestamp=3) ) i.add( Resource('d',timestamp=4) ) resources=[] for r in i: resources.append(r) self.assertEqual(len(resources), 4) self.assertEqual( resources[0].uri, 'a') self.assertEqual( resources[3].uri, 'd')
def generate(self): """Generates an inventory (snapshot from the source) TODO: remove as soon as resource container _len_ is fixed""" capabilities = {} if self.source.has_changememory: next_changeset = self.source.changememory.next_changeset_uri() capabilities[next_changeset] = {"type": "changeset"} # inventory = Inventory(resources=self.source.resources, # capabilities=capabilities) inventory = Inventory(resources=None, capabilities=capabilities) for resource in self.source.resources: if resource is not None: inventory.add(resource) return inventory
def test_08_print(self): r1 = Resource(uri="a", lastmod="2001-01-01", size=1234) r2 = Resource(uri="b", lastmod="2002-02-02", size=56789) r3 = Resource(uri="c", lastmod="2003-03-03", size=0) m = Inventory() m.add(r1) m.add(r2) m.add(r3) # print m self.assertEqual( Sitemap().inventory_as_xml(m), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://resourcesync.org/change/0.1\"><url><loc>a</loc><lastmod>2001-01-01T00:00:00</lastmod><rs:size>1234</rs:size></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00</lastmod><rs:size>56789</rs:size></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00</lastmod><rs:size>0</rs:size></url></urlset>", )
def test00_dump_creation(self): i = Inventory() i.add( ResourceFile('http://ex.org/a', size=1, file='resync/test/testdata/a')) i.add( ResourceFile('http://ex.org/b', size=2, file='resync/test/testdata/b')) d = Dump() d.check_files(inventory=i) self.assertEqual(d.total_size, 28)
def test1_same(self): src = Inventory() src.add( Resource('a',timestamp=1) ) src.add( Resource('b',timestamp=2) ) dst = Inventory() dst.add( Resource('a',timestamp=1) ) dst.add( Resource('b',timestamp=2) ) ( same, changed, deleted, added ) = dst.compare(src) self.assertEqual( len(same), 2, "2 things unchanged" ) i = iter(same) self.assertEqual( i.next().uri, 'a', "first was a" ) self.assertEqual( i.next().uri, 'b', "second was b" ) self.assertEqual( len(changed), 0, "nothing changed" ) self.assertEqual( len(deleted), 0, "nothing deleted" ) self.assertEqual( len(added), 0, "nothing added" )
def test_02_resource_deleted(self): # ResourceChange with deleted r1 = ResourceChange('http://example.org/r/1',1234,9999,'Q2hlY2sgSW50ZWdyaXR5IQ==',changetype='DELETED') self.assertEqual( Sitemap().resource_as_xml(r1), "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><expires>1970-01-01T00:20:34Z</expires><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>" ) # Now make inventory i = Inventory() i.add(r1) inv_xml = Sitemap().resources_as_xml(i) # and try parsing back s = Sitemap() s.resource_class = ResourceChange i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml)) self.assertEqual( len(i), 1 ) r = iter(i).next() self.assertEqual( r.uri, 'http://example.org/r/1') self.assertEqual( r.timestamp, 1234) self.assertEqual( r.changetype, 'DELETED')
def test_02_resource_created(self): # ResourceChange with created r1 = ResourceChange('http://example.org/r/1',1234,9999,'Q2hlY2sgSW50ZWdyaXR5IQ==',changetype='CREATED') xml = Sitemap().resource_as_xml(r1) self.assertEqual( xml, "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><lastmod rs:type=\"created\">1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>" ) # Now make inventory i = Inventory() i.add(r1) inv_xml = Sitemap().resources_as_xml(i) self.assertEqual( inv_xml, "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>http://example.org/r/1</loc><lastmod rs:type=\"created\">1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url></urlset>" ) # and try parsing back s = Sitemap() s.resource_class = ResourceChange i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml)) self.assertEqual( len(i), 1 ) r = iter(i).next() self.assertEqual( r.uri, 'http://example.org/r/1') self.assertEqual( r.timestamp, 1234) self.assertEqual( r.changetype, 'CREATED')
def test7_changeset(self): src = Inventory() src.add( Resource('a',timestamp=1) ) src.add( Resource('b',timestamp=2) ) src.add( Resource('c',timestamp=3) ) src.add( Resource('d',timestamp=4)) src.add( Resource('e',timestamp=5) ) self.assertEqual(len(src), 5, "5 things in src") changes = src.changeset( ['a','d'], changetype='X' ) self.assertEqual(len(changes), 2, "2 things extracted") self.assertEqual(changes[0].uri, 'a', "changes[0].uri=a") self.assertEqual(changes[0].timestamp, 1, "changes[0].timestamp=1") self.assertEqual(changes[0].changetype, 'X', "changes[0].changetype=X") self.assertEqual(changes[1].timestamp, 4, "changes[1].timestamp=4") self.assertEqual(changes[1].changetype, 'X', "changes[1].changetype=X") # Make new inventory from the changes dst = Inventory() dst.add( changes ) self.assertEqual(dst.resources['a'].timestamp, 1 ) self.assertEqual(dst.resources['a'].changetype, 'X') self.assertEqual(dst.resources['d'].timestamp, 4) self.assertEqual(dst.resources['d'].changetype, 'X')
def test_02_resource_deleted(self): # ResourceChange with deleted r1 = ResourceChange('http://example.org/r/1', 1234, 9999, 'Q2hlY2sgSW50ZWdyaXR5IQ==', changetype='DELETED') self.assertEqual( Sitemap().resource_as_xml(r1), "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><expires>1970-01-01T00:20:34Z</expires><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>" ) # Now make inventory i = Inventory() i.add(r1) inv_xml = Sitemap().resources_as_xml(i) # and try parsing back s = Sitemap() s.resource_class = ResourceChange i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml)) self.assertEqual(len(i), 1) r = iter(i).next() self.assertEqual(r.uri, 'http://example.org/r/1') self.assertEqual(r.timestamp, 1234) self.assertEqual(r.changetype, 'DELETED')
def changeset_sitemap(self, outfile=None, ref_sitemap=None, capabilities=None, dump=None): # 1. Get and parse reference sitemap rs = Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.verbose): print "Reading sitemap(s) from %s ..." % (ref_sitemap) ri = rs.read(ref_sitemap) num_entries = len(ri) print "Read reference sitemap with %d entries in %d sitemaps" % ( num_entries, rs.sitemaps_created) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for r in ri.resource_uris(): print ri.resources[r] n += 1 if (n >= to_show): break # 2. Set up base_path->base_uri mappings, get inventory from disk disk_inventory = self.inventory # 3. Calculate changeset (num_same, updated, deleted, created) = ri.compare(disk_inventory) changeset = Inventory() changeset.capabilities = capabilities changeset.add(disk_inventory.changeset(updated, changetype='updated')) changeset.add(ri.changeset(deleted, changetype='deleted')) changeset.add(disk_inventory.changeset(created, changetype='created')) # 4. Write out changeset s = Sitemap(verbose=self.verbose, pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.inventory_as_xml(changeset) else: s.write(changeset, basename=outfile) self.write_dump_if_requested(changeset, dump)
def test7_changeset(self): src = Inventory() src.add(Resource('a', timestamp=1)) src.add(Resource('b', timestamp=2)) src.add(Resource('c', timestamp=3)) src.add(Resource('d', timestamp=4)) src.add(Resource('e', timestamp=5)) self.assertEqual(len(src), 5, "5 things in src") changes = src.changeset(['a', 'd'], changetype='X') self.assertEqual(len(changes), 2, "2 things extracted") self.assertEqual(changes[0].uri, 'a', "changes[0].uri=a") self.assertEqual(changes[0].timestamp, 1, "changes[0].timestamp=1") self.assertEqual(changes[0].changetype, 'X', "changes[0].changetype=X") self.assertEqual(changes[1].timestamp, 4, "changes[1].timestamp=4") self.assertEqual(changes[1].changetype, 'X', "changes[1].changetype=X") # Make new inventory from the changes dst = Inventory() dst.add(changes) self.assertEqual(dst.resources['a'].timestamp, 1) self.assertEqual(dst.resources['a'].changetype, 'X') self.assertEqual(dst.resources['d'].timestamp, 4) self.assertEqual(dst.resources['d'].changetype, 'X')
def changeset_sitemap(self,outfile=None,ref_sitemap=None,capabilities=None, dump=None): # 1. Get and parse reference sitemap rs = Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.verbose): print "Reading sitemap(s) from %s ..." % (ref_sitemap) ri = rs.read(ref_sitemap) num_entries = len(ri) print "Read reference sitemap with %d entries in %d sitemaps" % (num_entries,rs.sitemaps_created) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for r in ri.resource_uris(): print ri.resources[r] n+=1 if ( n >= to_show ): break # 2. Set up base_path->base_uri mappings, get inventory from disk disk_inventory = self.inventory # 3. Calculate changeset (num_same,updated,deleted,created)=ri.compare(disk_inventory) changeset = Inventory() changeset.capabilities = capabilities changeset.add( disk_inventory.changeset( updated, changetype='updated' ) ) changeset.add( ri.changeset( deleted, changetype='deleted' ) ) changeset.add( disk_inventory.changeset( created, changetype='created' ) ) # 4. Write out changeset s = Sitemap(verbose=self.verbose, pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.inventory_as_xml(changeset) else: s.write(changeset,basename=outfile) self.write_dump_if_requested(changeset,dump)