def test04_dump_multi_file_max_size(self): rl=ResourceList() for letter in map(chr,xrange(ord('a'),ord('l')+1)): uri='http://ex.org/%s' % (letter) fname='resync/test/testdata/a_to_z/%s' % (letter) rl.add( Resource(uri, path=fname) ) self.assertEqual( len(rl), 12 ) d2=Dump(rl) tmpbase=os.path.join(self.tmpdir,'test0f_') d2.max_size=2000 n=d2.write(tmpbase) self.assertEqual( n, 2, 'expect to write 2 dump files' ) self.assertTrue( os.path.isfile(tmpbase+'00000.zip') ) self.assertTrue( os.path.isfile(tmpbase+'00001.zip') ) # Look at the first file in detail zipf=tmpbase+'00000.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','a','b','c','d','e','f'] ) #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 ) self.assertEqual( zo.getinfo('a').file_size, 9 ) self.assertEqual( zo.getinfo('b').file_size, 1116 ) self.assertEqual( zo.getinfo('c').file_size, 32 ) self.assertEqual( zo.getinfo('d').file_size, 13 ) self.assertEqual( zo.getinfo('e').file_size, 20 ) self.assertEqual( zo.getinfo('f').file_size, 1625 ) zo.close() os.unlink(zipf) # Check second and third files have expected contents zipf=tmpbase+'00001.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','g','h','i','j','k','l'] ) zo.close() os.unlink(zipf)
def test01_no_links(self): xml = run_resync( ['--resourcelist', 'http://example.org/t', 'tests/testdata/dir1']) rl = ResourceList() rl.parse(fh=io.BytesIO(xml)) self.assertEqual(len(rl), 2) self.assertEqual(rl.link('describedby'), None)
def read_reference_resource_list(self, ref_sitemap, name='reference'): """Read reference resource list and return the ResourceList object name parameter just uses in output messages to say what type of resource list is being read. """ rl = ResourceList() self.logger.info("Reading reference %s resource list from %s ..." % (name, ref_sitemap)) rl.mapper = self.mapper rl.read(uri=ref_sitemap, index_only=(not self.allow_multifile)) num_entries = len(rl.resources) self.logger.info( "Read %s resource list with %d entries in %d sitemaps" % (name, num_entries, rl.num_files)) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for r in rl.resources: print r n += 1 if (n >= to_show): break return (rl)
def test01_no_links(self): xml = run_resync(['--write-resourcelist', 'http://example.org/t', 'tests/testdata/dir1']) rl = ResourceList() rl.parse(fh=io.BytesIO(xml)) self.assertEqual(len(rl), 2) self.assertEqual(rl.link('describedby'), None)
def test20_as_xml(self): rl = ResourceList() rl.add( Resource('a',timestamp=1) ) rl.add( Resource('b',timestamp=2) ) xml = rl.as_xml() self.assertTrue( re.search(r'<rs:md .*capability="resourcelist"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<url><loc>a</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def test04_dump_multi_file_max_size(self): rl = ResourceList() for letter in map(chr, range(ord('a'), ord('l') + 1)): uri = 'http://ex.org/%s' % (letter) fname = 'tests/testdata/a_to_z/%s' % (letter) rl.add(Resource(uri, path=fname)) self.assertEqual(len(rl), 12) d2 = Dump(rl) tmpbase = os.path.join(self.tmpdir, 'test0f_') d2.max_size = 2000 n = d2.write(tmpbase) self.assertEqual(n, 2, 'expect to write 2 dump files') self.assertTrue(os.path.isfile(tmpbase + '00000.zip')) self.assertTrue(os.path.isfile(tmpbase + '00001.zip')) # Look at the first file in detail zipf = tmpbase + '00000.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'a', 'b', 'c', 'd', 'e', 'f']) #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 ) self.assertEqual(zo.getinfo('a').file_size, 9) self.assertEqual(zo.getinfo('b').file_size, 1116) self.assertEqual(zo.getinfo('c').file_size, 32) self.assertEqual(zo.getinfo('d').file_size, 13) self.assertEqual(zo.getinfo('e').file_size, 20) self.assertEqual(zo.getinfo('f').file_size, 1625) zo.close() os.unlink(zipf) # Check second and third files have expected contents zipf = tmpbase + '00001.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'g', 'h', 'i', 'j', 'k', 'l']) zo.close() os.unlink(zipf)
def test00_dump_creation(self): i = ResourceList() i.add(Resource("http://ex.org/a", length=1, path="resync/test/testdata/a")) i.add(Resource("http://ex.org/b", length=2, path="resync/test/testdata/b")) d = Dump() d.check_files(resource_list=i) self.assertEqual(d.total_size, 28)
def test11_bad_size(self): rl = ResourceList() rl.add( Resource('http://ex.org/a', length=9999, path='tests/testdata/a')) d = Dump(rl) self.assertTrue(d.check_files(check_length=False)) self.assertRaises(DumpError, d.check_files)
def read_reference_resource_list(self,ref_sitemap,name='reference'): """Read reference resource list and return the ResourceList object name parameter just uses in output messages to say what type of resource list is being read. """ rl = ResourceList() self.logger.info("Reading reference %s resource list from %s ..." % (name,ref_sitemap)) rl.mapper=self.mapper rl.read(uri=ref_sitemap,index_only=(not self.allow_multifile)) num_entries = len(rl.resources) self.logger.info("Read %s resource list with %d entries in %d sitemaps" % (name,num_entries,rl.num_files)) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for r in rl.resources: print r n+=1 if ( n >= to_show ): break return(rl)
def test20_as_xml(self): rl = ResourceList() rl.add( Resource('a',timestamp=1) ) rl.add( Resource('b',timestamp=2) ) xml = rl.as_xml() print xml self.assertTrue( re.search(r'<rs:md .*capability="resourcelist"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<url><loc>a</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def test_09_print_from_iter(self): r1 = Resource(uri='a',lastmod='2001-01-01',length=1234) r2 = Resource(uri='b',lastmod='2002-02-02',length=56789) m = ResourceList() m.add(r1) m.add(r2) i = iter(m) self.assertEqual( Sitemap().resources_as_xml(i), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url></urlset>")
def test_08_print_non_ascii_uri(self): """Verify that valid Unicode uri values give good XML out.""" m = ResourceList(md={'capability': 'resourcelist', 'modified': None}) m.add(Resource(uri=u'a_\u00c3_b')) m.add(Resource(uri=u'c_\u1234_d')) xml = Sitemap().resources_as_xml(m) self.assertTrue(re.search(u'<loc>a_.*_b</loc>', xml)) self.assertTrue(re.search(u'<loc>a_\u00c3_b</loc>', xml)) self.assertTrue(re.search(u'<loc>c_\u1234_d</loc>', xml))
def test20_as_xml(self): rl = ResourceList() rl.add( Resource('a',timestamp=1) ) rl.add( Resource('b',timestamp=2) ) xml = rl.as_xml() print xml self.assertTrue( re.search(r'<rs:md .*capability="resourcelist"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<rs:md .*modified="\d\d\d\d\-\d\d\-\d\dT\d\d:\d\d:\d\dZ"', xml), 'XML has modified to seconds precision (and not more)' ) self.assertTrue( re.search(r'<url><loc>a</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def test_ex_01(self): """resourcesync_ex_1 is a simple resource_list with 2 resources, no metadata""" rl=ResourceList() rl.parse(uri='tests/testdata/examples_from_spec/resourcesync_ex_1.xml') self.assertEqual( rl.capability, 'resourcelist' ) self.assertEqual( len(rl.resources), 2, '2 resources') sms = sorted(rl.uris()) self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] ) self.assertEqual( rl.resources['http://example.com/res1'].lastmod, None )
def test31_parse_no_capability(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\ <url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod></url>\ </urlset>' rl=ResourceList() rl.parse(fh=StringIO.StringIO(xml)) self.assertEqual( len(rl.resources), 1, 'got 1 resource') self.assertEqual( rl.md['capability'], 'resourcelist', 'capability set by reading routine' ) self.assertFalse( 'from' in rl.md )
def test_08_print(self): r1 = Resource(uri='a',lastmod='2001-01-01',length=1234) r2 = Resource(uri='b',lastmod='2002-02-02',length=56789) r3 = Resource(uri='c',lastmod='2003-03-03',length=0) m = ResourceList(md={'capability':'resourcelist','modified':None}) m.add(r1) m.add(r2) m.add(r3) #print m self.assertEqual( Sitemap().resources_as_xml(m), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><rs:md capability=\"resourcelist\" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length=\"0\" /></url></urlset>")
def test31_parse_no_capability(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\ <url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod></url>\ </urlset>' rl=ResourceList() rl.parse(fh=io.StringIO(xml)) self.assertEqual( len(rl.resources), 1, 'got 1 resource') self.assertEqual( rl.md['capability'], 'resourcelist', 'capability set by reading routine' ) self.assertFalse( 'from' in rl.md )
def test_ex_08(self): """resourcesync_ex_8 is a simple Resource List Index with 2 Resource Lists""" rl=ResourceList() rl.read(uri='tests/testdata/examples_from_spec/resourcesync_ex_8.xml',index_only=True) self.assertEqual( rl.capability, 'resourcelist' ) self.assertEqual( rl.md_at, '2013-01-03T09:00:00Z' ) self.assertEqual( len(rl.resources), 2, '2 resources') sms = sorted(rl.uris()) self.assertEqual( sms, ['http://example.com/resourcelist-part1.xml', 'http://example.com/resourcelist-part2.xml'] )
def test_01_read_local_filenames(self): rl = ResourceList() rl.read('tests/testdata/sitemapindex2/sitemap.xml') self.assertEqual( len(rl.resources), 17, '17 resources from 3 sitemaps listed') sr = sorted(rl.uris()) self.assertEqual( sr[0], 'http://localhost:8888/resources/1' ) self.assertEqual( sr[1], 'http://localhost:8888/resources/10' ) self.assertEqual( sr[2], 'http://localhost:8888/resources/100' ) self.assertEqual( sr[3], 'http://localhost:8888/resources/1000' ) self.assertEqual( sr[16], 'http://localhost:8888/resources/826' )
def test06_add_changed_resources(self): added = ResourceList() added.add( Resource('a',timestamp=1,change='created') ) added.add( Resource('d',timestamp=4,change='created') ) self.assertEqual(len(added), 2, "2 things in added resource_list") changes = ChangeList() changes.add_changed_resources( added, change='created' ) self.assertEqual(len(changes), 2, "2 things added") i = iter(changes) first = next(i) self.assertEqual(first.uri, 'a', "changes[0].uri=a") self.assertEqual(first.timestamp, 1, "changes[0].timestamp=1") self.assertEqual(first.change, 'created') #, "changes[0].change=createdd") second = next(i) self.assertEqual(second.timestamp, 4, "changes[1].timestamp=4") self.assertEqual(second.change, 'created', "changes[1].change=createdd") # Now add some with updated (one same, one diff) updated = ResourceList() updated.add( Resource('a',timestamp=5,change='created') ) updated.add( Resource('b',timestamp=6,change='created') ) self.assertEqual(len(updated), 2, "2 things in updated resource_list") changes.add_changed_resources( updated, change='updated' ) self.assertEqual(len(changes), 4, "4 = 2 old + 2 things updated") # Make new resource_list from the changes which should not have dupes dst = ResourceList() dst.add( changes, replace=True ) self.assertEqual(len(dst), 3, "3 unique resources") self.assertEqual(dst.resources['a'].timestamp, 5 ) # 5 was later in last the 1 self.assertEqual(dst.resources['a'].change, 'updated') self.assertEqual(dst.resources['b'].timestamp, 6) self.assertEqual(dst.resources['b'].change, 'updated') self.assertEqual(dst.resources['d'].timestamp, 4) self.assertEqual(dst.resources['d'].change, 'created')
def test_ex_02(self): """resourcesync_ex_2 is a simple resource_list with 2 resources, some metadata""" rl=ResourceList() rl.parse(uri='tests/testdata/examples_from_spec/resourcesync_ex_2.xml') self.assertEqual( len(rl.resources), 2, '2 resources') sms = sorted(rl.uris()) self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] ) self.assertEqual( rl.resources['http://example.com/res1'].lastmod, '2013-01-02T13:00:00Z' ) self.assertEqual( rl.resources['http://example.com/res2'].lastmod, '2013-01-02T14:00:00Z' ) self.assertEqual( rl.resources['http://example.com/res1'].md5, '1584abdf8ebdc9802ac0c6a7402c03b6' ) self.assertEqual( rl.resources['http://example.com/res2'].md5, '1e0d5cb8ef6ba40c99b14c0237be735e' )
def test_01_read_local_filenames(self): rl = ResourceList() rl.read('tests/testdata/sitemapindex2/sitemap.xml') self.assertEqual(len(rl.resources), 17, '17 resources from 3 sitemaps listed') sr = sorted(rl.uris()) self.assertEqual(sr[0], 'http://localhost:8888/resources/1') self.assertEqual(sr[1], 'http://localhost:8888/resources/10') self.assertEqual(sr[2], 'http://localhost:8888/resources/100') self.assertEqual(sr[3], 'http://localhost:8888/resources/1000') self.assertEqual(sr[16], 'http://localhost:8888/resources/826')
def test_09_print_from_iter(self): r1 = Resource(uri='a', lastmod='2001-01-01', length=1234) r2 = Resource(uri='b', lastmod='2002-02-02', length=56789) m = ResourceList() m.add(r1) m.add(r2) i = iter(m) self.assertEqual( Sitemap().resources_as_xml(i), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url></urlset>" )
def test_02_read_with_mapper(self): rl = ResourceList() rl.mapper = Mapper(['http://localhost/=tests/testdata/sitemapindex2/']) rl.read('tests/testdata/sitemapindex2/sitemap_mapper.xml') self.assertEqual( len(rl.resources), 17, '17 resources from 3 sitemaps listed') sr = sorted(rl.uris()) self.assertEqual( sr[0], 'http://localhost:8888/resources/1' ) self.assertEqual( sr[1], 'http://localhost:8888/resources/10' ) self.assertEqual( sr[2], 'http://localhost:8888/resources/100' ) self.assertEqual( sr[3], 'http://localhost:8888/resources/1000' ) self.assertEqual( sr[16], 'http://localhost:8888/resources/826' )
def test06_add_iterable(self): r1 = Resource(uri='a',length=1) r2 = Resource(uri='b',length=2) i = ResourceList() i.add( [r1,r2] ) self.assertRaises( ResourceListDupeError, i.add, r1) self.assertRaises( ResourceListDupeError, i.add, r2) # allow dupes r1d = Resource(uri='a',length=10) i.add( [r1d] ,replace=True) self.assertEqual( len(i), 2 ) self.assertEqual( i.resources['a'].length, 10 )
def test30_parse(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability="resourcelist" modified="2013-01-01"/>\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\ </urlset>' rl=ResourceList() rl.parse(fh=StringIO.StringIO(xml)) self.assertEqual( len(rl.resources), 2, 'got 2 resources') self.assertEqual( rl.md['capability'], 'resourcelist', 'capability set' ) self.assertEqual( rl.md['modified'], '2013-01-01' )
def test08_iter(self): i = ResourceList() i.add( Resource('a',timestamp=1) ) i.add( Resource('b',timestamp=2) ) i.add( Resource('c',timestamp=3) ) i.add( Resource('d',timestamp=4) ) resources=[] for r in i: resources.append(r) self.assertEqual(len(resources), 4) self.assertEqual( resources[0].uri, 'a') self.assertEqual( resources[3].uri, 'd')
def test30_parse(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md at="2013-08-07" capability="resourcelist" completed="2013-08-08" />\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\ </urlset>' rl=ResourceList() rl.parse(fh=StringIO.StringIO(xml)) self.assertEqual( len(rl.resources), 2, 'got 2 resources') self.assertEqual( rl.md['capability'], 'resourcelist', 'capability set' ) self.assertEqual( rl.md_at, '2013-08-07' ) self.assertEqual( rl.md_completed, '2013-08-08' )
def test02_resourcelist(self): rl = ResourceList() caps = CapabilityList() caps.add_capability( rl, "http://example.org/resourcelist.xml" ) caps.md['from'] = "2013-02-07T22:39:00" self.assertEqual( len(caps), 1 ) self.assertEqual( caps.as_xml(), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:md capability="capabilitylist" from="2013-02-07T22:39:00" /><url><loc>http://example.org/resourcelist.xml</loc><rs:md capability="resourcelist" /></url></urlset>' )
def get_state_published(self): """ See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist (with local paths) of resources published in the zip end file. :return: - the path to the zip end file or None if there is no zip end file. - the resourcelist of resources published in zip end file or an empty list if there is no zip end file. """ path_zip_end_old = None rl_end_old = ResourceList() zip_end_files = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) if len(zip_end_files) > 1: raise RuntimeError( "Found more than one %s*.zip files. Inconsistent structure of %s." % (PREFIX_END_PART, self.publish_dir)) elif len(zip_end_files) == 1: path_zip_end_old = zip_end_files[0] if path_zip_end_old: rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r") sm = Sitemap() sm.parse_xml(rl_file, resources=rl_end_old) rl_file.close() return path_zip_end_old, rl_end_old
def test_build_ex_16(self): rl = ResourceList() rl.up = 'http://example.com/dataset1/capabilitylist.xml' rl.index = 'http://example.com/dataset1/resourcelist-index.xml' rl.md_at="2013-01-03T09:00:00Z" rl.add( Resource( uri='http://example.com/res3', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c8753', length=4385, mime_type="application/pdf" )) rl.add( Resource( uri='http://example.com/res4', lastmod='2013-01-02T14:00:00Z', md5='4556abdf8ebdc9802ac0c6a7402c9881', length=883, mime_type="image/png" )) ex_xml = self._open_ex('resourcesync_ex_16').read() self._assert_xml_equal( rl.as_xml(), ex_xml )
def generate(self): """Generate a resource_list (snapshot from the source).""" then = time.time() resource_list = ResourceList(resources=self.source.resources, count=self.source.resource_count) now = time.time() self.logger.info("Generated resource_list: %f" % (now - then)) return resource_list
def test03_dump_multi_file_max_size(self): rl=ResourceList() for letter in map(chr,xrange(ord('a'),ord('l')+1)): uri='http://ex.org/%s' % (letter) fname='resync/test/testdata/a_to_z/%s' % (letter) rl.add( Resource(uri, path=fname) ) self.assertEqual( len(rl), 12 ) #d=Dump(rl) #tmpdir=tempfile.mkdtemp() #tmpbase=os.path.join(tmpdir,'base') #d.max_size=2000 # start new zip after size exceeds 2000 bytes #n=d.write(tmpbase) #self.assertEqual( n, 2, 'expect to write 2 dump files' ) # # Now repeat with large size limit but small number of files limit d2=Dump(rl) tmpbase=os.path.join(self.tmpdir,'test03_') d2.max_files=4 n=d2.write(tmpbase) self.assertEqual( n, 3, 'expect to write 3 dump files' ) self.assertTrue( os.path.isfile(tmpbase+'00000.zip') ) self.assertTrue( os.path.isfile(tmpbase+'00001.zip') ) self.assertTrue( os.path.isfile(tmpbase+'00002.zip') ) # Look at the first file in detail zipf=tmpbase+'00000.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','a','b','c','d'] ) #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 ) self.assertEqual( zo.getinfo('a').file_size, 9 ) self.assertEqual( zo.getinfo('b').file_size, 1116 ) self.assertEqual( zo.getinfo('c').file_size, 32 ) self.assertEqual( zo.getinfo('d').file_size, 13 ) zo.close() os.unlink(zipf) # Check second and third files have expected contents zipf=tmpbase+'00001.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','e','f','g','h'] ) zo.close() os.unlink(zipf) zipf=tmpbase+'00002.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','i','j','k','l'] ) zo.close() os.unlink(zipf)
def test03_dump_multi_file_max_size(self): rl = ResourceList() for letter in map(chr, range(ord('a'), ord('l') + 1)): uri = 'http://ex.org/%s' % (letter) fname = 'tests/testdata/a_to_z/%s' % (letter) rl.add(Resource(uri, path=fname)) self.assertEqual(len(rl), 12) #d=Dump(rl) #tmpdir=tempfile.mkdtemp() #tmpbase=os.path.join(tmpdir,'base') #d.max_size=2000 # start new zip after size exceeds 2000 bytes #n=d.write(tmpbase) #self.assertEqual( n, 2, 'expect to write 2 dump files' ) # # Now repeat with large size limit but small number of files limit d2 = Dump(rl) tmpbase = os.path.join(self.tmpdir, 'test03_') d2.max_files = 4 n = d2.write(tmpbase) self.assertEqual(n, 3, 'expect to write 3 dump files') self.assertTrue(os.path.isfile(tmpbase + '00000.zip')) self.assertTrue(os.path.isfile(tmpbase + '00001.zip')) self.assertTrue(os.path.isfile(tmpbase + '00002.zip')) # Look at the first file in detail zipf = tmpbase + '00000.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'a', 'b', 'c', 'd']) #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 ) self.assertEqual(zo.getinfo('a').file_size, 9) self.assertEqual(zo.getinfo('b').file_size, 1116) self.assertEqual(zo.getinfo('c').file_size, 32) self.assertEqual(zo.getinfo('d').file_size, 13) zo.close() os.unlink(zipf) # Check second and third files have expected contents zipf = tmpbase + '00001.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'e', 'f', 'g', 'h']) zo.close() os.unlink(zipf) zipf = tmpbase + '00002.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'i', 'j', 'k', 'l']) zo.close() os.unlink(zipf)
def test32_parse_bad_capability(self): # the <rs:md capability="bad_capability".. should give error xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability="bad_capability" from="2013-01-01"/>\ <url><loc>http://example.com/bad_res_1</loc><lastmod>2012-03-14T18:37:36Z</lastmod></url>\ </urlset>' rl=ResourceList() self.assertRaises( SitemapParseError, rl.parse, fh=StringIO.StringIO(xml) )
def test02_changed(self): src = ResourceList() src.add(Resource('a', timestamp=1)) src.add(Resource('b', timestamp=2)) dst = ResourceList() dst.add(Resource('a', timestamp=3)) dst.add(Resource('b', timestamp=4)) (same, changed, deleted, added) = dst.compare(src) self.assertEqual(len(same), 0, "0 things unchanged") self.assertEqual(len(changed), 2, "2 things changed") i = iter(changed) self.assertEqual(next(i).uri, 'a', "first was a") self.assertEqual(next(i).uri, 'b', "second was b") self.assertEqual(len(deleted), 0, "nothing deleted") self.assertEqual(len(added), 0, "nothing added")
def test03_multiple(self): caps = CapabilityList() rl = ResourceList() caps.add_capability( rl, "rl.xml" ) cl = ChangeList() caps.add_capability( cl, "cl.xml" ) self.assertEqual( len(caps), 2 ) xml = caps.as_xml() self.assertTrue( re.search( r'<loc>rl.xml</loc><rs:md capability="resourcelist" />', xml ) ) self.assertTrue( re.search( r'<loc>cl.xml</loc><rs:md capability="changelist" />', xml) )
def list_resources_chunk(self): """ Fill a resource list up to max_files_compressed or with as much rdf-files as there are left in resource_dir. A boolean indicates whether the resource_dir was exhausted. :return: the ResourceList, exhausted """ resourcelist = ResourceList() exhausted = self.list_patch_files(resourcelist, max_files=self.max_files_compressed) return resourcelist, exhausted
def test_build_ex_14(self): """Resource List with 2 entries and some metadata""" rl = ResourceList() rl.up='http://example.com/dataset1/capabilitylist.xml' rl.md_at="2013-01-03T09:00:00Z" rl.md_completed="2013-01-03T09:01:00Z" rl.add( Resource( uri='http://example.com/res1', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6', length=8876, mime_type="text/html" )) rl.add( Resource( uri='http://example.com/res2', lastmod='2013-01-02T14:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e', sha256='854f61290e2e197a11bc91063afce22e43f8ccc655237050ace766adc68dc784', length=14599, mime_type="application/pdf" )) ex_xml = self._open_ex('resourcesync_ex_14').read() self._assert_xml_equal( rl.as_xml(), ex_xml )
def test07_hashes(self): r1 = Resource(uri='a') r2 = Resource(uri='b') i = ResourceList() self.assertEqual(i.hashes(), set()) i.add(r1) i.add(r2) self.assertEqual(i.hashes(), set()) r1.md5 = "aabbcc" self.assertEqual(i.hashes(), set(['md5'])) r2.sha1 = "ddeeff" self.assertEqual(i.hashes(), set(['md5', 'sha-1']))
def test_build_ex_13(self): """Capability List document with 4 entries""" cl = CapabilityList() cl.describedby = 'http://example.com/info_about_set1_of_resources.xml' cl.up = 'http://example.com/resourcesync_description.xml' cl.add_capability( capability=ResourceList( uri='http://example.com/dataset1/resourcelist.xml' ) ) cl.add_capability( capability=ResourceDump( uri='http://example.com/dataset1/resourcedump.xml' ) ) cl.add_capability( capability=ChangeList( uri='http://example.com/dataset1/changelist.xml' ) ) cl.add_capability( capability=ChangeDump( uri='http://example.com/dataset1/changedump.xml' ) ) ex_xml = self._open_ex('resourcesync_ex_13').read() self._assert_xml_equal( cl.as_xml(), ex_xml )
def test05_add(self): r1 = Resource(uri='a', length=1) r2 = Resource(uri='b', length=2) i = ResourceList() i.add(r1) self.assertRaises(ResourceListDupeError, i.add, r1) i.add(r2) self.assertRaises(ResourceListDupeError, i.add, r2) # allow dupes r1d = Resource(uri='a', length=10) i.add(r1d, replace=True) self.assertEqual(len(i), 2) self.assertEqual(i.resources['a'].length, 10)
def test04_added(self): src = ResourceList() src.add( Resource('a',timestamp=1) ) src.add( Resource('b',timestamp=2) ) src.add( Resource('c',timestamp=3) ) src.add( Resource('d',timestamp=4) ) dst = ResourceList() dst.add( Resource('a',timestamp=1) ) dst.add( Resource('c',timestamp=3) ) ( same, changed, deleted, added ) = dst.compare(src) self.assertEqual( len(same), 2, "2 things unchanged" ) self.assertEqual( len(changed), 0, "nothing changed" ) self.assertEqual( len(deleted), 0, "nothing deleted" ) self.assertEqual( len(added), 2, "b and d added" ) i = iter(added) self.assertEqual( next(i).uri, 'b', "first was b" ) self.assertEqual( next(i).uri, 'd', "second was d" )
def test07_has_md5(self): r1 = Resource(uri='a') r2 = Resource(uri='b') i = ResourceList() self.assertFalse( i.has_md5() ) i.add(r1) i.add(r2) self.assertFalse( i.has_md5() ) r1.md5="aabbcc" self.assertTrue( i.has_md5() )
def test_build_ex_02(self): """Slightly more complex Resource List document """ rl = ResourceList() rl.md_at = '2013-01-03T09:00:00Z' rl.add( Resource(uri='http://example.com/res1', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6') ) r2 = Resource(uri='http://example.com/res2', lastmod='2013-01-02T14:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e') r2.link_set(rel="duplicate",href="http://mirror.example.com/res2") rl.add( r2 ) ex_xml = self._open_ex('resourcesync_ex_2').read() self._assert_xml_equal( rl.as_xml(), ex_xml )
def test_build_ex_01(self): """Simple Resource List document """ rl = ResourceList() rl.md_at = '2013-01-03T09:00:00Z' rl.add( Resource('http://example.com/res1') ) rl.add( Resource('http://example.com/res2') ) ex_xml = self._open_ex('resourcesync_ex_1').read() self._assert_xml_equal( rl.as_xml(), ex_xml )
def test02_resource_list_links(self): xml = run_resync([ '--resourcelist', '--describedby-link=a', '--sourcedescription-link=b', #will be ignored '--capabilitylist-link=c', 'http://example.org/t', 'resync/test/testdata/dir1' ]) rl = ResourceList() rl.parse(fh=StringIO.StringIO(xml)) self.assertEqual(len(rl), 2) self.assertNotEqual(rl.link('describedby'), None) self.assertEqual(rl.link('describedby')['href'], 'a') self.assertNotEqual(rl.link('up'), None) self.assertEqual(rl.link('up')['href'], 'c')
def test02_resource_list_links(self): xml = run_resync(['--write-resourcelist', '--describedby-link=a', '--sourcedescription-link=b', # will be ignored '--capabilitylist-link=c', 'http://example.org/t', 'tests/testdata/dir1']) rl = ResourceList() rl.parse(fh=io.BytesIO(xml)) self.assertEqual(len(rl), 2) self.assertNotEqual(rl.link('describedby'), None) self.assertEqual(rl.link('describedby')['href'], 'a') self.assertNotEqual(rl.link('up'), None) self.assertEqual(rl.link('up')['href'], 'c')
def test01_same(self): src = ResourceList() src.add( Resource('a',timestamp=1) ) src.add( Resource('b',timestamp=2) ) dst = ResourceList() dst.add( Resource('a',timestamp=1) ) dst.add( Resource('b',timestamp=2) ) ( same, changed, deleted, added ) = dst.compare(src) self.assertEqual( len(same), 2, "2 things unchanged" ) i = iter(same) self.assertEqual( i.next().uri, 'a', "first was a" ) self.assertEqual( i.next().uri, 'b', "second was b" ) self.assertEqual( len(changed), 0, "nothing changed" ) self.assertEqual( len(deleted), 0, "nothing deleted" ) self.assertEqual( len(added), 0, "nothing added" )
def test_build_ex_08(self): """Simple Resource List Index document This is not something that would usually be created directly but instead would be created as part of the process of writing a large Resource List in multiple files. However, it is possible to create manually. """ rli = ResourceList() rli.sitemapindex=True rli.md_at = '2013-01-03T09:00:00Z' rli.add( Resource(uri='http://example.com/resourcelist-part1.xml') ) rli.add( Resource(uri='http://example.com/resourcelist-part2.xml') ) ex_xml = self._open_ex('resourcesync_ex_8').read() self._assert_xml_equal( rli.as_xml(), ex_xml )