Пример #1
0
 def test04_dump_multi_file_max_size(self):
     rl=ResourceList()
     for letter in map(chr,xrange(ord('a'),ord('l')+1)):
         uri='http://ex.org/%s' % (letter)
         fname='resync/test/testdata/a_to_z/%s' % (letter)
         rl.add( Resource(uri, path=fname) )
     self.assertEqual( len(rl), 12 )
     d2=Dump(rl) 
     tmpbase=os.path.join(self.tmpdir,'test0f_')
     d2.max_size=2000
     n=d2.write(tmpbase)
     self.assertEqual( n, 2, 'expect to write 2 dump files' )
     self.assertTrue( os.path.isfile(tmpbase+'00000.zip') )
     self.assertTrue( os.path.isfile(tmpbase+'00001.zip') )
     # Look at the first file in detail
     zipf=tmpbase+'00000.zip'
     zo=zipfile.ZipFile(zipf,'r')
     self.assertEqual( zo.namelist(), ['manifest.xml','a','b','c','d','e','f'] )
     #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 )
     self.assertEqual( zo.getinfo('a').file_size, 9 )
     self.assertEqual( zo.getinfo('b').file_size, 1116 )
     self.assertEqual( zo.getinfo('c').file_size, 32 )
     self.assertEqual( zo.getinfo('d').file_size, 13 )
     self.assertEqual( zo.getinfo('e').file_size, 20 )
     self.assertEqual( zo.getinfo('f').file_size, 1625 )
     zo.close()
     os.unlink(zipf)
     # Check second and third files have expected contents
     zipf=tmpbase+'00001.zip'
     zo=zipfile.ZipFile(zipf,'r')
     self.assertEqual( zo.namelist(), ['manifest.xml','g','h','i','j','k','l'] )
     zo.close()
     os.unlink(zipf)
 def test01_no_links(self):
     xml = run_resync(
         ['--resourcelist', 'http://example.org/t', 'tests/testdata/dir1'])
     rl = ResourceList()
     rl.parse(fh=io.BytesIO(xml))
     self.assertEqual(len(rl), 2)
     self.assertEqual(rl.link('describedby'), None)
Пример #3
0
    def read_reference_resource_list(self, ref_sitemap, name='reference'):
        """Read reference resource list and return the ResourceList object

        name parameter just uses in output messages to say what type
        of resource list is being read.
        """
        rl = ResourceList()
        self.logger.info("Reading reference %s resource list from %s ..." %
                         (name, ref_sitemap))
        rl.mapper = self.mapper
        rl.read(uri=ref_sitemap, index_only=(not self.allow_multifile))
        num_entries = len(rl.resources)
        self.logger.info(
            "Read %s resource list with %d entries in %d sitemaps" %
            (name, num_entries, rl.num_files))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries > to_show):
                print "Showing first %d entries sorted by URI%s..." % (
                    to_show, override_str)
            n = 0
            for r in rl.resources:
                print r
                n += 1
                if (n >= to_show):
                    break
        return (rl)
Пример #4
0
 def test01_no_links(self):
     xml = run_resync(['--write-resourcelist',
                       'http://example.org/t', 'tests/testdata/dir1'])
     rl = ResourceList()
     rl.parse(fh=io.BytesIO(xml))
     self.assertEqual(len(rl), 2)
     self.assertEqual(rl.link('describedby'), None)
Пример #5
0
 def test20_as_xml(self):
     rl = ResourceList()
     rl.add( Resource('a',timestamp=1) )
     rl.add( Resource('b',timestamp=2) )
     xml = rl.as_xml()
     self.assertTrue( re.search(r'<rs:md .*capability="resourcelist"', xml), 'XML has capability' )
     self.assertTrue( re.search(r'<url><loc>a</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' ) 
Пример #6
0
 def test04_dump_multi_file_max_size(self):
     rl = ResourceList()
     for letter in map(chr, range(ord('a'), ord('l') + 1)):
         uri = 'http://ex.org/%s' % (letter)
         fname = 'tests/testdata/a_to_z/%s' % (letter)
         rl.add(Resource(uri, path=fname))
     self.assertEqual(len(rl), 12)
     d2 = Dump(rl)
     tmpbase = os.path.join(self.tmpdir, 'test0f_')
     d2.max_size = 2000
     n = d2.write(tmpbase)
     self.assertEqual(n, 2, 'expect to write 2 dump files')
     self.assertTrue(os.path.isfile(tmpbase + '00000.zip'))
     self.assertTrue(os.path.isfile(tmpbase + '00001.zip'))
     # Look at the first file in detail
     zipf = tmpbase + '00000.zip'
     zo = zipfile.ZipFile(zipf, 'r')
     self.assertEqual(zo.namelist(),
                      ['manifest.xml', 'a', 'b', 'c', 'd', 'e', 'f'])
     #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 )
     self.assertEqual(zo.getinfo('a').file_size, 9)
     self.assertEqual(zo.getinfo('b').file_size, 1116)
     self.assertEqual(zo.getinfo('c').file_size, 32)
     self.assertEqual(zo.getinfo('d').file_size, 13)
     self.assertEqual(zo.getinfo('e').file_size, 20)
     self.assertEqual(zo.getinfo('f').file_size, 1625)
     zo.close()
     os.unlink(zipf)
     # Check second and third files have expected contents
     zipf = tmpbase + '00001.zip'
     zo = zipfile.ZipFile(zipf, 'r')
     self.assertEqual(zo.namelist(),
                      ['manifest.xml', 'g', 'h', 'i', 'j', 'k', 'l'])
     zo.close()
     os.unlink(zipf)
Пример #7
0
 def test00_dump_creation(self):
     i = ResourceList()
     i.add(Resource("http://ex.org/a", length=1, path="resync/test/testdata/a"))
     i.add(Resource("http://ex.org/b", length=2, path="resync/test/testdata/b"))
     d = Dump()
     d.check_files(resource_list=i)
     self.assertEqual(d.total_size, 28)
Пример #8
0
 def test11_bad_size(self):
     rl = ResourceList()
     rl.add(
         Resource('http://ex.org/a', length=9999, path='tests/testdata/a'))
     d = Dump(rl)
     self.assertTrue(d.check_files(check_length=False))
     self.assertRaises(DumpError, d.check_files)
Пример #9
0
    def read_reference_resource_list(self,ref_sitemap,name='reference'):
        """Read reference resource list and return the ResourceList object

        name parameter just uses in output messages to say what type
        of resource list is being read.
        """
        rl = ResourceList()
        self.logger.info("Reading reference %s resource list from %s ..." % (name,ref_sitemap))
        rl.mapper=self.mapper
        rl.read(uri=ref_sitemap,index_only=(not self.allow_multifile))
        num_entries = len(rl.resources)
        self.logger.info("Read %s resource list with %d entries in %d sitemaps" % (name,num_entries,rl.num_files))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries>to_show):
                print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
            n=0
            for r in rl.resources:
                print r
                n+=1
                if ( n >= to_show ):
                    break
        return(rl)
Пример #10
0
 def test20_as_xml(self):
     rl = ResourceList()
     rl.add( Resource('a',timestamp=1) )
     rl.add( Resource('b',timestamp=2) )
     xml = rl.as_xml()
     print xml
     self.assertTrue( re.search(r'<rs:md .*capability="resourcelist"', xml), 'XML has capability' )
     self.assertTrue( re.search(r'<url><loc>a</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' ) 
Пример #11
0
 def test_09_print_from_iter(self): 
     r1 = Resource(uri='a',lastmod='2001-01-01',length=1234)
     r2 = Resource(uri='b',lastmod='2002-02-02',length=56789)
     m = ResourceList()
     m.add(r1)
     m.add(r2)
     i = iter(m)
     self.assertEqual( Sitemap().resources_as_xml(i), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url></urlset>")
Пример #12
0
 def test_08_print_non_ascii_uri(self):
     """Verify that valid Unicode uri values give good XML out."""
     m = ResourceList(md={'capability': 'resourcelist', 'modified': None})
     m.add(Resource(uri=u'a_\u00c3_b'))
     m.add(Resource(uri=u'c_\u1234_d'))
     xml = Sitemap().resources_as_xml(m)
     self.assertTrue(re.search(u'<loc>a_.*_b</loc>', xml))
     self.assertTrue(re.search(u'<loc>a_\u00c3_b</loc>', xml))
     self.assertTrue(re.search(u'<loc>c_\u1234_d</loc>', xml))
Пример #13
0
 def test_08_print_non_ascii_uri(self):
     """Verify that valid Unicode uri values give good XML out."""
     m = ResourceList(md={'capability': 'resourcelist', 'modified': None})
     m.add(Resource(uri=u'a_\u00c3_b'))
     m.add(Resource(uri=u'c_\u1234_d'))
     xml = Sitemap().resources_as_xml(m)
     self.assertTrue(re.search(u'<loc>a_.*_b</loc>', xml))
     self.assertTrue(re.search(u'<loc>a_\u00c3_b</loc>', xml))
     self.assertTrue(re.search(u'<loc>c_\u1234_d</loc>', xml))
Пример #14
0
 def test20_as_xml(self):
     rl = ResourceList()
     rl.add( Resource('a',timestamp=1) )
     rl.add( Resource('b',timestamp=2) )
     xml = rl.as_xml()
     print xml
     self.assertTrue( re.search(r'<rs:md .*capability="resourcelist"', xml), 'XML has capability' )
     self.assertTrue( re.search(r'<rs:md .*modified="\d\d\d\d\-\d\d\-\d\dT\d\d:\d\d:\d\dZ"', xml), 'XML has modified to seconds precision (and not more)' )
     self.assertTrue( re.search(r'<url><loc>a</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' ) 
Пример #15
0
 def test_ex_01(self):
     """resourcesync_ex_1 is a simple resource_list with 2 resources, no metadata"""
     rl=ResourceList()
     rl.parse(uri='tests/testdata/examples_from_spec/resourcesync_ex_1.xml')
     self.assertEqual( rl.capability, 'resourcelist' )
     self.assertEqual( len(rl.resources), 2, '2 resources')
     sms = sorted(rl.uris())
     self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] )
     self.assertEqual( rl.resources['http://example.com/res1'].lastmod, None )
Пример #16
0
    def test31_parse_no_capability(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\
<url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod></url>\
</urlset>'
        rl=ResourceList()
        rl.parse(fh=StringIO.StringIO(xml))
        self.assertEqual( len(rl.resources), 1, 'got 1 resource')
        self.assertEqual( rl.md['capability'], 'resourcelist', 'capability set by reading routine' )
        self.assertFalse( 'from' in rl.md )
Пример #17
0
 def test_08_print(self):
     r1 = Resource(uri='a',lastmod='2001-01-01',length=1234)
     r2 = Resource(uri='b',lastmod='2002-02-02',length=56789)
     r3 = Resource(uri='c',lastmod='2003-03-03',length=0)
     m = ResourceList(md={'capability':'resourcelist','modified':None})
     m.add(r1)
     m.add(r2)
     m.add(r3)
     #print m
     self.assertEqual( Sitemap().resources_as_xml(m), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><rs:md capability=\"resourcelist\" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length=\"0\" /></url></urlset>")
Пример #18
0
    def test31_parse_no_capability(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\
<url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod></url>\
</urlset>'
        rl=ResourceList()
        rl.parse(fh=io.StringIO(xml))
        self.assertEqual( len(rl.resources), 1, 'got 1 resource')
        self.assertEqual( rl.md['capability'], 'resourcelist', 'capability set by reading routine' )
        self.assertFalse( 'from' in rl.md )
Пример #19
0
 def test_ex_08(self):
     """resourcesync_ex_8 is a simple Resource List Index with 2 Resource Lists"""
     rl=ResourceList()
     rl.read(uri='tests/testdata/examples_from_spec/resourcesync_ex_8.xml',index_only=True)
     self.assertEqual( rl.capability, 'resourcelist' )
     self.assertEqual( rl.md_at, '2013-01-03T09:00:00Z' )
     self.assertEqual( len(rl.resources), 2, '2 resources')
     sms = sorted(rl.uris())
     self.assertEqual( sms, ['http://example.com/resourcelist-part1.xml',
                             'http://example.com/resourcelist-part2.xml'] )
Пример #20
0
 def test_01_read_local_filenames(self):
     rl = ResourceList()
     rl.read('tests/testdata/sitemapindex2/sitemap.xml')
     self.assertEqual( len(rl.resources), 17, '17 resources from 3 sitemaps listed')
     sr = sorted(rl.uris())
     self.assertEqual( sr[0], 'http://localhost:8888/resources/1' )
     self.assertEqual( sr[1], 'http://localhost:8888/resources/10' )
     self.assertEqual( sr[2], 'http://localhost:8888/resources/100' )
     self.assertEqual( sr[3], 'http://localhost:8888/resources/1000' )
     self.assertEqual( sr[16], 'http://localhost:8888/resources/826' )
Пример #21
0
 def test06_add_changed_resources(self):
     added = ResourceList()
     added.add( Resource('a',timestamp=1,change='created') )
     added.add( Resource('d',timestamp=4,change='created') )
     self.assertEqual(len(added), 2, "2 things in added resource_list")
     changes = ChangeList()
     changes.add_changed_resources( added, change='created' )
     self.assertEqual(len(changes), 2, "2 things added")
     i = iter(changes)
     first = next(i)
     self.assertEqual(first.uri, 'a', "changes[0].uri=a")
     self.assertEqual(first.timestamp, 1, "changes[0].timestamp=1")
     self.assertEqual(first.change, 'created') #, "changes[0].change=createdd")
     second = next(i)
     self.assertEqual(second.timestamp, 4, "changes[1].timestamp=4")
     self.assertEqual(second.change, 'created', "changes[1].change=createdd")
     # Now add some with updated (one same, one diff)
     updated = ResourceList()
     updated.add( Resource('a',timestamp=5,change='created') )
     updated.add( Resource('b',timestamp=6,change='created') )
     self.assertEqual(len(updated), 2, "2 things in updated resource_list")
     changes.add_changed_resources( updated, change='updated' )
     self.assertEqual(len(changes), 4, "4 = 2 old + 2 things updated")
     # Make new resource_list from the changes which should not have dupes
     dst = ResourceList()
     dst.add( changes, replace=True )
     self.assertEqual(len(dst), 3, "3 unique resources")
     self.assertEqual(dst.resources['a'].timestamp, 5 ) # 5 was later in last the 1
     self.assertEqual(dst.resources['a'].change, 'updated')
     self.assertEqual(dst.resources['b'].timestamp, 6)
     self.assertEqual(dst.resources['b'].change, 'updated')
     self.assertEqual(dst.resources['d'].timestamp, 4)
     self.assertEqual(dst.resources['d'].change, 'created')
Пример #22
0
 def test_ex_02(self):
     """resourcesync_ex_2 is a simple resource_list with 2 resources, some metadata"""
     rl=ResourceList()
     rl.parse(uri='tests/testdata/examples_from_spec/resourcesync_ex_2.xml')
     self.assertEqual( len(rl.resources), 2, '2 resources')
     sms = sorted(rl.uris())
     self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] )
     self.assertEqual( rl.resources['http://example.com/res1'].lastmod, '2013-01-02T13:00:00Z' )
     self.assertEqual( rl.resources['http://example.com/res2'].lastmod, '2013-01-02T14:00:00Z' )
     self.assertEqual( rl.resources['http://example.com/res1'].md5, '1584abdf8ebdc9802ac0c6a7402c03b6' )
     self.assertEqual( rl.resources['http://example.com/res2'].md5, '1e0d5cb8ef6ba40c99b14c0237be735e' )
 def test_01_read_local_filenames(self):
     rl = ResourceList()
     rl.read('tests/testdata/sitemapindex2/sitemap.xml')
     self.assertEqual(len(rl.resources), 17,
                      '17 resources from 3 sitemaps listed')
     sr = sorted(rl.uris())
     self.assertEqual(sr[0], 'http://localhost:8888/resources/1')
     self.assertEqual(sr[1], 'http://localhost:8888/resources/10')
     self.assertEqual(sr[2], 'http://localhost:8888/resources/100')
     self.assertEqual(sr[3], 'http://localhost:8888/resources/1000')
     self.assertEqual(sr[16], 'http://localhost:8888/resources/826')
Пример #24
0
 def test_09_print_from_iter(self):
     r1 = Resource(uri='a', lastmod='2001-01-01', length=1234)
     r2 = Resource(uri='b', lastmod='2002-02-02', length=56789)
     m = ResourceList()
     m.add(r1)
     m.add(r2)
     i = iter(m)
     self.assertEqual(
         Sitemap().resources_as_xml(i),
         "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url></urlset>"
     )
Пример #25
0
 def test_02_read_with_mapper(self):
     rl = ResourceList()
     rl.mapper = Mapper(['http://localhost/=tests/testdata/sitemapindex2/'])
     rl.read('tests/testdata/sitemapindex2/sitemap_mapper.xml')
     self.assertEqual( len(rl.resources), 17, '17 resources from 3 sitemaps listed')
     sr = sorted(rl.uris())
     self.assertEqual( sr[0], 'http://localhost:8888/resources/1' )
     self.assertEqual( sr[1], 'http://localhost:8888/resources/10' )
     self.assertEqual( sr[2], 'http://localhost:8888/resources/100' )
     self.assertEqual( sr[3], 'http://localhost:8888/resources/1000' )
     self.assertEqual( sr[16], 'http://localhost:8888/resources/826' )
Пример #26
0
 def test06_add_iterable(self):
     r1 = Resource(uri='a',length=1)
     r2 = Resource(uri='b',length=2)
     i = ResourceList()
     i.add( [r1,r2] )
     self.assertRaises( ResourceListDupeError, i.add, r1)
     self.assertRaises( ResourceListDupeError, i.add, r2)
     # allow dupes
     r1d = Resource(uri='a',length=10)
     i.add( [r1d] ,replace=True)
     self.assertEqual( len(i), 2 )
     self.assertEqual( i.resources['a'].length, 10 ) 
Пример #27
0
    def test30_parse(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="resourcelist" modified="2013-01-01"/>\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\
</urlset>'
        rl=ResourceList()
        rl.parse(fh=StringIO.StringIO(xml))
        self.assertEqual( len(rl.resources), 2, 'got 2 resources')
        self.assertEqual( rl.md['capability'], 'resourcelist', 'capability set' )
        self.assertEqual( rl.md['modified'], '2013-01-01' )
Пример #28
0
 def test08_iter(self):
     i = ResourceList()
     i.add( Resource('a',timestamp=1) )
     i.add( Resource('b',timestamp=2) )
     i.add( Resource('c',timestamp=3) )
     i.add( Resource('d',timestamp=4) )
     resources=[]
     for r in i:
         resources.append(r)
     self.assertEqual(len(resources), 4)
     self.assertEqual( resources[0].uri, 'a')
     self.assertEqual( resources[3].uri, 'd')
Пример #29
0
    def test30_parse(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md at="2013-08-07" capability="resourcelist" completed="2013-08-08" />\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\
</urlset>'
        rl=ResourceList()
        rl.parse(fh=StringIO.StringIO(xml))
        self.assertEqual( len(rl.resources), 2, 'got 2 resources')
        self.assertEqual( rl.md['capability'], 'resourcelist', 'capability set' )
        self.assertEqual( rl.md_at, '2013-08-07' )
        self.assertEqual( rl.md_completed, '2013-08-08' )
Пример #30
0
 def test02_resourcelist(self):
     rl = ResourceList()
     caps = CapabilityList()
     caps.add_capability( rl, "http://example.org/resourcelist.xml" )
     caps.md['from'] = "2013-02-07T22:39:00"
     self.assertEqual( len(caps), 1 )
     self.assertEqual( caps.as_xml(), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:md capability="capabilitylist" from="2013-02-07T22:39:00" /><url><loc>http://example.org/resourcelist.xml</loc><rs:md capability="resourcelist" /></url></urlset>' )
Пример #31
0
    def get_state_published(self):
        """
        See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist
        (with local paths) of resources published in the zip end file.
        :return:    - the path to the zip end file or None if there is no zip end file.
                    - the resourcelist of resources published in zip end file or an empty list if there is no zip end file.
        """
        path_zip_end_old = None
        rl_end_old = ResourceList()

        zip_end_files = glob(
            os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip"))
        if len(zip_end_files) > 1:
            raise RuntimeError(
                "Found more than one %s*.zip files. Inconsistent structure of %s."
                % (PREFIX_END_PART, self.publish_dir))
        elif len(zip_end_files) == 1:
            path_zip_end_old = zip_end_files[0]

        if path_zip_end_old:
            rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r")
            sm = Sitemap()
            sm.parse_xml(rl_file, resources=rl_end_old)
            rl_file.close()

        return path_zip_end_old, rl_end_old
Пример #32
0
 def test_build_ex_16(self):
     rl = ResourceList()
     rl.up = 'http://example.com/dataset1/capabilitylist.xml'
     rl.index = 'http://example.com/dataset1/resourcelist-index.xml'
     rl.md_at="2013-01-03T09:00:00Z"
     rl.add( Resource( uri='http://example.com/res3',
                       lastmod='2013-01-02T13:00:00Z',
                       md5='1584abdf8ebdc9802ac0c6a7402c8753',
                       length=4385,
                       mime_type="application/pdf" ))
     rl.add( Resource( uri='http://example.com/res4',
                       lastmod='2013-01-02T14:00:00Z',
                       md5='4556abdf8ebdc9802ac0c6a7402c9881',
                       length=883,
                       mime_type="image/png" ))
     ex_xml = self._open_ex('resourcesync_ex_16').read()
     self._assert_xml_equal( rl.as_xml(), ex_xml )
Пример #33
0
 def generate(self):
     """Generate a resource_list (snapshot from the source)."""
     then = time.time()
     resource_list = ResourceList(resources=self.source.resources,
                                  count=self.source.resource_count)
     now = time.time()
     self.logger.info("Generated resource_list: %f" % (now - then))
     return resource_list
Пример #34
0
 def test03_dump_multi_file_max_size(self):
     rl=ResourceList()
     for letter in map(chr,xrange(ord('a'),ord('l')+1)):
         uri='http://ex.org/%s' % (letter)
         fname='resync/test/testdata/a_to_z/%s' % (letter)
         rl.add( Resource(uri, path=fname) )
     self.assertEqual( len(rl), 12 )
     #d=Dump(rl) 
     #tmpdir=tempfile.mkdtemp()
     #tmpbase=os.path.join(tmpdir,'base')
     #d.max_size=2000 # start new zip after size exceeds 2000 bytes
     #n=d.write(tmpbase)
     #self.assertEqual( n, 2, 'expect to write 2 dump files' )
     # 
     # Now repeat with large size limit but small number of files limit
     d2=Dump(rl) 
     tmpbase=os.path.join(self.tmpdir,'test03_')
     d2.max_files=4
     n=d2.write(tmpbase)
     self.assertEqual( n, 3, 'expect to write 3 dump files' )
     self.assertTrue( os.path.isfile(tmpbase+'00000.zip') )
     self.assertTrue( os.path.isfile(tmpbase+'00001.zip') )
     self.assertTrue( os.path.isfile(tmpbase+'00002.zip') )
     # Look at the first file in detail
     zipf=tmpbase+'00000.zip'
     zo=zipfile.ZipFile(zipf,'r')
     self.assertEqual( zo.namelist(), ['manifest.xml','a','b','c','d'] )
     #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 )
     self.assertEqual( zo.getinfo('a').file_size, 9 )
     self.assertEqual( zo.getinfo('b').file_size, 1116 )
     self.assertEqual( zo.getinfo('c').file_size, 32 )
     self.assertEqual( zo.getinfo('d').file_size, 13 )
     zo.close()
     os.unlink(zipf)
     # Check second and third files have expected contents
     zipf=tmpbase+'00001.zip'
     zo=zipfile.ZipFile(zipf,'r')
     self.assertEqual( zo.namelist(), ['manifest.xml','e','f','g','h'] )
     zo.close()
     os.unlink(zipf)
     zipf=tmpbase+'00002.zip'
     zo=zipfile.ZipFile(zipf,'r')
     self.assertEqual( zo.namelist(), ['manifest.xml','i','j','k','l'] )
     zo.close()
     os.unlink(zipf)
Пример #35
0
 def test03_dump_multi_file_max_size(self):
     rl = ResourceList()
     for letter in map(chr, range(ord('a'), ord('l') + 1)):
         uri = 'http://ex.org/%s' % (letter)
         fname = 'tests/testdata/a_to_z/%s' % (letter)
         rl.add(Resource(uri, path=fname))
     self.assertEqual(len(rl), 12)
     #d=Dump(rl)
     #tmpdir=tempfile.mkdtemp()
     #tmpbase=os.path.join(tmpdir,'base')
     #d.max_size=2000 # start new zip after size exceeds 2000 bytes
     #n=d.write(tmpbase)
     #self.assertEqual( n, 2, 'expect to write 2 dump files' )
     #
     # Now repeat with large size limit but small number of files limit
     d2 = Dump(rl)
     tmpbase = os.path.join(self.tmpdir, 'test03_')
     d2.max_files = 4
     n = d2.write(tmpbase)
     self.assertEqual(n, 3, 'expect to write 3 dump files')
     self.assertTrue(os.path.isfile(tmpbase + '00000.zip'))
     self.assertTrue(os.path.isfile(tmpbase + '00001.zip'))
     self.assertTrue(os.path.isfile(tmpbase + '00002.zip'))
     # Look at the first file in detail
     zipf = tmpbase + '00000.zip'
     zo = zipfile.ZipFile(zipf, 'r')
     self.assertEqual(zo.namelist(), ['manifest.xml', 'a', 'b', 'c', 'd'])
     #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 )
     self.assertEqual(zo.getinfo('a').file_size, 9)
     self.assertEqual(zo.getinfo('b').file_size, 1116)
     self.assertEqual(zo.getinfo('c').file_size, 32)
     self.assertEqual(zo.getinfo('d').file_size, 13)
     zo.close()
     os.unlink(zipf)
     # Check second and third files have expected contents
     zipf = tmpbase + '00001.zip'
     zo = zipfile.ZipFile(zipf, 'r')
     self.assertEqual(zo.namelist(), ['manifest.xml', 'e', 'f', 'g', 'h'])
     zo.close()
     os.unlink(zipf)
     zipf = tmpbase + '00002.zip'
     zo = zipfile.ZipFile(zipf, 'r')
     self.assertEqual(zo.namelist(), ['manifest.xml', 'i', 'j', 'k', 'l'])
     zo.close()
     os.unlink(zipf)
Пример #36
0
    def test32_parse_bad_capability(self):
        # the <rs:md capability="bad_capability".. should give error
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="bad_capability" from="2013-01-01"/>\
<url><loc>http://example.com/bad_res_1</loc><lastmod>2012-03-14T18:37:36Z</lastmod></url>\
</urlset>'
        rl=ResourceList()
        self.assertRaises( SitemapParseError, rl.parse, fh=StringIO.StringIO(xml) )
Пример #37
0
 def test02_changed(self):
     src = ResourceList()
     src.add(Resource('a', timestamp=1))
     src.add(Resource('b', timestamp=2))
     dst = ResourceList()
     dst.add(Resource('a', timestamp=3))
     dst.add(Resource('b', timestamp=4))
     (same, changed, deleted, added) = dst.compare(src)
     self.assertEqual(len(same), 0, "0 things unchanged")
     self.assertEqual(len(changed), 2, "2 things changed")
     i = iter(changed)
     self.assertEqual(next(i).uri, 'a', "first was a")
     self.assertEqual(next(i).uri, 'b', "second was b")
     self.assertEqual(len(deleted), 0, "nothing deleted")
     self.assertEqual(len(added), 0, "nothing added")
Пример #38
0
 def test03_multiple(self):
     caps = CapabilityList()
     rl = ResourceList()
     caps.add_capability( rl, "rl.xml" )
     cl = ChangeList()
     caps.add_capability( cl, "cl.xml" )
     self.assertEqual( len(caps), 2 )
     xml = caps.as_xml()
     self.assertTrue( re.search( r'<loc>rl.xml</loc><rs:md capability="resourcelist" />', xml ) )
     self.assertTrue( re.search( r'<loc>cl.xml</loc><rs:md capability="changelist" />', xml) )
Пример #39
0
 def list_resources_chunk(self):
     """
     Fill a resource list up to max_files_compressed or with as much rdf-files as there are left in resource_dir.
     A boolean indicates whether the resource_dir was exhausted.
     :return: the ResourceList, exhausted
     """
     resourcelist = ResourceList()
     exhausted = self.list_patch_files(resourcelist,
                                       max_files=self.max_files_compressed)
     return resourcelist, exhausted
Пример #40
0
 def test_build_ex_14(self):
     """Resource List with 2 entries and some metadata"""
     rl = ResourceList()
     rl.up='http://example.com/dataset1/capabilitylist.xml'
     rl.md_at="2013-01-03T09:00:00Z"
     rl.md_completed="2013-01-03T09:01:00Z"
     rl.add( Resource( uri='http://example.com/res1',
                       lastmod='2013-01-02T13:00:00Z',
                       md5='1584abdf8ebdc9802ac0c6a7402c03b6',
                       length=8876,
                       mime_type="text/html" ))
     rl.add( Resource( uri='http://example.com/res2',
                       lastmod='2013-01-02T14:00:00Z',
                       md5='1e0d5cb8ef6ba40c99b14c0237be735e',
                       sha256='854f61290e2e197a11bc91063afce22e43f8ccc655237050ace766adc68dc784',
                       length=14599,
                       mime_type="application/pdf" ))
     ex_xml = self._open_ex('resourcesync_ex_14').read()
     self._assert_xml_equal( rl.as_xml(), ex_xml )
Пример #41
0
 def test07_hashes(self):
     r1 = Resource(uri='a')
     r2 = Resource(uri='b')
     i = ResourceList()
     self.assertEqual(i.hashes(), set())
     i.add(r1)
     i.add(r2)
     self.assertEqual(i.hashes(), set())
     r1.md5 = "aabbcc"
     self.assertEqual(i.hashes(), set(['md5']))
     r2.sha1 = "ddeeff"
     self.assertEqual(i.hashes(), set(['md5', 'sha-1']))
Пример #42
0
 def test_build_ex_13(self):
     """Capability List document with 4 entries"""
     cl = CapabilityList()
     cl.describedby = 'http://example.com/info_about_set1_of_resources.xml'
     cl.up = 'http://example.com/resourcesync_description.xml'
     cl.add_capability( capability=ResourceList( uri='http://example.com/dataset1/resourcelist.xml' ) )
     cl.add_capability( capability=ResourceDump( uri='http://example.com/dataset1/resourcedump.xml' ) )
     cl.add_capability( capability=ChangeList( uri='http://example.com/dataset1/changelist.xml' ) )
     cl.add_capability( capability=ChangeDump( uri='http://example.com/dataset1/changedump.xml' ) )
     ex_xml = self._open_ex('resourcesync_ex_13').read()
     self._assert_xml_equal( cl.as_xml(), ex_xml )
Пример #43
0
 def test05_add(self):
     r1 = Resource(uri='a', length=1)
     r2 = Resource(uri='b', length=2)
     i = ResourceList()
     i.add(r1)
     self.assertRaises(ResourceListDupeError, i.add, r1)
     i.add(r2)
     self.assertRaises(ResourceListDupeError, i.add, r2)
     # allow dupes
     r1d = Resource(uri='a', length=10)
     i.add(r1d, replace=True)
     self.assertEqual(len(i), 2)
     self.assertEqual(i.resources['a'].length, 10)
Пример #44
0
 def test04_added(self):
     src = ResourceList()
     src.add( Resource('a',timestamp=1) )
     src.add( Resource('b',timestamp=2) )
     src.add( Resource('c',timestamp=3) )
     src.add( Resource('d',timestamp=4) )
     dst = ResourceList()
     dst.add( Resource('a',timestamp=1) )
     dst.add( Resource('c',timestamp=3) )
     ( same, changed, deleted, added ) = dst.compare(src)
     self.assertEqual( len(same), 2, "2 things unchanged" )
     self.assertEqual( len(changed), 0, "nothing changed" )
     self.assertEqual( len(deleted), 0, "nothing deleted" )
     self.assertEqual( len(added), 2, "b and d added" )
     i = iter(added)
     self.assertEqual( next(i).uri, 'b', "first was b" )
     self.assertEqual( next(i).uri, 'd', "second was d" )
Пример #45
0
 def test07_has_md5(self):
     r1 = Resource(uri='a')
     r2 = Resource(uri='b')
     i = ResourceList()
     self.assertFalse( i.has_md5() )
     i.add(r1)
     i.add(r2)
     self.assertFalse( i.has_md5() )
     r1.md5="aabbcc"
     self.assertTrue( i.has_md5() )
Пример #46
0
 def test_build_ex_02(self):
     """Slightly more complex Resource List document """
     rl = ResourceList()
     rl.md_at = '2013-01-03T09:00:00Z'
     rl.add( Resource(uri='http://example.com/res1',
                      lastmod='2013-01-02T13:00:00Z',
                      md5='1584abdf8ebdc9802ac0c6a7402c03b6') )
     r2 =  Resource(uri='http://example.com/res2',
                    lastmod='2013-01-02T14:00:00Z',
                    md5='1e0d5cb8ef6ba40c99b14c0237be735e')
     r2.link_set(rel="duplicate",href="http://mirror.example.com/res2")
     rl.add( r2 )
     ex_xml = self._open_ex('resourcesync_ex_2').read()
     self._assert_xml_equal( rl.as_xml(), ex_xml )
Пример #47
0
 def test_build_ex_01(self):
     """Simple Resource List document """
     rl = ResourceList()
     rl.md_at = '2013-01-03T09:00:00Z'
     rl.add( Resource('http://example.com/res1') )
     rl.add( Resource('http://example.com/res2') )
     ex_xml = self._open_ex('resourcesync_ex_1').read()
     self._assert_xml_equal( rl.as_xml(), ex_xml )
Пример #48
0
 def test07_hashes(self):
     r1 = Resource(uri='a')
     r2 = Resource(uri='b')
     i = ResourceList()
     self.assertEqual(i.hashes(), set())
     i.add(r1)
     i.add(r2)
     self.assertEqual(i.hashes(), set())
     r1.md5 = "aabbcc"
     self.assertEqual(i.hashes(), set(['md5']))
     r2.sha1 = "ddeeff"
     self.assertEqual(i.hashes(), set(['md5', 'sha-1']))
Пример #49
0
 def test02_resource_list_links(self):
     xml = run_resync([
         '--resourcelist',
         '--describedby-link=a',
         '--sourcedescription-link=b',  #will be ignored
         '--capabilitylist-link=c',
         'http://example.org/t',
         'resync/test/testdata/dir1'
     ])
     rl = ResourceList()
     rl.parse(fh=StringIO.StringIO(xml))
     self.assertEqual(len(rl), 2)
     self.assertNotEqual(rl.link('describedby'), None)
     self.assertEqual(rl.link('describedby')['href'], 'a')
     self.assertNotEqual(rl.link('up'), None)
     self.assertEqual(rl.link('up')['href'], 'c')
Пример #50
0
 def test02_resource_list_links(self):
     xml = run_resync(['--write-resourcelist',
                       '--describedby-link=a',
                       '--sourcedescription-link=b',  # will be ignored
                       '--capabilitylist-link=c',
                       'http://example.org/t', 'tests/testdata/dir1'])
     rl = ResourceList()
     rl.parse(fh=io.BytesIO(xml))
     self.assertEqual(len(rl), 2)
     self.assertNotEqual(rl.link('describedby'), None)
     self.assertEqual(rl.link('describedby')['href'], 'a')
     self.assertNotEqual(rl.link('up'), None)
     self.assertEqual(rl.link('up')['href'], 'c')
Пример #51
0
 def test07_has_md5(self):
     r1 = Resource(uri='a')
     r2 = Resource(uri='b')
     i = ResourceList()
     self.assertFalse( i.has_md5() )
     i.add(r1)
     i.add(r2)
     self.assertFalse( i.has_md5() )
     r1.md5="aabbcc"
     self.assertTrue( i.has_md5() )
Пример #52
0
 def test01_same(self):
     src = ResourceList()
     src.add( Resource('a',timestamp=1) )
     src.add( Resource('b',timestamp=2) )
     dst = ResourceList()
     dst.add( Resource('a',timestamp=1) )
     dst.add( Resource('b',timestamp=2) )
     ( same, changed, deleted, added ) = dst.compare(src)
     self.assertEqual( len(same), 2, "2 things unchanged" )
     i = iter(same)
     self.assertEqual( i.next().uri, 'a', "first was a" )
     self.assertEqual( i.next().uri, 'b', "second was b" )
     self.assertEqual( len(changed), 0, "nothing changed" )
     self.assertEqual( len(deleted), 0, "nothing deleted" )
     self.assertEqual( len(added), 0, "nothing added" )
Пример #53
0
 def test_build_ex_08(self):
     """Simple Resource List Index document
     
     This is not something that would usually be created directly 
     but instead would be created as part of the process of 
     writing a large Resource List in multiple files. However,
     it is possible to create manually.
     """
     rli = ResourceList()
     rli.sitemapindex=True
     rli.md_at = '2013-01-03T09:00:00Z'
     rli.add( Resource(uri='http://example.com/resourcelist-part1.xml') )
     rli.add( Resource(uri='http://example.com/resourcelist-part2.xml') )
     ex_xml = self._open_ex('resourcesync_ex_8').read()
     self._assert_xml_equal( rli.as_xml(), ex_xml )