def test04_parse(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:ln href="http://example.org/about" rel="describedby" /><rs:md capability="description" /><url><loc>http://example.org/ds1/cl.xml</loc><rs:md capability="capabilitylist" /></url><url><loc>http://example.org/ds2/cl.xml</loc><rs:md capability="capabilitylist" /></url><url><loc>http://example.org/ds3/cl.xml</loc><rs:md capability="capabilitylist" /></url></urlset>' sd=SourceDescription() sd.parse(str=xml) self.assertEqual( sd.link_href('describedby'), 'http://example.org/about', 'describedby link' ) self.assertEqual( sd.capability, 'description' ) self.assertEqual( len(sd.resources), 3, 'got 3 capacility lists' ) [r1,r2,r3]=sd.resources self.assertEqual( r1.uri, 'http://example.org/ds1/cl.xml' ) self.assertEqual( r1.capability, 'capabilitylist' )
new_lasttime = time_sorted_resources[-1]["time"].strftime( "%Y-%m-%dT%H:%M:%SZ") # write this time to the timefile timefile_out = open(args.time_file, "w") timefile_out.write(new_lasttime) timefile_out.close() # Downloads all resource lists def get_resource_lists(resources): for key, resource_list_resource in resources.iteritems(): resource_list_response = requests.get(resource_list_resource.uri) resource_list = ResourceList() resource_list.parse(str=resource_list_response.text) get_resources(resource_list.resources) # Download URI of the source description XML from # --> should actually be either via robots.txt or/in .well-known source_desc_response = requests.get(args.source_description_uri) source_desc = SourceDescription() source_desc.parse(str=source_desc_response.text) [capabilitylist_resource] = source_desc.resources # Download capablity list obtained from source description capabilitylist_response = requests.get(capabilitylist_resource.uri) capabilitylist = CapabilityList() capabilitylist.parse(str=capabilitylist_response.text) # Download resource lists obtained from capability list get_resource_lists(capabilitylist.resources)
# store newest modified time of newest resource new_lasttime = time_sorted_resources[-1]["time"].strftime("%Y-%m-%dT%H:%M:%SZ") # write this time to the timefile timefile_out = open(args.time_file, "w") timefile_out.write(new_lasttime) timefile_out.close() # Downloads all resource lists def get_resource_lists(resources): for key, resource_list_resource in resources.iteritems(): resource_list_response = requests.get(resource_list_resource.uri) resource_list = ResourceList() resource_list.parse(str=resource_list_response.text) get_resources(resource_list.resources) # Download URI of the source description XML from # --> should actually be either via robots.txt or/in .well-known source_desc_response = requests.get(args.source_description_uri) source_desc = SourceDescription() source_desc.parse(str=source_desc_response.text) [capabilitylist_resource] = source_desc.resources # Download capablity list obtained from source description capabilitylist_response = requests.get(capabilitylist_resource.uri) capabilitylist = CapabilityList() capabilitylist.parse(str=capabilitylist_response.text) # Download resource lists obtained from capability list get_resource_lists(capabilitylist.resources)