示例#1
0
 def test04_parse(self):
     xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:ln href="http://example.org/about" rel="describedby" /><rs:md capability="description" /><url><loc>http://example.org/ds1/cl.xml</loc><rs:md capability="capabilitylist" /></url><url><loc>http://example.org/ds2/cl.xml</loc><rs:md capability="capabilitylist" /></url><url><loc>http://example.org/ds3/cl.xml</loc><rs:md capability="capabilitylist" /></url></urlset>'
     sd=SourceDescription()
     sd.parse(str=xml)
     self.assertEqual( sd.link_href('describedby'), 'http://example.org/about', 
                       'describedby link' )
     self.assertEqual( sd.capability, 'description' )
     self.assertEqual( len(sd.resources), 3, 'got 3 capacility lists' )
     [r1,r2,r3]=sd.resources
     self.assertEqual( r1.uri, 'http://example.org/ds1/cl.xml' )
     self.assertEqual( r1.capability, 'capabilitylist' )
示例#2
0
    new_lasttime = time_sorted_resources[-1]["time"].strftime(
        "%Y-%m-%dT%H:%M:%SZ")
    # write this time to the timefile
    timefile_out = open(args.time_file, "w")
    timefile_out.write(new_lasttime)
    timefile_out.close()


# Downloads all resource lists
def get_resource_lists(resources):
    for key, resource_list_resource in resources.iteritems():
        resource_list_response = requests.get(resource_list_resource.uri)
        resource_list = ResourceList()
        resource_list.parse(str=resource_list_response.text)
        get_resources(resource_list.resources)


# Download URI of the source description XML from
# --> should actually be either via robots.txt or/in .well-known
source_desc_response = requests.get(args.source_description_uri)
source_desc = SourceDescription()
source_desc.parse(str=source_desc_response.text)
[capabilitylist_resource] = source_desc.resources

# Download capablity list obtained from source description
capabilitylist_response = requests.get(capabilitylist_resource.uri)
capabilitylist = CapabilityList()
capabilitylist.parse(str=capabilitylist_response.text)

# Download resource lists obtained from capability list
get_resource_lists(capabilitylist.resources)
示例#3
0
	# store newest modified time of newest resource
	new_lasttime = time_sorted_resources[-1]["time"].strftime("%Y-%m-%dT%H:%M:%SZ")
	# write this time to the timefile
	timefile_out = open(args.time_file, "w")
	timefile_out.write(new_lasttime)
	timefile_out.close()

# Downloads all resource lists
def get_resource_lists(resources):
	for key, resource_list_resource in resources.iteritems():
		resource_list_response = requests.get(resource_list_resource.uri)
		resource_list = ResourceList()
		resource_list.parse(str=resource_list_response.text)
		get_resources(resource_list.resources)



# Download URI of the source description XML from 
# --> should actually be either via robots.txt or/in .well-known
source_desc_response = requests.get(args.source_description_uri)
source_desc = SourceDescription()
source_desc.parse(str=source_desc_response.text)
[capabilitylist_resource] = source_desc.resources

# Download capablity list obtained from source description
capabilitylist_response =  requests.get(capabilitylist_resource.uri)
capabilitylist = CapabilityList()
capabilitylist.parse(str=capabilitylist_response.text)

# Download resource lists obtained from capability list
get_resource_lists(capabilitylist.resources)