def all_resources(self): all_resources = {} # search for resourcelists resourcelist_files = sorted( glob(self.paras.abs_metadata_path("resourcelist_*.xml"))) for rl_file_name in resourcelist_files: resourcelist = ResourceList() with open(rl_file_name, "r", encoding="utf-8") as rl_file: sm = Sitemap() sm.parse_xml(rl_file, resources=resourcelist) all_resources.update({ resource.uri: resource for resource in resourcelist.resources }) # search for changelists changelist_files = sorted( glob(self.paras.abs_metadata_path("changelist_*.xml"))) for cl_file_name in changelist_files: changelist = ChangeList() with open(cl_file_name, "r", encoding="utf-8") as cl_file: sm = Sitemap() sm.parse_xml(cl_file, resources=changelist) for resource in changelist.resources: if resource.change == "created" or resource.change == "updated": all_resources.update({resource.uri: resource}) elif resource.change == "deleted" and resource.uri in all_resources: del all_resources[resource.uri] return all_resources
def test_features(): # Read Capability List and show supported capabilities cl = CapabilityList() capability_url = 'http://localhost:8888/capabilitylist.xml' cl.read(capability_url) print("\nAbout to show all capabilities...") for resource in cl: print(f"supports {resource.capability} (at {resource.uri})") # Read Resource List and print it rl = ResourceList() resource_url = 'http://localhost:8888/resourcelist.xml' # this url is one of the capabilities rl.read(resource_url) for resource in rl: print(resource) # Attempting to download resources, but getting an error related to one resource d = dump.Dump(resources=rl) d.write(basename='./dump_test/test.xml')
def get_resource_list_xml(self, from_date=None, to_date=None): """ Get content of resource list. :return: (xml) resource list content """ if not self._validation(): return None r = get_items_by_index_tree(self.repository_id) rl = ResourceList() rl.up = INVENIO_CAPABILITY_URL.format(request.url_root) for item in r: if item: resource_date = str_to_datetime( item.get('_source').get('_updated')) if from_date and str_to_datetime(from_date) > resource_date: continue if to_date and str_to_datetime(to_date) < resource_date: continue id_item = item.get('_source').get('control_number') # url = '{}records/{}'.format(request.url_root, str(id_item)) url = '{}resync/{}/records/{}'.format(request.url_root, str(self.repository_id), str(id_item)) rl.add( Resource(url, lastmod=item.get('_source').get('_updated'))) return rl.as_xml()
def sitemap(offset=0): if offset > 0: offset = offset - 1 instances = __get_instances__(offset) resource_list = ResourceList() dedups = 0 for i,row in enumerate(instances): instance = row.get('instance') if "date" in row: last_mod = row.get("date").get("value")[0:10] else: last_mod = datetime.datetime.utcnow().strftime( W3C_DATE) try: resource_list.add( Resource("{}.json".format(instance.get("value")), lastmod=last_mod) ) except ResourceListDupeError: dedups += 1 continue xml = resource_list.as_xml() return Response(xml, mimetype="text/xml")
def create_index(self, sitemap_data_iter: iter): if len(sitemap_data_iter) > 1: resourcelist_index = ResourceList() resourcelist_index.sitemapindex = True resourcelist_index.md_at = self.date_start_processing resourcelist_index.md_completed = self.date_end_processing index_path = self.para.abs_metadata_path("resourcelist-index.xml") rel_index_path = os.path.relpath(index_path, self.para.resource_dir) index_url = self.para.url_prefix + defaults.sanitize_url_path( rel_index_path) resourcelist_index.link_set(rel="up", href=self.para.capabilitylist_url()) for sitemap_data in sitemap_data_iter: resourcelist_index.add( Resource(uri=sitemap_data.uri, md_at=sitemap_data.doc_start, md_completed=sitemap_data.doc_end)) if sitemap_data.document_saved: self.update_rel_index(index_url, sitemap_data.path, ResourceList()) self.finish_sitemap(-1, resourcelist_index)
def update_previous_state(self): if self.previous_resources is None: self.previous_resources = {} # search for resourcelists self.resourcelist_files = sorted( glob(self.param.abs_metadata_path("changedump_*.xml"))) for rl_file_name in self.resourcelist_files: resourcelist = ResourceList() with open(rl_file_name, "r", encoding="utf-8") as rl_file: sm = Sitemap() sm.parse_xml(rl_file, resources=resourcelist) self.date_resourcelist_completed = resourcelist.md_completed if self.date_resourcelist_completed is None: self.date_resourcelist_completed = resourcelist.md_at self.previous_resources.update({ resource.uri: resource for resource in resourcelist.resources }) # search for changedumps self.changedump_files = sorted( glob(self.param.abs_metadata_path("changedump_*.xml"))) for cl_file_name in self.changedump_files: changedump = ChangeDump() with open(cl_file_name, "r", encoding="utf-8") as cl_file: sm = Sitemap() sm.parse_xml(cl_file, resources=changedump) for resource in changedump.resources: if resource.change == "created" or resource.change == "updated": self.previous_resources.update( {resource.uri: resource}) elif resource.change == "deleted" and resource.uri in self.previous_resources: del self.previous_resources[resource.uri]
def generator() -> [SitemapData, ResourceList]: resourcelist = None ordinal = self.find_ordinal(Capability.resourcelist.name) resource_count = 0 doc_start = None resource_generator = self.resource_generator() for resource_count, resource in resource_generator( resource_metadata): # stuff resource into resourcelist if resourcelist is None: resourcelist = ResourceList() doc_start = defaults.w3c_now() resourcelist.md_at = doc_start resourcelist.add(resource) # under conditions: yield the current resourcelist if resource_count % self.param.max_items_in_list == 0: ordinal += 1 doc_end = defaults.w3c_now() resourcelist.md_completed = doc_end sitemap_data = self.finish_sitemap(ordinal, resourcelist, doc_start=doc_start, doc_end=doc_end) yield sitemap_data, resourcelist resourcelist = None # under conditions: yield the current and last resourcelist if resourcelist: ordinal += 1 doc_end = defaults.w3c_now() resourcelist.md_completed = doc_end sitemap_data = self.finish_sitemap(ordinal, resourcelist, doc_start=doc_start, doc_end=doc_end) yield sitemap_data, resourcelist
#!/usr/bin/env python if (True): #keep indentation of README from resync import Resource,ResourceList rl = ResourceList() rl.add( Resource('http://example.com/res1', lastmod='2013-01-01') ) rl.add( Resource('http://example.com/res2', lastmod='2013-01-02') ) print rl.as_xml()
#!/usr/bin/env python if (True): #keep indentation of README from resync import Resource, ResourceList rl = ResourceList() rl.add(Resource('http://example.com/res1', lastmod='2013-01-01')) rl.add(Resource('http://example.com/res2', lastmod='2013-01-02')) print rl.as_xml()