def test01_as_xml(self): rd = ResourceDump() rd.add( Resource('a.zip',timestamp=1) ) rd.add( Resource('b.zip',timestamp=2) ) xml = rd.as_xml() self.assertTrue( re.search(r'<rs:md .*capability="resourcedump"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename( self.publish_dir)).rstrip('\n') print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def test_build_ex_04(self): """Simple Resource Dump document """ rd = ResourceDump() rd.md_at = '2013-01-03T09:00:00Z' rd.add( Resource(uri='http://example.com/resourcedump.zip', lastmod='2013-01-03T09:00:00Z') ) ex_xml = self._open_ex('resourcesync_ex_4').read() self._assert_xml_equal( rd.as_xml(), ex_xml )
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename(self.publish_dir)).rstrip("\n") print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def test01_as_xml(self): rd = ResourceDump() rd.add(Resource('a.zip', timestamp=1)) rd.add(Resource('b.zip', timestamp=2)) xml = rd.as_xml() self.assertTrue(re.search(r'<rs:md .*capability="resourcedump"', xml), 'XML has capability') self.assertTrue( re.search( r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a')
def test05_write(self): rd = ResourceDump() rd.add(Resource('aa.zip', timestamp=1)) rd.add(Resource('bb.zip', timestamp=2)) dumpf = os.path.join(self.tmpdir, "test05_dump.xml") rd.write(basename=dumpf) self.assertTrue(os.path.exists(dumpf)) # Now read that back rd2 = ResourceDump() rd2.parse(dumpf) self.assertEqual(len(rd2), 2) self.assertEqual(rd2.uris(), ['aa.zip', 'bb.zip'])
def get_resource_dump_xml(self, from_date=None, to_date=None): """ Get content of resource dump. :return: (xml) resource dump content """ if not self._validation(): return None from .utils import parse_date if from_date: from_date = parse_date(from_date) if to_date: to_date = parse_date(to_date) r = get_items_by_index_tree(self.repository_id) rd = ResourceDump() rd.up = INVENIO_CAPABILITY_URL.format(request.url_root) for item in r: if item: resource_date = parse_date(item.get('_source').get('_updated')) if from_date and from_date > resource_date: continue if to_date and to_date < resource_date: continue id_item = item.get('_source').get('control_number') url = '{}resync/{}/{}/file_content.zip'.format( request.url_root, self.repository_id, str(id_item)) rs = Resource(url, lastmod=item.get('_source').get('_updated'), ln=[]) if self.resource_dump_manifest: href = '{}resync/{}/{}/resourcedump_manifest.xml'.format( request.url_root, self.repository_id, str(id_item)) rs.ln.append({ 'rel': 'contents', 'href': href, 'type': 'application/xml' }) rd.add(rs) return rd.as_xml()
def test_build_ex_17(self): """Resource Dump with 3 entries and some metadata""" rd = ResourceDump() rd.up='http://example.com/dataset1/capabilitylist.xml' rd.md_at="2013-01-03T09:00:00Z" rd.md_completed="2013-01-03T09:04:00Z" z1 = Resource( uri='http://example.com/resourcedump-part1.zip', mime_type="application/zip", length=4765, md_at="2013-01-03T09:00:00Z", md_completed="2013-01-03T09:02:00Z" ) z1.link_set( rel="contents", href="http://example.com/resourcedump_manifest-part1.xml", mime_type="application/xml" ) rd.add( z1 ) z2 = Resource( uri='http://example.com/resourcedump-part2.zip', mime_type="application/zip", length=9875, md_at="2013-01-03T09:01:00Z", md_completed="2013-01-03T09:03:00Z" ) z2.link_set( rel="contents", href="http://example.com/resourcedump_manifest-part2.xml", mime_type="application/xml" ) rd.add( z2 ) z3 = Resource( uri='http://example.com/resourcedump-part3.zip', mime_type="application/zip", length=2298, md_at="2013-01-03T09:03:00Z", md_completed="2013-01-03T09:04:00Z" ) z3.link_set( rel="contents", href="http://example.com/resourcedump_manifest-part3.xml", mime_type="application/xml" ) rd.add( z3 ) ex_xml = self._open_ex('resourcesync_ex_17').read() self._assert_xml_equal( rd.as_xml(), ex_xml )
def do_publish(self): """ Publish resources found in resource_dir in accordance with the Resource Sync Framework. Resources will be packaged in ZIP file format. The amount of resources that will be packaged in one zip file is bound to max_files_in_zip. Successive packages will be created if more than max_files_in_zip resources have to be published. Packages that reach the limit of max_files_in_zip are marked as complete. Any remainder of resources are packaged in a zip file marked as zip end. WARNING: This method removes resources that are published in packages marked as complete from resource_dir. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ count_def_resources = 0 diff_end_resources = 0 path_zip_end_old, rl_end_old = self.get_state_published() new_zips = ResourceDump() state_changed = False exhausted = False while not exhausted: resourcelist, exhausted = self.list_resources_chunk() if len(resourcelist) == self.max_files_compressed: # complete zip state_changed = True count_def_resources += len(resourcelist) zip_resource = self.create_zip(resourcelist, PREFIX_COMPLETED_PART, False, self.write_separate_manifest) new_zips.add(zip_resource) # move resources from resource_dir for resource in resourcelist: r_path = os.path.join(self.resource_dir, resource.path) if self.move_resources: shutil.move(r_path, self.publish_dir) else: os.remove(r_path) elif not self.is_same(resourcelist, rl_end_old): assert exhausted state_changed = True if len(resourcelist) > 0: diff_end_resources += len(resourcelist) zip_resource = self.create_zip( resourcelist, PREFIX_END_PART, True, self.write_separate_manifest) new_zips.add(zip_resource) # publish new metadata. Exclude zip_end_old if state_changed: self.publish_metadata(new_zips, path_zip_end_old) # remove old zip end file, resource list and manifest; # account for difference of resources provisionally packaged. if state_changed and path_zip_end_old: diff_end_resources -= len(rl_end_old) os.remove(path_zip_end_old) os.remove(os.path.splitext(path_zip_end_old)[0] + ".xml") manifest = PREFIX_MANIFEST + os.path.splitext( os.path.basename(path_zip_end_old))[0] + ".xml" manifest_file = os.path.join(self.publish_dir, manifest) if os.path.isfile(manifest_file): os.remove(manifest_file) return state_changed, count_def_resources, diff_end_resources
def do_publish(self): """ Publish resources found in resource_dir in accordance with the Resource Sync Framework. Resources will be packaged in ZIP file format. The amount of resources that will be packaged in one zip file is bound to max_files_in_zip. Successive packages will be created if more than max_files_in_zip resources have to be published. Packages that reach the limit of max_files_in_zip are marked as complete. Any remainder of resources are packaged in a zip file marked as zip end. WARNING: This method removes resources that are published in packages marked as complete from resource_dir. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ count_def_resources = 0 diff_end_resources = 0 path_zip_end_old, rl_end_old = self.get_state_published() new_zips = ResourceDump() state_changed = False exhausted = False while not exhausted: resourcelist, exhausted = self.list_resources_chunk() if len(resourcelist) == self.max_files_compressed: # complete zip state_changed = True count_def_resources += len(resourcelist) zip_resource = self.create_zip(resourcelist, PREFIX_COMPLETED_PART, False, self.write_separate_manifest) new_zips.add(zip_resource) # move resources from resource_dir for resource in resourcelist: r_path = os.path.join(self.resource_dir, resource.path) if self.move_resources: shutil.move(r_path, self.publish_dir) else: os.remove(r_path) elif not self.is_same(resourcelist, rl_end_old): assert exhausted state_changed = True if len(resourcelist) > 0: diff_end_resources += len(resourcelist) zip_resource = self.create_zip(resourcelist, PREFIX_END_PART, True, self.write_separate_manifest) new_zips.add(zip_resource) # publish new metadata. Exclude zip_end_old if state_changed: self.publish_metadata(new_zips, path_zip_end_old) # remove old zip end file, resource list and manifest; # account for difference of resources provisionally packaged. if state_changed and path_zip_end_old: diff_end_resources -= len(rl_end_old) os.remove(path_zip_end_old) os.remove(os.path.splitext(path_zip_end_old)[0] + ".xml") manifest = PREFIX_MANIFEST + os.path.splitext(os.path.basename(path_zip_end_old))[0] + ".xml" manifest_file = os.path.join(self.publish_dir, manifest) if os.path.isfile(manifest_file): os.remove(manifest_file) return state_changed, count_def_resources, diff_end_resources