def publish(self): """ Try and publish or remove zip end if something went wrong. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ if not os.path.isdir(self.resource_dir): os.makedirs(self.resource_dir) #print "Created %s" % self.resource_dir if not os.path.isdir(self.publish_dir): os.makedirs(self.publish_dir) #print "Created %s" % self.publish_dir try: return self.do_publish() except: # Something went wrong. Best we can do is clean up end of zip chain. zip_end_files = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) for ze_file in zip_end_files: os.remove(ze_file) print "error recovery: removed %s" % ze_file zip_end_xmls = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml")) for ze_xml in zip_end_xmls: os.remove(ze_xml) print "error recovery: removed %s" % ze_xml zip_end_manis = glob( os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml")) for ze_mani in zip_end_manis: os.remove(ze_mani) print "error recovery: removed %s" % ze_mani # remove zip-end entries from resource-dump.xml rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) rs_dump = ResourceDump() if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) prefix = self.publish_url + PREFIX_END_PART for uri in rs_dump.resources.keys(): if uri.startswith(prefix): del rs_dump.resources[uri] print "error recovery: removed %s from %s" % (uri, rs_dump_path) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) print "error recovery: walk through error recovery completed. Now raising ..." raise
def test_build_ex_17(self): """Resource Dump with 3 entries and some metadata""" rd = ResourceDump() rd.up='http://example.com/dataset1/capabilitylist.xml' rd.md_at="2013-01-03T09:00:00Z" rd.md_completed="2013-01-03T09:04:00Z" z1 = Resource( uri='http://example.com/resourcedump-part1.zip', mime_type="application/zip", length=4765, md_at="2013-01-03T09:00:00Z", md_completed="2013-01-03T09:02:00Z" ) z1.link_set( rel="contents", href="http://example.com/resourcedump_manifest-part1.xml", mime_type="application/xml" ) rd.add( z1 ) z2 = Resource( uri='http://example.com/resourcedump-part2.zip', mime_type="application/zip", length=9875, md_at="2013-01-03T09:01:00Z", md_completed="2013-01-03T09:03:00Z" ) z2.link_set( rel="contents", href="http://example.com/resourcedump_manifest-part2.xml", mime_type="application/xml" ) rd.add( z2 ) z3 = Resource( uri='http://example.com/resourcedump-part3.zip', mime_type="application/zip", length=2298, md_at="2013-01-03T09:03:00Z", md_completed="2013-01-03T09:04:00Z" ) z3.link_set( rel="contents", href="http://example.com/resourcedump_manifest-part3.xml", mime_type="application/xml" ) rd.add( z3 ) ex_xml = self._open_ex('resourcesync_ex_17').read() self._assert_xml_equal( rd.as_xml(), ex_xml )
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename( self.publish_dir)).rstrip('\n') print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def test01_as_xml(self): rd = ResourceDump() rd.add( Resource('a.zip',timestamp=1) ) rd.add( Resource('b.zip',timestamp=2) ) xml = rd.as_xml() self.assertTrue( re.search(r'<rs:md .*capability="resourcedump"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def test_build_ex_04(self): """Simple Resource Dump document """ rd = ResourceDump() rd.md_at = '2013-01-03T09:00:00Z' rd.add( Resource(uri='http://example.com/resourcedump.zip', lastmod='2013-01-03T09:00:00Z') ) ex_xml = self._open_ex('resourcesync_ex_4').read() self._assert_xml_equal( rd.as_xml(), ex_xml )
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename(self.publish_dir)).rstrip("\n") print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def publish(self): """ Try and publish or remove zip end if something went wrong. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ if not os.path.isdir(self.resource_dir): os.makedirs(self.resource_dir) # print "Created %s" % self.resource_dir if not os.path.isdir(self.publish_dir): os.makedirs(self.publish_dir) # print "Created %s" % self.publish_dir try: return self.do_publish() except: # Something went wrong. Best we can do is clean up end of zip chain. zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) for ze_file in zip_end_files: os.remove(ze_file) print "error recovery: removed %s" % ze_file zip_end_xmls = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml")) for ze_xml in zip_end_xmls: os.remove(ze_xml) print "error recovery: removed %s" % ze_xml zip_end_manis = glob(os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml")) for ze_mani in zip_end_manis: os.remove(ze_mani) print "error recovery: removed %s" % ze_mani # remove zip-end entries from resource-dump.xml rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) rs_dump = ResourceDump() if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) prefix = self.publish_url + PREFIX_END_PART for uri in rs_dump.resources.keys(): if uri.startswith(prefix): del rs_dump.resources[uri] print "error recovery: removed %s from %s" % (uri, rs_dump_path) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) print "error recovery: walk through error recovery completed. Now raising ..." raise
def test01_as_xml(self): rd = ResourceDump() rd.add(Resource('a.zip', timestamp=1)) rd.add(Resource('b.zip', timestamp=2)) xml = rd.as_xml() self.assertTrue(re.search(r'<rs:md .*capability="resourcedump"', xml), 'XML has capability') self.assertTrue( re.search( r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a')
def get_resource_dump_xml(self, from_date=None, to_date=None): """ Get content of resource dump. :return: (xml) resource dump content """ if not self._validation(): return None from .utils import parse_date if from_date: from_date = parse_date(from_date) if to_date: to_date = parse_date(to_date) r = get_items_by_index_tree(self.repository_id) rd = ResourceDump() rd.up = INVENIO_CAPABILITY_URL.format(request.url_root) for item in r: if item: resource_date = parse_date(item.get('_source').get('_updated')) if from_date and from_date > resource_date: continue if to_date and to_date < resource_date: continue id_item = item.get('_source').get('control_number') url = '{}resync/{}/{}/file_content.zip'.format( request.url_root, self.repository_id, str(id_item)) rs = Resource(url, lastmod=item.get('_source').get('_updated'), ln=[]) if self.resource_dump_manifest: href = '{}resync/{}/{}/resourcedump_manifest.xml'.format( request.url_root, self.repository_id, str(id_item)) rs.ln.append({ 'rel': 'contents', 'href': href, 'type': 'application/xml' }) rd.add(rs) return rd.as_xml()