def assert_import_equals_export(self, _roundtrip): _result = test_utils.import_xml(_roundtrip) with open(_roundtrip) as _import_file: _import_xml = _import_file.read() register_namespace("", SCHEMA_NAMESPACE) _import_xml = to_xml_string(fromstring(_import_xml), encoding="utf-8") _export_xml = to_xml_string(_result.export_to_elementtree(), encoding="utf-8") # cfedermann: uncomment these lines to dump import/export XML to file. # # with open('/tmp/_import.xml', 'wb') as _out: # _out.write(_import_xml.encode('utf-8')) # with open('/tmp/_export.xml', 'wb') as _out: # _out.write(_export_xml.encode('utf-8')) diff = "\n".join(unified_diff(_import_xml.split("\n"), _export_xml.split("\n"))) self.assertEqual( _import_xml, _export_xml, msg="For file {0}, export differs from import:\n{1}".format(_roundtrip, diff) )
def check_metadata(self): """ Checks if the metadata of the resource has changed with respect to the current metadata serialization. If yes, recreates the serialization, updates it in the storage folder and increases the revision (for master copies) Returns a flag indicating if the serialization was updated. """ # flag to indicate if rebuilding of metadata.xml is required update_xml = False # create current version of metadata XML from metashare.xml_utils import to_xml_string try: _metadata = to_xml_string( # pylint: disable-msg=E1101 self.resourceinfotype_model_set.all() [0].export_to_elementtree(), # use ASCII encoding to convert non-ASCII chars to entities encoding="ASCII") except: # pylint: disable-msg=E1101 LOGGER.error('PROBLEMATIC: %s - count: %s', self.identifier, self.resourceinfotype_model_set.count(), exc_info=True) raise if self.metadata != _metadata: self.metadata = _metadata LOGGER.debug(u"\nMETADATA: {0}\n".format(self.metadata)) self.modified = datetime.now() update_xml = True # increase revision for ingested and published resources whenever # the metadata XML changes for master copies if self.publication_status in (INGESTED, PUBLISHED) \ and self.copy_status == MASTER: self.revision += 1 # check if there exists a metadata XML file; this is not the case if # the publication status just changed from internal to ingested # or if the resource was received when syncing if self.publication_status in (INGESTED, PUBLISHED) \ and not os.path.isfile( '{0}/metadata-{1:04d}.xml'.format(self._storage_folder(), self.revision)): update_xml = True if update_xml: # serialize metadata with open( '{0}/metadata-{1:04d}.xml'.format(self._storage_folder(), self.revision), 'wb') as _out: _out.write(unicode(self.metadata).encode('ASCII')) return update_xml
def xml_to_json(obj): list_fields = ['distributionInfo', 'licenceInfo', 'corpusTextInfo', 'distributionMedium', 'downloadLocation', 'executionLocation', 'attributionText', 'iprHolder', 'contactPerson', 'surname'] # get xml representation xml_string = to_xml_string(obj.export_to_elementtree(), encoding="utf-8").encode("utf-8") # parse xml to dict dict_repr = xmltodict.parse(xml_string, force_list=list_fields) return json.dumps(dict_repr, indent=4).encode('utf8')
def check_metadata(self): """ Checks if the metadata of the resource has changed with respect to the current metadata serialization. If yes, recreates the serialization, updates it in the storage folder and increases the revision (for master copies) Returns a flag indicating if the serialization was updated. """ # flag to indicate if rebuilding of metadata.xml is required update_xml = False # create current version of metadata XML from metashare.xml_utils import to_xml_string try: _metadata = to_xml_string( # pylint: disable-msg=E1101 self.resourceinfotype_model_set.all()[0].export_to_elementtree(), # use ASCII encoding to convert non-ASCII chars to entities encoding="ASCII", ) except: # pylint: disable-msg=E1101 LOGGER.error( "PROBLEMATIC: %s - count: %s", self.identifier, self.resourceinfotype_model_set.count(), exc_info=True ) raise if self.metadata != _metadata: self.metadata = _metadata LOGGER.debug(u"\nMETADATA: {0}\n".format(self.metadata)) self.modified = datetime.now() update_xml = True # increase revision for ingested and published resources whenever # the metadata XML changes for master copies if self.publication_status in (INGESTED, PUBLISHED) and self.copy_status == MASTER: self.revision += 1 # check if there exists a metadata XML file; this is not the case if # the publication status just changed from internal to ingested # or if the resource was received when syncing if self.publication_status in (INGESTED, PUBLISHED) and not os.path.isfile( "{0}/metadata-{1:04d}.xml".format(self._storage_folder(), self.revision) ): update_xml = True if update_xml: # serialize metadata with open("{0}/metadata-{1:04d}.xml".format(self._storage_folder(), self.revision), "wb") as _out: _out.write(unicode(self.metadata).encode("ASCII")) return update_xml
def assert_import_equals_export(self, _roundtrip): _result = test_utils.import_xml(_roundtrip) with open(_roundtrip) as _import_file: _import_xml = _import_file.read() register_namespace('', SCHEMA_NAMESPACE) _import_xml = to_xml_string(fromstring(_import_xml), encoding="utf-8") _export_xml = to_xml_string(_result.export_to_elementtree(), encoding="utf-8") # cfedermann: uncomment these lines to dump import/export XML to file. # #with open('/tmp/_import.xml', 'wb') as _out: # _out.write(_import_xml.encode('utf-8')) #with open('/tmp/_export.xml', 'wb') as _out: # _out.write(_export_xml.encode('utf-8')) diff = '\n'.join( unified_diff(_import_xml.split('\n'), _export_xml.split('\n'))) self.assertEqual( _import_xml, _export_xml, msg='For file {0}, export differs from import:\n{1}'.format( _roundtrip, diff.encode('utf-8')))
def extract_source_resource_metadata(res_id): res = resourceInfoType_model.objects.get(id=res_id) res_owners = res.owners.all() view_path = res.get_absolute_url() try: root_node = res.export_to_elementtree() xml_string = to_xml_string(root_node, encoding="utf-8").encode('utf-8') return { "resource": res, "uri": "{}{}".format(DJANGO_URL, view_path), "owners": res_owners, "metadata": xml_string, } except: print "Could not import resource with id {}: \"{}\"".format( res_id, res)
# Disable verbose debug output for the import process... settings.DEBUG = True SUCCESSFUL_EXPORTS = 0 ERRONEOUS_EXPORTS = 0 RESOURCE_NO = 0 from metashare.repository.models import resourceInfoType_model from metashare.xml_utils import to_xml_string with ZipFile(sys.argv[1], 'w') as out: for resource in resourceInfoType_model.objects.all(): # skip rsources marked as deleted if resource.storage_object.deleted == True: continue try: RESOURCE_NO += 1 root_node = resource.export_to_elementtree() xml_string = to_xml_string( root_node, encoding="utf-8").encode('utf-8') resource_filename = 'resource-{0}.xml'.format(RESOURCE_NO) out.writestr(resource_filename, xml_string) SUCCESSFUL_EXPORTS += 1 except Exception: ERRONEOUS_EXPORTS += 1 print 'Could not export resource id={0}!'.format(resource.id) print traceback.format_exc() print "Done. Successfully exported {0} files from the database, errors " \ "occured in {1} cases.".format(SUCCESSFUL_EXPORTS, ERRONEOUS_EXPORTS)
def update_storage(self): """ Updates the metadata XML if required and serializes it and this storage object to the storage folder. """ # for internal resources, no serialization is done if self.publication_status is INTERNAL: return # check if the storage folder for this storage object instance exists if self._storage_folder() and not exists(self._storage_folder()): # If not, create the storage folder. mkdir(self._storage_folder()) # update the checksum, if a downloadable file exists if self.master_copy: self._compute_checksum() self.digest_last_checked = datetime.now() # flag to indicate if rebuilding of metadata.xml is required update_xml = False # create current version of metadata XML from metashare.xml_utils import to_xml_string _metadata = to_xml_string( # pylint: disable-msg=E1101 self.resourceinfotype_model_set.all()[0].export_to_elementtree(), # use ASCII encoding to convert non-ASCII chars to entities encoding="ASCII") if self.metadata != _metadata: self.metadata = _metadata self.modified = datetime.now() # increase revision for ingested and published resources whenever # the metadata XML changes if self.publication_status in (INGESTED, PUBLISHED): self.revision += 1 update_xml = True LOGGER.debug(u"\nMETADATA: {0}\n".format(self.metadata)) # check if there exists a metadata XML file; this is not the case if # the publication status just changed from internal to ingested # or if the resource was received when syncing if self.publication_status in (INGESTED, PUBLISHED) \ and not os.path.isfile( '{0}/metadata-{1:04d}.xml'.format(self._storage_folder(), self.revision)): update_xml = True # flag to indicate if rebuilding of resource.zip is required update_zip = False if update_xml: # serialize metadata with open('{0}/metadata-{1:04d}.xml'.format( self._storage_folder(), self.revision), 'wb') as _out: _out.write(unicode(self.metadata).encode('ASCII')) update_zip = True # check if global storage object serialization has changed; if yes, # save it to storage folder _dict_global = { } for item in GLOBAL_STORAGE_ATTS: _dict_global[item] = getattr(self, item) _global_storage = \ dumps(_dict_global, cls=DjangoJSONEncoder, sort_keys=True, separators=(',',':')) if self.global_storage != _global_storage: self.global_storage = _global_storage if self.publication_status in (INGESTED, PUBLISHED): with open('{0}/storage-global.json'.format( self._storage_folder()), 'wb') as _out: _out.write(unicode(self.global_storage).encode('utf-8')) update_zip = True # create new digest zip if required, but only for master and proxy copies if update_zip and self.copy_status in (MASTER, PROXY): _zf_name = '{0}/resource.zip'.format(self._storage_folder()) _zf = zipfile.ZipFile(_zf_name, mode='w', compression=ZIP_DEFLATED) try: _zf.write( '{0}/metadata-{1:04d}.xml'.format(self._storage_folder(), self.revision), arcname='metadata.xml') _zf.write( '{0}/storage-global.json'.format(self._storage_folder()), arcname='storage-global.json') finally: _zf.close() # update zip digest checksum self.digest_checksum = \ compute_digest_checksum(self.metadata, self.global_storage) # update last modified timestamp self.digest_modified = datetime.now() # check if local storage object serialization has changed; if yes, # save it to storage folder _dict_local = { } for item in LOCAL_STORAGE_ATTS: _dict_local[item] = getattr(self, item) _local_storage = \ dumps(_dict_local, cls=DjangoJSONEncoder, sort_keys=True, separators=(',',':')) if self.local_storage != _local_storage: self.local_storage = _local_storage if self.publication_status in (INGESTED, PUBLISHED): with open('{0}/storage-local.json'.format( self._storage_folder()), 'wb') as _out: _out.write(unicode(self.local_storage).encode('utf-8')) # save storage object if required; this is always required since at # least self.digest_last_checked has changed self.save()