def test_01_init_statement(self): n = datetime.now() ods = [("http://od1/", n, "http://package/", "sword", "obo"), ("http://od2/", n, "http://package/", "bob", None)] s = Statement(aggregation_uri="http://aggregation/", rem_uri="http://rem/", original_deposits=ods, aggregates=[ "http://od1/", "http://od2/", "http://agg1/", "http://agg2/" ], states=[("http://state/", "everything is groovy")]) # now check that the item is correctly initialised assert s.aggregation_uri == "http://aggregation/" assert s.rem_uri == "http://rem/" assert len(s.original_deposits) == 2 assert "http://od1/" in s.original_deposits[0] assert "http://od2/" in s.original_deposits[1] assert "http://od1/" in s.aggregates assert "http://od2/" in s.aggregates assert "http://agg1/" in s.aggregates assert "http://agg2/" in s.aggregates assert len(s.aggregates) == 4 assert len(s.states) == 1 state_uri, state_description = s.states[0] assert state_uri == "http://state/" assert state_description == "everything is groovy"
def test_02_modify_statement(self): n = datetime.now() ods = [ ("http://od1/", n, "http://package/", "sword", "obo"), ("http://od2/", n, "http://package/", "bob", None) ] s = Statement(aggregation_uri="http://aggregation/", rem_uri="http://rem/", original_deposits=ods, aggregates=["http://od1/", "http://od2/", "http://agg1/", "http://agg2/"], states=[("http://state/", "everything is groovy")]) s.set_state("http://new/state/", "still good, though") assert len(s.states) == 1 state_uri, state_description = s.states[0] assert state_uri == "http://new/state/" assert state_description == "still good, though" s.add_state("http://another/state", "also, this") assert len(s.states) == 2
def test_02_modify_statement(self): n = datetime.now() ods = [("http://od1/", n, "http://package/", "sword", "obo"), ("http://od2/", n, "http://package/", "bob", None)] s = Statement(aggregation_uri="http://aggregation/", rem_uri="http://rem/", original_deposits=ods, aggregates=[ "http://od1/", "http://od2/", "http://agg1/", "http://agg2/" ], states=[("http://state/", "everything is groovy")]) s.set_state("http://new/state/", "still good, though") assert len(s.states) == 1 state_uri, state_description = s.states[0] assert state_uri == "http://new/state/" assert state_description == "still good, though" s.add_state("http://another/state", "also, this") assert len(s.states) == 2
def test_03_rdf_serialise(self): n = datetime.now() ods = [ ("http://od1/", n, "http://package/", "sword", "obo"), ("http://od2/", n, "http://package/", "bob", None) ] od_uris = ["http://od1/", "http://od2/"] s = Statement(aggregation_uri="http://aggregation/", rem_uri="http://rem/", original_deposits=ods, aggregates=["http://od1/", "http://od2/", "http://agg1/", "http://agg2/"], states=[("http://state/", "everything is groovy")]) rdf_string = s.serialise_rdf() # first try the round trip rdf = etree.fromstring(rdf_string) # here are some counters/switches which will help us test that everything # is good within the statement descriptions = 0 states = 0 state_descriptions = 0 original_deposits = 0 aggregated_resources = 0 packaging = 0 dep_on = 0 dep_by = 0 dep_obo = 0 has_rem_description = False has_agg_description = False # now go through the rdf and check that everything is as expected for desc in rdf.findall(RDF + "Description"): descriptions += 1 about = desc.get(RDF + "about") for element in desc.getchildren(): if element.tag == ORE + "describes": resource = element.get(RDF + "resource") assert about == s.rem_uri assert resource == s.aggregation_uri has_rem_description = True if element.tag == ORE + "isDescribedBy": resource = element.get(RDF + "resource") assert about == s.aggregation_uri assert resource == s.rem_uri has_agg_description = True if element.tag == ORE + "aggregates": resource = element.get(RDF + "resource") assert resource in s.aggregates or resource in od_uris aggregated_resources += 1 if element.tag == SWORD + "originalDeposit": resource = element.get(RDF + "resource") assert resource in od_uris original_deposits += 1 if element.tag == SWORD + "state": resource = element.get(RDF + "resource") assert resource == "http://state/" states += 1 if element.tag == SWORD + "stateDescription": assert element.text.strip() == "everything is groovy" assert about == "http://state/" state_descriptions += 1 if element.tag == SWORD + "packaging": resource = element.get(RDF + "resource") assert resource == "http://package/" assert about in od_uris packaging += 1 if element.tag == SWORD + "depositedOn": assert about in od_uris dep_on += 1 if element.tag == SWORD + "depositedBy": assert element.text in ["sword", "bob"] assert about in od_uris dep_by += 1 if element.tag == SWORD + "depositedOnBehalfOf": assert element.text == "obo" assert about in od_uris dep_obo += 1 # now check that our counters/switches were flipped appropriately assert descriptions == 5 assert states == 1 assert state_descriptions == 1 assert original_deposits == 2 assert aggregated_resources == 4 assert packaging == 2 assert dep_on == 2 assert dep_by == 2 assert dep_obo == 1 assert has_rem_description assert has_agg_description
def test_04_rdf_aggregation_uri_exists(self): n = datetime.now() ods = [ ("http://od1/", n, "http://package/", "sword", "obo"), ("http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip", n, "http://package/", "bob", None) ] od_uris = ["http://od1/", "http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip"] s = Statement(aggregation_uri="http://192.168.23.133/asdfasd/datasets/mydataset6", rem_uri="http://rem/", original_deposits=ods, aggregates=["http://od1/", "http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip", "http://agg1/", "http://agg2/"], states=[("http://state/", "everything is groovy")]) rdf_string = s.serialise_rdf(RDF_DOC) # first try the round trip rdf = etree.fromstring(rdf_string) # here are some counters/switches which will help us test that everything # is good within the statement descriptions = 0 states = 0 state_descriptions = 0 original_deposits = 0 aggregated_resources = 0 packaging = 0 dep_on = 0 dep_by = 0 dep_obo = 0 has_rem_description = False has_agg_description = False ox_tag = False dc_tag = False rdf_tag = False # now go through the rdf and check that everything is as expected for desc in rdf.findall(RDF + "Description"): descriptions += 1 about = desc.get(RDF + "about") for element in desc.getchildren(): # we expect all of the same things to be true as in the previous # test if element.tag == ORE + "describes": resource = element.get(RDF + "resource") assert about == s.rem_uri assert resource == s.aggregation_uri has_rem_description = True if element.tag == ORE + "isDescribedBy": resource = element.get(RDF + "resource") assert about == s.aggregation_uri assert resource == s.rem_uri has_agg_description = True if element.tag == ORE + "aggregates": resource = element.get(RDF + "resource") assert resource in s.aggregates or resource in od_uris aggregated_resources += 1 if element.tag == SWORD + "originalDeposit": resource = element.get(RDF + "resource") assert resource in od_uris original_deposits += 1 if element.tag == SWORD + "state": resource = element.get(RDF + "resource") assert resource == "http://state/" states += 1 if element.tag == SWORD + "stateDescription": assert element.text.strip() == "everything is groovy" assert about == "http://state/" state_descriptions += 1 if element.tag == SWORD + "packaging": resource = element.get(RDF + "resource") assert resource == "http://package/" assert about in od_uris packaging += 1 if element.tag == SWORD + "depositedOn": assert about in od_uris dep_on += 1 if element.tag == SWORD + "depositedBy": assert element.text in ["sword", "bob"] assert about in od_uris dep_by += 1 if element.tag == SWORD + "depositedOnBehalfOf": assert element.text == "obo" assert about in od_uris dep_obo += 1 # and we must verify that we didn't overwrite anything in the # passed in RDF document (don't check everything, but let's pick # one thing from each namespace) if element.tag == OX + "currentVersion": assert element.text == "6" ox_tag = True if element.tag == DC + "identifier": assert element.text == "mydataset6" dc_tag = True if element.tag == RDF + "type": resource = element.get(RDF + "resource") assert resource == "http://vocab.ox.ac.uk/dataset/schema#DataSet" rdf_tag = True # now check that our counters/switches were flipped appropriately assert descriptions == 5 assert states == 1 assert state_descriptions == 1 assert original_deposits == 2 assert aggregated_resources == 4 assert packaging == 2 assert dep_on == 2 assert dep_by == 2 assert dep_obo == 1 assert has_rem_description assert has_agg_description assert ox_tag assert dc_tag assert rdf_tag
def replace(self, path, deposit): """ Replace all the content represented by the supplied id with the supplied deposit Args: - oid: the object ID in the store - deposit: a DepositRequest object Return a DepositResponse containing the Deposit Receipt or a SWORD Error """ silo, dataset_id, accept_parameters = self.um.interpret_path(path) rdf_silo = self._get_authorised_rdf_silo(silo) # now get the dataset object itself dataset = rdf_silo.get_item(dataset_id) # deal with possible problems with the filename if deposit.filename is None or deposit.filename == "": raise SwordError(error_uri=Errors.bad_request, msg="You must supply a filename to unpack") if JAILBREAK.search(deposit.filename) != None: raise SwordError(error_uri=Errors.bad_request, msg="'..' cannot be used in the path or as a filename") # FIXME: at the moment this metadata operation is not supported by DataBank # # first figure out what to do about the metadata keep_atom = False metadata_state = None # This will be used to store any state information associated # with a metadata update. It gets tied up with the content state # and any pre-existing states further down #if deposit.atom is not None: # ssslog.info("Replace request has ATOM part - updating") # entry_ingester = self.configuration.get_entry_ingester()(self.dao) # entry_ingester.ingest(collection, id, deposit.atom) # keep_atom = True content_state = None deposit_uri = None derived_resource_uris = [] if deposit.content is not None: ssslog.info("Replace request has file content - updating") # remove all the old files before adding the new. We always leave # behind the metadata; this will be overwritten later if necessary #self.dao.remove_content(collection, id, True, keep_atom) #Increment the version, but do not clone the previous version. # An update will replace the entire contents of the container (if previously unpacked) with the bagit file dataset.increment_version_delta(clone_previous_version=True, copy_filenames=['manifest.rdf']) # store the content file dataset.put_stream(deposit.filename, deposit.content) ssslog.debug("New incoming file stored with filename " + deposit.filename) # FIXME: unpacking doesn't happen here ... (keeping for the time being for reference) # Broadcast to unpack and add sword:state in manifest # <sword:state rdf:resource="http://purl.org/net/sword/state/queuedForUnpacking"/> # now that we have stored the atom and the content, we can invoke a package ingester over the top to extract # all the metadata and any files we want. Notice that we pass in the metadata_relevant flag, so the # packager won't overwrite the existing metadata if it isn't supposed to #packager = self.configuration.get_package_ingester(deposit.packaging)(self.dao) #derived_resources = packager.ingest(collection, id, fn, deposit.metadata_relevant) #ssslog.debug("Resources derived from deposit: " + str(derived_resources)) # a list of identifiers which will resolve to the derived resources #derived_resource_uris = self.get_derived_resource_uris(collection, id, derived_resources) # An identifier which will resolve to the package just deposited deposit_uri = self.um.file_uri(silo, dataset_id, deposit.filename) ssslog.debug("Incoming file has been stored at URI " + deposit_uri) # register a new content state to be used content_state = DataBankStates.zip_file_added # Taken from dataset.py, seems to be the done thing when adding an item. # NOTE: confirmed with Anusha that this is correct dataset.del_triple(dataset.uri, u"dcterms:modified") dataset.add_triple(dataset.uri, u"dcterms:modified", datetime.now()) dataset.del_triple(dataset.uri, u"oxds:currentVersion") dataset.add_triple(dataset.uri, u"oxds:currentVersion", dataset.currentversion) # before we do any state management, we have to be sure that the sword namespace # is registered dataset.get_rdf_manifest().add_namespace("sword", "http://purl.org/net/sword/terms/") dataset.sync() # sort out the new list of states for the item current_states = self._extract_states(dataset) new_states = [] # for each existing state, consider whether to carry it over ssslog.info("new content state: " + str(content_state)) for state_uri, state_desc in current_states: keep = True if metadata_state is not None and state_uri in DataBankStates.metadata_states: # we do not want the state if it is a metadata state and we have been given # a new metadata state keep = False if content_state is not None and state_uri in DataBankStates.content_states: ssslog.debug("Removing state: " + state_uri) # we do not want the state if it is a content state and we have been given # a new content state keep = False if keep: ssslog.debug("carrying over state: " + state_uri) new_states.append((state_uri, state_desc)) # add the new metadata and content states provided from above if metadata_state is not None: new_states.append(metadata_state) if content_state is not None: ssslog.debug("adding new content state: " + str(content_state)) new_states.append(content_state) ssslog.debug("New Dataset States: " + str(new_states)) # FIXME: how safe is this? What other ore:aggregates might there be? # we need to back out some of the triples in preparation to update the # statement # NOTE AR: I have commented the following lines. # For aggregates this is not needed. put_stream will add the aggregate into the URI. # Why delete other triples in the manifest - ?? # sword:originalDeposit point to isVersionOf aggregates = dataset.list_rdf_objects(dataset.uri, u"ore:aggregates") original_deposits = dataset.list_rdf_objects(dataset.uri, u"sword:originalDeposit") states = dataset.list_rdf_objects(dataset.uri, u"sword:state") for a in aggregates: dataset.del_triple(a, "*") for od in original_deposits: dataset.del_triple(od, "*") for s in states: dataset.del_triple(s, "*") dataset.del_triple(dataset.uri, u"ore:aggregates") dataset.del_triple(dataset.uri, u"sword:originalDeposit") dataset.del_triple(dataset.uri, u"sword:state") # FIXME: also unsafe in the same way as above # Write the md5 checksum into the manifest # A deposit contains just the new stuff so no harm in deleting all triples dataset.del_triple("*", u"oxds:hasMD5") #dataset.del_triple(deposit_uri, u"oxds:hasMD5") if deposit.content_md5 is not None: dataset.add_triple(deposit_uri, u"oxds:hasMD5", deposit.content_md5) dataset.sync() # the aggregation uri agg_uri = self.um.agg_uri(silo, dataset_id) # the Edit-URI edit_uri = self.um.edit_uri(silo, dataset_id) # FIXME: here we also need to keep existing states where relevant. # A state will continue to be relevant if it applies to an area of the # item (i.e. the container or the media resource) for which this operation # has no effect. # for example: # this is a metadata replace, but a status on the item is set to say that # the item's zip file is corrupt and needs replacing. The new status # should leave this alone (and probably not do anything, tbh), no matter # what else it does # create the statement outline # FIXME: there is something weird going on with instantiating this object without the original_deposits argument # apparently if I don't explicitly say there are no original deposits, then it "remembers" original deposits # from previous uses of the object s = Statement(aggregation_uri=agg_uri, rem_uri=edit_uri, states=new_states, original_deposits=[]) # set the original deposit (which sorts out the aggregations for us too) by = deposit.auth.username if deposit.auth is not None else None obo = deposit.auth.on_behalf_of if deposit.auth is not None else None if deposit_uri is not None: s.original_deposit(deposit_uri, datetime.now(), deposit.packaging, by, obo) # create the new manifest and store it manifest = dataset.get_rdf_manifest() f = open(manifest.filepath, "r") rdf_string = f.read() new_manifest = s.serialise_rdf(rdf_string) dataset.put_stream("manifest.rdf", new_manifest) # FIXME: add in proper treatment here # now generate a receipt. # TODO: Include audit log instead of 'added zip to dataset' receipt = self.deposit_receipt(silo, dataset_id, dataset, "added zip to dataset") # now augment the receipt with the details of this particular deposit # this handles None arguments, and converts the xml receipt into a string receipt = self.augmented_receipt(receipt, deposit_uri, derived_resource_uris) # finally, assemble the deposit response and return dr = DepositResponse() dr.receipt = receipt.serialise() dr.location = receipt.edit_uri return dr
def deposit_new(self, silo, deposit): """ Take the supplied deposit and treat it as a new container with content to be created in the specified collection Args: -collection: the ID of the collection to be deposited into -deposit: the DepositRequest object to be processed Returns a DepositResponse object which will contain the Deposit Receipt or a SWORD Error """ # check against the authorised list of silos rdf_silo = self._get_authorised_rdf_silo(silo) # ensure that we have a slug if deposit.slug is None: deposit.slug = str(uuid.uuid4()) # weed out unacceptable deposits if rdf_silo.exists(deposit.slug): raise SwordError(error_uri=DataBankErrors.dataset_conflict, msg="A Dataset with the name " + deposit.slug + " already exists") if not allowable_id2(deposit.slug): raise SwordError(error_uri=Errors.bad_request, msg="Dataset name can contain only the following characters - " + ag.naming_rule_humanized + " and has to be more than 1 character") # NOTE: we pass in an empty dictionary of metadata on create, and then run # _ingest_metadata to augment the item from the deposit item = create_new(rdf_silo, deposit.slug, self.auth_credentials.username, {}) add_dataset(silo, deposit.slug) self._ingest_metadata(item, deposit) # NOTE: left in for reference for the time being, but deposit_new # only support entry only deposits in databank. This will need to be # re-introduced for full sword support # store the content file if one exists, and do some processing on it #deposit_uri = None #derived_resource_uris = [] #if deposit.content is not None: # if deposit.filename is None: # deposit.filename = "unnamed.file" # fn = self.dao.store_content(collection, id, deposit.content, deposit.filename) # now that we have stored the atom and the content, we can invoke a package ingester over the top to extract # all the metadata and any files we want # FIXME: because the deposit interpreter doesn't deal with multipart properly # we don't get the correct packaging format here if the package is anything # other than Binary # ssslog.info("attempting to load ingest packager for format " + str(deposit.packaging)) # packager = self.configuration.get_package_ingester(deposit.packaging)(self.dao) # derived_resources = packager.ingest(collection, id, fn, deposit.metadata_relevant) # An identifier which will resolve to the package just deposited # deposit_uri = self.um.part_uri(collection, id, fn) # a list of identifiers which will resolve to the derived resources # derived_resource_uris = self.get_derived_resource_uris(collection, id, derived_resources) # the aggregation uri agg_uri = self.um.agg_uri(silo, deposit.slug) # the Edit-URI edit_uri = self.um.edit_uri(silo, deposit.slug) # create the initial statement s = Statement(aggregation_uri=agg_uri, rem_uri=edit_uri, states=[DataBankStates.initial_state]) # FIXME: need to sort out authentication before we can do this ... # FIXME: also, it's not relevant unless we take a binary-only deposit, which # we currently don't # User already authorized to deposit in this silo (_get_authorised_rdf_silo). # This is to augment metadata with details like who created, on behalf of, when # #by = deposit.auth.username if deposit.auth is not None else None #obo = deposit.auth.on_behalf_of if deposit.auth is not None else None #if deposit_uri is not None: # s.original_deposit(deposit_uri, datetime.now(), deposit.packaging, by, obo) #s.aggregates = derived_resource_uris # In creating the statement we use the existing manifest.rdf file in the # item: manifest = item.get_rdf_manifest() f = open(manifest.filepath, "r") rdf_string = f.read() # create the new manifest and store it #Serialize rdf adds the sword statement - state, depositedOn, by, onBehalfOf, stateDesc new_manifest = s.serialise_rdf(rdf_string) item.put_stream("manifest.rdf", new_manifest) # FIXME: here is where we have to put the correct treatment in # now generate a receipt for the deposit # TODO: Add audit log from item.manifest in place of "created new item" receipt = self.deposit_receipt(silo, deposit.slug, item, "created new item") # FIXME: while we don't have full text deposit, we don't need to augment # the deposit receipt # now augment the receipt with the details of this particular deposit # this handles None arguments, and converts the xml receipt into a string # receipt = self.augmented_receipt(receipt, deposit_uri, derived_resource_uris) # finally, assemble the deposit response and return dr = DepositResponse() dr.receipt = receipt.serialise() dr.location = receipt.edit_uri # Broadcast change as message ag.b.creation(silo, deposit.slug, ident=self.auth_credentials.username) return dr
def replace(self, path, deposit): """ Replace all the content represented by the supplied id with the supplied deposit Args: - oid: the object ID in the store - deposit: a DepositRequest object Return a DepositResponse containing the Deposit Receipt or a SWORD Error """ silo, dataset_id, accept_parameters = self.um.interpret_path(path) rdf_silo = self._get_authorised_rdf_silo(silo) # now get the dataset object itself dataset = rdf_silo.get_item(dataset_id) # deal with possible problems with the filename if deposit.filename is None or deposit.filename == "": raise SwordError(error_uri=Errors.bad_request, msg="You must supply a filename to unpack") if JAILBREAK.search(deposit.filename) != None: raise SwordError( error_uri=Errors.bad_request, msg="'..' cannot be used in the path or as a filename") # FIXME: at the moment this metadata operation is not supported by DataBank # # first figure out what to do about the metadata keep_atom = False metadata_state = None # This will be used to store any state information associated # with a metadata update. It gets tied up with the content state # and any pre-existing states further down #if deposit.atom is not None: # ssslog.info("Replace request has ATOM part - updating") # entry_ingester = self.configuration.get_entry_ingester()(self.dao) # entry_ingester.ingest(collection, id, deposit.atom) # keep_atom = True content_state = None deposit_uri = None derived_resource_uris = [] if deposit.content is not None: ssslog.info("Replace request has file content - updating") # remove all the old files before adding the new. We always leave # behind the metadata; this will be overwritten later if necessary #self.dao.remove_content(collection, id, True, keep_atom) #Increment the version, but do not clone the previous version. # An update will replace the entire contents of the container (if previously unpacked) with the bagit file dataset.increment_version_delta(clone_previous_version=True, copy_filenames=['manifest.rdf']) # store the content file dataset.put_stream(deposit.filename, deposit.content) ssslog.debug("New incoming file stored with filename " + deposit.filename) # FIXME: unpacking doesn't happen here ... (keeping for the time being for reference) # Broadcast to unpack and add sword:state in manifest # <sword:state rdf:resource="http://purl.org/net/sword/state/queuedForUnpacking"/> # now that we have stored the atom and the content, we can invoke a package ingester over the top to extract # all the metadata and any files we want. Notice that we pass in the metadata_relevant flag, so the # packager won't overwrite the existing metadata if it isn't supposed to #packager = self.configuration.get_package_ingester(deposit.packaging)(self.dao) #derived_resources = packager.ingest(collection, id, fn, deposit.metadata_relevant) #ssslog.debug("Resources derived from deposit: " + str(derived_resources)) # a list of identifiers which will resolve to the derived resources #derived_resource_uris = self.get_derived_resource_uris(collection, id, derived_resources) # An identifier which will resolve to the package just deposited deposit_uri = self.um.file_uri(silo, dataset_id, deposit.filename) ssslog.debug("Incoming file has been stored at URI " + deposit_uri) # register a new content state to be used content_state = DataBankStates.zip_file_added # Taken from dataset.py, seems to be the done thing when adding an item. # NOTE: confirmed with Anusha that this is correct dataset.del_triple(dataset.uri, u"dcterms:modified") dataset.add_triple(dataset.uri, u"dcterms:modified", datetime.now()) dataset.del_triple(dataset.uri, u"oxds:currentVersion") dataset.add_triple(dataset.uri, u"oxds:currentVersion", dataset.currentversion) # before we do any state management, we have to be sure that the sword namespace # is registered dataset.get_rdf_manifest().add_namespace( "sword", "http://purl.org/net/sword/terms/") dataset.sync() # sort out the new list of states for the item current_states = self._extract_states(dataset) new_states = [] # for each existing state, consider whether to carry it over ssslog.info("new content state: " + str(content_state)) for state_uri, state_desc in current_states: keep = True if metadata_state is not None and state_uri in DataBankStates.metadata_states: # we do not want the state if it is a metadata state and we have been given # a new metadata state keep = False if content_state is not None and state_uri in DataBankStates.content_states: ssslog.debug("Removing state: " + state_uri) # we do not want the state if it is a content state and we have been given # a new content state keep = False if keep: ssslog.debug("carrying over state: " + state_uri) new_states.append((state_uri, state_desc)) # add the new metadata and content states provided from above if metadata_state is not None: new_states.append(metadata_state) if content_state is not None: ssslog.debug("adding new content state: " + str(content_state)) new_states.append(content_state) ssslog.debug("New Dataset States: " + str(new_states)) # FIXME: how safe is this? What other ore:aggregates might there be? # we need to back out some of the triples in preparation to update the # statement # NOTE AR: I have commented the following lines. # For aggregates this is not needed. put_stream will add the aggregate into the URI. # Why delete other triples in the manifest - ?? # sword:originalDeposit point to isVersionOf aggregates = dataset.list_rdf_objects(dataset.uri, u"ore:aggregates") original_deposits = dataset.list_rdf_objects(dataset.uri, u"sword:originalDeposit") states = dataset.list_rdf_objects(dataset.uri, u"sword:state") for a in aggregates: dataset.del_triple(a, "*") for od in original_deposits: dataset.del_triple(od, "*") for s in states: dataset.del_triple(s, "*") dataset.del_triple(dataset.uri, u"ore:aggregates") dataset.del_triple(dataset.uri, u"sword:originalDeposit") dataset.del_triple(dataset.uri, u"sword:state") # FIXME: also unsafe in the same way as above # Write the md5 checksum into the manifest # A deposit contains just the new stuff so no harm in deleting all triples dataset.del_triple("*", u"oxds:hasMD5") #dataset.del_triple(deposit_uri, u"oxds:hasMD5") if deposit.content_md5 is not None: dataset.add_triple(deposit_uri, u"oxds:hasMD5", deposit.content_md5) dataset.sync() # the aggregation uri agg_uri = self.um.agg_uri(silo, dataset_id) # the Edit-URI edit_uri = self.um.edit_uri(silo, dataset_id) # FIXME: here we also need to keep existing states where relevant. # A state will continue to be relevant if it applies to an area of the # item (i.e. the container or the media resource) for which this operation # has no effect. # for example: # this is a metadata replace, but a status on the item is set to say that # the item's zip file is corrupt and needs replacing. The new status # should leave this alone (and probably not do anything, tbh), no matter # what else it does # create the statement outline # FIXME: there is something weird going on with instantiating this object without the original_deposits argument # apparently if I don't explicitly say there are no original deposits, then it "remembers" original deposits # from previous uses of the object s = Statement(aggregation_uri=agg_uri, rem_uri=edit_uri, states=new_states, original_deposits=[]) # set the original deposit (which sorts out the aggregations for us too) by = deposit.auth.username if deposit.auth is not None else None obo = deposit.auth.on_behalf_of if deposit.auth is not None else None if deposit_uri is not None: s.original_deposit(deposit_uri, datetime.now(), deposit.packaging, by, obo) # create the new manifest and store it manifest = dataset.get_rdf_manifest() f = open(manifest.filepath, "r") rdf_string = f.read() new_manifest = s.serialise_rdf(rdf_string) dataset.put_stream("manifest.rdf", new_manifest) # FIXME: add in proper treatment here # now generate a receipt. # TODO: Include audit log instead of 'added zip to dataset' receipt = self.deposit_receipt(silo, dataset_id, dataset, "added zip to dataset") # now augment the receipt with the details of this particular deposit # this handles None arguments, and converts the xml receipt into a string receipt = self.augmented_receipt(receipt, deposit_uri, derived_resource_uris) # finally, assemble the deposit response and return dr = DepositResponse() dr.receipt = receipt.serialise() dr.location = receipt.edit_uri return dr
def deposit_new(self, silo, deposit): """ Take the supplied deposit and treat it as a new container with content to be created in the specified collection Args: -collection: the ID of the collection to be deposited into -deposit: the DepositRequest object to be processed Returns a DepositResponse object which will contain the Deposit Receipt or a SWORD Error """ # check against the authorised list of silos rdf_silo = self._get_authorised_rdf_silo(silo) # ensure that we have a slug if deposit.slug is None: deposit.slug = str(uuid.uuid4()) # weed out unacceptable deposits if rdf_silo.exists(deposit.slug): raise SwordError(error_uri=DataBankErrors.dataset_conflict, msg="A Dataset with the name " + deposit.slug + " already exists") if not allowable_id2(deposit.slug): raise SwordError( error_uri=Errors.bad_request, msg="Dataset name can contain only the following characters - " + ag.naming_rule_humanized + " and has to be more than 1 character") # NOTE: we pass in an empty dictionary of metadata on create, and then run # _ingest_metadata to augment the item from the deposit item = create_new(rdf_silo, deposit.slug, self.auth_credentials.username, {}) add_dataset(silo, deposit.slug) self._ingest_metadata(item, deposit) # NOTE: left in for reference for the time being, but deposit_new # only support entry only deposits in databank. This will need to be # re-introduced for full sword support # store the content file if one exists, and do some processing on it #deposit_uri = None #derived_resource_uris = [] #if deposit.content is not None: # if deposit.filename is None: # deposit.filename = "unnamed.file" # fn = self.dao.store_content(collection, id, deposit.content, deposit.filename) # now that we have stored the atom and the content, we can invoke a package ingester over the top to extract # all the metadata and any files we want # FIXME: because the deposit interpreter doesn't deal with multipart properly # we don't get the correct packaging format here if the package is anything # other than Binary # ssslog.info("attempting to load ingest packager for format " + str(deposit.packaging)) # packager = self.configuration.get_package_ingester(deposit.packaging)(self.dao) # derived_resources = packager.ingest(collection, id, fn, deposit.metadata_relevant) # An identifier which will resolve to the package just deposited # deposit_uri = self.um.part_uri(collection, id, fn) # a list of identifiers which will resolve to the derived resources # derived_resource_uris = self.get_derived_resource_uris(collection, id, derived_resources) # the aggregation uri agg_uri = self.um.agg_uri(silo, deposit.slug) # the Edit-URI edit_uri = self.um.edit_uri(silo, deposit.slug) # create the initial statement s = Statement(aggregation_uri=agg_uri, rem_uri=edit_uri, states=[DataBankStates.initial_state]) # FIXME: need to sort out authentication before we can do this ... # FIXME: also, it's not relevant unless we take a binary-only deposit, which # we currently don't # User already authorized to deposit in this silo (_get_authorised_rdf_silo). # This is to augment metadata with details like who created, on behalf of, when # #by = deposit.auth.username if deposit.auth is not None else None #obo = deposit.auth.on_behalf_of if deposit.auth is not None else None #if deposit_uri is not None: # s.original_deposit(deposit_uri, datetime.now(), deposit.packaging, by, obo) #s.aggregates = derived_resource_uris # In creating the statement we use the existing manifest.rdf file in the # item: manifest = item.get_rdf_manifest() f = open(manifest.filepath, "r") rdf_string = f.read() # create the new manifest and store it #Serialize rdf adds the sword statement - state, depositedOn, by, onBehalfOf, stateDesc new_manifest = s.serialise_rdf(rdf_string) item.put_stream("manifest.rdf", new_manifest) # FIXME: here is where we have to put the correct treatment in # now generate a receipt for the deposit # TODO: Add audit log from item.manifest in place of "created new item" receipt = self.deposit_receipt(silo, deposit.slug, item, "created new item") # FIXME: while we don't have full text deposit, we don't need to augment # the deposit receipt # now augment the receipt with the details of this particular deposit # this handles None arguments, and converts the xml receipt into a string # receipt = self.augmented_receipt(receipt, deposit_uri, derived_resource_uris) # finally, assemble the deposit response and return dr = DepositResponse() dr.receipt = receipt.serialise() dr.location = receipt.edit_uri # Broadcast change as message ag.b.creation(silo, deposit.slug, ident=self.auth_credentials.username) return dr
def test_03_rdf_serialise(self): n = datetime.now() ods = [("http://od1/", n, "http://package/", "sword", "obo"), ("http://od2/", n, "http://package/", "bob", None)] od_uris = ["http://od1/", "http://od2/"] s = Statement(aggregation_uri="http://aggregation/", rem_uri="http://rem/", original_deposits=ods, aggregates=[ "http://od1/", "http://od2/", "http://agg1/", "http://agg2/" ], states=[("http://state/", "everything is groovy")]) rdf_string = s.serialise_rdf() # first try the round trip rdf = etree.fromstring(rdf_string) # here are some counters/switches which will help us test that everything # is good within the statement descriptions = 0 states = 0 state_descriptions = 0 original_deposits = 0 aggregated_resources = 0 packaging = 0 dep_on = 0 dep_by = 0 dep_obo = 0 has_rem_description = False has_agg_description = False # now go through the rdf and check that everything is as expected for desc in rdf.findall(RDF + "Description"): descriptions += 1 about = desc.get(RDF + "about") for element in desc.getchildren(): if element.tag == ORE + "describes": resource = element.get(RDF + "resource") assert about == s.rem_uri assert resource == s.aggregation_uri has_rem_description = True if element.tag == ORE + "isDescribedBy": resource = element.get(RDF + "resource") assert about == s.aggregation_uri assert resource == s.rem_uri has_agg_description = True if element.tag == ORE + "aggregates": resource = element.get(RDF + "resource") assert resource in s.aggregates or resource in od_uris aggregated_resources += 1 if element.tag == SWORD + "originalDeposit": resource = element.get(RDF + "resource") assert resource in od_uris original_deposits += 1 if element.tag == SWORD + "state": resource = element.get(RDF + "resource") assert resource == "http://state/" states += 1 if element.tag == SWORD + "stateDescription": assert element.text.strip() == "everything is groovy" assert about == "http://state/" state_descriptions += 1 if element.tag == SWORD + "packaging": resource = element.get(RDF + "resource") assert resource == "http://package/" assert about in od_uris packaging += 1 if element.tag == SWORD + "depositedOn": assert about in od_uris dep_on += 1 if element.tag == SWORD + "depositedBy": assert element.text in ["sword", "bob"] assert about in od_uris dep_by += 1 if element.tag == SWORD + "depositedOnBehalfOf": assert element.text == "obo" assert about in od_uris dep_obo += 1 # now check that our counters/switches were flipped appropriately assert descriptions == 5 assert states == 1 assert state_descriptions == 1 assert original_deposits == 2 assert aggregated_resources == 4 assert packaging == 2 assert dep_on == 2 assert dep_by == 2 assert dep_obo == 1 assert has_rem_description assert has_agg_description
def test_04_rdf_aggregation_uri_exists(self): n = datetime.now() ods = [ ("http://od1/", n, "http://package/", "sword", "obo"), ("http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip", n, "http://package/", "bob", None) ] od_uris = [ "http://od1/", "http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip" ] s = Statement( aggregation_uri="http://192.168.23.133/asdfasd/datasets/mydataset6", rem_uri="http://rem/", original_deposits=ods, aggregates=[ "http://od1/", "http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip", "http://agg1/", "http://agg2/" ], states=[("http://state/", "everything is groovy")]) rdf_string = s.serialise_rdf(RDF_DOC) # first try the round trip rdf = etree.fromstring(rdf_string) # here are some counters/switches which will help us test that everything # is good within the statement descriptions = 0 states = 0 state_descriptions = 0 original_deposits = 0 aggregated_resources = 0 packaging = 0 dep_on = 0 dep_by = 0 dep_obo = 0 has_rem_description = False has_agg_description = False ox_tag = False dc_tag = False rdf_tag = False # now go through the rdf and check that everything is as expected for desc in rdf.findall(RDF + "Description"): descriptions += 1 about = desc.get(RDF + "about") for element in desc.getchildren(): # we expect all of the same things to be true as in the previous # test if element.tag == ORE + "describes": resource = element.get(RDF + "resource") assert about == s.rem_uri assert resource == s.aggregation_uri has_rem_description = True if element.tag == ORE + "isDescribedBy": resource = element.get(RDF + "resource") assert about == s.aggregation_uri assert resource == s.rem_uri has_agg_description = True if element.tag == ORE + "aggregates": resource = element.get(RDF + "resource") assert resource in s.aggregates or resource in od_uris aggregated_resources += 1 if element.tag == SWORD + "originalDeposit": resource = element.get(RDF + "resource") assert resource in od_uris original_deposits += 1 if element.tag == SWORD + "state": resource = element.get(RDF + "resource") assert resource == "http://state/" states += 1 if element.tag == SWORD + "stateDescription": assert element.text.strip() == "everything is groovy" assert about == "http://state/" state_descriptions += 1 if element.tag == SWORD + "packaging": resource = element.get(RDF + "resource") assert resource == "http://package/" assert about in od_uris packaging += 1 if element.tag == SWORD + "depositedOn": assert about in od_uris dep_on += 1 if element.tag == SWORD + "depositedBy": assert element.text in ["sword", "bob"] assert about in od_uris dep_by += 1 if element.tag == SWORD + "depositedOnBehalfOf": assert element.text == "obo" assert about in od_uris dep_obo += 1 # and we must verify that we didn't overwrite anything in the # passed in RDF document (don't check everything, but let's pick # one thing from each namespace) if element.tag == OX + "currentVersion": assert element.text == "6" ox_tag = True if element.tag == DC + "identifier": assert element.text == "mydataset6" dc_tag = True if element.tag == RDF + "type": resource = element.get(RDF + "resource") assert resource == "http://vocab.ox.ac.uk/dataset/schema#DataSet" rdf_tag = True # now check that our counters/switches were flipped appropriately assert descriptions == 5 assert states == 1 assert state_descriptions == 1 assert original_deposits == 2 assert aggregated_resources == 4 assert packaging == 2 assert dep_on == 2 assert dep_by == 2 assert dep_obo == 1 assert has_rem_description assert has_agg_description assert ox_tag assert dc_tag assert rdf_tag