Exemplo n.º 1
0
    def test_01_init_statement(self):
        n = datetime.now()
        ods = [("http://od1/", n, "http://package/", "sword", "obo"),
               ("http://od2/", n, "http://package/", "bob", None)]
        s = Statement(aggregation_uri="http://aggregation/",
                      rem_uri="http://rem/",
                      original_deposits=ods,
                      aggregates=[
                          "http://od1/", "http://od2/", "http://agg1/",
                          "http://agg2/"
                      ],
                      states=[("http://state/", "everything is groovy")])

        # now check that the item is correctly initialised
        assert s.aggregation_uri == "http://aggregation/"
        assert s.rem_uri == "http://rem/"
        assert len(s.original_deposits) == 2
        assert "http://od1/" in s.original_deposits[0]
        assert "http://od2/" in s.original_deposits[1]
        assert "http://od1/" in s.aggregates
        assert "http://od2/" in s.aggregates
        assert "http://agg1/" in s.aggregates
        assert "http://agg2/" in s.aggregates
        assert len(s.aggregates) == 4
        assert len(s.states) == 1

        state_uri, state_description = s.states[0]
        assert state_uri == "http://state/"
        assert state_description == "everything is groovy"
 def test_02_modify_statement(self):
     n = datetime.now()
     ods = [
         ("http://od1/", n, "http://package/", "sword", "obo"),
         ("http://od2/", n, "http://package/", "bob", None)
     ]
     s = Statement(aggregation_uri="http://aggregation/", rem_uri="http://rem/",
                     original_deposits=ods,
                     aggregates=["http://od1/", "http://od2/", "http://agg1/", "http://agg2/"],
                     states=[("http://state/", "everything is groovy")])
     
     s.set_state("http://new/state/", "still good, though")
     
     assert len(s.states) == 1
     state_uri, state_description = s.states[0]
     assert state_uri == "http://new/state/"
     assert state_description == "still good, though"
     
     s.add_state("http://another/state", "also, this")
     assert len(s.states) == 2
Exemplo n.º 3
0
    def test_02_modify_statement(self):
        n = datetime.now()
        ods = [("http://od1/", n, "http://package/", "sword", "obo"),
               ("http://od2/", n, "http://package/", "bob", None)]
        s = Statement(aggregation_uri="http://aggregation/",
                      rem_uri="http://rem/",
                      original_deposits=ods,
                      aggregates=[
                          "http://od1/", "http://od2/", "http://agg1/",
                          "http://agg2/"
                      ],
                      states=[("http://state/", "everything is groovy")])

        s.set_state("http://new/state/", "still good, though")

        assert len(s.states) == 1
        state_uri, state_description = s.states[0]
        assert state_uri == "http://new/state/"
        assert state_description == "still good, though"

        s.add_state("http://another/state", "also, this")
        assert len(s.states) == 2
 def test_03_rdf_serialise(self):
     n = datetime.now()
     ods = [
         ("http://od1/", n, "http://package/", "sword", "obo"),
         ("http://od2/", n, "http://package/", "bob", None)
     ]
     od_uris = ["http://od1/", "http://od2/"]
     s = Statement(aggregation_uri="http://aggregation/", rem_uri="http://rem/",
                     original_deposits=ods,
                     aggregates=["http://od1/", "http://od2/", "http://agg1/", "http://agg2/"],
                     states=[("http://state/", "everything is groovy")])
                     
     rdf_string = s.serialise_rdf()
     
     # first try the round trip
     rdf = etree.fromstring(rdf_string)
     
     # here are some counters/switches which will help us test that everything
     # is good within the statement
     descriptions = 0
     states = 0
     state_descriptions = 0
     original_deposits = 0
     aggregated_resources = 0
     packaging = 0
     dep_on = 0
     dep_by = 0
     dep_obo = 0
     
     has_rem_description = False
     has_agg_description = False
     
     # now go through the rdf and check that everything is as expected
     for desc in rdf.findall(RDF + "Description"):
         descriptions += 1
         about = desc.get(RDF + "about")
         for element in desc.getchildren():
             if element.tag == ORE + "describes":
                 resource = element.get(RDF + "resource")
                 assert about == s.rem_uri
                 assert resource == s.aggregation_uri
                 has_rem_description = True
             if element.tag == ORE + "isDescribedBy":
                 resource = element.get(RDF + "resource")
                 assert about == s.aggregation_uri
                 assert resource == s.rem_uri
                 has_agg_description = True
             if element.tag == ORE + "aggregates":
                 resource = element.get(RDF + "resource")
                 assert resource in s.aggregates or resource in od_uris
                 aggregated_resources += 1
             if element.tag == SWORD + "originalDeposit":
                 resource = element.get(RDF + "resource")
                 assert resource in od_uris
                 original_deposits += 1
             if element.tag == SWORD + "state":
                 resource = element.get(RDF + "resource")
                 assert resource == "http://state/"
                 states += 1
             if element.tag == SWORD + "stateDescription":
                 assert element.text.strip() == "everything is groovy"
                 assert about == "http://state/"
                 state_descriptions += 1
             if element.tag == SWORD + "packaging":
                 resource = element.get(RDF + "resource")
                 assert resource == "http://package/"
                 assert about in od_uris
                 packaging += 1
             if element.tag == SWORD + "depositedOn":
                 assert about in od_uris
                 dep_on += 1
             if element.tag == SWORD + "depositedBy":
                 assert element.text in ["sword", "bob"]
                 assert about in od_uris
                 dep_by += 1
             if element.tag == SWORD + "depositedOnBehalfOf":
                 assert element.text == "obo"
                 assert about in od_uris
                 dep_obo += 1
     
     # now check that our counters/switches were flipped appropriately
     assert descriptions == 5
     assert states == 1
     assert state_descriptions == 1
     assert original_deposits == 2
     assert aggregated_resources == 4
     assert packaging == 2
     assert dep_on == 2
     assert dep_by == 2
     assert dep_obo == 1
     assert has_rem_description
     assert has_agg_description
 def test_04_rdf_aggregation_uri_exists(self):
     n = datetime.now()
     ods = [
         ("http://od1/", n, "http://package/", "sword", "obo"),
         ("http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip", n, "http://package/", "bob", None)
     ]
     od_uris = ["http://od1/", "http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip"]
     s = Statement(aggregation_uri="http://192.168.23.133/asdfasd/datasets/mydataset6", rem_uri="http://rem/",
                     original_deposits=ods,
                     aggregates=["http://od1/", "http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip", "http://agg1/", "http://agg2/"],
                     states=[("http://state/", "everything is groovy")])
                     
     rdf_string = s.serialise_rdf(RDF_DOC)
     
     # first try the round trip
     rdf = etree.fromstring(rdf_string)
     
     # here are some counters/switches which will help us test that everything
     # is good within the statement
     descriptions = 0
     states = 0
     state_descriptions = 0
     original_deposits = 0
     aggregated_resources = 0
     packaging = 0
     dep_on = 0
     dep_by = 0
     dep_obo = 0
     
     has_rem_description = False
     has_agg_description = False
     ox_tag = False
     dc_tag = False
     rdf_tag = False
     
     # now go through the rdf and check that everything is as expected
     for desc in rdf.findall(RDF + "Description"):
         descriptions += 1
         about = desc.get(RDF + "about")
         for element in desc.getchildren():
             # we expect all of the same things to be true as in the previous
             # test
             if element.tag == ORE + "describes":
                 resource = element.get(RDF + "resource")
                 assert about == s.rem_uri
                 assert resource == s.aggregation_uri
                 has_rem_description = True
             if element.tag == ORE + "isDescribedBy":
                 resource = element.get(RDF + "resource")
                 assert about == s.aggregation_uri
                 assert resource == s.rem_uri
                 has_agg_description = True
             if element.tag == ORE + "aggregates":
                 resource = element.get(RDF + "resource")
                 assert resource in s.aggregates or resource in od_uris
                 aggregated_resources += 1
             if element.tag == SWORD + "originalDeposit":
                 resource = element.get(RDF + "resource")
                 assert resource in od_uris
                 original_deposits += 1
             if element.tag == SWORD + "state":
                 resource = element.get(RDF + "resource")
                 assert resource == "http://state/"
                 states += 1
             if element.tag == SWORD + "stateDescription":
                 assert element.text.strip() == "everything is groovy"
                 assert about == "http://state/"
                 state_descriptions += 1
             if element.tag == SWORD + "packaging":
                 resource = element.get(RDF + "resource")
                 assert resource == "http://package/"
                 assert about in od_uris
                 packaging += 1
             if element.tag == SWORD + "depositedOn":
                 assert about in od_uris
                 dep_on += 1
             if element.tag == SWORD + "depositedBy":
                 assert element.text in ["sword", "bob"]
                 assert about in od_uris
                 dep_by += 1
             if element.tag == SWORD + "depositedOnBehalfOf":
                 assert element.text == "obo"
                 assert about in od_uris
                 dep_obo += 1
                 
             # and we must verify that we didn't overwrite anything in the
             # passed in RDF document (don't check everything, but let's pick
             # one thing from each namespace)
             if element.tag == OX + "currentVersion":
                 assert element.text == "6"
                 ox_tag = True
             if element.tag == DC + "identifier":
                 assert element.text == "mydataset6"
                 dc_tag = True
             if element.tag == RDF + "type":
                 resource = element.get(RDF + "resource")
                 assert resource == "http://vocab.ox.ac.uk/dataset/schema#DataSet"
                 rdf_tag = True
     
     # now check that our counters/switches were flipped appropriately
     assert descriptions == 5
     assert states == 1
     assert state_descriptions == 1
     assert original_deposits == 2
     assert aggregated_resources == 4
     assert packaging == 2
     assert dep_on == 2
     assert dep_by == 2
     assert dep_obo == 1
     assert has_rem_description
     assert has_agg_description
     
     assert ox_tag
     assert dc_tag
     assert rdf_tag
    def replace(self, path, deposit):
        """
        Replace all the content represented by the supplied id with the supplied deposit
        Args:
        - oid:  the object ID in the store
        - deposit:  a DepositRequest object
        Return a DepositResponse containing the Deposit Receipt or a SWORD Error
        """
        silo, dataset_id, accept_parameters = self.um.interpret_path(path)
        rdf_silo = self._get_authorised_rdf_silo(silo)
            
        # now get the dataset object itself
        dataset = rdf_silo.get_item(dataset_id)
        
        # deal with possible problems with the filename
        if deposit.filename is None or deposit.filename == "":
            raise SwordError(error_uri=Errors.bad_request, msg="You must supply a filename to unpack")
        if JAILBREAK.search(deposit.filename) != None:
            raise SwordError(error_uri=Errors.bad_request, msg="'..' cannot be used in the path or as a filename")
        
        # FIXME: at the moment this metadata operation is not supported by DataBank
        #
        # first figure out what to do about the metadata
        keep_atom = False
        metadata_state = None # This will be used to store any state information associated
                                # with a metadata update.  It gets tied up with the content state
                                # and any pre-existing states further down
        #if deposit.atom is not None:
        #    ssslog.info("Replace request has ATOM part - updating")
        #    entry_ingester = self.configuration.get_entry_ingester()(self.dao)
        #    entry_ingester.ingest(collection, id, deposit.atom)
        #    keep_atom = True
        
        content_state = None
        deposit_uri = None
        derived_resource_uris = []
        if deposit.content is not None:
            ssslog.info("Replace request has file content - updating")
            
            # remove all the old files before adding the new.  We always leave
            # behind the metadata; this will be overwritten later if necessary
            #self.dao.remove_content(collection, id, True, keep_atom)
            #Increment the version, but do not clone the previous version.
            # An update will replace the entire contents of the container (if previously unpacked) with the bagit file
            dataset.increment_version_delta(clone_previous_version=True, copy_filenames=['manifest.rdf'])

            # store the content file
            dataset.put_stream(deposit.filename, deposit.content)
            ssslog.debug("New incoming file stored with filename " + deposit.filename)
            
            # FIXME: unpacking doesn't happen here ... (keeping for the time being for reference)
            # Broadcast to unpack and add sword:state in manifest
            # <sword:state rdf:resource="http://purl.org/net/sword/state/queuedForUnpacking"/>
            
            # now that we have stored the atom and the content, we can invoke a package ingester over the top to extract
            # all the metadata and any files we want.  Notice that we pass in the metadata_relevant flag, so the
            # packager won't overwrite the existing metadata if it isn't supposed to
            #packager = self.configuration.get_package_ingester(deposit.packaging)(self.dao)
            #derived_resources = packager.ingest(collection, id, fn, deposit.metadata_relevant)
            #ssslog.debug("Resources derived from deposit: " + str(derived_resources))
        
            # a list of identifiers which will resolve to the derived resources
            #derived_resource_uris = self.get_derived_resource_uris(collection, id, derived_resources)

            # An identifier which will resolve to the package just deposited
            deposit_uri = self.um.file_uri(silo, dataset_id, deposit.filename)
            ssslog.debug("Incoming file has been stored at URI " + deposit_uri)
            
            # register a new content state to be used
            content_state = DataBankStates.zip_file_added

        # Taken from dataset.py, seems to be the done thing when adding an item.
        # NOTE: confirmed with Anusha that this is correct
        dataset.del_triple(dataset.uri, u"dcterms:modified")
        dataset.add_triple(dataset.uri, u"dcterms:modified", datetime.now())
        dataset.del_triple(dataset.uri, u"oxds:currentVersion")
        dataset.add_triple(dataset.uri, u"oxds:currentVersion", dataset.currentversion)

        # before we do any state management, we have to be sure that the sword namespace
        # is registered
        dataset.get_rdf_manifest().add_namespace("sword", "http://purl.org/net/sword/terms/")
        dataset.sync()
        
        # sort out the new list of states for the item
        current_states = self._extract_states(dataset)
        new_states = []
        
        # for each existing state, consider whether to carry it over
        ssslog.info("new content state: " + str(content_state))
        for state_uri, state_desc in current_states:
            keep = True
            if metadata_state is not None and state_uri in DataBankStates.metadata_states:
                # we do not want the state if it is a metadata state and we have been given
                # a new metadata state
                keep = False
            if content_state is not None and state_uri in DataBankStates.content_states:
                    ssslog.debug("Removing state: " + state_uri)
                    # we do not want the state if it is a content state and we have been given
                    # a new content state
                    keep = False            
            if keep:
                ssslog.debug("carrying over state: " + state_uri)
                new_states.append((state_uri, state_desc))
        
        # add the new metadata and content states provided from above
        if metadata_state is not None:
            new_states.append(metadata_state)
        if content_state is not None:
            ssslog.debug("adding new content state: " + str(content_state))
            new_states.append(content_state)
            
        ssslog.debug("New Dataset States: " + str(new_states))
        
        # FIXME: how safe is this?  What other ore:aggregates might there be?
        # we need to back out some of the triples in preparation to update the
        # statement
        # NOTE AR: I have commented the following lines. 
        #       For aggregates this is not needed. put_stream will add the aggregate into the URI. 
        #       Why delete other triples in the manifest - ??
        # sword:originalDeposit point to isVersionOf
        
        aggregates = dataset.list_rdf_objects(dataset.uri, u"ore:aggregates")
        original_deposits = dataset.list_rdf_objects(dataset.uri, u"sword:originalDeposit")
        states = dataset.list_rdf_objects(dataset.uri, u"sword:state")
        
        for a in aggregates:
            dataset.del_triple(a, "*")
        for od in original_deposits:
            dataset.del_triple(od, "*")
        for s in states:
            dataset.del_triple(s, "*")
        dataset.del_triple(dataset.uri, u"ore:aggregates")
        dataset.del_triple(dataset.uri, u"sword:originalDeposit")
        dataset.del_triple(dataset.uri, u"sword:state")

        # FIXME: also unsafe in the same way as above
        # Write the md5 checksum into the manifest
        # A deposit contains just the new stuff so no harm in deleting all triples 
        dataset.del_triple("*", u"oxds:hasMD5")
        #dataset.del_triple(deposit_uri, u"oxds:hasMD5")
        if deposit.content_md5 is not None:
            dataset.add_triple(deposit_uri, u"oxds:hasMD5", deposit.content_md5)
        
        dataset.sync()

        # the aggregation uri
        agg_uri = self.um.agg_uri(silo, dataset_id)

        # the Edit-URI
        edit_uri = self.um.edit_uri(silo, dataset_id)

        # FIXME: here we also need to keep existing states where relevant.
        #   A state will continue to be relevant if it applies to an area of the
        #   item (i.e. the container or the media resource) for which this operation
        #   has no effect.
        #   for example:
        #   this is a metadata replace, but a status on the item is set to say that
        #   the item's zip file is corrupt and needs replacing.  The new status 
        #   should leave this alone (and probably not do anything, tbh), no matter
        #   what else it does
        # create the statement outline
        # FIXME: there is something weird going on with instantiating this object without the original_deposits argument
        # apparently if I don't explicitly say there are no original deposits, then it "remembers" original deposits 
        # from previous uses of the object
        s = Statement(aggregation_uri=agg_uri, rem_uri=edit_uri, states=new_states, original_deposits=[])
         
        # set the original deposit (which sorts out the aggregations for us too)
        by = deposit.auth.username if deposit.auth is not None else None
        obo = deposit.auth.on_behalf_of if deposit.auth is not None else None
        if deposit_uri is not None:
            s.original_deposit(deposit_uri, datetime.now(), deposit.packaging, by, obo)
        
        # create the new manifest and store it
        manifest = dataset.get_rdf_manifest()
        f = open(manifest.filepath, "r")
        rdf_string = f.read()
        
        new_manifest = s.serialise_rdf(rdf_string)
        dataset.put_stream("manifest.rdf", new_manifest)
        
        # FIXME: add in proper treatment here
        # now generate a receipt. 
        # TODO: Include audit log instead of 'added zip to dataset'
        receipt = self.deposit_receipt(silo, dataset_id, dataset, "added zip to dataset")
        
        # now augment the receipt with the details of this particular deposit
        # this handles None arguments, and converts the xml receipt into a string
        receipt = self.augmented_receipt(receipt, deposit_uri, derived_resource_uris)

        # finally, assemble the deposit response and return
        dr = DepositResponse()
        dr.receipt = receipt.serialise()
        dr.location = receipt.edit_uri
        return dr
    def deposit_new(self, silo, deposit):
        """
        Take the supplied deposit and treat it as a new container with content to be created in the specified collection
        Args:
        -collection:    the ID of the collection to be deposited into
        -deposit:       the DepositRequest object to be processed
        Returns a DepositResponse object which will contain the Deposit Receipt or a SWORD Error
        """
        # check against the authorised list of silos
        rdf_silo = self._get_authorised_rdf_silo(silo)

        # ensure that we have a slug
        if deposit.slug is None:
            deposit.slug = str(uuid.uuid4())
            
        # weed out unacceptable deposits
        if rdf_silo.exists(deposit.slug):
            raise SwordError(error_uri=DataBankErrors.dataset_conflict, msg="A Dataset with the name " + deposit.slug + " already exists")
        if not allowable_id2(deposit.slug):
            raise SwordError(error_uri=Errors.bad_request, msg="Dataset name can contain only the following characters - " + 
                                                                ag.naming_rule_humanized + " and has to be more than 1 character")
        
        # NOTE: we pass in an empty dictionary of metadata on create, and then run
        # _ingest_metadata to augment the item from the deposit
        item = create_new(rdf_silo, deposit.slug, self.auth_credentials.username, {})
        add_dataset(silo, deposit.slug)
        self._ingest_metadata(item, deposit)
        
        # NOTE: left in for reference for the time being, but deposit_new 
        # only support entry only deposits in databank.  This will need to be
        # re-introduced for full sword support
        # store the content file if one exists, and do some processing on it
        #deposit_uri = None
        #derived_resource_uris = []
        #if deposit.content is not None:
        
       #     if deposit.filename is None:
       #         deposit.filename = "unnamed.file"
       #     fn = self.dao.store_content(collection, id, deposit.content, deposit.filename)

            # now that we have stored the atom and the content, we can invoke a package ingester over the top to extract
            # all the metadata and any files we want
            
            # FIXME: because the deposit interpreter doesn't deal with multipart properly
            # we don't get the correct packaging format here if the package is anything
            # other than Binary
       #     ssslog.info("attempting to load ingest packager for format " + str(deposit.packaging))
       #     packager = self.configuration.get_package_ingester(deposit.packaging)(self.dao)
       #     derived_resources = packager.ingest(collection, id, fn, deposit.metadata_relevant)

            # An identifier which will resolve to the package just deposited
       #     deposit_uri = self.um.part_uri(collection, id, fn)
            
            # a list of identifiers which will resolve to the derived resources
       #     derived_resource_uris = self.get_derived_resource_uris(collection, id, derived_resources)

        # the aggregation uri
        agg_uri = self.um.agg_uri(silo, deposit.slug)

        # the Edit-URI
        edit_uri = self.um.edit_uri(silo, deposit.slug)

        # create the initial statement
        s = Statement(aggregation_uri=agg_uri, rem_uri=edit_uri, states=[DataBankStates.initial_state])
        
        # FIXME: need to sort out authentication before we can do this ...
        # FIXME: also, it's not relevant unless we take a binary-only deposit, which
        # we currently don't
        # User already authorized to deposit in this silo (_get_authorised_rdf_silo). 
        # This is to augment metadata with details like who created, on behalf of, when
        #
        #by = deposit.auth.username if deposit.auth is not None else None
        #obo = deposit.auth.on_behalf_of if deposit.auth is not None else None
        #if deposit_uri is not None:
        #    s.original_deposit(deposit_uri, datetime.now(), deposit.packaging, by, obo)
        #s.aggregates = derived_resource_uris

        # In creating the statement we use the existing manifest.rdf file in the
        # item:
        manifest = item.get_rdf_manifest()
        f = open(manifest.filepath, "r")
        rdf_string = f.read()

        # create the new manifest and store it
        #Serialize rdf adds the sword statement - state, depositedOn, by, onBehalfOf, stateDesc
        new_manifest = s.serialise_rdf(rdf_string)
        item.put_stream("manifest.rdf", new_manifest)

        # FIXME: here is where we have to put the correct treatment in
        # now generate a receipt for the deposit
        # TODO: Add audit log from item.manifest in place of  "created new item"
        receipt = self.deposit_receipt(silo, deposit.slug, item, "created new item")

        # FIXME: while we don't have full text deposit, we don't need to augment
        # the deposit receipt
        
        # now augment the receipt with the details of this particular deposit
        # this handles None arguments, and converts the xml receipt into a string
        # receipt = self.augmented_receipt(receipt, deposit_uri, derived_resource_uris)
        
        # finally, assemble the deposit response and return
        dr = DepositResponse()
        dr.receipt = receipt.serialise()
        dr.location = receipt.edit_uri
        
        # Broadcast change as message
        ag.b.creation(silo, deposit.slug, ident=self.auth_credentials.username)
        
        return dr
Exemplo n.º 8
0
    def replace(self, path, deposit):
        """
        Replace all the content represented by the supplied id with the supplied deposit
        Args:
        - oid:  the object ID in the store
        - deposit:  a DepositRequest object
        Return a DepositResponse containing the Deposit Receipt or a SWORD Error
        """
        silo, dataset_id, accept_parameters = self.um.interpret_path(path)
        rdf_silo = self._get_authorised_rdf_silo(silo)

        # now get the dataset object itself
        dataset = rdf_silo.get_item(dataset_id)

        # deal with possible problems with the filename
        if deposit.filename is None or deposit.filename == "":
            raise SwordError(error_uri=Errors.bad_request,
                             msg="You must supply a filename to unpack")
        if JAILBREAK.search(deposit.filename) != None:
            raise SwordError(
                error_uri=Errors.bad_request,
                msg="'..' cannot be used in the path or as a filename")

        # FIXME: at the moment this metadata operation is not supported by DataBank
        #
        # first figure out what to do about the metadata
        keep_atom = False
        metadata_state = None  # This will be used to store any state information associated
        # with a metadata update.  It gets tied up with the content state
        # and any pre-existing states further down
        #if deposit.atom is not None:
        #    ssslog.info("Replace request has ATOM part - updating")
        #    entry_ingester = self.configuration.get_entry_ingester()(self.dao)
        #    entry_ingester.ingest(collection, id, deposit.atom)
        #    keep_atom = True

        content_state = None
        deposit_uri = None
        derived_resource_uris = []
        if deposit.content is not None:
            ssslog.info("Replace request has file content - updating")

            # remove all the old files before adding the new.  We always leave
            # behind the metadata; this will be overwritten later if necessary
            #self.dao.remove_content(collection, id, True, keep_atom)
            #Increment the version, but do not clone the previous version.
            # An update will replace the entire contents of the container (if previously unpacked) with the bagit file
            dataset.increment_version_delta(clone_previous_version=True,
                                            copy_filenames=['manifest.rdf'])

            # store the content file
            dataset.put_stream(deposit.filename, deposit.content)
            ssslog.debug("New incoming file stored with filename " +
                         deposit.filename)

            # FIXME: unpacking doesn't happen here ... (keeping for the time being for reference)
            # Broadcast to unpack and add sword:state in manifest
            # <sword:state rdf:resource="http://purl.org/net/sword/state/queuedForUnpacking"/>

            # now that we have stored the atom and the content, we can invoke a package ingester over the top to extract
            # all the metadata and any files we want.  Notice that we pass in the metadata_relevant flag, so the
            # packager won't overwrite the existing metadata if it isn't supposed to
            #packager = self.configuration.get_package_ingester(deposit.packaging)(self.dao)
            #derived_resources = packager.ingest(collection, id, fn, deposit.metadata_relevant)
            #ssslog.debug("Resources derived from deposit: " + str(derived_resources))

            # a list of identifiers which will resolve to the derived resources
            #derived_resource_uris = self.get_derived_resource_uris(collection, id, derived_resources)

            # An identifier which will resolve to the package just deposited
            deposit_uri = self.um.file_uri(silo, dataset_id, deposit.filename)
            ssslog.debug("Incoming file has been stored at URI " + deposit_uri)

            # register a new content state to be used
            content_state = DataBankStates.zip_file_added

        # Taken from dataset.py, seems to be the done thing when adding an item.
        # NOTE: confirmed with Anusha that this is correct
        dataset.del_triple(dataset.uri, u"dcterms:modified")
        dataset.add_triple(dataset.uri, u"dcterms:modified", datetime.now())
        dataset.del_triple(dataset.uri, u"oxds:currentVersion")
        dataset.add_triple(dataset.uri, u"oxds:currentVersion",
                           dataset.currentversion)

        # before we do any state management, we have to be sure that the sword namespace
        # is registered
        dataset.get_rdf_manifest().add_namespace(
            "sword", "http://purl.org/net/sword/terms/")
        dataset.sync()

        # sort out the new list of states for the item
        current_states = self._extract_states(dataset)
        new_states = []

        # for each existing state, consider whether to carry it over
        ssslog.info("new content state: " + str(content_state))
        for state_uri, state_desc in current_states:
            keep = True
            if metadata_state is not None and state_uri in DataBankStates.metadata_states:
                # we do not want the state if it is a metadata state and we have been given
                # a new metadata state
                keep = False
            if content_state is not None and state_uri in DataBankStates.content_states:
                ssslog.debug("Removing state: " + state_uri)
                # we do not want the state if it is a content state and we have been given
                # a new content state
                keep = False
            if keep:
                ssslog.debug("carrying over state: " + state_uri)
                new_states.append((state_uri, state_desc))

        # add the new metadata and content states provided from above
        if metadata_state is not None:
            new_states.append(metadata_state)
        if content_state is not None:
            ssslog.debug("adding new content state: " + str(content_state))
            new_states.append(content_state)

        ssslog.debug("New Dataset States: " + str(new_states))

        # FIXME: how safe is this?  What other ore:aggregates might there be?
        # we need to back out some of the triples in preparation to update the
        # statement
        # NOTE AR: I have commented the following lines.
        #       For aggregates this is not needed. put_stream will add the aggregate into the URI.
        #       Why delete other triples in the manifest - ??
        # sword:originalDeposit point to isVersionOf

        aggregates = dataset.list_rdf_objects(dataset.uri, u"ore:aggregates")
        original_deposits = dataset.list_rdf_objects(dataset.uri,
                                                     u"sword:originalDeposit")
        states = dataset.list_rdf_objects(dataset.uri, u"sword:state")

        for a in aggregates:
            dataset.del_triple(a, "*")
        for od in original_deposits:
            dataset.del_triple(od, "*")
        for s in states:
            dataset.del_triple(s, "*")
        dataset.del_triple(dataset.uri, u"ore:aggregates")
        dataset.del_triple(dataset.uri, u"sword:originalDeposit")
        dataset.del_triple(dataset.uri, u"sword:state")

        # FIXME: also unsafe in the same way as above
        # Write the md5 checksum into the manifest
        # A deposit contains just the new stuff so no harm in deleting all triples
        dataset.del_triple("*", u"oxds:hasMD5")
        #dataset.del_triple(deposit_uri, u"oxds:hasMD5")
        if deposit.content_md5 is not None:
            dataset.add_triple(deposit_uri, u"oxds:hasMD5",
                               deposit.content_md5)

        dataset.sync()

        # the aggregation uri
        agg_uri = self.um.agg_uri(silo, dataset_id)

        # the Edit-URI
        edit_uri = self.um.edit_uri(silo, dataset_id)

        # FIXME: here we also need to keep existing states where relevant.
        #   A state will continue to be relevant if it applies to an area of the
        #   item (i.e. the container or the media resource) for which this operation
        #   has no effect.
        #   for example:
        #   this is a metadata replace, but a status on the item is set to say that
        #   the item's zip file is corrupt and needs replacing.  The new status
        #   should leave this alone (and probably not do anything, tbh), no matter
        #   what else it does
        # create the statement outline
        # FIXME: there is something weird going on with instantiating this object without the original_deposits argument
        # apparently if I don't explicitly say there are no original deposits, then it "remembers" original deposits
        # from previous uses of the object
        s = Statement(aggregation_uri=agg_uri,
                      rem_uri=edit_uri,
                      states=new_states,
                      original_deposits=[])

        # set the original deposit (which sorts out the aggregations for us too)
        by = deposit.auth.username if deposit.auth is not None else None
        obo = deposit.auth.on_behalf_of if deposit.auth is not None else None
        if deposit_uri is not None:
            s.original_deposit(deposit_uri, datetime.now(), deposit.packaging,
                               by, obo)

        # create the new manifest and store it
        manifest = dataset.get_rdf_manifest()
        f = open(manifest.filepath, "r")
        rdf_string = f.read()

        new_manifest = s.serialise_rdf(rdf_string)
        dataset.put_stream("manifest.rdf", new_manifest)

        # FIXME: add in proper treatment here
        # now generate a receipt.
        # TODO: Include audit log instead of 'added zip to dataset'
        receipt = self.deposit_receipt(silo, dataset_id, dataset,
                                       "added zip to dataset")

        # now augment the receipt with the details of this particular deposit
        # this handles None arguments, and converts the xml receipt into a string
        receipt = self.augmented_receipt(receipt, deposit_uri,
                                         derived_resource_uris)

        # finally, assemble the deposit response and return
        dr = DepositResponse()
        dr.receipt = receipt.serialise()
        dr.location = receipt.edit_uri
        return dr
Exemplo n.º 9
0
    def deposit_new(self, silo, deposit):
        """
        Take the supplied deposit and treat it as a new container with content to be created in the specified collection
        Args:
        -collection:    the ID of the collection to be deposited into
        -deposit:       the DepositRequest object to be processed
        Returns a DepositResponse object which will contain the Deposit Receipt or a SWORD Error
        """
        # check against the authorised list of silos
        rdf_silo = self._get_authorised_rdf_silo(silo)

        # ensure that we have a slug
        if deposit.slug is None:
            deposit.slug = str(uuid.uuid4())

        # weed out unacceptable deposits
        if rdf_silo.exists(deposit.slug):
            raise SwordError(error_uri=DataBankErrors.dataset_conflict,
                             msg="A Dataset with the name " + deposit.slug +
                             " already exists")
        if not allowable_id2(deposit.slug):
            raise SwordError(
                error_uri=Errors.bad_request,
                msg="Dataset name can contain only the following characters - "
                + ag.naming_rule_humanized +
                " and has to be more than 1 character")

        # NOTE: we pass in an empty dictionary of metadata on create, and then run
        # _ingest_metadata to augment the item from the deposit
        item = create_new(rdf_silo, deposit.slug,
                          self.auth_credentials.username, {})
        add_dataset(silo, deposit.slug)
        self._ingest_metadata(item, deposit)

        # NOTE: left in for reference for the time being, but deposit_new
        # only support entry only deposits in databank.  This will need to be
        # re-introduced for full sword support
        # store the content file if one exists, and do some processing on it
        #deposit_uri = None
        #derived_resource_uris = []
        #if deposit.content is not None:

        #     if deposit.filename is None:
        #         deposit.filename = "unnamed.file"
        #     fn = self.dao.store_content(collection, id, deposit.content, deposit.filename)

        # now that we have stored the atom and the content, we can invoke a package ingester over the top to extract
        # all the metadata and any files we want

        # FIXME: because the deposit interpreter doesn't deal with multipart properly
        # we don't get the correct packaging format here if the package is anything
        # other than Binary
        #     ssslog.info("attempting to load ingest packager for format " + str(deposit.packaging))
        #     packager = self.configuration.get_package_ingester(deposit.packaging)(self.dao)
        #     derived_resources = packager.ingest(collection, id, fn, deposit.metadata_relevant)

        # An identifier which will resolve to the package just deposited
        #     deposit_uri = self.um.part_uri(collection, id, fn)

        # a list of identifiers which will resolve to the derived resources
        #     derived_resource_uris = self.get_derived_resource_uris(collection, id, derived_resources)

        # the aggregation uri
        agg_uri = self.um.agg_uri(silo, deposit.slug)

        # the Edit-URI
        edit_uri = self.um.edit_uri(silo, deposit.slug)

        # create the initial statement
        s = Statement(aggregation_uri=agg_uri,
                      rem_uri=edit_uri,
                      states=[DataBankStates.initial_state])

        # FIXME: need to sort out authentication before we can do this ...
        # FIXME: also, it's not relevant unless we take a binary-only deposit, which
        # we currently don't
        # User already authorized to deposit in this silo (_get_authorised_rdf_silo).
        # This is to augment metadata with details like who created, on behalf of, when
        #
        #by = deposit.auth.username if deposit.auth is not None else None
        #obo = deposit.auth.on_behalf_of if deposit.auth is not None else None
        #if deposit_uri is not None:
        #    s.original_deposit(deposit_uri, datetime.now(), deposit.packaging, by, obo)
        #s.aggregates = derived_resource_uris

        # In creating the statement we use the existing manifest.rdf file in the
        # item:
        manifest = item.get_rdf_manifest()
        f = open(manifest.filepath, "r")
        rdf_string = f.read()

        # create the new manifest and store it
        #Serialize rdf adds the sword statement - state, depositedOn, by, onBehalfOf, stateDesc
        new_manifest = s.serialise_rdf(rdf_string)
        item.put_stream("manifest.rdf", new_manifest)

        # FIXME: here is where we have to put the correct treatment in
        # now generate a receipt for the deposit
        # TODO: Add audit log from item.manifest in place of  "created new item"
        receipt = self.deposit_receipt(silo, deposit.slug, item,
                                       "created new item")

        # FIXME: while we don't have full text deposit, we don't need to augment
        # the deposit receipt

        # now augment the receipt with the details of this particular deposit
        # this handles None arguments, and converts the xml receipt into a string
        # receipt = self.augmented_receipt(receipt, deposit_uri, derived_resource_uris)

        # finally, assemble the deposit response and return
        dr = DepositResponse()
        dr.receipt = receipt.serialise()
        dr.location = receipt.edit_uri

        # Broadcast change as message
        ag.b.creation(silo, deposit.slug, ident=self.auth_credentials.username)

        return dr
Exemplo n.º 10
0
    def test_03_rdf_serialise(self):
        n = datetime.now()
        ods = [("http://od1/", n, "http://package/", "sword", "obo"),
               ("http://od2/", n, "http://package/", "bob", None)]
        od_uris = ["http://od1/", "http://od2/"]
        s = Statement(aggregation_uri="http://aggregation/",
                      rem_uri="http://rem/",
                      original_deposits=ods,
                      aggregates=[
                          "http://od1/", "http://od2/", "http://agg1/",
                          "http://agg2/"
                      ],
                      states=[("http://state/", "everything is groovy")])

        rdf_string = s.serialise_rdf()

        # first try the round trip
        rdf = etree.fromstring(rdf_string)

        # here are some counters/switches which will help us test that everything
        # is good within the statement
        descriptions = 0
        states = 0
        state_descriptions = 0
        original_deposits = 0
        aggregated_resources = 0
        packaging = 0
        dep_on = 0
        dep_by = 0
        dep_obo = 0

        has_rem_description = False
        has_agg_description = False

        # now go through the rdf and check that everything is as expected
        for desc in rdf.findall(RDF + "Description"):
            descriptions += 1
            about = desc.get(RDF + "about")
            for element in desc.getchildren():
                if element.tag == ORE + "describes":
                    resource = element.get(RDF + "resource")
                    assert about == s.rem_uri
                    assert resource == s.aggregation_uri
                    has_rem_description = True
                if element.tag == ORE + "isDescribedBy":
                    resource = element.get(RDF + "resource")
                    assert about == s.aggregation_uri
                    assert resource == s.rem_uri
                    has_agg_description = True
                if element.tag == ORE + "aggregates":
                    resource = element.get(RDF + "resource")
                    assert resource in s.aggregates or resource in od_uris
                    aggregated_resources += 1
                if element.tag == SWORD + "originalDeposit":
                    resource = element.get(RDF + "resource")
                    assert resource in od_uris
                    original_deposits += 1
                if element.tag == SWORD + "state":
                    resource = element.get(RDF + "resource")
                    assert resource == "http://state/"
                    states += 1
                if element.tag == SWORD + "stateDescription":
                    assert element.text.strip() == "everything is groovy"
                    assert about == "http://state/"
                    state_descriptions += 1
                if element.tag == SWORD + "packaging":
                    resource = element.get(RDF + "resource")
                    assert resource == "http://package/"
                    assert about in od_uris
                    packaging += 1
                if element.tag == SWORD + "depositedOn":
                    assert about in od_uris
                    dep_on += 1
                if element.tag == SWORD + "depositedBy":
                    assert element.text in ["sword", "bob"]
                    assert about in od_uris
                    dep_by += 1
                if element.tag == SWORD + "depositedOnBehalfOf":
                    assert element.text == "obo"
                    assert about in od_uris
                    dep_obo += 1

        # now check that our counters/switches were flipped appropriately
        assert descriptions == 5
        assert states == 1
        assert state_descriptions == 1
        assert original_deposits == 2
        assert aggregated_resources == 4
        assert packaging == 2
        assert dep_on == 2
        assert dep_by == 2
        assert dep_obo == 1
        assert has_rem_description
        assert has_agg_description
Exemplo n.º 11
0
    def test_04_rdf_aggregation_uri_exists(self):
        n = datetime.now()
        ods = [
            ("http://od1/", n, "http://package/", "sword", "obo"),
            ("http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip",
             n, "http://package/", "bob", None)
        ]
        od_uris = [
            "http://od1/",
            "http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip"
        ]
        s = Statement(
            aggregation_uri="http://192.168.23.133/asdfasd/datasets/mydataset6",
            rem_uri="http://rem/",
            original_deposits=ods,
            aggregates=[
                "http://od1/",
                "http://192.168.23.133/asdfasd/datasets/mydataset6/example.zip",
                "http://agg1/", "http://agg2/"
            ],
            states=[("http://state/", "everything is groovy")])

        rdf_string = s.serialise_rdf(RDF_DOC)

        # first try the round trip
        rdf = etree.fromstring(rdf_string)

        # here are some counters/switches which will help us test that everything
        # is good within the statement
        descriptions = 0
        states = 0
        state_descriptions = 0
        original_deposits = 0
        aggregated_resources = 0
        packaging = 0
        dep_on = 0
        dep_by = 0
        dep_obo = 0

        has_rem_description = False
        has_agg_description = False
        ox_tag = False
        dc_tag = False
        rdf_tag = False

        # now go through the rdf and check that everything is as expected
        for desc in rdf.findall(RDF + "Description"):
            descriptions += 1
            about = desc.get(RDF + "about")
            for element in desc.getchildren():
                # we expect all of the same things to be true as in the previous
                # test
                if element.tag == ORE + "describes":
                    resource = element.get(RDF + "resource")
                    assert about == s.rem_uri
                    assert resource == s.aggregation_uri
                    has_rem_description = True
                if element.tag == ORE + "isDescribedBy":
                    resource = element.get(RDF + "resource")
                    assert about == s.aggregation_uri
                    assert resource == s.rem_uri
                    has_agg_description = True
                if element.tag == ORE + "aggregates":
                    resource = element.get(RDF + "resource")
                    assert resource in s.aggregates or resource in od_uris
                    aggregated_resources += 1
                if element.tag == SWORD + "originalDeposit":
                    resource = element.get(RDF + "resource")
                    assert resource in od_uris
                    original_deposits += 1
                if element.tag == SWORD + "state":
                    resource = element.get(RDF + "resource")
                    assert resource == "http://state/"
                    states += 1
                if element.tag == SWORD + "stateDescription":
                    assert element.text.strip() == "everything is groovy"
                    assert about == "http://state/"
                    state_descriptions += 1
                if element.tag == SWORD + "packaging":
                    resource = element.get(RDF + "resource")
                    assert resource == "http://package/"
                    assert about in od_uris
                    packaging += 1
                if element.tag == SWORD + "depositedOn":
                    assert about in od_uris
                    dep_on += 1
                if element.tag == SWORD + "depositedBy":
                    assert element.text in ["sword", "bob"]
                    assert about in od_uris
                    dep_by += 1
                if element.tag == SWORD + "depositedOnBehalfOf":
                    assert element.text == "obo"
                    assert about in od_uris
                    dep_obo += 1

                # and we must verify that we didn't overwrite anything in the
                # passed in RDF document (don't check everything, but let's pick
                # one thing from each namespace)
                if element.tag == OX + "currentVersion":
                    assert element.text == "6"
                    ox_tag = True
                if element.tag == DC + "identifier":
                    assert element.text == "mydataset6"
                    dc_tag = True
                if element.tag == RDF + "type":
                    resource = element.get(RDF + "resource")
                    assert resource == "http://vocab.ox.ac.uk/dataset/schema#DataSet"
                    rdf_tag = True

        # now check that our counters/switches were flipped appropriately
        assert descriptions == 5
        assert states == 1
        assert state_descriptions == 1
        assert original_deposits == 2
        assert aggregated_resources == 4
        assert packaging == 2
        assert dep_on == 2
        assert dep_by == 2
        assert dep_obo == 1
        assert has_rem_description
        assert has_agg_description

        assert ox_tag
        assert dc_tag
        assert rdf_tag