def subitemview(self, silo, id, path, subpath): # Function to retreive a file from the zipfile # TODO # I check to see the path is avlid and it is a zip file. # I do not deal with subpath. if it is a file - serve it. If it is a dir, show the contents of it. # tmpl_context variables needed: c.silo_name, c.zipfile_contents c.ident, c.id, c.path if not ag.granary.issilo(silo): abort(404) if not (path or subpath): abort(400, "You must supply a filename to unpack") rdfsilo = ag.granary.get_rdf_silo(silo) if not rdfsilo.exists(id): abort(404) c.silo_name = silo c.id = id c.path = path c.subpath = subpath ident = request.environ.get("repoze.who.identity") c.ident = ident dataset = rdfsilo.get_item(id) if dataset.metadata.get("embargoed") not in ["false", 0, False]: if not ident: abort(401, "Not Authorised") silos = ag.authz(ident) if silo not in silos: abort(403, "Forbidden") item_real_filepath = dataset.to_dirpath() target_filepath = "%s/%s" % (item_real_filepath, path) # c.parts = dataset.list_parts(detailed=False) if not dataset.isfile(path): abort(404, "File not found") if not os.path.isfile(target_filepath): abort(404, "File not found") if not check_file_mimetype(target_filepath, "application/zip"): abort(415, "File is not of type application/zip") # TODO : if subpath is a file - serve it. If subpath is a dir, show the contents of the dir return render("/zipfilesubitemview.html")
def datasetview(self, silo, id): """Get a list of zipfiles in dataset 'id' within the silo 'silo' and unpack a dataset.""" if not ag.granary.issilo(silo): abort(404) rdfsilo = ag.granary.get_rdf_silo(silo) if not rdfsilo.exists(id): abort(404) # tmpl_context variables needed: c.silo_name, c.zipfiles, c.ident, c.id, c.path c.silo_name = silo c.id = id ident = request.environ.get("repoze.who.identity") c.ident = ident dataset = rdfsilo.get_item(id) creator = None if ( dataset.manifest and dataset.manifest.state and "metadata" in dataset.manifest.state and dataset.manifest.state["metadata"] and "createdby" in dataset.manifest.state["metadata"] and dataset.manifest.state["metadata"]["createdby"] ): creator = dataset.manifest.state["metadata"]["createdby"] http_method = request.environ["REQUEST_METHOD"] if http_method == "GET": c.editor = False if ag.metadata_embargoed: if not ident: abort(401, "Not Authorised") silos = ag.authz(ident) if silo not in silos: abort(403, "Forbidden") silos_admin = ag.authz(ident, permission="administrator") silos_manager = ag.authz(ident, permission="manager") # if ident['repoze.who.userid'] == creator or ident.get('role') in ["admin", "manager"]: if ident["repoze.who.userid"] == creator or silo in silos_admin or silo in silos_manager: c.editor = True elif ident: silos = ag.authz(ident) if silo in silos: silos_admin = ag.authz(ident, permission="administrator") silos_manager = ag.authz(ident, permission="manager") # if ident['repoze.who.userid'] == creator or ident.get('role') in ["admin", "manager"]: if ident["repoze.who.userid"] == creator or silo in silos_admin or silo in silos_manager: c.editor = True else: # identity management of item if not ident: abort(401, "Not Authorised") silos = ag.authz(ident) if silo not in silos: abort(403, "Forbidden") silos_admin = ag.authz(ident, permission="administrator") silos_manager = ag.authz(ident, permission="manager") # if not (ident['repoze.who.userid'] == creator or ident.get('role') in ["admin", "manager"]): if not (ident["repoze.who.userid"] == creator or silo in silos_admin or silo in silos_manager): abort(403, "Forbidden") if http_method == "GET": c.zipfiles = get_zipfiles_in_dataset(dataset) # conneg return accept_list = None if "HTTP_ACCEPT" in request.environ: try: accept_list = conneg_parse(request.environ["HTTP_ACCEPT"]) except: accept_list = [MT("text", "html")] if not accept_list: accept_list = [MT("text", "html")] mimetype = accept_list.pop(0) while mimetype: if str(mimetype).lower() in ["text/html", "text/xhtml"]: return render("/list_of_zipfiles.html") elif str(mimetype).lower() in ["text/plain", "application/json"]: response.content_type = 'application/json; charset="UTF-8"' response.status_int = 200 response.status = "200 OK" # return simplejson.dumps(dict(c.zipfiles)) return simplejson.dumps(list(c.zipfiles.keys())) try: mimetype = accept_list.pop(0) except IndexError: mimetype = None # Whoops nothing satisfies - return text/html return render("/list_of_zipfiles.html") elif http_method == "POST": params = request.POST if not (params.has_key("filename") and params["filename"]): abort(400, "You must supply a filename to unpack") item_real_filepath = dataset.to_dirpath() target_filepath = "%s/%s" % (item_real_filepath, params["filename"]) if not os.path.isfile(target_filepath): abort(404, "File to unpack not found") if not check_file_mimetype(target_filepath, "application/zip"): abort(415, "File is not of type application/zip") if params.has_key("id") and params["id"]: target_dataset_name = params["id"] else: # (head, fn) = os.path.split(params['filename']) # (fn, ext) = os.path.splitext(fn) # target_dataset_name = "%s-%s"%(id,fn) target_dataset_name = id # step 1: Create / initialize target dataset if not rdfsilo.exists(target_dataset_name): if not allowable_id2(target_dataset_name): response.content_type = "text/plain" response.status_int = 400 response.status = "400 Bad request. Data package name not valid" return ( "Data package name can contain only the following characters - %s and has to be more than 1 character" % ag.naming_rule_humanized ) target_dataset = create_new(rdfsilo, target_dataset_name, ident["repoze.who.userid"]) response.status_int = 201 response.status = "201 Created" response.headers["Content-Location"] = url( controller="datasets", action="datasetview", silo=silo, id=target_dataset_name ) response_message = "201 Created" else: target_dataset = rdfsilo.get_item(target_dataset_name) response.status = "204 Updated" response.status_int = 204 response_message = None # step 2: Unpack zip item try: unpack_zip_item(target_dataset, dataset, params["filename"], rdfsilo, ident["repoze.who.userid"]) except BadZipfile: abort(400, "BadZipfile: Couldn't unpack zipfile") target_dataset.sync() target_dataset.sync() target_dataset.sync() if response.status_int == 201: try: ag.b.creation(silo, id, ident=ident["repoze.who.userid"]) except: pass else: try: ag.b.change(silo, id, ident=ident["repoze.who.userid"]) except: pass # conneg return accept_list = None if "HTTP_ACCEPT" in request.environ: try: accept_list = conneg_parse(request.environ["HTTP_ACCEPT"]) except: accept_list = [MT("text", "html")] if not accept_list: accept_list = [MT("text", "html")] mimetype = accept_list.pop(0) while mimetype: if str(mimetype).lower() in ["text/html", "text/xhtml"]: redirect(url(controller="datasets", action="datasetview", silo=silo, id=target_dataset_name)) elif str(mimetype).lower() in ["text/plain", "application/json"]: response.content_type = "text/plain" return response_message try: mimetype = accept_list.pop(0) except IndexError: mimetype = None # Whoops - nothing satisfies - return text/plain response.content_type = "text/plain" return response_message
def itemview(self, silo, id, path): """API call to GET - read the contents of a zip-file (without having to unpack) and POST- unpack a zip file into a new / existing dataset PUT - Add the zipfile and unpack it onto the existing dataset""" # tmpl_context variables needed: c.silo_name, c.zipfile_contents c.ident, c.id, c.path if not path: abort(400, "You must supply a filename to unpack") if not ag.granary.issilo(silo): abort(404) rdfsilo = ag.granary.get_rdf_silo(silo) if not rdfsilo.exists(id): abort(404) c.silo_name = silo c.id = id c.path = path ident = request.environ.get("repoze.who.identity") c.ident = ident dataset = rdfsilo.get_item(id) creator = None if ( dataset.manifest and dataset.manifest.state and "metadata" in dataset.manifest.state and dataset.manifest.state["metadata"] and "createdby" in dataset.manifest.state["metadata"] and dataset.manifest.state["metadata"]["createdby"] ): creator = dataset.manifest.state["metadata"]["createdby"] http_method = request.environ["REQUEST_METHOD"] if http_method == "GET": if dataset.metadata.get("embargoed") not in ["false", 0, False]: if not ident: abort(401, "Not Authorised") silos = ag.authz(ident) if silo not in silos: abort(403, "Forbidden") else: if not ident: abort(401, "Not Authorised") silos = ag.authz(ident) if silo not in silos: abort(403, "Forbidden") silos_admin = ag.authz(ident, permission="administrator") silos_manager = ag.authz(ident, permission="manager") # if not (ident['repoze.who.userid'] == creator or ident.get('role') in ["admin", "manager"]): if not (ident["repoze.who.userid"] == creator or silo in silos_admin or silo in silos_manager): abort(403, "Forbidden") item_real_filepath = dataset.to_dirpath() target_filepath = "%s/%s" % (item_real_filepath, path) # c.parts = dataset.list_parts(detailed=False) if http_method in ["GET", "POST"]: if not dataset.isfile(path): abort(404, "File not found") if not os.path.isfile(target_filepath): abort(404, "File not found") if not check_file_mimetype(target_filepath, "application/zip"): abort(415, "File is not of type application/zip") if http_method == "GET": try: c.zipfile_contents = read_zipfile(target_filepath) except BadZipfile: abort(400, "Could not read zipfile") # conneg return accept_list = None if "HTTP_ACCEPT" in request.environ: try: accept_list = conneg_parse(request.environ["HTTP_ACCEPT"]) except: accept_list = [MT("text", "html")] if not accept_list: accept_list = [MT("text", "html")] mimetype = accept_list.pop(0) while mimetype: if str(mimetype).lower() in ["text/html", "text/xhtml"]: return render("/zipfileview.html") elif str(mimetype).lower() in ["text/plain", "application/json"]: response.content_type = 'application/json; charset="UTF-8"' response.status_int = 200 response.status = "200 OK" return simplejson.dumps(c.zipfile_contents) try: mimetype = accept_list.pop(0) except IndexError: mimetype = None # Whoops - nothing satisfies - return text/html return render("/zipfileview.html") elif http_method == "POST": params = request.POST # if not (params.has_key("filename") and params['filename']): # abort(400, "You must supply a filename to unpack") if params.has_key("id") and params["id"]: target_dataset_name = params["id"] else: # (head, fn) = os.path.split(path) # (fn, ext) = os.path.splitext(fn) # target_dataset_name = "%s-%s"%(id,fn) target_dataset_name = id # step 1: Create / initialize target dataset if not rdfsilo.exists(target_dataset_name): if not allowable_id2(target_dataset_name): response.content_type = "text/plain" response.status_int = 400 response.status = "400 Bad request. Data package name not valid" return ( "Data package name can contain only the following characters - %s and has to be more than 1 character" % ag.naming_rule_humanized ) target_dataset = create_new(rdfsilo, target_dataset_name, ident["repoze.who.userid"]) response.status_int = 201 response.status = "201 Created" response.headers["Content-Location"] = url( controller="datasets", action="datasetview", silo=silo, id=target_dataset_name ) response_message = "201 Created" else: target_dataset = rdfsilo.get_item(target_dataset_name) response.status = "204 Updated" response.status_int = 204 response_message = None # step 2: Unpack zip item try: unpack_zip_item(target_dataset_name, dataset, path, rdfsilo, ident["repoze.who.userid"]) except BadZipfile: abort(400, "Couldn't unpack zipfile") target_dataset.sync() target_dataset.sync() target_dataset.sync() if response.status_int == 201: try: ag.b.creation(silo, id, ident=ident["repoze.who.userid"]) except: pass else: try: ag.b.change(silo, id, ident=ident["repoze.who.userid"]) except: pass # conneg return accept_list = None if "HTTP_ACCEPT" in request.environ: try: accept_list = conneg_parse(request.environ["HTTP_ACCEPT"]) except: accept_list = [MT("text", "html")] if not accept_list: accept_list = [MT("text", "html")] mimetype = accept_list.pop(0) while mimetype: if str(mimetype).lower() in ["text/html", "text/xhtml"]: redirect(url(controller="datasets", action="datasetview", silo=silo, id=target_dataset_name)) elif str(mimetype).lower() in ["text/plain", "application/json"]: response.content_type = "text/plain" return response_message try: mimetype = accept_list.pop(0) except IndexError: mimetype = None # Whoops - nothing satisfies - return text/plain response.content_type = "text/plain" return response_message elif http_method == "PUT": # Pylons loads the request body into request.body... # This is not going to work for large files... ah well # POST will handle large files as they are pushed to disc, # but this won't content = request.body if JAILBREAK.search(path) != None: abort(400, "'..' cannot be used in the path") # Step 1: Put zipfile in dataset if dataset.isdir(path): response.content_type = "text/plain" response.status_int = 403 response.status = "403 Forbidden" return "Cannot PUT a file on to an existing directory" if dataset.isfile(path): code = 204 else: code = 201 if code == 204: dataset.increment_version_delta(clone_previous_version=True, copy_filenames=["manifest.rdf", path]) else: dataset.increment_version_delta(clone_previous_version=True, copy_filenames=["manifest.rdf"]) dataset.put_stream(path, content) dataset.del_triple(dataset.uri, u"dcterms:modified") dataset.add_triple(dataset.uri, u"dcterms:modified", datetime.now()) dataset.del_triple(dataset.uri, u"oxds:currentVersion") dataset.add_triple(dataset.uri, u"oxds:currentVersion", dataset.currentversion) dataset.sync() target_dataset = rdfsilo.get_item(id) # step 2: Unpack zip item if not check_file_mimetype(target_filepath, "application/zip"): abort(415, "File is not of type application/zip") try: unpack_zip_item(target_dataset, dataset, path, rdfsilo, ident["repoze.who.userid"]) except BadZipfile: abort(400, "Couldn't unpack zipfile") target_dataset.sync() target_dataset.sync() target_dataset.sync() response.status = "204 Updated" response.status_int = 204 response_message = None try: ag.b.change(silo, id, path, ident=ident["repoze.who.userid"]) except: pass # conneg return accept_list = None if "HTTP_ACCEPT" in request.environ: try: accept_list = conneg_parse(request.environ["HTTP_ACCEPT"]) except: accept_list = [MT("text", "html")] if not accept_list: accept_list = [MT("text", "html")] mimetype = accept_list.pop(0) while mimetype: if str(mimetype).lower() in ["text/html", "text/xhtml"]: redirect(url(controller="datasets", action="datasetview", silo=silo, id=id)) elif str(mimetype).lower() in ["text/plain", "application/json"]: response.content_type = "text/plain" return response_message try: mimetype = accept_list.pop(0) except IndexError: mimetype = None # Whoops - nothing satisfies - return text/plain response.content_type = "text/plain" return response_message
def replace(self, path, deposit): """ Replace all the content represented by the supplied id with the supplied deposit Args: - oid: the object ID in the store - deposit: a DepositRequest object Return a DepositResponse containing the Deposit Receipt or a SWORD Error """ silo, dataset_id, accept_parameters = self.um.interpret_path(path) rdf_silo = self._get_authorised_rdf_silo(silo) # now get the dataset object itself dataset = rdf_silo.get_item(dataset_id) # deal with possible problems with the filename if deposit.filename is None or deposit.filename == "": raise SwordError(error_uri=Errors.bad_request, msg="You must supply a filename to unpack") if JAILBREAK.search(deposit.filename) != None: raise SwordError(error_uri=Errors.bad_request, msg="'..' cannot be used in the path or as a filename") # FIXME: at the moment this metadata operation is not supported by DataBank # # first figure out what to do about the metadata keep_atom = False metadata_state = None # This will be used to store any state information associated # with a metadata update. It gets tied up with the content state # and any pre-existing states further down #if deposit.atom is not None: # ssslog.info("Replace request has ATOM part - updating") # entry_ingester = self.configuration.get_entry_ingester()(self.dao) # entry_ingester.ingest(collection, id, deposit.atom) # keep_atom = True content_state = None deposit_uri = None derived_resource_uris = [] if deposit.content_file is not None: ssslog.info("Replace request has file content - updating") # remove all the old files before adding the new. We always leave # behind the metadata; this will be overwritten later if necessary #self.dao.remove_content(collection, id, True, keep_atom) #Increment the version, but do not clone the previous version. # An update will replace the entire contents of the container (if previously unpacked) with the bagit file dataset.increment_version_delta(clone_previous_version=True, copy_filenames=['manifest.rdf']) # store the content file dataset.put_stream(deposit.filename, deposit.content_file) ssslog.debug("New incoming file stored with filename " + deposit.filename) # FIXME: unpacking doesn't happen here ... (keeping for the time being for reference) # Broadcast to unpack and add sword:state in manifest # <sword:state rdf:resource="http://purl.org/net/sword/state/queuedForUnpacking"/> # now that we have stored the atom and the content, we can invoke a package ingester over the top to extract # all the metadata and any files we want. Notice that we pass in the metadata_relevant flag, so the # packager won't overwrite the existing metadata if it isn't supposed to #packager = self.configuration.get_package_ingester(deposit.packaging)(self.dao) #derived_resources = packager.ingest(collection, id, fn, deposit.metadata_relevant) #ssslog.debug("Resources derived from deposit: " + str(derived_resources)) # a list of identifiers which will resolve to the derived resources #derived_resource_uris = self.get_derived_resource_uris(collection, id, derived_resources) # An identifier which will resolve to the package just deposited deposit_uri = self.um.file_uri(silo, dataset_id, deposit.filename) ssslog.debug("Incoming file has been stored at URI " + deposit_uri) # register a new content state to be used content_state = DataBankStates.zip_file_added # Taken from dataset.py, seems to be the done thing when adding an item. # NOTE: confirmed with Anusha that this is correct dataset.del_triple(dataset.uri, u"dcterms:modified") dataset.add_triple(dataset.uri, u"dcterms:modified", datetime.now()) dataset.del_triple(dataset.uri, u"oxds:currentVersion") dataset.add_triple(dataset.uri, u"oxds:currentVersion", dataset.currentversion) # before we do any state management, we have to be sure that the sword namespace # is registered dataset.get_rdf_manifest().add_namespace("sword", "http://purl.org/net/sword/terms/") dataset.sync() # sort out the new list of states for the item current_states = self._extract_states(dataset) new_states = [] # for each existing state, consider whether to carry it over ssslog.info("new content state: " + str(content_state)) for state_uri, state_desc in current_states: keep = True if metadata_state is not None and state_uri in DataBankStates.metadata_states: # we do not want the state if it is a metadata state and we have been given # a new metadata state keep = False if content_state is not None and state_uri in DataBankStates.content_states: ssslog.debug("Removing state: " + state_uri) # we do not want the state if it is a content state and we have been given # a new content state keep = False if keep: ssslog.debug("carrying over state: " + state_uri) new_states.append((state_uri, state_desc)) # add the new metadata and content states provided from above if metadata_state is not None: new_states.append(metadata_state) if content_state is not None: ssslog.debug("adding new content state: " + str(content_state)) new_states.append(content_state) ssslog.debug("New Dataset States: " + str(new_states)) # FIXME: how safe is this? What other ore:aggregates might there be? # we need to back out some of the triples in preparation to update the # statement # NOTE AR: I have commented the following lines. # For aggregates this is not needed. put_stream will add the aggregate into the URI. # Why delete other triples in the manifest - ?? # sword:originalDeposit point to isVersionOf aggregates = dataset.list_rdf_objects(dataset.uri, u"ore:aggregates") original_deposits = dataset.list_rdf_objects(dataset.uri, u"sword:originalDeposit") states = dataset.list_rdf_objects(dataset.uri, u"sword:state") for a in aggregates: dataset.del_triple(a, "*") for od in original_deposits: dataset.del_triple(od, "*") for s in states: dataset.del_triple(s, "*") dataset.del_triple(dataset.uri, u"ore:aggregates") dataset.del_triple(dataset.uri, u"sword:originalDeposit") dataset.del_triple(dataset.uri, u"sword:state") # FIXME: also unsafe in the same way as above # Write the md5 checksum into the manifest # A deposit contains just the new stuff so no harm in deleting all triples dataset.del_triple("*", u"oxds:hasMD5") #dataset.del_triple(deposit_uri, u"oxds:hasMD5") if deposit.content_md5 is not None: dataset.add_triple(deposit_uri, u"oxds:hasMD5", deposit.content_md5) dataset.sync() # the aggregation uri agg_uri = self.um.agg_uri(silo, dataset_id) # the Edit-URI edit_uri = self.um.edit_uri(silo, dataset_id) # FIXME: here we also need to keep existing states where relevant. # A state will continue to be relevant if it applies to an area of the # item (i.e. the container or the media resource) for which this operation # has no effect. # for example: # this is a metadata replace, but a status on the item is set to say that # the item's zip file is corrupt and needs replacing. The new status # should leave this alone (and probably not do anything, tbh), no matter # what else it does # create the statement outline # FIXME: there is something weird going on with instantiating this object without the original_deposits argument # apparently if I don't explicitly say there are no original deposits, then it "remembers" original deposits # from previous uses of the object s = Statement(aggregation_uri=agg_uri, rem_uri=edit_uri, states=new_states, original_deposits=[]) # set the original deposit (which sorts out the aggregations for us too) by = deposit.auth.username if deposit.auth is not None else None obo = deposit.auth.on_behalf_of if deposit.auth is not None else None if deposit_uri is not None: s.original_deposit(deposit_uri, datetime.now(), deposit.packaging, by, obo) # create the new manifest and store it manifest = dataset.get_rdf_manifest() f = open(manifest.filepath, "r") rdf_string = f.read() new_manifest = s.serialise_rdf(rdf_string) dataset.put_stream("manifest.rdf", new_manifest) dataset.sync() # FIXME: add in proper treatment here # now generate a receipt. # TODO: Include audit log instead of 'added zip to dataset' receipt = self.deposit_receipt(silo, dataset_id, dataset, "added zip to dataset") # now augment the receipt with the details of this particular deposit # this handles None arguments, and converts the xml receipt into a string receipt = self.augmented_receipt(receipt, deposit_uri, derived_resource_uris) # Unpack the file if it is zip item_real_filepath = dataset.to_dirpath() target_filepath = "%s/%s"%(item_real_filepath, deposit.filename) ssslog.info("The path for the deposit file %s is %s"%(deposit.filename, target_filepath)) if not os.path.isfile(target_filepath): ssslog.debug("File not found %s"%target_filepath) else: ssslog.debug("File %s found"%target_filepath) if check_file_mimetype(target_filepath, 'application/zip'): ssslog.debug("Mimetype is application/zip for %s"%target_filepath) else: ssslog.debug("Mimetype is not apllication/zip"%target_filepath) if os.path.isfile(target_filepath) and check_file_mimetype(target_filepath, 'application/zip'): ssslog.info("Going to unpack zipfile %s"%deposit.filename) target_dataset_name = dataset_id target_dataset = rdf_silo.get_item(target_dataset_name) try: unpack_zip_item(target_dataset, dataset, deposit.filename, rdf_silo, self.auth_credentials.identity.get('repoze.who.userid')) target_dataset.sync() except BadZipfile: ssslog.error("Aborting with 400. BadZipfile: Couldn't unpack zipfile %s"%deposit.filename) abort(400, "BadZipfile: Couldn't unpack zipfile") except Exception as e: ssslog.error("Error unpacking. \n %s"%str(e)) else: ssslog.info("Not unpacking file %s"%deposit.filename) # finally, assemble the deposit response and return dr = DepositResponse() dr.receipt = receipt.serialise() dr.location = receipt.edit_uri return dr