def crds(authenticated_user): get_local_bucket(empty=True) creds = StorageCreds(user=authenticated_user, service_url="storage") drive = Drive(creds=creds, name="test_drive") filepath = os.path.join( os.path.dirname(__file__), "../../../tests/data/proc_test_data/CRDS/bsd.picarro.1minute.248m.dat", ) filemeta = drive.upload(filepath) par = PAR(location=filemeta.location(), user=authenticated_user) hugs = Service(service_url="hugs") par_secret = hugs.encrypt_data(par.secret()) auth = Authorisation(resource="process", user=authenticated_user) args = { "authorisation": auth.to_data(), "par": { "data": par.to_data() }, "par_secret": { "data": par_secret }, "data_type": "CRDS", "source_name": "bsd.picarro.1minute.248m", } hugs.call_function(function="process", args=args)
def run(args): """This file is used to return a list of versions of the specified filenames """ drive_uid = str(args["drive_uid"]) authorisation = Authorisation.from_data(args["authorisation"]) filename = args["filename"] try: include_metadata = args["include_metadata"] except: include_metadata = False if include_metadata: include_metadata = True else: include_metadata = False drive = DriveInfo(drive_uid=drive_uid) versions = drive.list_versions(authorisation=authorisation, filename=filename, include_metadata=include_metadata) return_value = {} return_value["versions"] = list_to_string(versions) return return_value
def process(args): """ Take a PAR from an uploaded file and process the data Args: args (dict): Dictionary of JSON serialised objects to be used by processing functions Returns: dict: Dictionary of results of processing """ data_type = args["data_type"] data_par = PAR.from_data(args["par"]["data"]) data_secret = args["par_secret"]["data"] auth = args["authorisation"] authorisation = Authorisation.from_data(auth) # Verify that this process had authorisation to be called authorisation.verify("process") hugs = get_this_service(need_private_access=True) data_secret = hugs.decrypt_data(data_secret) data_filename = data_par.resolve(secret=data_secret) # Here we're downloading the data to the tmp directory # Be good if we could load it directly from the object store data_file = data_filename.download(dir="/tmp") if data_type == "GC": precision_par = PAR.from_data(args["par"]["precision"]) precision_secret = args["par_secret"]["precision"] precision_secret = hugs.decrypt_data(precision_secret) precision_filename = precision_par.resolve(precision_secret) precision_file = precision_filename.download(dir="/tmp") site = args["site"] instrument = args["instrument"] data_file = data_file, precision_file else: site = None instrument = None if "overwrite" in args: overwrite = args["overwrite"] else: overwrite = False source_name = args["source_name"] results = process_data( data_file=data_file, source_name=source_name, data_type=data_type, site=site, instrument_name=instrument, overwrite=overwrite, ) return {"results": results}
def test_process_CRDS(authenticated_user, tempdir): creds = StorageCreds(user=authenticated_user, service_url="storage") drive = Drive(creds=creds, name="test_drive") filepath = os.path.join( os.path.dirname(__file__), "../../../tests/data/proc_test_data/CRDS/bsd.picarro.1minute.248m.dat", ) filemeta = drive.upload(filepath) Path("/tmp/bsd.picarro.1minute.248m.dat").unlink(missing_ok=True) par = PAR(location=filemeta.location(), user=authenticated_user) hugs = Service(service_url="hugs") par_secret = hugs.encrypt_data(par.secret()) auth = Authorisation(resource="process", user=authenticated_user) args = { "authorisation": auth.to_data(), "par": { "data": par.to_data() }, "par_secret": { "data": par_secret }, "data_type": "CRDS", "source_name": "bsd.picarro.1minute.248m", } response = hugs.call_function(function="process", args=args) expected_keys = [ "bsd.picarro.1minute.248m_ch4", "bsd.picarro.1minute.248m_co", "bsd.picarro.1minute.248m_co2", ] results = response["results"]["bsd.picarro.1minute.248m.dat"] return False assert sorted(results.keys()) == expected_keys
def run(args): """Open and return a new ChunkUploader that can be used to upload a file in lots of chunks """ filename = str(args["filename"]) drive_uid = str(args["drive_uid"]) try: aclrules = ACLRules.from_data(args["aclrules"]) except: aclrules = None try: authorisation = Authorisation.from_data(args["authorisation"]) except: authorisation = None try: par_uid = str(args["par_uid"]) except: par_uid = None try: secret = str(args["secret"]) except: secret = None try: pubkey = PublicKey.from_data(args["encryption_key"]) except: pubkey = None if par_uid is not None: registry = PARRegistry() (par, identifiers) = registry.load(par_uid=par_uid, secret=secret) else: par = None identifiers = None drive = DriveInfo(drive_uid=drive_uid) (filemeta, uploader) = drive.open_uploader(filename=filename, aclrules=aclrules, authorisation=authorisation, par=par, identifiers=identifiers) return { "filemeta": filemeta.to_data(), "uploader": uploader.to_data(pubkey=pubkey) }
def run(args): """Create a new PAR based on the information supplied in the passed half-created PAR, and the supplied Authorisation. This will return the URL that will need to be used by the PAR to access the required data. This will be encrypted using the supplied PublicKey """ auth = Authorisation.from_data(args["authorisation"]) par = PAR.from_data(args["par"]) secret = args["secret"] registry = PARRegistry() par_uid = registry.register(par=par, authorisation=auth, secret=secret) result = {"par_uid": par_uid} return result
def job_runner(args): """ Service function that gets called by the Client job_runner Args: dict: Dictionary of variables used in setting up and running job Returns: dict: Dictionary of data detailing job run status such as stdout, stderr output """ auth = args["authorisation"] authorisation = Authorisation.from_data(auth) # Verify that this process had authorisation to be called authorisation.verify("job_runner") hugs = get_this_service(need_private_access=True) job_data = args["requirements"] # # Pass the PAR through to allow use in the control script job_data["par"] = args["par"] # Pass the decrypted PAR secret here as we're on the server already job_data["par_secret"] = hugs.decrypt_data(args["par_secret"]) hostname = job_data["hostname"] username = job_data["username"] # Have we used this server before? try: known_host = job_data["known_host"] except KeyError: known_host = False # Decrypt the password we use to access the private key password = hugs.decrypt_data(args["key_password"]) results = run_job( username=username, hostname=hostname, password=password, job_data=job_data, known_host=known_host, ) return results
def run(args): """Admin function used to set the cluster that will be used to actually perform jobs """ try: authorisation = Authorisation.from_data(args["authorisation"]) except: authorisation = None try: passphrase = str(args["passphrase"]) except: passphrase = None cluster = Cluster.from_data(args["cluster"]) Cluster.set_cluster(cluster=cluster, authorisation=authorisation, passphrase=passphrase) return {"cluster": cluster.to_data()}
def test_process_GC(authenticated_user, tempdir): creds = StorageCreds(user=authenticated_user, service_url="storage") drive = Drive(creds=creds, name="test_drive") data_filepath = os.path.join( os.path.dirname(__file__), "../../../tests/data/proc_test_data/GC/capegrim-medusa.18.C", ) precision_filepath = os.path.join( os.path.dirname(__file__), "../../../tests/data/proc_test_data/GC/capegrim-medusa.18.precisions.C", ) Path("/tmp/capegrim-medusa.18.C").unlink(missing_ok=True) Path("/tmp/capegrim-medusa.18.precisions.C").unlink(missing_ok=True) data_meta = drive.upload(data_filepath) precision_meta = drive.upload(precision_filepath) data_par = PAR(location=data_meta.location(), user=authenticated_user) precision_par = PAR(location=precision_meta.location(), user=authenticated_user) hugs = Service(service_url="hugs") data_secret = hugs.encrypt_data(data_par.secret()) precision_secret = hugs.encrypt_data(precision_par.secret()) auth = Authorisation(resource="process", user=authenticated_user) args = { "authorisation": auth.to_data(), "par": { "data": data_par.to_data(), "precision": precision_par.to_data() }, "par_secret": { "data": data_secret, "precision": precision_secret }, "data_type": "GCWERKS", "source_name": "capegrim-medusa", "site": "CGO", "instrument": "medusa", } response = hugs.call_function(function="process", args=args) result_keys = (sorted( response["results"]["capegrim-medusa.18.C"].keys()))[:8] expected_keys = [ "capegrim-medusa.18_C4F10", "capegrim-medusa.18_C6F14", "capegrim-medusa.18_CCl4", "capegrim-medusa.18_CF4", "capegrim-medusa.18_CFC-11", "capegrim-medusa.18_CFC-112", "capegrim-medusa.18_CFC-113", "capegrim-medusa.18_CFC-114", ] assert result_keys == expected_keys
def create_job( self, auth_user, requirements, key_password, data_files, hugs_url=None, storage_url=None, ): """ Create a job Args: auth_user (Acquire.User): Authenticated Acquire user The following keys are required: "hostname", "username", "name", "run_command", "partition", "n_nodes", "n_tasks_per_node", "n_cpus_per_task", "memory_req", "job_duration" where partition must be one of: "cpu_test", "dcv", "gpu", "gpu_veryshort", "hmem", "serial", "test", "veryshort" Example: requirements = {"hostname": hostname, "username": username, "name": "test_job, "n_nodes": 2,"n_tasks_per_node": 2, "n_cpus_per_task": 2, "memory": "128G", ...} requirements (dict): Dictionary containing job details and requested resources key_password (str): Password for private key used to access the HPC data_files (dict): Data file(s) to be uploaded to the cloud drive to run the simulation. Simulation code files should be given in the "app" key and data files in the "data" key TODO - having to pass in a password and get it through to Paramiko seems long winded, is there a better way to do this? hugs_url (str): URL of HUGS service storage_url (str): URL of storage service Returns: dict: Dictionary containing information regarding job running on resource This will contain the PAR for access for data upload and download. """ from Acquire.Client import ( Drive, Service, PAR, Authorisation, StorageCreds, Location, ACLRule, ) from Acquire.ObjectStore import create_uuid import datetime import os if self._service is None: raise PermissionError("Cannot use a null service") if storage_url is None: storage_url = self._service_url + "/storage" if hugs_url is None: hugs_url = self._service_url + "/hugs" if not isinstance(data_files["app"], list): data_files["app"] = [data_files["app"]] try: if not isinstance(data_files["data"], list): data_files["data"] = [data_files["data"]] except KeyError: pass # Get an authorisaton to pass to the service hugs = Service(service_url=hugs_url) # Credentials to create the cloud storage drive creds = StorageCreds(user=auth_user, service_url=storage_url) # Append a shortened UUID to the job name to ensure we don't multiple drives with the same name short_uuid = create_uuid(short_uid=True) job_name = requirements["name"] job_name = f"{job_name.lower()}_{short_uuid}" # Create a cloud drive for the input and output data to be written to drive = Drive(creds=creds, name=job_name) # Check the size of the files and if we want to use the chunk uploader # Now we want to upload the files to the cloud drive we've created for this job chunk_limit = 50 * 1024 * 1024 # Store the metadata for the uploaded files uploaded_files = {"app": {}, "data": {}} # These probably won't be very big so don't check their size for f in data_files["app"]: file_meta = drive.upload(f, dir="app") uploaded_files["app"][f] = file_meta # We might not have any data files to upload try: for f in data_files["data"]: filesize = os.path.getsize(f) if filesize < chunk_limit: file_meta = drive.upload(f, dir="data") else: file_meta = drive.chunk_upload(f, dir="data") uploaded_files["data"][f] = file_meta except KeyError: pass auth = Authorisation(resource="job_runner", user=auth_user) # Create a PAR with a long lifetime here and return a version to the user # and another to the server to allow writing of result data drive_guid = drive.metadata().guid() location = Location(drive_guid=drive_guid) # Read the duration from the requirements dictionary # TODO - add in some reading of the duration # try: # duration = requirements["duration"] # par_expiry = datetime.datetime par_lifetime = datetime.datetime.now() + datetime.timedelta(days=1) # Create an ACL rule for this PAR so we can read and write to it aclrule = ACLRule.owner() par = PAR( location=location, user=auth_user, aclrule=aclrule, expires_datetime=par_lifetime, ) par_secret = par.secret() encryped_par_secret = hugs.encrypt_data(par_secret) # Encrypt the password we use to decrypt the private key used to access the HPC cluster # TODO - is this a sensible way of doing this? encrypted_password = hugs.encrypt_data(key_password) par_data = par.to_data() args = {} args["authorisation"] = auth.to_data() args["par"] = par_data args["par_secret"] = encryped_par_secret args["requirements"] = requirements args["key_password"] = encrypted_password function_response = self._service.call_function(function="job_runner", args=args) response = {} response["function_response"] = function_response response["par"] = par_data response["par_secret"] = par_secret response["upload_data"] = uploaded_files return response
def run(args): """This file is used to return a list of the FileMeta objects of files that are in a specified drive """ drive_uid = str(args["drive_uid"]) try: authorisation = Authorisation.from_data(args["authorisation"]) except: authorisation = None try: par_uid = args["par_uid"] except: par_uid = None try: secret = args["secret"] except: secret = None try: directory = str(args["dir"]) except: directory = None try: filename = str(args["filename"]) except: filename = None try: include_metadata = args["include_metadata"] except: include_metadata = False if include_metadata: include_metadata = True else: include_metadata = False if par_uid is not None: registry = PARRegistry() (par, identifiers) = registry.load(par_uid=par_uid, secret=secret) else: par = None identifiers = None drive = DriveInfo(drive_uid=drive_uid) files = drive.list_files(authorisation=authorisation, include_metadata=include_metadata, par=par, identifiers=identifiers, dir=directory, filename=filename) return_value = {} return_value["files"] = list_to_string(files) return return_value
def process_files( self, user, files, data_type, source_name=None, overwrite=False, hugs_url=None, storage_url=None, datasource=None, site=None, instrument=None, ): """ Process the passed file(s) Args: user (User): Authenticated Acquire User files (str, list): Path of files to be processed data_type (str): Type of data to be processed (CRDS, GC etc) hugs_url (str): URL of HUGS service. Currently used for testing datasource (str): Datasource name or UUID This may be removed in the future. storage_url (str): URL of storage service. Currently used for testing This may be removed in the future. site (str, default=None): Name of site, three letter code or long name instrument (str, default=None): If no instrument name is passed we will attempt to find it from the filename. Returns: dict: UUIDs of Datasources storing data of processed files keyed by filename """ data_type = data_type.upper() if self._service is None: raise PermissionError("Cannot use a null service") if not isinstance(files, list): files = [files] if data_type.upper() == "GC": if not all(isinstance(item, tuple) for item in files): return TypeError( "If data type is GC, a list of tuples for data and precision filenames must be passed" ) files = [(Path(f), Path(p)) for f, p in files] else: files = [Path(f) for f in files] if storage_url is None: storage_url = self._service_url + "/storage" if hugs_url is None: hugs_url = self._service_url + "/hugs" # # Take the filename without the file extension # source_name = [os.path.splitext((filepath.name).split("/")[-1])[0] for filepath in files] hugs = Service(service_url=hugs_url) creds = StorageCreds(user=user, service_url=storage_url) drive = Drive(creds=creds, name="test_drive") auth = Authorisation(resource="process", user=user) # Here we'll need special cases for different data types. As GC requires # both the data file and precision data and they need to be kept together # for use in processing. # We can maybe reconsider the way this is done if there ends up being a lot of test # cases and this gets a bit clunky results = {} for file in files: if data_type == "GC": if source_name is None: source_name = file[0].stem if site is None: site = source_name.split(".")[0] if "-" in site and data_type == "GC": site = site.split("-")[0] filemeta = drive.upload(file[0]) par = PAR(location=filemeta.location(), user=user) par_secret = hugs.encrypt_data(par.secret()) prec_meta = drive.upload(file[1]) prec_par = PAR(location=prec_meta.location(), user=user) prec_par_secret = hugs.encrypt_data(prec_par.secret()) args = { "authorisation": auth.to_data(), "par": {"data": par.to_data(), "precision": prec_par.to_data()}, "par_secret": {"data": par_secret, "precision": prec_par_secret}, "data_type": data_type, "datasource": datasource, "source_name": source_name, "overwrite": overwrite, "site": site, "instrument": instrument, } else: filemeta = drive.upload(file) par = PAR(location=filemeta.location(), user=user) par_secret = hugs.encrypt_data(par.secret()) args = { "authorisation": auth.to_data(), "par": {"data": par.to_data()}, "par_secret": {"data": par_secret}, "data_type": data_type, "datasource": datasource, "source_name": source_name, "overwrite": overwrite, } # If we try to upload many files we don't want it to fail if a single # file contains overlapping data try: response = self._service.call_function(function="process", args=args) results.update(response["results"]) except ValueError as err: results[file.name] = err return results