def upload_request(): system_addr = EXT_TRANSFER_MACHINE_INTERNAL system_name = EXT_TRANSFER_MACHINE_PUBLIC targetPath = request.form.get("targetPath", None) # path to save file in cluster v = validate_input(targetPath) if v != "": return jsonify(description="Failed to upload file", error=f"'targetPath' {v}"), 400 sourcePath = request.form.get("sourcePath", None) # path from the local FS v = validate_input(sourcePath) if v != "": return jsonify(description="Failed to upload file", error=f"'sourcePath' {v}"), 400 [headers, ID] = get_tracing_headers(request) # checks if targetPath is a valid path check = is_valid_dir(targetPath, headers, system_name, system_addr) if not check["result"]: return jsonify(description="sourcePath error"), 400, check["headers"] # obtain new task from Tasks microservice task_id = create_task(headers, service="storage") if task_id == -1: return jsonify(error="Error creating task"), 400 # asynchronous task creation try: update_task(task_id, headers, async_task.QUEUED) aTask = threading.Thread(target=upload_task, name=ID, args=(headers, system_name, system_addr, targetPath, sourcePath, task_id)) storage_tasks[task_id] = aTask storage_tasks[task_id].start() task_url = f"{KONG_URL}/tasks/{task_id}" data = jsonify(success="Task created", task_url=task_url, task_id=task_id) return data, 201 except Exception as e: data = jsonify(error=e) return data, 400
def acct(): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to retrieve account information", error="Machine does not exists"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to retrieve account information"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to retrieve account information"), 404, header #check if startime (--startime=) param is set: start_time_opt = "" try: starttime = request.args.get("starttime","") if starttime != "": # check if starttime parameter is correctly encoded if check_sacctTime(starttime): start_time_opt = " --starttime={start_time} ".format(start_time=starttime) else: app.logger.warning("starttime wrongly encoded") # check if endtime (--endtime=) param is set: end_time_opt = "" endtime = request.args.get("endtime","") if endtime != "": # check if endtime parameter is correctly encoded if check_sacctTime(endtime): end_time_opt = " --endtime={end_time} ".format(end_time=endtime) else: app.logger.warning("endtime wrongly encoded") except Exception as e: data = jsonify(description="Failed to retrieve account information", error=e) return data, 400 # check optional parameter jobs=jobidA,jobidB,jobidC jobs_opt = "" jobs = request.args.get("jobs","") if jobs != "": jobs_opt = " --jobs={jobs} ".format(jobs=jobs) # sacct # -X so no step information is shown (ie: just jobname, not jobname.batch or jobname.0, etc) # --starttime={start_time_opt} starts accounting info # --endtime={start_time_opt} end accounting info # --jobs={job1,job2,job3} list of jobs to be reported # format: 0 - jobid 1-partition 2-jobname 3-user 4-job sTate, # 5 - start time, 6-elapsed time , 7-end time # 8 - nodes allocated and 9 - resources # --parsable2 = limits with | character not ending with it action = "sacct -X {starttime} {endtime} {jobs_opt} " \ "--format='jobid,partition,jobname,user,state,start,cputime,end,NNodes,NodeList' " \ "--noheader --parsable2".format(starttime=start_time_opt,endtime=end_time_opt, jobs_opt=jobs_opt) try: # obtain new task from Tasks microservice task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to retrieve account information",error='Error creating task'), 400 update_task(task_id, auth_header, async_task.QUEUED) # asynchronous task creation aTask = threading.Thread(target=acct_task, args=(auth_header, system_name, system_addr, action, task_id)) aTask.start() task_url = "{KONG_URL}/tasks/{task_id}".format(KONG_URL=KONG_URL, task_id=task_id) data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 200 except Exception as e: data = jsonify(description="Failed to retrieve account information",error=e) return data, 400
def cancel_job(jobid): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to delete job", error="Machine does not exists"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to delete job"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to delete job"), 404, header app.logger.info(f"Cancel SLURM job={jobid} from {system_name} ({system_addr})") # scancel with verbose in order to show correctly the error action = f"scancel -v {jobid}" try: # obtain new task from TASKS microservice. task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to delete job",error='Error creating task'), 400 # asynchronous task creation aTask = threading.Thread(target=cancel_job_task, args=(auth_header, system_name, system_addr, action, task_id)) aTask.start() update_task(task_id, auth_header, async_task.QUEUED) task_url = f"{KONG_URL}/tasks/{task_id}" data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 200 except Exception as e: data = jsonify(description="Failed to delete job",error=e) return data, 400
def list_job(jobid): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to retrieve job information", error="Machine does not exists"), 400, header #check if jobid is a valid jobid for SLURM if not is_jobid(jobid): return jsonify(description="Failed to retrieve job information", error=f"{jobid} is not a valid job ID"), 400 # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to retrieve job information"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to retrieve job information"), 404, header username = get_username(auth_header) app.logger.info(f"Getting SLURM information of job={jobid} from {system_name} ({system_addr})") # format: jobid (i) partition (P) jobname (j) user (u) job sTate (T), # start time (S), job time (M), left time (L) # nodes allocated (M) and resources (R) action = "squeue -u {username} --format='%i|%P|%j|%u|%T|%M|%S|%L|%D|%R' --noheader -j {jobid}".\ format(username=username,jobid=jobid) try: # obtain new task from Tasks microservice task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to retrieve job information",error='Error creating task'), 400 update_task(task_id, auth_header, async_task.QUEUED) # asynchronous task creation aTask = threading.Thread(target=list_job_task, args=(auth_header, system_name, system_addr, action, task_id, 1, 1)) aTask.start() task_url = "{KONG_URL}/tasks/{task_id}".format(KONG_URL=KONG_URL, task_id=task_id) data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 200 except Exception as e: data = jsonify(description="Failed to retrieve job information",error=e) return data, 400
def list_jobs(): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to retrieve jobs information", error="Machine does not exists"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to retrieve jobs information"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to retrieve jobs information"), 404, header username = get_username(auth_header) app.logger.info(f"Getting SLURM information of jobs from {system_name} ({system_addr})") # job list comma separated: jobs = request.args.get("jobs", None) pageSize = request.args.get("pageSize", None) pageNumber = request.args.get("pageNumber", None) if pageSize != None and pageNumber != None: try: pageNumber = int(pageNumber) pageSize = int(pageSize) if pageSize not in [10,25,50,100]: pageSize = 25 except ValueError: pageNumber = 0 pageSize = 25 app.logger.error("Wrong pageNumber and/or pageSize") else: # if not set, by default pageNumber = 0 pageSize = 25 # by default empty job_list = "" if jobs != None: try: # check if input is correct: job_aux_list = jobs.split(",") if '' in job_aux_list: return jsonify(error="Jobs list wrong format",description="Failed to retrieve job information"), 400 for jobid in job_aux_list: if not is_jobid(jobid): return jsonify(error=f"{jobid} is not a valid job ID", description="Failed to retrieve job information"), 400 job_list="--job={jobs}".format(jobs=jobs) except: return jsonify(error="Jobs list wrong format",description="Failed to retrieve job information"), 400 # format: jobid (i) partition (P) jobname (j) user (u) job sTate (T), # start time (S), job time (M), left time (L) # nodes allocated (M) and resources (R) action = f"squeue -u {username} {job_list} --format='%i|%P|%j|%u|%T|%M|%S|%L|%D|%R' --noheader" try: task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to retrieve job information",error='Error creating task'), 400 update_task(task_id, auth_header, async_task.QUEUED) # asynchronous task creation aTask = threading.Thread(target=list_job_task, args=(auth_header, system_name, system_addr, action, task_id, pageSize, pageNumber)) aTask.start() task_url = f"{KONG_URL}/tasks/{task_id}" data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 200 except Exception as e: data = jsonify(description="Failed to retrieve job information",error=e) return data, 400
def submit_job_path(): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="Failed to submit job", error="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header={"X-Machine-Does-Not-Exists":"Machine does not exists"} return jsonify(description="Failed to submit job",error="Machine does not exists"), 400, header # iterate over SYSTEMS_PUBLIC list and find the endpoint matching same order # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to submit job"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to submit job"), 404, header try: targetPath = request.form["targetPath"] except KeyError as e: data = jsonify(description="Failed to submit job", error="'targetPath' parameter not set in request") return data, 400 if targetPath == None: data = jsonify(description="Failed to submit job", error="'targetPath' parameter not set in request") return data, 400 if targetPath == "": data = jsonify(description="Failed to submit job", error="'targetPath' parameter value is empty") return data, 400 # checks if targetPath is a valid path for this user in this machine check = is_valid_file(targetPath, auth_header, system_name, system_addr) if not check["result"]: return jsonify(description="Failed to submit job"), 400, check["headers"] # creates the async task related to the job submission task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to submit job",error='Error creating task'), 400 # if targetPath = "/home/testuser/test/sbatch.sh/" # split by / and discard last element (the file name): ['', 'home', 'testuser', 'test'] job_dir_splitted = targetPath.split("/")[:-1] # in case the targetPath ends with /, like: "/home/testuser/test/sbatch.sh/" # => ['', 'home', 'testuser', 'test', ''], then last element of the list is discarded if job_dir_splitted[-1] == "": job_dir_splitted = job_dir_splitted[:-1] job_dir = "/".join(job_dir_splitted) try: # asynchronous task creation aTask = threading.Thread(target=submit_job_path_task, args=(auth_header, system_name, system_addr, targetPath, job_dir, task_id)) aTask.start() retval = update_task(task_id, auth_header, async_task.QUEUED, TASKS_URL) task_url = "{KONG_URL}/tasks/{task_id}".format(KONG_URL=KONG_URL, task_id=task_id) data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 201 except Exception as e: data = jsonify(description="Failed to submit job",error=e) return data, 400
def submit_job_upload(): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header={"X-Machine-Does-Not-Exists":"Machine does not exists"} return jsonify(description="Failed to submit job file",error="Machine does not exists"), 400, header # iterate over SYSTEMS_PUBLIC list and find the endpoint matching same order # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to submit job file"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to submit job file"), 404, header job_base_fs = COMPUTE_BASE_FS[system_idx] try: # check if the post request has the file part if 'file' not in request.files: app.logger.error('No batch file part') error = jsonify(description="Failed to submit job file", error='No batch file part') return error, 400 job_file = {'filename': secure_filename(request.files['file'].filename), 'content': request.files['file'].read()} # if user does not select file, browser also # submit an empty part without filename if job_file['filename'] == '': app.logger.error('No batch file selected') error = jsonify(description="Failed to submit job file", error='No batch file selected') return error, 400 except RequestEntityTooLarge as re: app.logger.error(re.description) data = jsonify(description="Failed to submit job file", error=f"File is bigger than {MAX_FILE_SIZE} MB") return data, 413 except Exception as e: data = jsonify(description="Failed to submit job file",error=e) return data, 400 task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to submit job file",error='Error creating task'), 400 # create tmp file with timestamp # using hash_id from Tasks, which is user-task_id (internal) tmpdir = "{task_id}".format(task_id=task_id) username = get_username(auth_header) job_dir = f"{job_base_fs}/{username}/firecrest/{tmpdir}" app.logger.info(f"Job dir: {job_dir}") try: # asynchronous task creation aTask = threading.Thread(target=submit_job_task, args=(auth_header, system_name, system_addr, job_file, job_dir, task_id)) aTask.start() retval = update_task(task_id, auth_header,async_task.QUEUED) task_url = f"{KONG_URL}/tasks/{task_id}" data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 201 except Exception as e: data = jsonify(description="Failed to submit job",error=e) return data, 400
def submit_job_path(): try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="Failed to submit job", error="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to submit job", error="Machine does not exists"), 400, header # iterate over SYSTEMS_PUBLIC list and find the endpoint matching same order # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] use_plugin = USE_SPANK_PLUGIN[system_idx] targetPath = request.form.get("targetPath", None) v = validate_input(targetPath) if v != "": return jsonify(description="Failed to submit job", error=f"'targetPath' {v}"), 400 # check "account parameter" account = request.form.get("account", None) if account != None: v = validate_input(account) if v != "": return jsonify(description="Invalid account", error=f"'account' {v}"), 400 [headers, ID] = get_tracing_headers(request) # check if machine is accessible by user: resp = exec_remote_command(headers, system_name, system_addr, f"ID={ID} true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to submit job"), 400, header if in_str(error_str, "Permission") or in_str(error_str, "OPENSSH"): header = { "X-Permission-Denied": "User does not have permissions to access machine or path" } return jsonify(description="Failed to submit job"), 404, header # checks if targetPath is a valid path for this user in this machine check = is_valid_file(targetPath, headers, system_name, system_addr) if not check["result"]: return jsonify( description="Failed to submit job"), 400, check["headers"] # creates the async task related to the job submission task_id = create_task(headers, service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to submit job", error='Error creating task'), 400 # if targetPath = "/home/testuser/test/sbatch.sh/" # split by / and discard last element (the file name): ['', 'home', 'testuser', 'test'] job_dir_splitted = targetPath.split("/")[:-1] # in case the targetPath ends with /, like: "/home/testuser/test/sbatch.sh/" # => ['', 'home', 'testuser', 'test', ''], then last element of the list is discarded if job_dir_splitted[-1] == "": job_dir_splitted = job_dir_splitted[:-1] job_dir = "/".join(job_dir_splitted) try: # asynchronous task creation aTask = threading.Thread(target=submit_job_path_task, name=ID, args=(headers, system_name, system_addr, targetPath, job_dir, account, use_plugin, task_id)) aTask.start() retval = update_task(task_id, headers, async_task.QUEUED, TASKS_URL) task_url = f"{KONG_URL}/tasks/{task_id}" data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 201 except Exception as e: data = jsonify(description="Failed to submit job", error=e) return data, 400