def record_worker_metrics(metric_info): """ Record metric received from worker """ metric_type = metric_info["metric_type"] list_active_nodes = get_current_active_nodes() logger.info("Metric type => {0} \n Metric info => {1}".format(metric_type, metric_info)) logger.debug("Active nodes => {0} \n Nodes to scale_down => {1}".format(list_active_nodes, monitoring.list_nodes_to_scale_down)) data_back = "Metric of type {0} is received and recorded".format(metric_type) if metric_type.lower() == "run_job": monitoring.run_job(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"]) elif metric_type.lower() == "terminate_retried_job": ret_val = monitoring.terminate_running_job(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"]) data_back = ret_val if ret_val != "" else data_back elif metric_type.lower() == "terminate_job": ret_val = monitoring.terminate_job(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["start_time"]) data_back = ret_val if ret_val != "" else data_back elif metric_type.lower() == "job_failed": ret_val = monitoring.job_failed(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["fail_time"]) data_back = ret_val if ret_val != "" else data_back elif metric_type.lower() == "run_task": monitoring.run_task(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["task_id"]) elif metric_type.lower() == "terminate_task": monitoring.terminate_task(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["task_id"],metric_info["start_time"]) elif metric_type.lower() == "task_failed": monitoring.task_failed(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["task_id"],metric_info["fail_time"]) else: data_back ="The metric of type {} didn't match with any known metric types".format(metric_type) return data_back
def add(self, job_id, task_command, job_params): job_start_time = time.time() worker_id = self.request.hostname.split("@")[1] node_id, service_name, container_id = worker_id.split("##") monitoring.run_job(node_id, service_name, worker_id, job_id) log_file = "./log/" + self.request.hostname + ".log" with open(log_file, "a") as myfile: myfile.write("node_id: " + jqw.node_id + "\n") myfile.write("worker_id: " + worker_id + "\n") myfile.write("New Job: " + job_id + "\n") myfile.write("Command: " + str(task_command) + "\n") myfile.write("Parameters: " + str(len(job_params)) + "\n") for task_params in job_params: myfile.write("Parameters: " + str(task_params) + "\n") myfile.write("-------------------------------------\n") time.sleep(random.randrange(10, 100)) for task_params in job_params: task_start_time = time.time() task_id = task_params["id"] task_data = task_params["data"] monitoring.run_task(node_id, service_name, worker_id, job_id, task_id) command = ['docker', 'exec', container_id ] + task_command + task_data monitoring.terminate_task(node_id, service_name, worker_id, job_id, task_id, task_start_time) output = subprocess.check_output(command) myfile.write("output: " + str(output) + "\n") monitoring.terminate_job(node_id, service_name, worker_id, job_id, job_start_time)
def process_list(worker_id, exp_id, job_queue_id, job, job_start_time): output = "" # A pre-job script might be added here # Go through the tasks, execute them sequntially for task in job["tasks"]: try: task_command = task["command"] except Exception as e: task["command"] = job["command"] try: task_data = task["data"] except Exception as e: task["data"] = job["data"] task_start_time = time.time() monitoring.run_task(worker_id, exp_id, job["id"], task["id"]) command = ( ["docker", "exec", getContainerID(worker_id)] + task_command + task["data"]) output = subprocess.check_output(command) monitoring.terminate_task(worker_id, exp_id, job["id"], task["id"], task_start_time) # A post-job script might be added here if isinstance(output, bytes): output = output.decode() return output
def process_array(worker_id, exp_id, job_queue_id, job, job_start_time): output = "" tasks = job["tasks"] try: task_command = tasks["command"] except Exception as e: tasks["command"] = job["command"] try: task_data = tasks["data"] except Exception as e: tasks["data"] = job["data"] # A pre-job script might be added here # Go through the tasks, execute them sequntially for x in range(0, tasks["count"]): task_start_time = time.time() task_id = tasks["id"] + "_" + str(x) monitoring.run_task(worker_id, exp_id, job["id"], task_id) command = ( ["docker", "exec", getContainerID(worker_id)] + tasks["command"] + [str(tasks["data"])]) output = subprocess.check_output(command) monitoring.terminate_task(worker_id, exp_id, job["id"], task_id, task_start_time) # A post-job script might be added here return output
def process_array(worker_id, exp_id, job_queue_id, job, myfile, job_start_time): output = "" tasks = job['tasks'] try: task_command = tasks['command'] except Exception as e: tasks['command'] = job['command'] try: task_data = tasks['data'] except Exception as e: tasks['data'] = job['data'] myfile.write("Task: " + str(tasks) + "\n") # try: for x in range(0, tasks['count']): myfile.write("-------------------------------------\n") task_start_time = time.time() task_id = tasks["id"] + "_" + str(x) monitoring.run_task(getNodeID(worker_id), exp_id, getServiceName(worker_id), worker_id, job['id'], task_id) command = [ 'docker', 'exec', getContainerID(worker_id) ] + tasks['command'] + [str(tasks["data"]) + [str(worker_id)]] print(worker_id + " - Running Task : " + str(command)) output = subprocess.check_output(command) monitoring.terminate_task(getNodeID(worker_id), exp_id, getServiceName(worker_id), worker_id, job['id'], task_id, task_start_time) ''' except subprocess.CalledProcessError as e: monitoring.job_failed(getNodeID(worker_id), exp_id, getServiceName(worker_id), worker_id, job_queue_id, job_start_time) print("///////////////////// Array //////////////////////////") print(str(e)) print("/////////////////////////////////////////////////////") raise Reject(e, requeue=True) print("I' killing the process since my container is dead :(") sys.exit(0) except Exception as e: # monitoring.task_failed(getNodeID(worker_id), exp_id, getServiceName(worker_id), worker_id, job_queue_id, task_id, task_start_time) monitoring.job_failed(getNodeID(worker_id), exp_id, getServiceName(worker_id), worker_id, job_queue_id, job_start_time) print("///////////////////// Array /////////////////////////") print(str(e)) print("/////////////////////////////////////////////////////") raise Reject(e, requeue=True) ''' myfile.write("output: " + str(output) + "\n") print(worker_id + " - Output: " + str(output)) return output