def record_worker_metrics(metric_info):
    """ Record metric received from worker """
    metric_type = metric_info["metric_type"]
    list_active_nodes = get_current_active_nodes()
    logger.info("Metric type =>  {0} \n Metric info => {1}".format(metric_type, metric_info))
    logger.debug("Active nodes => {0} \n Nodes to scale_down => {1}".format(list_active_nodes, monitoring.list_nodes_to_scale_down))
    data_back = "Metric of type {0} is received and recorded".format(metric_type)
    if metric_type.lower() == "run_job":
        monitoring.run_job(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"])
    elif metric_type.lower() == "terminate_retried_job":
        ret_val = monitoring.terminate_running_job(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"])
        data_back = ret_val if ret_val != "" else data_back
    elif metric_type.lower() == "terminate_job":
        ret_val = monitoring.terminate_job(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["start_time"])
        data_back = ret_val if ret_val != "" else data_back
    elif metric_type.lower() == "job_failed":
        ret_val = monitoring.job_failed(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["fail_time"])
        data_back = ret_val if ret_val != "" else data_back
    elif metric_type.lower() == "run_task":
        monitoring.run_task(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["task_id"])
    elif metric_type.lower() == "terminate_task":
        monitoring.terminate_task(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["task_id"],metric_info["start_time"])
    elif metric_type.lower() == "task_failed":
        monitoring.task_failed(metric_info["qworker_id"],metric_info["experiment_id"],metric_info["job_id"],metric_info["task_id"],metric_info["fail_time"])
    else:
        data_back ="The metric of type {} didn't match with any known metric types".format(metric_type)
    return data_back
示例#2
0
def add(self, job_id, task_command, job_params):
    job_start_time = time.time()

    worker_id = self.request.hostname.split("@")[1]
    node_id, service_name, container_id = worker_id.split("##")

    monitoring.run_job(node_id, service_name, worker_id, job_id)

    log_file = "./log/" + self.request.hostname + ".log"

    with open(log_file, "a") as myfile:
        myfile.write("node_id: " + jqw.node_id + "\n")
        myfile.write("worker_id: " + worker_id + "\n")
        myfile.write("New Job: " + job_id + "\n")
        myfile.write("Command: " + str(task_command) + "\n")
        myfile.write("Parameters: " + str(len(job_params)) + "\n")
        for task_params in job_params:
            myfile.write("Parameters: " + str(task_params) + "\n")
        myfile.write("-------------------------------------\n")
        time.sleep(random.randrange(10, 100))
        for task_params in job_params:
            task_start_time = time.time()
            task_id = task_params["id"]
            task_data = task_params["data"]
            monitoring.run_task(node_id, service_name, worker_id, job_id,
                                task_id)
            command = ['docker', 'exec', container_id
                       ] + task_command + task_data
            monitoring.terminate_task(node_id, service_name, worker_id, job_id,
                                      task_id, task_start_time)
            output = subprocess.check_output(command)
            myfile.write("output: " + str(output) + "\n")
    monitoring.terminate_job(node_id, service_name, worker_id, job_id,
                             job_start_time)
示例#3
0
def process_list(worker_id, exp_id, job_queue_id, job, job_start_time):
    output = ""

    # A pre-job script might be added here

    # Go through the tasks, execute them sequntially
    for task in job["tasks"]:
        try:
            task_command = task["command"]
        except Exception as e:
            task["command"] = job["command"]
        try:
            task_data = task["data"]
        except Exception as e:
            task["data"] = job["data"]

        task_start_time = time.time()
        monitoring.run_task(worker_id, exp_id, job["id"], task["id"])

        command = (
            ["docker", "exec", getContainerID(worker_id)] + task_command +
            task["data"])
        output = subprocess.check_output(command)
        monitoring.terminate_task(worker_id, exp_id, job["id"], task["id"],
                                  task_start_time)

    # A post-job script might be added here

    if isinstance(output, bytes):
        output = output.decode()

    return output
示例#4
0
def process_array(worker_id, exp_id, job_queue_id, job, job_start_time):
    output = ""
    tasks = job["tasks"]
    try:
        task_command = tasks["command"]
    except Exception as e:
        tasks["command"] = job["command"]
    try:
        task_data = tasks["data"]
    except Exception as e:
        tasks["data"] = job["data"]

    # A pre-job script might be added here

    # Go through the tasks, execute them sequntially

    for x in range(0, tasks["count"]):
        task_start_time = time.time()
        task_id = tasks["id"] + "_" + str(x)
        monitoring.run_task(worker_id, exp_id, job["id"], task_id)
        command = (
            ["docker", "exec", getContainerID(worker_id)] + tasks["command"] +
            [str(tasks["data"])])
        output = subprocess.check_output(command)
        monitoring.terminate_task(worker_id, exp_id, job["id"], task_id,
                                  task_start_time)

    # A post-job script might be added here

    return output
示例#5
0
def process_array(worker_id, exp_id, job_queue_id, job, myfile,
                  job_start_time):
    output = ""
    tasks = job['tasks']
    try:
        task_command = tasks['command']
    except Exception as e:
        tasks['command'] = job['command']
    try:
        task_data = tasks['data']
    except Exception as e:
        tasks['data'] = job['data']

    myfile.write("Task: " + str(tasks) + "\n")
    #	try:
    for x in range(0, tasks['count']):
        myfile.write("-------------------------------------\n")
        task_start_time = time.time()
        task_id = tasks["id"] + "_" + str(x)
        monitoring.run_task(getNodeID(worker_id), exp_id,
                            getServiceName(worker_id), worker_id, job['id'],
                            task_id)
        command = [
            'docker', 'exec', getContainerID(worker_id)
        ] + tasks['command'] + [str(tasks["data"]) + [str(worker_id)]]
        print(worker_id + " - Running Task : " + str(command))
        output = subprocess.check_output(command)
        monitoring.terminate_task(getNodeID(worker_id), exp_id,
                                  getServiceName(worker_id), worker_id,
                                  job['id'], task_id, task_start_time)
    '''	
	except subprocess.CalledProcessError as e:
		monitoring.job_failed(getNodeID(worker_id), exp_id, getServiceName(worker_id), worker_id, job_queue_id, job_start_time)
		print("///////////////////// Array //////////////////////////")
		print(str(e))
		print("/////////////////////////////////////////////////////")
		raise Reject(e, requeue=True)
		print("I' killing the process since my container is dead :(")
		sys.exit(0)
	except Exception as e:
#		monitoring.task_failed(getNodeID(worker_id), exp_id, getServiceName(worker_id), worker_id, job_queue_id, task_id, task_start_time)
		monitoring.job_failed(getNodeID(worker_id), exp_id, getServiceName(worker_id), worker_id, job_queue_id, job_start_time)
		print("///////////////////// Array /////////////////////////")
		print(str(e))
		print("/////////////////////////////////////////////////////")
		raise Reject(e, requeue=True)
	'''
    myfile.write("output: " + str(output) + "\n")
    print(worker_id + " - Output: " + str(output))
    return output