def __init__(self): # observation and action space self.setup_space() # random seed self.seed(config.seed) # global timer self.wall_time = WallTime() # uses priority queue self.timeline = Timeline() # executors self.executors = OrderedSet() for exec_id in range(config.exec_cap): self.executors.add(Executor(exec_id)) # free executors self.free_executors = FreeExecutors(self.executors) # moving executors self.moving_executors = MovingExecutors() # executor commit self.exec_commit = ExecutorCommit() # prevent agent keeps selecting the same node self.node_selected = set() # for computing reward at each step self.reward_calculator = RewardCalculator()
def reset(self, max_time=np.inf): # reset observation and action space self.setup_space() self.max_time = max_time self.wall_time.reset() self.timeline.reset() self.exec_commit.reset() self.moving_executors.reset() self.reward_calculator.reset() self.finished_job_dags = OrderedSet() self.node_selected.clear() for executor in self.executors: executor.reset() self.free_executors.reset(self.executors) # generate a set of new jobs self.job_dags = generate_jobs( self.np_random, self.timeline, self.wall_time) # map action to dag_idx and node_idx self.action_map = compute_act_map(self.job_dags) # add initial set of jobs in the system for job_dag in self.job_dags: self.add_job(job_dag) # put all executors as source executors initially self.source_job = None self.num_source_exec = len(self.executors) self.exec_to_schedule = OrderedSet(self.executors) return self.observe()
def __init__(self, dag_db): self.dag_db = dag_db self.job_dags = OrderedSet() self.action_map = {} # action index -> node self.available_executors = {} self.last_trigger = None # executors self.executors = {} for exec_id in range(config.exec_cap): self.executors[exec_id] = Executor(exec_id) # dynamically bind {app_id -> job_dag} self.spark_dag_map = {} # dynamically bind {job_dag -> app_id} self.spark_inverse_dag_map = {} # dynamically bind {(app_id, stage_id) -> node} self.spark_node_map = {} # dynamically bind {node -> (app_id, stage_id)} self.spark_inverse_node_map = {} # dynamically bind {app_id -> {exec_id -> re-usable track_id}} self.exec_id_track_id_map = {}
def generate_jobs(np_random, timeline, wall_time): job_dags = OrderedSet() tpch_size = ['2g','5g','10g','20g','50g','80g','100g'] tpch_num = 22 t = 0 for _ in range(config.num_init_dags): # generate query query_size = tpch_size[np_random.randint(len(tpch_size))] query_idx = str(np_random.randint(tpch_num) + 1) # generate job job_dag = load_job( query_size, query_idx, wall_time, np_random) # job already arrived, put in job_dags job_dag.start_time = t job_dag.arrived = True job_dags.add(job_dag) for _ in range(config.num_stream_dags): # poisson process t += int(np_random.exponential(config.stream_interval)) # uniform distribution query_size = tpch_size[np_random.randint(len(tpch_size))] query_idx = str(np_random.randint(tpch_num) + 1) # generate job job_dag = load_job( query_size, query_idx, wall_time, np_random) # push into timeline job_dag.start_time = t timeline.push(t, job_dag) return job_dags
def reset(self): for node in self.nodes: node.reset() self.num_nodes_done = 0 self.executors = OrderedSet() self.frontier_nodes = OrderedSet() for node in self.nodes: if node.is_schedulable(): self.frontier_nodes.add(node) self.arrived = False self.completed = False self.completion_time = np.inf
def get_frontier_nodes(self): # frontier nodes := unsaturated nodes with all parent nodes saturated frontier_nodes = OrderedSet() for job_dag in self.job_dags: for node in job_dag.nodes: if not node in self.node_selected and not self.saturated(node): parents_saturated = True for parent_node in node.parent_nodes: if not self.saturated(parent_node): parents_saturated = False break if parents_saturated: frontier_nodes.add(node) return frontier_nodes
def __init__(self, nodes, adj_mat, name): # nodes: list of N nodes # adj_mat: N by N 0-1 adjacency matrix, e_ij = 1 -> edge from i to j assert len(nodes) == adj_mat.shape[0] assert adj_mat.shape[0] == adj_mat.shape[1] self.name = name self.nodes = nodes self.adj_mat = adj_mat self.num_nodes = len(self.nodes) self.num_nodes_done = 0 # set of executors currently running on the job self.executors = OrderedSet() # the computation graph needs to be a DAG assert is_dag(self.num_nodes, self.adj_mat) # get the set of schedule nodes self.frontier_nodes = OrderedSet() for node in self.nodes: if node.is_schedulable(): self.frontier_nodes.add(node) # assign job dag to node self.assign_job_dag_to_node() # dag is arrived self.arrived = False # dag is completed self.completed = False # dag start ime self.start_time = None # dag completion time self.completion_time = np.inf # map a executor number to an interval self.executor_interval_map = \ self.get_executor_interval_map()
def assign_executor(self, executor, frontier_changed): if executor.node is not None and not executor.node.no_more_tasks: # keep working on the previous node task = executor.node.schedule(executor) self.timeline.push(task.finish_time, task) else: # need to move on to other nodes if frontier_changed: # frontier changed, need to consult all free executors # note: executor.job_dag might change after self.schedule() source_job = executor.job_dag if len(self.exec_commit[executor.node]) > 0: # directly fulfill the commitment self.exec_to_schedule = {executor} self.schedule() else: # free up the executor self.free_executors.add(source_job, executor) # then consult all free executors self.exec_to_schedule = OrderedSet(self.free_executors[source_job]) self.source_job = source_job self.num_source_exec = len(self.free_executors[source_job]) else: # just need to schedule one current executor self.exec_to_schedule = {executor} # only care about executors on the node if len(self.exec_commit[executor.node]) > 0: # directly fulfill the commitment self.schedule() else: # need to consult for ALL executors on the node # Note: self.exec_to_schedule is immediate # self.num_source_exec is for commit # so len(self.exec_to_schedule) != # self.num_source_exec can happen self.source_job = executor.job_dag self.num_source_exec = len(executor.node.executors)
def __init__(self, idx, tasks, task_duration, wall_time, np_random): self.idx = idx self.tasks = tasks self.wall_time = wall_time self.np_random = np_random self.task_duration = task_duration self.num_tasks = len(tasks) self.num_finished_tasks = 0 self.next_task_idx = 0 self.no_more_tasks = False self.tasks_all_done = False self.node_finish_time = np.inf self.executors = OrderedSet() # uninitialized self.parent_nodes = [] self.child_nodes = [] self.descendant_nodes = [] self.job_dag = None self.assign_node_to_tasks()
def run(self): # set up ipc communication context = zmq.Context() socket = context.socket(zmq.REP) ipc_msg = IPCMessage() ipc_reply = IPCReply() os.system('rm /tmp/spark_scheduling_java_python_ipc') socket.bind("ipc:///tmp/spark_scheduling_java_python_ipc") # for reward computation num_active_jobs = 0 prev_time = time.time() while not self.exit.is_set(): msg = socket.recv() ipc_msg.ParseFromString(msg) if ipc_msg.msg_type == 'register': self.dag_db.add_new_app(ipc_msg.app_name, ipc_msg.app_id) job_dag = self.env.add_job_dag(ipc_msg.app_id) add_job_in_graph(self.graph, job_dag) ipc_reply.msg = \ "external scheduler register app " + str(ipc_msg.app_name) elif ipc_msg.msg_type == 'bind': self.env.bind_exec_id(ipc_msg.app_id, ipc_msg.exec_id, ipc_msg.track_id) ipc_reply.msg = \ "external scheduler bind app_id " + \ str(ipc_msg.app_id) + " exec_id " + \ str(ipc_msg.exec_id) + " on track_id " + \ str(ipc_msg.track_id) elif ipc_msg.msg_type == 'inform': self.env.complete_tasks(ipc_msg.app_id, ipc_msg.stage_id, ipc_msg.num_tasks_left) ipc_reply.msg = \ "external scheduler updated app_id " + \ str(ipc_msg.app_id) + \ " stage_id " + \ str(ipc_msg.stage_id) + \ " with " + str(ipc_msg.num_tasks_left) + " tasks left" elif ipc_msg.msg_type == 'update': frontier_nodes_changed = \ self.env.complete_stage(ipc_msg.app_id, ipc_msg.stage_id) ipc_reply.msg = \ "external scheduler updated app_id " + \ str(ipc_msg.app_id) + \ " stage_id " + \ str(ipc_msg.stage_id) elif ipc_msg.msg_type == 'tracking': # master asks which app it should assign the executor to ipc_reply.app_id, ipc_reply.num_executors_to_take = \ self.exec_tracker.pop_executor_flow(ipc_msg.num_available_executors) ipc_reply.msg = \ "external scheduler moves " + \ str(ipc_reply.num_executors_to_take) + \ " executor to app " + ipc_reply.app_id elif ipc_msg.msg_type == 'consult': # convert ipc_msg.app_id and ipc_msg.stage_id to corresponding # executors in virtual environment and then inovke the # scheduling agent # 1. translate the raw information into observation space # sort out the exec_map (where the executors are) exec_map = {job_dag: 0 for job_dag in self.env.job_dags} for app_id in self.dag_db.apps_map: if app_id in self.exec_tracker.executor_flow: job_dag = self.dag_db.apps_map[app_id] exec_map[job_dag] = self.exec_tracker.executor_flow[ app_id] source_job = self.dag_db.apps_map[ipc_msg.app_id] frontier_nodes = OrderedSet() for job_dag in self.env.job_dags: for node in job_dag.frontier_nodes: frontier_nodes.add(node) for job_dag in self.env.job_dags: for node in job_dag.nodes: feature = np.zeros([6]) # number of executors already in the job feature[0] = exec_map[job_dag] # source executor is from the current job (locality) feature[1] = job_dag is source_job # number of source executors feature[2] = 1 # remaining number of tasks in the node feature[3] = node.num_tasks - node.next_task_idx # average task duration of the node feature[4] = node.tasks[-1].duration # is the current node valid feature[5] = node in frontier_nodes # update feature in observation self.graph.update_nodes({node: feature}) # update mask in the action space self.action_space.update_valid_set(frontier_nodes) # 2. gather feedback for the previous action curr_time = time.time() elapsed_time = curr_time - prev_time prev_reward = num_active_jobs * elapsed_time prev_done = False # spark can be long running prev_info = {'elapsed_time': elapsed_time} num_active_jobs = len(self.env.job_dags) prev_time = curr_time # 3. get the action from the agent node = self.agent.get_action(self.graph, prev_reward, prev_done, prev_info) # 4. translate the action to ipc reply if node is None: # no-action was made ipc_reply.app_id = 'void' ipc_reply.stage_id = -1 else: ipc_reply.app_id, ipc_reply.stage_id = self.env.spark_inverse_node_map[ node] if node.idx not in node.job_dag.frontier_nodes: # move (or stay) the executor to the job only ipc_reply.stage_id = -1 if ipc_msg.app_id != 'void' and \ ipc_reply.app_id != 'void' and \ ipc_msg.app_id != ipc_reply.app_id: # executor needs to move to another job, keep track of it self.exec_tracker.add_executor_flow(ipc_reply.app_id, 1) ipc_reply.msg = \ "external scheduler return app_id " + str(ipc_reply.app_id) + \ " stage_id " + str(ipc_reply.stage_id) + \ " for exec_id " + str(ipc_msg.exec_id) elif ipc_msg.msg_type == 'deregister': job_dag = self.env.remove_job_dag(ipc_msg.app_id) remove_job_from_graph(self.graph, job_dag) self.dag_db.remove_app(ipc_msg.app_id) self.exec_tracker.remove_app(ipc_msg.app_id) ipc_reply.msg = \ "external scheduler deregister app " + ipc_msg.app_id print("time:", datetime.now()) print("msg_type:", ipc_msg.msg_type) print("app_name:", ipc_msg.app_name) print("app_id:", ipc_msg.app_id) print("stage_id:", ipc_msg.stage_id) print("executor_id:", ipc_msg.exec_id) print("track_id:", ipc_msg.track_id) print("num_available_executors:", ipc_msg.num_available_executors) print("num_tasks_left", ipc_msg.num_tasks_left) print("reply_msg:", ipc_reply.msg) print("") sys.stdout.flush() socket.send(ipc_reply.SerializeToString())
class JobDAG(object): def __init__(self, nodes, adj_mat, name): # nodes: list of N nodes # adj_mat: N by N 0-1 adjacency matrix, e_ij = 1 -> edge from i to j assert len(nodes) == adj_mat.shape[0] assert adj_mat.shape[0] == adj_mat.shape[1] self.name = name self.nodes = nodes self.adj_mat = adj_mat self.num_nodes = len(self.nodes) self.num_nodes_done = 0 # set of executors currently running on the job self.executors = OrderedSet() # the computation graph needs to be a DAG assert is_dag(self.num_nodes, self.adj_mat) # get the set of schedule nodes self.frontier_nodes = OrderedSet() for node in self.nodes: if node.is_schedulable(): self.frontier_nodes.add(node) # assign job dag to node self.assign_job_dag_to_node() # dag is arrived self.arrived = False # dag is completed self.completed = False # dag start ime self.start_time = None # dag completion time self.completion_time = np.inf # map a executor number to an interval self.executor_interval_map = \ self.get_executor_interval_map() def assign_job_dag_to_node(self): for node in self.nodes: node.job_dag = self def get_executor_interval_map(self): executor_interval_map = {} executor_data_point = [5, 10, 20, 40, 50, 60, 80, 100] entry_pt = 0 # get the left most map for e in range(executor_data_point[0] + 1): executor_interval_map[e] = \ (executor_data_point[0], executor_data_point[0]) # get the center map for i in range(len(executor_data_point) - 1): for e in range(executor_data_point[i] + 1, executor_data_point[i + 1]): executor_interval_map[e] = \ (executor_data_point[i], executor_data_point[i + 1]) # at the data point e = executor_data_point[i + 1] executor_interval_map[e] = \ (executor_data_point[i + 1], executor_data_point[i + 1]) # get the residual map if config.exec_cap > executor_data_point[-1]: for e in range(executor_data_point[-1] + 1, config.exec_cap + 1): executor_interval_map[e] = \ (executor_data_point[-1], executor_data_point[-1]) return executor_interval_map def get_nodes_duration(self): # Warning: this is slow O(num_nodes * num_tasks) # get the duration over all nodes duration = 0 for node in self.nodes: duration += node.get_node_duration() return duration def reset(self): for node in self.nodes: node.reset() self.num_nodes_done = 0 self.executors = OrderedSet() self.frontier_nodes = OrderedSet() for node in self.nodes: if node.is_schedulable(): self.frontier_nodes.add(node) self.arrived = False self.completed = False self.completion_time = np.inf def update_frontier_nodes(self, node): frontier_nodes_changed = False for child in node.child_nodes: if child.is_schedulable(): if child.idx not in self.frontier_nodes: self.frontier_nodes.add(child) frontier_nodes_changed = True return frontier_nodes_changed
class Node(object): def __init__(self, idx, tasks, task_duration, wall_time, np_random): self.idx = idx self.tasks = tasks self.wall_time = wall_time self.np_random = np_random self.task_duration = task_duration self.num_tasks = len(tasks) self.num_finished_tasks = 0 self.next_task_idx = 0 self.no_more_tasks = False self.tasks_all_done = False self.node_finish_time = np.inf self.executors = OrderedSet() # uninitialized self.parent_nodes = [] self.child_nodes = [] self.descendant_nodes = [] self.job_dag = None self.assign_node_to_tasks() def assign_node_to_tasks(self): for task in self.tasks: task.node = self def get_node_duration(self): # Warning: this is slow O(num_tasks) # get the total duration over all tasks duration = 0 for task in self.tasks: duration += task.get_duration() return duration def is_schedulable(self): if self.no_more_tasks: # no more tasks return False if self.tasks_all_done: # node done return False for node in self.parent_nodes: if not node.tasks_all_done: # a parent node not done return False return True def reset(self): for task in self.tasks: task.reset() self.executors.clear() self.num_finished_tasks = 0 self.next_task_idx = 0 self.no_more_tasks = False self.tasks_all_done = False self.node_finish_time = np.inf def sample_executor_key(self, num_executors): (left_exec, right_exec) = \ self.job_dag.executor_interval_map[num_executors] executor_key = None if left_exec == right_exec: executor_key = left_exec else: rand_pt = self.np_random.randint(1, right_exec - left_exec + 1) if rand_pt <= num_executors - left_exec: executor_key = left_exec else: executor_key = right_exec if executor_key not in self.task_duration['first_wave']: # more executors than number of tasks in the job largest_key = 0 for e in self.task_duration['first_wave']: if e > largest_key: largest_key = e executor_key = largest_key return executor_key def schedule(self, executor): assert self.next_task_idx < self.num_tasks task = self.tasks[self.next_task_idx] # task duration is determined by wave num_executors = len(self.job_dag.executors) assert num_executors > 0 # sample an executor point in the data executor_key = self.sample_executor_key(num_executors) if executor.task is None or \ executor.task.node.job_dag != task.node.job_dag: # the executor never runs a task in this job # fresh executor incurrs a warmup delay if len(self.task_duration['fresh_durations'][executor_key]) > 0: # (1) try to directly retrieve the warmup delay from data fresh_durations = \ self.task_duration['fresh_durations'][executor_key] i = np.random.randint(len(fresh_durations)) duration = fresh_durations[i] else: # (2) use first wave but deliberately add in a warmup delay first_wave = \ self.task_duration['first_wave'][executor_key] i = np.random.randint(len(first_wave)) duration = first_wave[i] + config.warmup_delay elif executor.task is not None and \ executor.task.node == task.node and \ len(self.task_duration['rest_wave'][executor_key]) > 0: # executor was working on this node # the task duration should be retrieved from rest wave rest_wave = self.task_duration['rest_wave'][executor_key] i = np.random.randint(len(rest_wave)) duration = rest_wave[i] else: # executor is fresh to this node, use first wave if len(self.task_duration['first_wave'][executor_key]) > 0: # (1) try to retrieve first wave from data first_wave = \ self.task_duration['first_wave'][executor_key] i = np.random.randint(len(first_wave)) duration = first_wave[i] else: # (2) first wave doesn't exist, use fresh durations instead # (should happen very rarely) fresh_durations = \ self.task_duration['fresh_durations'][executor_key] i = np.random.randint(len(fresh_durations)) duration = fresh_durations[i] # # Hack! only use first/fresh duration # # executor is fresh to this node, use first wave # if len(self.task_duration['first_wave'][executor_key]) > 0: # # (1) try to retrieve first wave from data # first_wave = \ # self.task_duration['first_wave'][executor_key] # i = self.np_random.randint(len(first_wave)) # duration = first_wave[i] # else: # # (2) first wave doesn't exist, use fresh durations instead # # (should happen very rarely) # fresh_durations = \ # self.task_duration['fresh_durations'][executor_key] # i = self.np_random.randint(len(fresh_durations)) # duration = fresh_durations[i] # detach the executor from old node # the executor can run task means it is local # to the job at this point executor.detach_node() # schedule the task task.schedule(self.wall_time.curr_time, duration, executor) # mark executor as running in the node self.executors.add(executor) executor.node = self self.next_task_idx += 1 self.no_more_tasks = (self.next_task_idx >= self.num_tasks) if self.no_more_tasks: if self in self.job_dag.frontier_nodes: self.job_dag.frontier_nodes.remove(self) return task
def add_job(self, job): self.free_executors[job] = OrderedSet()
def step(self, action): assert self.action_space.contains(action) next_node, limit_idx = action # index starts from 0 but degree of parallelism starts with 1 limit = limit_idx + 1 # mark the node as selected assert next_node not in self.node_selected self.node_selected.add(next_node) # commit the source executor executor = next(iter(self.exec_to_schedule)) source = executor.job_dag if executor.node is None else executor.node # compute number of valid executors to assign if next_node is not None: use_exec = min(next_node.num_tasks - next_node.next_task_idx - \ self.exec_commit.node_commit[next_node] - \ self.moving_executors.count(next_node), limit, self.num_source_exec) else: use_exec = self.num_source_exec assert use_exec > 0 self.exec_commit.add(source, next_node, use_exec) # deduct the executors that know the destination self.num_source_exec -= use_exec assert self.num_source_exec >= 0 if self.num_source_exec == 0: # now a new scheduling round, clean up node selection self.node_selected.clear() # all commitments are made, now schedule free executors self.schedule() # Now run to the next event in the virtual timeline while len(self.timeline) > 0 and self.num_source_exec == 0: # consult agent by putting executors in source_exec new_time, obj = self.timeline.pop() self.wall_time.update_time(new_time) # case task: a task completion event, and frees up an executor. # case query: a new job arrives # case executor: an executor arrives at certain job if isinstance(obj, Task): # task completion event finished_task = obj node = finished_task.node node.num_finished_tasks += 1 # bookkeepings for node completion frontier_changed = False if node.num_finished_tasks == node.num_tasks: assert not node.tasks_all_done # only complete once node.tasks_all_done = True node.job_dag.num_nodes_done += 1 node.node_finish_time = self.wall_time.curr_time frontier_changed = node.job_dag.update_frontier_nodes(node) # assign new destination for the job self.assign_executor(finished_task.executor, frontier_changed) # bookkeepings for job completion if node.job_dag.num_nodes_done == node.job_dag.num_nodes: assert not node.job_dag.completed # only complete once node.job_dag.completed = True node.job_dag.completion_time = self.wall_time.curr_time self.remove_job(node.job_dag) elif isinstance(obj, JobDAG): # new job arrival event job_dag = obj # job should be arrived at the first time assert not job_dag.arrived job_dag.arrived = True # inform agent about job arrival when stream is enabled self.job_dags.add(job_dag) self.add_job(job_dag) self.action_map = compute_act_map(self.job_dags) # assign free executors (if any) to the new job if len(self.free_executors[None]) > 0: self.exec_to_schedule = \ OrderedSet(self.free_executors[None]) self.source_job = None self.num_source_exec = \ len(self.free_executors[None]) elif isinstance(obj, Executor): # executor arrival event executor = obj # pop destination from the tracking record node = self.moving_executors.pop(executor) if node is not None: # the job is not yet done when executor arrives executor.job_dag = node.job_dag node.job_dag.executors.add(executor) if node is not None and not node.no_more_tasks: # the node is still schedulable if node in node.job_dag.frontier_nodes: # node is immediately runnable task = node.schedule(executor) self.timeline.push(task.finish_time, task) else: # free up the executor in this job self.free_executors.add(executor.job_dag, executor) else: # the node is saturated or the job is done # by the time the executor arrives, use # backup logic self.backup_schedule(executor) else: print("illegal event type") exit(1) # compute reward reward = self.reward_calculator.get_reward( self.job_dags, self.wall_time.curr_time) # no more decision to make, jobs all done or time is up done = (self.num_source_exec == 0) and \ ((len(self.timeline) == 0) or \ (self.wall_time.curr_time >= self.max_time)) if done: assert self.wall_time.curr_time >= self.max_time or \ len(self.job_dags) == 0 return self.observe(), reward, done, None
class SparkSimEnv(core.Env): """ A trace-driven simulator for the dynamics of the scheduling module in Apache Spark. The intricacies to closely simulate the system in reality are mainly (1) the "moving cost" of executors acorss jobs (due to the overhead of starting a new JVM); (2) the wave effect of running tasks of the same stage on an executor (overhead of loading data in the first wave of tasks); (3) the dimenishing speedup in job runtime when assigning more executors to a job. See reference for more details. * STATE * Graph type of observation. It consists of features associated with each node ( a tensor of dimension n * m, where n is number of nodes, m is number of features), and adjacency matrix (a sparse 0-1 matrix of dimension n * n). The features on each node is [number_of_executors_currently_in_this_job, is_current_executor_local_to_this_job, number_of_free_executors, total_work_remaining_on_this_node, number_of_tasks_remaining_on_this_node] * ACTIONS * Two dimensional action, [node_idx_to_schedule_next, number_of_executors_to_assign] Note: the set of available nodes has to contain node_idx, and the number of executors to assign must not exceed the limit. Both the available set and the limit are provided in the (auxiliary) state. * REWARD * Negative time elapsed for each job in the system since last action. For example, the virtual time was 0 for the last action, 4 jobs was in the system (either in the queue waiting or being processed), job 1 finished at time 1, job 2 finished at time 2.4 and job 3 and 4 are still running at the next action. The next action is taken at time 5. Then the reward is - (1 * 1 + 1 * 2.4 + 2 * 5). Thus, the sum of the rewards would be negative of total (waiting + processing) time for all jobs. * REFERENCE * Section 6.2 Learning Scheduling Algorithms for Data Processing Clusters H Mao, M Schwarzkopf, SB Venkatakrishnan, M Alizadeh https://arxiv.org/pdf/1810.01963.pdf """ def __init__(self): # observation and action space self.setup_space() # random seed self.seed(config.seed) # global timer self.wall_time = WallTime() # uses priority queue self.timeline = Timeline() # executors self.executors = OrderedSet() for exec_id in range(config.exec_cap): self.executors.add(Executor(exec_id)) # free executors self.free_executors = FreeExecutors(self.executors) # moving executors self.moving_executors = MovingExecutors() # executor commit self.exec_commit = ExecutorCommit() # prevent agent keeps selecting the same node self.node_selected = set() # for computing reward at each step self.reward_calculator = RewardCalculator() def add_job(self, job_dag): self.moving_executors.add_job(job_dag) self.free_executors.add_job(job_dag) self.exec_commit.add_job(job_dag) add_job_in_graph(self.graph, job_dag) def assign_executor(self, executor, frontier_changed): if executor.node is not None and not executor.node.no_more_tasks: # keep working on the previous node task = executor.node.schedule(executor) self.timeline.push(task.finish_time, task) else: # need to move on to other nodes if frontier_changed: # frontier changed, need to consult all free executors # note: executor.job_dag might change after self.schedule() source_job = executor.job_dag if len(self.exec_commit[executor.node]) > 0: # directly fulfill the commitment self.exec_to_schedule = {executor} self.schedule() else: # free up the executor self.free_executors.add(source_job, executor) # then consult all free executors self.exec_to_schedule = OrderedSet(self.free_executors[source_job]) self.source_job = source_job self.num_source_exec = len(self.free_executors[source_job]) else: # just need to schedule one current executor self.exec_to_schedule = {executor} # only care about executors on the node if len(self.exec_commit[executor.node]) > 0: # directly fulfill the commitment self.schedule() else: # need to consult for ALL executors on the node # Note: self.exec_to_schedule is immediate # self.num_source_exec is for commit # so len(self.exec_to_schedule) != # self.num_source_exec can happen self.source_job = executor.job_dag self.num_source_exec = len(executor.node.executors) def backup_schedule(self, executor): backup_scheduled = False if executor.job_dag is not None: # first try to schedule on current job for node in executor.job_dag.frontier_nodes: if not self.saturated(node): # greedily schedule a frontier node task = node.schedule(executor) self.timeline.push(task.finish_time, task) backup_scheduled = True break # then try to schedule on any available node if not backup_scheduled: schedulable_nodes = self.get_frontier_nodes() if len(schedulable_nodes) > 0: node = next(iter(schedulable_nodes)) self.timeline.push( self.wall_time.curr_time + config.moving_delay, executor) # keep track of moving executors self.moving_executors.add(executor, node) backup_scheduled = True # at this point if nothing available, leave executor idle if not backup_scheduled: self.free_executors.add(executor.job_dag, executor) def get_frontier_nodes(self): # frontier nodes := unsaturated nodes with all parent nodes saturated frontier_nodes = OrderedSet() for job_dag in self.job_dags: for node in job_dag.nodes: if not node in self.node_selected and not self.saturated(node): parents_saturated = True for parent_node in node.parent_nodes: if not self.saturated(parent_node): parents_saturated = False break if parents_saturated: frontier_nodes.add(node) return frontier_nodes def get_executor_limits(self): # "minimum executor limit" for each job # executor limit := {job_dag -> int} executor_limit = {} for job_dag in self.job_dags: if self.source_job == job_dag: curr_exec = self.num_source_exec else: curr_exec = 0 # note: this does not count in the commit and moving executors executor_limit[job_dag] = len(job_dag.executors) - curr_exec return executor_limit def observe(self): # valid set of nodes frontier_nodes = self.get_frontier_nodes() # sort out the exec_map (where the executors are) exec_map = {} for job_dag in self.job_dags: exec_map[job_dag] = len(job_dag.executors) # count in moving executors for node in self.moving_executors.moving_executors.values(): exec_map[node.job_dag] += 1 # count in executor commit for s in self.exec_commit.commit: if isinstance(s, JobDAG): j = s elif isinstance(s, Node): j = s.job_dag elif s is None: j = None else: print('source', s, 'unknown') exit(1) for n in self.exec_commit.commit[s]: if n is not None and n.job_dag != j: exec_map[n.job_dag] += self.exec_commit.commit[s][n] for job_dag in self.job_dags: for node in job_dag.nodes: feature = np.zeros([6]) # number of executors already in the job feature[0] = exec_map[job_dag] # source executor is from the current job (locality) feature[1] = job_dag is self.source_job # number of source executors feature[2] = self.num_source_exec # remaining number of tasks in the node feature[3] = node.num_tasks - node.next_task_idx # average task duration of the node feature[4] = node.tasks[-1].duration # is the current node valid feature[5] = node in frontier_nodes # update feature in observation self.graph.update_nodes({node: feature}) # update mask in the action space self.action_space[0].update_valid_set(frontier_nodes) # return the graph as observation obs = self.graph assert self.observation_space.contains(obs) return obs def saturated(self, node): # frontier nodes := unsaturated nodes with all parent nodes saturated anticipated_task_idx = node.next_task_idx + \ self.exec_commit.node_commit[node] + \ self.moving_executors.count(node) # note: anticipated_task_idx can be larger than node.num_tasks # when the tasks finish very fast before commitments are fulfilled return anticipated_task_idx >= node.num_tasks def schedule(self): executor = next(iter(self.exec_to_schedule)) source = executor.job_dag if executor.node is None else executor.node # schedule executors from the source until the commitment is fulfilled while len(self.exec_commit[source]) > 0 and \ len(self.exec_to_schedule) > 0: # keep fulfilling the commitment using free executors node = self.exec_commit.pop(source) executor = self.exec_to_schedule.pop() # mark executor as in use if it was free executor previously if self.free_executors.contain_executor(executor.job_dag, executor): self.free_executors.remove(executor) if node is None: # the next node is explicitly silent, make executor ilde if executor.job_dag is not None and \ any([not n.no_more_tasks for n in \ executor.job_dag.nodes]): # mark executor as idle in its original job self.free_executors.add(executor.job_dag, executor) else: # no where to assign, put executor in null pool self.free_executors.add(None, executor) elif not node.no_more_tasks: # node is not currently saturated if executor.job_dag == node.job_dag: # executor local to the job if node in node.job_dag.frontier_nodes: # node is immediately runnable task = node.schedule(executor) self.timeline.push(task.finish_time, task) else: # put executor back in the free pool self.free_executors.add(executor.job_dag, executor) else: # need to move executor self.timeline.push( self.wall_time.curr_time + config.moving_delay, executor) # keep track of moving executors self.moving_executors.add(executor, node) else: # node is already saturated, use backup logic self.backup_schedule(executor) def step(self, action): assert self.action_space.contains(action) next_node, limit_idx = action # index starts from 0 but degree of parallelism starts with 1 limit = limit_idx + 1 # mark the node as selected assert next_node not in self.node_selected self.node_selected.add(next_node) # commit the source executor executor = next(iter(self.exec_to_schedule)) source = executor.job_dag if executor.node is None else executor.node # compute number of valid executors to assign if next_node is not None: use_exec = min(next_node.num_tasks - next_node.next_task_idx - \ self.exec_commit.node_commit[next_node] - \ self.moving_executors.count(next_node), limit, self.num_source_exec) else: use_exec = self.num_source_exec assert use_exec > 0 self.exec_commit.add(source, next_node, use_exec) # deduct the executors that know the destination self.num_source_exec -= use_exec assert self.num_source_exec >= 0 if self.num_source_exec == 0: # now a new scheduling round, clean up node selection self.node_selected.clear() # all commitments are made, now schedule free executors self.schedule() # Now run to the next event in the virtual timeline while len(self.timeline) > 0 and self.num_source_exec == 0: # consult agent by putting executors in source_exec new_time, obj = self.timeline.pop() self.wall_time.update_time(new_time) # case task: a task completion event, and frees up an executor. # case query: a new job arrives # case executor: an executor arrives at certain job if isinstance(obj, Task): # task completion event finished_task = obj node = finished_task.node node.num_finished_tasks += 1 # bookkeepings for node completion frontier_changed = False if node.num_finished_tasks == node.num_tasks: assert not node.tasks_all_done # only complete once node.tasks_all_done = True node.job_dag.num_nodes_done += 1 node.node_finish_time = self.wall_time.curr_time frontier_changed = node.job_dag.update_frontier_nodes(node) # assign new destination for the job self.assign_executor(finished_task.executor, frontier_changed) # bookkeepings for job completion if node.job_dag.num_nodes_done == node.job_dag.num_nodes: assert not node.job_dag.completed # only complete once node.job_dag.completed = True node.job_dag.completion_time = self.wall_time.curr_time self.remove_job(node.job_dag) elif isinstance(obj, JobDAG): # new job arrival event job_dag = obj # job should be arrived at the first time assert not job_dag.arrived job_dag.arrived = True # inform agent about job arrival when stream is enabled self.job_dags.add(job_dag) self.add_job(job_dag) self.action_map = compute_act_map(self.job_dags) # assign free executors (if any) to the new job if len(self.free_executors[None]) > 0: self.exec_to_schedule = \ OrderedSet(self.free_executors[None]) self.source_job = None self.num_source_exec = \ len(self.free_executors[None]) elif isinstance(obj, Executor): # executor arrival event executor = obj # pop destination from the tracking record node = self.moving_executors.pop(executor) if node is not None: # the job is not yet done when executor arrives executor.job_dag = node.job_dag node.job_dag.executors.add(executor) if node is not None and not node.no_more_tasks: # the node is still schedulable if node in node.job_dag.frontier_nodes: # node is immediately runnable task = node.schedule(executor) self.timeline.push(task.finish_time, task) else: # free up the executor in this job self.free_executors.add(executor.job_dag, executor) else: # the node is saturated or the job is done # by the time the executor arrives, use # backup logic self.backup_schedule(executor) else: print("illegal event type") exit(1) # compute reward reward = self.reward_calculator.get_reward( self.job_dags, self.wall_time.curr_time) # no more decision to make, jobs all done or time is up done = (self.num_source_exec == 0) and \ ((len(self.timeline) == 0) or \ (self.wall_time.curr_time >= self.max_time)) if done: assert self.wall_time.curr_time >= self.max_time or \ len(self.job_dags) == 0 return self.observe(), reward, done, None def remove_job(self, job_dag): for executor in list(job_dag.executors): executor.detach_job() self.exec_commit.remove_job(job_dag) self.free_executors.remove_job(job_dag) self.moving_executors.remove_job(job_dag) self.job_dags.remove(job_dag) self.finished_job_dags.add(job_dag) remove_job_from_graph(self.graph, job_dag) self.action_map = compute_act_map(self.job_dags) def reset(self, max_time=np.inf): # reset observation and action space self.setup_space() self.max_time = max_time self.wall_time.reset() self.timeline.reset() self.exec_commit.reset() self.moving_executors.reset() self.reward_calculator.reset() self.finished_job_dags = OrderedSet() self.node_selected.clear() for executor in self.executors: executor.reset() self.free_executors.reset(self.executors) # generate a set of new jobs self.job_dags = generate_jobs( self.np_random, self.timeline, self.wall_time) # map action to dag_idx and node_idx self.action_map = compute_act_map(self.job_dags) # add initial set of jobs in the system for job_dag in self.job_dags: self.add_job(job_dag) # put all executors as source executors initially self.source_job = None self.num_source_exec = len(self.executors) self.exec_to_schedule = OrderedSet(self.executors) return self.observe() def seed(self, seed): self.np_random = seeding.np_random(seed) def setup_space(self): # Set up the observation and action space # The boundary of the space may change if the dynamics is changed # a warning message will show up every time e.g., the observation falls # out of the observation space self.graph = DirectedGraph() self.obs_node_low = np.array([0] * 6) self.obs_node_high = np.array([config.exec_cap, 1, config.exec_cap, 1000, 100000, 1]) self.obs_edge_low = self.obs_edge_high = np.array([]) # features on nodes only self.observation_space = spaces.Graph( node_feature_space=spaces.MultiBox( low=self.obs_node_low, high=self.obs_node_high, dtype=np.float32), edge_feature_space=spaces.MultiBox( low=self.obs_edge_low, high=self.obs_edge_high, dtype=np.float32)) self.action_space = spaces.Tuple( (spaces.NodeInGraph(self.graph), spaces.MaskedDiscrete(config.num_servers)))
class Environment(object): def __init__(self, dag_db): self.dag_db = dag_db self.job_dags = OrderedSet() self.action_map = {} # action index -> node self.available_executors = {} self.last_trigger = None # executors self.executors = {} for exec_id in range(config.exec_cap): self.executors[exec_id] = Executor(exec_id) # dynamically bind {app_id -> job_dag} self.spark_dag_map = {} # dynamically bind {job_dag -> app_id} self.spark_inverse_dag_map = {} # dynamically bind {(app_id, stage_id) -> node} self.spark_node_map = {} # dynamically bind {node -> (app_id, stage_id)} self.spark_inverse_node_map = {} # dynamically bind {app_id -> {exec_id -> re-usable track_id}} self.exec_id_track_id_map = {} def add_job_dag(self, app_id): job_dag = self.dag_db.apps_map[app_id] job_dag.arrived = True self.job_dags.add(job_dag) # update map for job_dag self.spark_dag_map[app_id] = job_dag self.spark_inverse_dag_map[job_dag] = app_id # update exec_id track_id bind map self.exec_id_track_id_map[app_id] = {} # update map for node node_idx_to_stage_id_map = self.dag_db.stage_map[app_id] for node in job_dag.nodes: stage_id = node_idx_to_stage_id_map[node.idx] self.spark_node_map[(app_id, stage_id)] = node self.spark_inverse_node_map[node] = (app_id, stage_id) # update map for actions self.action_map.clear() self.action_map.update(self.pre_compute_action_map()) return job_dag def bind_exec_id(self, app_id, exec_id, track_id): assert 0 <= track_id < config.exec_cap self.exec_id_track_id_map[app_id][exec_id] = track_id def complete_stage(self, app_id, stage_id): node = self.spark_node_map[(app_id, stage_id)] # bookkeepings for node completion assert not node.tasks_all_done # only complete once node.tasks_all_done = True node.job_dag.update_frontier_nodes(node) node.job_dag.num_nodes_done += 1 # bookkeepings for job completion if node.job_dag.num_nodes_done == node.job_dag.num_nodes: assert not node.job_dag.completed # only complete once node.job_dag.completed = True def complete_tasks(self, app_id, stage_id, num_tasks_left): node = self.spark_node_map[(app_id, stage_id)] prev_finished_tasks = node.num_finished_tasks # update number of finished tasks for the node node.num_finished_tasks = node.num_tasks - num_tasks_left # update the next task index of the node node.next_task_idx += node.num_finished_tasks - prev_finished_tasks # remove node from frontier node if it is saturated node.no_more_tasks = (node.next_task_idx >= node.num_tasks) if node.no_more_tasks: if node.idx in node.job_dag.frontier_nodes: del node.job_dag.frontier_nodes[node.idx] def pre_compute_action_map(self): # translate action ~ [0, num_nodes_in_all_dags) to node object action_map = {} action = 0 for job_dag in self.job_dags: for node in job_dag.nodes: action_map[action] = node action += 1 return action_map def remove_job_dag(self, app_id): job_dag = self.dag_db.apps_map[app_id] self.job_dags.remove(job_dag) # free up stage holding executors for executor in job_dag.executors: executor.task = None executor.job_dag = None # update exec_id track_id map del self.exec_id_track_id_map[app_id] # update map for job_dag del self.spark_dag_map[app_id] del self.spark_inverse_dag_map[job_dag] # update map for node node_idx_to_stage_id_map = self.dag_db.stage_map[app_id] for node in job_dag.nodes: stage_id = node_idx_to_stage_id_map[node.idx] del self.spark_node_map[(app_id, stage_id)] del self.spark_inverse_node_map[node] # update map for actions self.action_map.clear() self.action_map.update(self.pre_compute_action_map()) return job_dag
def reset(self, executors): self.free_executors = {} self.free_executors[None] = OrderedSet() for executor in executors: self.free_executors[None].add(executor)