def handle_nodeset(self, data): # first, clear the types of all repositories we had before, if we had any before # this is crucial because it's possible the node is attempting to shut down some repositories # clearing them out here is safe; we don't abort jobs on repo being lost anyway, and if the repo's about to come back, we'll just put it right back if data["name"] in self.nodes: for reponame in self.nodes[data["name"]]["repositories"]: if reponame in self.repositories and "type" in self.repositories[reponame]: del self.repositories[reponame]["type"] # look for any jobs that we believe this node should be running, but isn't expectedRunningTasks = self.tasks.get_tasks_running_on(data["name"]) for runningTask in expectedRunningTasks: if runningTask["name"] not in data["activetasks"]: # uhoh - a task has vanished. this might be caused by the node crashing; clean it up :( runningTask["status"] = "complete" runningTask["success"] = "terminated" self.task_finalize_and_write(runningTask) # register node data # this is a bit overkill since it includes a bunch of stuff we don't care about, but is easier # importantly, we *have* to include the repository paths here, so we can shut them down if the node fails to respond in the future self.nodes[data["name"]] = data # register repository data for reponame, repodata in data["repositories"].items(): if reponame not in self.repositories: self.repositories[reponame] = {} self.repositories[reponame]["type"] = repodata["type"] self.repositories[reponame]["local"] = repodata["local"] self.repositories[reponame]["node"] = data["name"] coordinator_db.repositories_save(self.repositories)
def task_child_completion_notify(self, taskname): if self.task_and_children_complete(taskname): # there are certainly very clever efficient ways to do this # but because we have so few repositories, we just iterate over all repositories for reponame, repodata in self.repositories.items(): if "task" in repodata and repodata["task"] == taskname: del repodata["task"] # TODO: clean up repository here coordinator_db.repositories_save(self.repositories) # we need to bother with this only if task and children are complete, since if not, the parent's children certainly won't be taskdata = self.tasks.get_task_by_name(taskname) if "parent" in taskdata and taskdata["parent"]: self.task_child_completion_notify(taskdata["parent"])
def update_node(self, node): # TODO: look for tasks to cancel # TODO: test to see if we should reboot the node taskToStart = None taskToStartRepos = None for task in self.tasks.get_tasks_idle(): # This should be a priority test if taskToStart is not None: continue # Ensure resources are available on this node # Task can be used; ensure we have the right repos available # WARNING - If a task requests more than one repository of the same type, it could in theory acquire the same repo twice # This takes a little work to avoid and is why we currently don't permit more than one repo request chosenRepos = {} for requestRepoName, requestRepoData in task["repositories"].items(): if "request" in requestRepoData: # Requesting an entire new repo repoOptions = [] for reponame, repodata in self.repositories.items(): if "task" in repodata: # repo is currently being used continue if "type" not in repodata: # repo is currently not available continue if repodata["type"] != requestRepoData["request"]: # repo is of the wrong kind continue if "local" in requestRepoData and repodata["node"] != node: # repo request is for a local repo, and this isn't one continue # success! this repo can be used repoOptions.append(reponame) if not repoOptions: # no repo available, abort chosenRepos = None break chosenRepos[requestRepoName] = random.choice(repoOptions) elif "local" in requestRepoData: # Merely verifying that an already-claimed repo is local if ("repo_" + requestRepoName) not in task["environment"]: # no such repo even exists, this is confusing ;.; chosenRepos = None break reponame = task["environment"]["repo_" + requestRepoName] if reponame in self.repositories and self.repositories[reponame]["node"] != node: # repo mismatch, abort chosenRepos = None break if chosenRepos is None: # couldn't find a repo :( continue taskToStart = task taskToStartRepos = chosenRepos if taskToStart is not None: # We are actually starting! # Lock repos, add to environment for envname, reponame in taskToStartRepos.items(): self.repositories[reponame]["task"] = taskToStart["name"] taskToStart["environment"]["repo_" + envname] = reponame taskToStart["node"] = node taskToStart["status"] = "working" taskToStart["time-start"] = time.time() taskToStart["log"] = logdir + taskToStart["name"] + ".log" self.tasks.update_task(taskToStart) coordinator_db.repositories_save(self.repositories) startCommand = { "command": "task-run", "name": taskToStart["name"], "path": taskToStart["environment"]["repo_env"], "log": taskToStart["log"], } if "executable_pyscript" in taskToStart: startCommand["executable_pyscript"] = taskToStart["executable_pyscript"] if "executable_pyfile" in taskToStart: startCommand["executable_pyfile"] = taskToStart["executable_pyfile"] return startCommand