def __init__(self): print("AAA") logger.debug("Executer init") # save/log current jobs, so that it can restart. self.task_info_dir = os.path.join(root_dir, "task_info") self.root_info_dir = os.path.join(root_dir, "root_info") exist_or_mkdir(self.task_info_dir) exist_or_mkdir(self.root_info_dir) # load task info for all active / queued task self.active_task_list = TaskList( os.path.join(self.root_info_dir, "active_task.json"), self.task_info_dir) self.queued_task_list = TaskList( os.path.join(self.root_info_dir, "queued_task.json"), self.task_info_dir) self.info_dict = JsonTiedDict( os.path.join(self.root_info_dir, "info.json")) tpu_info_path = os.path.join(self.root_info_dir, "tpu_info.json") self.tpu_resource = ResourceList(tpu_info_path, ["v2-tf2", "v2-tf2-2"]) self.current_task_handles = {} # task_id -> process object # task_id being in current_task_handle does NOT imply the task is active, we don't delete handles self.task_cache = {} # task_id -> TaskObj self._init_info()
def __init__(self): self.request_dir = os.environ["request_dir"] self.tf_record_dir = os.environ["tf_record_dir"] info_path = os.path.join(self.request_dir, "info.json") self.json_tied_dict = JsonTiedDict(info_path) self.next_job_id = self.json_tied_dict.last_id() + 1 self.qck_generator: QCKGenDynamicKDP = get_qck_gen_dynamic_kdp() self.save_dir = os.path.join(output_path, "cppnc_auto") score_save_path_format = os.path.join(self.request_dir, "{}") self.job_runner = FileWatchingJobRunner(score_save_path_format, info_path, self.make_tfrecord, "tfrecord maker") print("") print(" [ TFRecordMaker ]") print()
class KDPEvalServer: def __init__(self): self.request_dir = os.environ["request_dir"] info_path = os.path.join(self.request_dir, "req_job_info.json") self.json_tied_dict = JsonTiedDict(info_path) self.next_job_id = self.json_tied_dict.last_id() def start(self): class RequestHandler(SimpleXMLRPCRequestHandler): rpc_paths = ('/RPC2', ) class RPCThreading(socketserver.ThreadingMixIn, SimpleXMLRPCServer): pass print("") print(" [ KDPEvalServer ]") print() print("Preparing server") server = RPCThreading(("0.0.0.0", port), requestHandler=RequestHandler, allow_none=True) server.register_introspection_functions() server.register_function(self.eval_job, 'eval_job') print("Waiting") server.serve_forever() def save_request(self, job_id, kdp_list: List[KDP]): save_path = os.path.join(self.request_dir, str(job_id)) temp_save_path = save_path + ".tmp" pickle.dump(kdp_list, open(temp_save_path, "wb")) os.rename(temp_save_path, save_path) def eval_job(self, kdp_list_raw: List[Tuple]): kdp_list: List[KDP] = lmap(KDP.from_state, kdp_list_raw) job_id = self.next_job_id self.save_request(job_id, kdp_list) self.next_job_id += 1 self.json_tied_dict.set('last_task_id', self.next_job_id) return job_id
class TFRecordMaker: def __init__(self): self.request_dir = os.environ["request_dir"] self.tf_record_dir = os.environ["tf_record_dir"] info_path = os.path.join(self.request_dir, "info.json") self.json_tied_dict = JsonTiedDict(info_path) self.next_job_id = self.json_tied_dict.last_id() + 1 self.qck_generator: QCKGenDynamicKDP = get_qck_gen_dynamic_kdp() self.save_dir = os.path.join(output_path, "cppnc_auto") score_save_path_format = os.path.join(self.request_dir, "{}") self.job_runner = FileWatchingJobRunner(score_save_path_format, info_path, self.make_tfrecord, "tfrecord maker") print("") print(" [ TFRecordMaker ]") print() def file_watch_daemon(self): self.job_runner.start() print("TFRecordMaker thread()") def make_tfrecord(self, job_id: int): save_path = os.path.join(self.request_dir, str(job_id)) kdp_list = pickle.load(open(save_path, "rb")) data_id_manager = DataIDManager(0, 1000 * 1000) print("{} kdp".format(len(kdp_list))) insts = self.qck_generator.generate(kdp_list, data_id_manager) record_save_path = os.path.join(self.tf_record_dir, str(job_id)) write_records_w_encode_fn(record_save_path, self.qck_generator.encode_fn, insts) # Save for backup info_save_path = os.path.join(self.tf_record_dir, "{}.info".format(job_id)) pickle.dump(data_id_manager.id_to_info, open(info_save_path, "wb")) # launch estimator add_estimator_job(job_id)
import subprocess import time from collections import Counter from subprocess import PIPE from typing import List, Dict from cpath import output_path from galagos.parse import save_queries_to_file, parse_galago_ranked_list, parse_galago_passage_ranked_list from galagos.types import SimpleRankedListEntry, GalagoPassageRankEntry from misc_lib import exist_or_mkdir from taskman_client.sync import JsonTiedDict dyn_query_dir = os.path.join(output_path, "dyn_query") exist_or_mkdir(dyn_query_dir) info_path = os.path.join(dyn_query_dir, "info.json") task_info = JsonTiedDict(info_path) class DocQuery(Dict): pass class PassageQuery(Dict): pass def get_new_query_json_path() -> str: last_query_file_idx = get_last_query_file_idx() new_query_id = last_query_file_idx + 1 task_info.last_task_id = new_query_id return get_json_path_for_idx(new_query_id)
def __init__(self): self.request_dir = os.environ["request_dir"] info_path = os.path.join(self.request_dir, "req_job_info.json") self.json_tied_dict = JsonTiedDict(info_path) self.next_job_id = self.json_tied_dict.last_id()
class Executer: def __init__(self): print("AAA") logger.debug("Executer init") # save/log current jobs, so that it can restart. self.task_info_dir = os.path.join(root_dir, "task_info") self.root_info_dir = os.path.join(root_dir, "root_info") exist_or_mkdir(self.task_info_dir) exist_or_mkdir(self.root_info_dir) # load task info for all active / queued task self.active_task_list = TaskList( os.path.join(self.root_info_dir, "active_task.json"), self.task_info_dir) self.queued_task_list = TaskList( os.path.join(self.root_info_dir, "queued_task.json"), self.task_info_dir) self.info_dict = JsonTiedDict( os.path.join(self.root_info_dir, "info.json")) tpu_info_path = os.path.join(self.root_info_dir, "tpu_info.json") self.tpu_resource = ResourceList(tpu_info_path, ["v2-tf2", "v2-tf2-2"]) self.current_task_handles = {} # task_id -> process object # task_id being in current_task_handle does NOT imply the task is active, we don't delete handles self.task_cache = {} # task_id -> TaskObj self._init_info() def _get_new_task_id(self): new_task_id = self.info_dict.last_task_id + 1 self.info_dict.set("last_task_id", new_task_id) return new_task_id def _get_task_info_path(self, task_id): return os.path.join(self.task_info_dir, "{}.json".format(task_id)) def run(self): # start _thread t = threading.Thread(target=self._thread) t.daemon = True t.start() def add_task_to_schedule(self, task): task.task_id = self._get_new_task_id() logger.debug("add_task_to_schedule() task_id={} proc_name={}".format( task.task_id, task.process_name)) new_task = QueuedTask.from_task(task, self._get_task_info_path(task.task_id)) new_task.set_status(STATUS_WAIT) self.queued_task_list.add(new_task) def remove_task(self, task_name): # Kill task if it is active task_obj = self._remove_task_from_active_list(task_name) # Remove from the list if not active if task_obj is None: task_obj = self._remove_task_from_queued_list(task_name) return task_obj def _remove_task_from_active_list(self, task_name): deleted_task_obj = None for task_obj in self.active_task_list: if task_obj.task_name == task_name: self._kill_task(task_obj) task_obj.set_status(STATUS_CANCELLED) deleted_task_obj = task_obj break self.active_task_list.remove(deleted_task_obj) return deleted_task_obj def _remove_task_from_queued_list(self, task_name): deleted_task_obj = -1 for task_obj in self.queued_task_list: if task_obj.task_name == task_name: task_obj.set_status(STATUS_CANCELLED) deleted_task_obj = task_obj break self.queued_task_list.remove(deleted_task_obj) return deleted_task_obj def _kill_task(self, task_obj): task_id = task_obj.task_id p = self.current_task_handles[task_id] p.kill() def _init_info(self): print("Init Info") logger.info("Init_info") logger2.info("Init Info") # Init self.current_task_handles task_to_mark_complete = [] for task_obj in self.active_task_list: print("ActiveTask : ", task_obj.task_id) try: print("Acquiring Handle") logger.debug("Acquiring Handle {}".format(task_obj.task_id)) self.current_task_handles[task_obj.task_id] = psutil.Process( task_obj.pid) logger.debug("Find task, task_id={} pid={}".format( task_obj.task_id, task_obj.pid)) except psutil.NoSuchProcess as e: task_to_mark_complete.append(task_obj) self._clean_up_completed_list(task_to_mark_complete) # tpu_name should be already acquired before this function # TODO handle std_output redirection def _execute(self, task: Task, tpu_name=None): if tpu_name is not None: task.update_argument({"tpu_name": tpu_name}) p = psutil.Popen( [task.process_name, task.get_param_str()], env=task.env, shell=True, ) task.pid = p.pid return p def _task_sanity_check(self, task: Task): # TODO : Check if related gs files are available # TODO : Cache information about gs file information # TODO : Check if necessary parameters are set return True def _thread(self): # 1. Poll Current Task status : By handle # 3. If resource is available, execute next task logger.info("_thread") while True: self._check_active_tasks() self._launch_task_if_possible() time.sleep(1) def _check_active_tasks(self): logger.info("check_active_tasks") task_to_mark_complete = [] for task_obj in self.active_task_list: task_process: psutil.Process = self.current_task_handles[ task_obj.task_id] try: status = task_process.status() logger.info("Task {} active".format(task_obj.task_id)) except psutil.NoSuchProcess as e: status = "dead" logger.info("Task {} dead".format(task_obj.task_id)) if status == "running": pass elif status == "dead": task_to_mark_complete.append(task_obj) # TODO # 2. Check stdout/stderr to see if process crashed self._clean_up_completed_list(task_to_mark_complete) def _launch_task_if_possible(self): task_that_just_got_executed = [] for task_obj in self.queued_task_list: is_ready = True tpu_name = None if task_obj.use_tpu: tpu_name = self.tpu_resource.assign() if tpu_name is None: is_ready = False if not self._task_sanity_check(task_obj): is_ready = False if is_ready: p = self._execute(task_obj, tpu_name) task_obj.pid = p.pid self.current_task_handles[task_obj.task_id] = p task_that_just_got_executed.append(task_obj) else: # return resource if tpu_name is not None: self.tpu_resource.release(tpu_name) for task_obj in task_that_just_got_executed: logger.debug("execute() task_id={} proc_name={}".format( task_obj.task_id, task_obj.process_name)) assert task_obj.pid is not None self.queued_task_list.remove(task_obj) self.active_task_list.add(task_obj) task_obj.set_status(STATUS_RUNNING) def _clean_up_completed_list(self, task_to_mark_complete): for task_obj in task_to_mark_complete: self.active_task_list.remove(task_obj) self._clean_up_completed(task_obj) def _clean_up_completed(self, task): logger.debug("_clean_up_completed() task_id={} ".format(task.task_id)) task.set_status(STATUS_COMPLETED) if task.use_tpu: self.tpu_resource.release(task.tpu_name)