def zoo_loop(sgf_dir=None, max_jobs=40): """Manages creating and cleaning up match jobs. - Load whatever pairs didn't get queued last time, and whatever our most recently seen model was. - Loop and... - If a new model is detected, create and append new pairs to the list - Automatically queue models from a list of pairs to keep a cluster busy - As jobs finish, delete them from the cluster. - If we crash, write out the list of pairs we didn't manage to queue sgf_dir -- the directory where sgf eval games should be used for computing ratings. max_jobs -- the maximum number of concurrent jobs. jobs * completions * 2 should be around 500 to keep kubernetes from losing track of completions """ desired_pairs = restore_pairs() or [] last_model_queued = restore_last_model() if sgf_dir: sgf_dir = os.path.abspath(sgf_dir) api_instance = get_api() try: while True: last_model = fsdb.get_latest_pb()[0] if last_model_queued < last_model: print("Adding models {} to {} to be scheduled".format( last_model_queued+1, last_model)) for m in reversed(range(last_model_queued+1, last_model+1)): desired_pairs += make_pairs_for_model(m) last_model_queued = last_model save_last_model(last_model) cleanup(api_instance) r = api_instance.list_job_for_all_namespaces() if len(r.items) < max_jobs: if len(desired_pairs) == 0: if sgf_dir: print("Out of pairs! Syncing new eval games...") ratings.sync(sgf_dir) print("Updating ratings and getting suggestions...") add_uncertain_pairs() desired_pairs = restore_pairs() or [] print("Got {} new pairs".format(len(desired_pairs))) print(ratings.top_n()) else: print("Out of pairs! Sleeping") time.sleep(300) continue next_pair = desired_pairs.pop() # take our pair off print("Enqueuing:", next_pair) try: same_run_eval(*next_pair) except: desired_pairs.append(next_pair) raise save_pairs(sorted(desired_pairs)) save_last_model(last_model) time.sleep(6) else: print("{}\t{} jobs outstanding. ({} to be scheduled)".format( time.strftime("%I:%M:%S %p"), len(r.items), len(desired_pairs))) time.sleep(60) except: print("Unfinished pairs:") print(sorted(desired_pairs)) save_pairs(sorted(desired_pairs)) save_last_model(last_model) raise
def zoo_loop(sgf_dir=None, max_jobs=40): """Manages creating and cleaning up match jobs. - Load whatever pairs didn't get queued last time, and whatever our most recently seen model was. - Loop and... - If a new model is detected, create and append new pairs to the list - Automatically queue models from a list of pairs to keep a cluster busy - As jobs finish, delete them from the cluster. - If we crash, write out the list of pairs we didn't manage to queue sgf_dir -- the directory where sgf eval games should be used for computing ratings. max_jobs -- the maximum number of concurrent jobs. jobs * completions * 2 should be around 500 to keep kubernetes from losing track of completions """ desired_pairs = restore_pairs() or [] random.shuffle(desired_pairs) last_model_queued = restore_last_model() if sgf_dir: sgf_dir = os.path.abspath(sgf_dir) api_instance = get_api() toggle = True try: while True: last_model = fsdb.get_latest_pb()[0] if last_model_queued < last_model: print("Adding models {} to {} to be scheduled".format( last_model_queued + 1, last_model)) for m in reversed(range(last_model_queued + 1, last_model + 1)): desired_pairs += make_pairs_for_model(m) last_model_queued = last_model save_last_model(last_model) cleanup(api_instance) random.shuffle(desired_pairs) r = api_instance.list_job_for_all_namespaces() if r.items: tasks = sum([item.spec.completions for item in r.items]) else: tasks = 0 if tasks < MAX_TASKS: if len(desired_pairs) == 0: if sgf_dir: if tasks > MIN_TASKS: time.sleep(60) continue print("Out of pairs! Syncing new eval games...") ratings.sync(sgf_dir) print("Updating ratings and getting suggestions...") if toggle: print("Pairing the top of the table.") add_top_pairs() else: print("Pairing the least-known models.") add_uncertain_pairs() toggle = not toggle for modelnum, rate in ratings.top_n(): print("{:>30}: {:0.3f} ({:0.3f})".format( modelnum, rate[0], rate[1])) desired_pairs = restore_pairs() or [] else: print("Out of pairs! Sleeping") time.sleep(300) continue next_pair = desired_pairs.pop() # take our pair off print("Enqueuing:", next_pair) try: same_run_eval(*next_pair) except: desired_pairs.append(next_pair) raise save_pairs(sorted(desired_pairs)) save_last_model(last_model) time.sleep(1) else: print("{}\t {} finished / {} requested. " "({} jobs, {} pairs to be scheduled)".format( time.strftime("%I:%M:%S %p"), sum([i.status.succeeded or 0 for i in r.items]), tasks, len(r.items), len(desired_pairs))) time.sleep(60) except: print("Unfinished pairs:") print(sorted(desired_pairs)) save_pairs(sorted(desired_pairs)) save_last_model(last_model) raise