示例#1
0
def zoo_loop(sgf_dir=None, max_jobs=40):
    """Manages creating and cleaning up match jobs.

    - Load whatever pairs didn't get queued last time, and whatever our most
      recently seen model was.
    - Loop and...
        - If a new model is detected, create and append new pairs to the list
        - Automatically queue models from a list of pairs to keep a cluster
          busy
        - As jobs finish, delete them from the cluster.
        - If we crash, write out the list of pairs we didn't manage to queue

    sgf_dir -- the directory where sgf eval games should be used for computing
      ratings.
    max_jobs -- the maximum number of concurrent jobs.  jobs * completions * 2
      should be around 500 to keep kubernetes from losing track of completions
    """
    desired_pairs = restore_pairs() or []
    last_model_queued = restore_last_model()

    if sgf_dir:
        sgf_dir = os.path.abspath(sgf_dir)

    api_instance = get_api()
    try:
        while True:
            last_model = fsdb.get_latest_pb()[0]
            if last_model_queued < last_model:
                print("Adding models {} to {} to be scheduled".format(
                    last_model_queued+1, last_model))
                for m in reversed(range(last_model_queued+1, last_model+1)):
                    desired_pairs += make_pairs_for_model(m)
                last_model_queued = last_model
                save_last_model(last_model)

            cleanup(api_instance)
            r = api_instance.list_job_for_all_namespaces()
            if len(r.items) < max_jobs:
                if len(desired_pairs) == 0:
                    if sgf_dir:
                        print("Out of pairs!  Syncing new eval games...")
                        ratings.sync(sgf_dir)
                        print("Updating ratings and getting suggestions...")
                        add_uncertain_pairs()
                        desired_pairs = restore_pairs() or []
                        print("Got {} new pairs".format(len(desired_pairs)))
                        print(ratings.top_n())
                    else:
                        print("Out of pairs!  Sleeping")
                        time.sleep(300)
                        continue

                next_pair = desired_pairs.pop()  # take our pair off
                print("Enqueuing:", next_pair)
                try:
                    same_run_eval(*next_pair)
                except:
                    desired_pairs.append(next_pair)
                    raise
                save_pairs(sorted(desired_pairs))
                save_last_model(last_model)
                time.sleep(6)

            else:
                print("{}\t{} jobs outstanding. ({} to be scheduled)".format(
                      time.strftime("%I:%M:%S %p"),
                      len(r.items), len(desired_pairs)))
                time.sleep(60)
    except:
        print("Unfinished pairs:")
        print(sorted(desired_pairs))
        save_pairs(sorted(desired_pairs))
        save_last_model(last_model)
        raise
示例#2
0
def zoo_loop(sgf_dir=None, max_jobs=40):
    """Manages creating and cleaning up match jobs.

    - Load whatever pairs didn't get queued last time, and whatever our most
      recently seen model was.
    - Loop and...
        - If a new model is detected, create and append new pairs to the list
        - Automatically queue models from a list of pairs to keep a cluster
          busy
        - As jobs finish, delete them from the cluster.
        - If we crash, write out the list of pairs we didn't manage to queue

    sgf_dir -- the directory where sgf eval games should be used for computing
      ratings.
    max_jobs -- the maximum number of concurrent jobs.  jobs * completions * 2
      should be around 500 to keep kubernetes from losing track of completions
    """
    desired_pairs = restore_pairs() or []
    random.shuffle(desired_pairs)
    last_model_queued = restore_last_model()

    if sgf_dir:
        sgf_dir = os.path.abspath(sgf_dir)

    api_instance = get_api()
    toggle = True
    try:
        while True:
            last_model = fsdb.get_latest_pb()[0]
            if last_model_queued < last_model:
                print("Adding models {} to {} to be scheduled".format(
                    last_model_queued + 1, last_model))
                for m in reversed(range(last_model_queued + 1,
                                        last_model + 1)):
                    desired_pairs += make_pairs_for_model(m)
                last_model_queued = last_model
                save_last_model(last_model)

            cleanup(api_instance)
            random.shuffle(desired_pairs)
            r = api_instance.list_job_for_all_namespaces()
            if r.items:
                tasks = sum([item.spec.completions for item in r.items])
            else:
                tasks = 0
            if tasks < MAX_TASKS:
                if len(desired_pairs) == 0:
                    if sgf_dir:
                        if tasks > MIN_TASKS:
                            time.sleep(60)
                            continue
                        print("Out of pairs!  Syncing new eval games...")
                        ratings.sync(sgf_dir)
                        print("Updating ratings and getting suggestions...")
                        if toggle:
                            print("Pairing the top of the table.")
                            add_top_pairs()
                        else:
                            print("Pairing the least-known models.")
                            add_uncertain_pairs()
                        toggle = not toggle
                        for modelnum, rate in ratings.top_n():
                            print("{:>30}: {:0.3f} ({:0.3f})".format(
                                modelnum, rate[0], rate[1]))
                        desired_pairs = restore_pairs() or []
                    else:
                        print("Out of pairs!  Sleeping")
                        time.sleep(300)
                        continue

                next_pair = desired_pairs.pop()  # take our pair off
                print("Enqueuing:", next_pair)
                try:
                    same_run_eval(*next_pair)
                except:
                    desired_pairs.append(next_pair)
                    raise
                save_pairs(sorted(desired_pairs))
                save_last_model(last_model)
                time.sleep(1)

            else:
                print("{}\t {} finished / {} requested. "
                      "({} jobs, {} pairs to be scheduled)".format(
                          time.strftime("%I:%M:%S %p"),
                          sum([i.status.succeeded or 0 for i in r.items]),
                          tasks, len(r.items), len(desired_pairs)))
                time.sleep(60)
    except:
        print("Unfinished pairs:")
        print(sorted(desired_pairs))
        save_pairs(sorted(desired_pairs))
        save_last_model(last_model)
        raise