예제 #1
0
def fill_and_wait_models(bufsize=dual_net.EXAMPLES_PER_GENERATION,
                         write_dir=None,
                         threads=8,
                         model_window=100,
                         skip_first_rsync=False):
    """ Fills a ringbuffer with positions from the most recent games, then
    continually rsync's and updates the buffer until a new model is promoted.
    Once it detects a new model, iit then dumps its contents for training to
    immediately begin on the next model.
    """
    write_dir = write_dir or fsdb.golden_chunk_dir()
    buf = ExampleBuffer(bufsize)
    models = fsdb.get_models()[-model_window:]
    if not skip_first_rsync:
        with timer("Rsync"):
            smart_rsync(models[-1][0] - 6)
    files = tqdm(map(files_for_model, models), total=len(models))
    buf.parallel_fill(list(itertools.chain(*files)), threads=threads)

    print("Filled buffer, watching for new games")
    while fsdb.get_latest_model()[0] == models[-1][0]:
        with timer("Rsync"):
            smart_rsync(models[-1][0] - 2)
        new_files = tqdm(map(files_for_model, models[-2:]), total=len(models))
        buf.update(list(itertools.chain(*new_files)))
        time.sleep(60)
    latest = fsdb.get_latest_model()

    print("New model!", latest[1], "!=", models[-1][1])
    print(buf)
    buf.flush(os.path.join(write_dir, str(latest[0] + 1) + '.tfrecord.zz'))
예제 #2
0
def train(working_dir):
    model_num, model_name = fsdb.get_latest_model()

    games = gfile.Glob(os.path.join(fsdb.selfplay_dir(), model_name, '*.zz'))
    if len(games) < MIN_GAMES_PER_GENERATION:
        print("{} doesn't have enough games to train a new model yet ({})".
              format(model_name, len(games)))
        print("Sleeping...")
        time.sleep(10 * 60)
        print("Done...")
        sys.exit(1)

    print("Training on gathered game data, initializing from {}".format(
        model_name))
    new_model_num = model_num + 1
    new_model_name = shipname.generate(new_model_num)
    print("New model will be {}".format(new_model_name))
    training_file = os.path.join(fsdb.golden_chunk_dir(),
                                 str(new_model_num) + '.tfrecord.zz')
    while not gfile.Exists(training_file):
        print("Waiting for", training_file)
        time.sleep(1 * 60)
    print("Using Golden File:", training_file)

    save_file = os.path.join(fsdb.models_dir(), new_model_name)
    try:
        main.train(working_dir, [training_file],
                   save_file,
                   generation_num=model_num + 1)
    except:
        logging.exception("Train error")
예제 #3
0
def validate(working_dir, model_num=None, validate_name=None):
    """ Runs validate on the directories up to the most recent model, or up to
    (but not including) the model specified by `model_num`
    """
    if model_num is None:
        model_num, model_name = fsdb.get_latest_model()
    else:
        model_num = int(model_num)
        model_name = fsdb.get_model(model_num)

    # Model N was trained on games up through model N-2, so the validation set
    # should only be for models through N-2 as well, thus the (model_num - 1)
    # term.
    models = list(
        filter(lambda num_name: num_name[0] < (model_num - 1),
               fsdb.get_models()))
    # Run on the most recent 50 generations,
    # TODO(brianklee): make this hyperparameter dependency explicit/not hardcoded
    holdout_dirs = [
        os.path.join(fsdb.holdout_dir(), pair[1]) for pair in models[-50:]
    ]

    main.validate(working_dir,
                  *holdout_dirs,
                  checkpoint_name=os.path.join(fsdb.models_dir(), model_name),
                  validate_name=validate_name)
예제 #4
0
파일: rl_loop.py 프로젝트: wtdeng/minigo
def train(working_dir):
    model_num, model_name = fsdb.get_latest_model()

    print("Training on gathered game data, initializing from {}".format(model_name))
    new_model_num = model_num + 1
    new_model_name = shipname.generate(new_model_num)
    print("New model will be {}".format(new_model_name))
    training_file = os.path.join(
        fsdb.golden_chunk_dir(), str(new_model_num) + '.tfrecord.zz')
    while not gfile.Exists(training_file):
        print("Waiting for", training_file)
        time.sleep(1*60)
    print("Using Golden File:", training_file)

    try:
        save_file = os.path.join(fsdb.models_dir(), new_model_name)
        print("Training model")
        dual_net.train(training_file)
        print("Exporting model to ", save_file)
        dual_net.export_model(working_dir, save_file)
    except Exception as e:
        import traceback
        logging.error(traceback.format_exc())
        print(traceback.format_exc())
        logging.exception("Train error")
        sys.exit(1)
예제 #5
0
def fill_and_wait_time(bufsize=dual_net.EXAMPLES_PER_GENERATION,
                       write_dir=None,
                       threads=32,
                       start_from=None):
    start_from = start_from or dt.datetime.utcnow()
    write_dir = write_dir or fsdb.golden_chunk_dir()
    buf = ExampleBuffer(bufsize)
    chunk_to_make, fast_write = _determine_chunk_to_make(write_dir)

    hours = fsdb.get_hour_dirs()
    with timer("Rsync"):
        time_rsync(
            min(dt.datetime.strptime(hours[-1], "%Y-%m-%d-%H/"), start_from))
        start_from = dt.datetime.utcnow()

    hours = fsdb.get_hour_dirs()
    files = (tf.gfile.Glob(os.path.join(LOCAL_DIR, d, "*.zz"))
             for d in reversed(hours)
             if tf.gfile.Exists(os.path.join(LOCAL_DIR, d)))
    files = itertools.islice(files, get_window_size(chunk_to_make))

    models = fsdb.get_models()
    buf.parallel_fill(list(itertools.chain.from_iterable(files)),
                      threads=threads)
    print("Filled buffer, watching for new games")

    while (fsdb.get_latest_model() == models[-1]
           or buf.total_updates < MINIMUM_NEW_GAMES):
        with timer("Rsync"):
            time_rsync(start_from - dt.timedelta(minutes=60))
        start_from = dt.datetime.utcnow()
        hours = sorted(fsdb.get_hour_dirs(LOCAL_DIR))
        new_files = list(
            map(lambda d: tf.gfile.Glob(os.path.join(LOCAL_DIR, d, '*.zz')),
                hours[-2:]))
        buf.update(list(itertools.chain.from_iterable(new_files)))
        if fast_write:
            break
        time.sleep(30)
        if fsdb.get_latest_model() != models[-1]:
            print("New model!  Waiting for games. Got", buf.total_updates,
                  "new games so far")

    latest = fsdb.get_latest_model()
    print("New model!", latest[1], "!=", models[-1][1])
    print(buf)
    buf.flush(chunk_to_make)
예제 #6
0
def zoo_loop():
    """Manages creating and cleaning up match jobs.

    - Load whatever pairs didn't get queued last time, and whatever our most
      recently seen model was.
    - Loop and...
        - If a new model is detected, create and append new pairs to the list
        - Automatically queue models from a list of pairs to keep a cluster busy
        - As jobs finish, delete them from the cluster.
        - If we crash, write out the list of pairs we didn't manage to queue
    """
    desired_pairs = restore_pairs() or []
    last_model_queued = restore_last_model()

    api_instance = get_api()
    try:
        while True:
            last_model = fsdb.get_latest_model()[0]
            if last_model_queued < last_model:
                print("Adding models {} to {} to be scheduled".format(
                    last_model_queued + 1, last_model))
                for m in reversed(range(last_model_queued + 1,
                                        last_model + 1)):
                    desired_pairs += make_pairs_for_model(m)
                last_model_queued = last_model
                save_last_model(last_model)

            cleanup(api_instance)
            r = api_instance.list_job_for_all_namespaces()
            if len(r.items) < 8:
                if not desired_pairs:
                    time.sleep(60 * 5)
                    continue
                next_pair = desired_pairs.pop()  # take our pair off
                print("Enqueuing:", next_pair)
                try:
                    same_run_eval(*next_pair)
                except:
                    desired_pairs.append(next_pair)
                    raise
                save_pairs(sorted(desired_pairs))
                save_last_model(last_model)

            else:
                print("{}\t{} jobs outstanding.".format(
                    time.strftime("%I:%M:%S %p"), len(r.items)))
            time.sleep(30)
    except:
        print("Unfinished pairs:")
        print(sorted(desired_pairs))
        save_pairs(sorted(desired_pairs))
        save_last_model(last_model)
        raise
예제 #7
0
def zoo_loop():
    desired_pairs = restore_pairs() or []
    last_model_queued = restore_last_model()

    api_instance = get_api()
    try:
        while True:
            last_model = fsdb.get_latest_model()[0]
            if last_model_queued < last_model:
                print("Adding models {} to {} to be scheduled".format(
                    last_model_queued + 1, last_model))
                for m in reversed(range(last_model_queued + 1,
                                        last_model + 1)):
                    desired_pairs += make_pairs_for_model(m)
                last_model_queued = last_model
                save_last_model(last_model)

            cleanup(api_instance)
            r = api_instance.list_job_for_all_namespaces()
            if len(r.items) < 10:
                if not desired_pairs:
                    time.sleep(60 * 5)
                    continue
                next_pair = desired_pairs.pop()  # take our pair off
                print("Enqueuing:", next_pair)
                try:
                    launch_eval(*next_pair)
                except:
                    desired_pairs.append(next_pair)
                    raise
                save_pairs(sorted(desired_pairs))
                save_last_model(last_model)

            else:
                print("{}\t{} jobs outstanding.".format(
                    time.strftime("%I:%M:%S %p"), len(r.items)))
            time.sleep(30)
    except:
        print("Unfinished pairs:")
        print(sorted(desired_pairs))
        save_pairs(sorted(desired_pairs))
        save_last_model(last_model)
        raise
예제 #8
0
def selfplay(verbose=2):
    _, model_name = fsdb.get_latest_model()
    games = gfile.Glob(os.path.join(fsdb.selfplay_dir(), model_name, '*.zz'))
    if len(games) > MAX_GAMES_PER_GENERATION:
        print("{} has enough games ({})".format(model_name, len(games)))
        time.sleep(10 * 60)
        sys.exit(1)
    print("Playing a game with model {}".format(model_name))
    model_save_path = os.path.join(fsdb.models_dir(), model_name)
    game_output_dir = os.path.join(fsdb.selfplay_dir(), model_name)
    game_holdout_dir = os.path.join(fsdb.holdout_dir(), model_name)
    sgf_dir = os.path.join(fsdb.sgf_dir(), model_name)
    main.selfplay(
        load_file=model_save_path,
        output_dir=game_output_dir,
        holdout_dir=game_holdout_dir,
        output_sgf=sgf_dir,
        holdout_pct=HOLDOUT_PCT,
        verbose=verbose,
    )