Пример #1
0
def _determine_chunk_to_make(write_dir):
    """
    Returns the full path of the chunk to make (gs://...)
    and a boolean, indicating whether we should wait for a new model
    or if we're 'behind' and should just write out our current chunk immediately
    True == write immediately.
    """
    models = fsdb.get_models()
    # Last model is N.  N+1 (should be) training.  We should gather games for N+2.
    chunk_to_make = os.path.join(write_dir,
                                 str(models[-1][0] + 1) + '.tfrecord.zz')
    if not tf.gfile.Exists(chunk_to_make):
        # N+1 is missing.  Write it out ASAP
        print("Making chunk ASAP:", chunk_to_make)
        return chunk_to_make, True
    chunk_to_make = os.path.join(write_dir,
                                 str(models[-1][0] + 2) + '.tfrecord.zz')
    while tf.gfile.Exists(chunk_to_make):
        print("Chunk for next model ({}) already exists. Sleeping.".format(
            chunk_to_make))
        time.sleep(5 * 60)
        models = fsdb.get_models()
        chunk_to_make = os.path.join(write_dir,
                                     str(models[-1][0] + 2) + '.tfrecord.zz')
    print("Making chunk:", chunk_to_make)

    return chunk_to_make, False
Пример #2
0
def validate(working_dir, model_num=None, validate_name=None):
    """ Runs validate on the directories up to the most recent model, or up to
    (but not including) the model specified by `model_num`
    """
    if model_num is None:
        model_num, model_name = fsdb.get_latest_model()
    else:
        model_num = int(model_num)
        model_name = fsdb.get_model(model_num)

    # Model N was trained on games up through model N-2, so the validation set
    # should only be for models through N-2 as well, thus the (model_num - 1)
    # term.
    models = list(
        filter(lambda num_name: num_name[0] < (model_num - 1),
               fsdb.get_models()))
    # Run on the most recent 50 generations,
    # TODO(brianklee): make this hyperparameter dependency explicit/not hardcoded
    holdout_dirs = [
        os.path.join(fsdb.holdout_dir(), pair[1]) for pair in models[-50:]
    ]

    main.validate(working_dir,
                  *holdout_dirs,
                  checkpoint_name=os.path.join(fsdb.models_dir(), model_name),
                  validate_name=validate_name)
Пример #3
0
def fill_and_wait_models(bufsize=dual_net.EXAMPLES_PER_GENERATION,
                         write_dir=None,
                         threads=8,
                         model_window=100,
                         skip_first_rsync=False):
    """ Fills a ringbuffer with positions from the most recent games, then
    continually rsync's and updates the buffer until a new model is promoted.
    Once it detects a new model, iit then dumps its contents for training to
    immediately begin on the next model.
    """
    write_dir = write_dir or fsdb.golden_chunk_dir()
    buf = ExampleBuffer(bufsize)
    models = fsdb.get_models()[-model_window:]
    if not skip_first_rsync:
        with timer("Rsync"):
            smart_rsync(models[-1][0] - 6)
    files = tqdm(map(files_for_model, models), total=len(models))
    buf.parallel_fill(list(itertools.chain(*files)), threads=threads)

    print("Filled buffer, watching for new games")
    while fsdb.get_latest_model()[0] == models[-1][0]:
        with timer("Rsync"):
            smart_rsync(models[-1][0] - 2)
        new_files = tqdm(map(files_for_model, models[-2:]), total=len(models))
        buf.update(list(itertools.chain(*new_files)))
        time.sleep(60)
    latest = fsdb.get_latest_model()

    print("New model!", latest[1], "!=", models[-1][1])
    print(buf)
    buf.flush(os.path.join(write_dir, str(latest[0] + 1) + '.tfrecord.zz'))
Пример #4
0
def smart_rsync(from_model_num=0, source_dir=None, dest_dir=LOCAL_DIR):
    source_dir = source_dir or fsdb.selfplay_dir()
    from_model_num = 0 if from_model_num < 0 else from_model_num
    models = [m for m in fsdb.get_models() if m[0] >= from_model_num]
    for _, model in models:
        _rsync_dir(os.path.join(source_dir, model),
                   os.path.join(dest_dir, model))
Пример #5
0
def main():
    root = os.path.abspath(
        os.path.join("sgf", fsdb.FLAGS.bucket_name, "sgf/eval"))
    sync(root, True)
    models = fsdb.get_models()
    data = wins_subset(fsdb.models_dir())
    print(len(data))
    r = compute_ratings(data)
    for v, k in sorted([(v, k) for k, v in r.items()])[-20:][::-1]:
        print(models[model_num_for(k)][1], v)
    db = sqlite3.connect("ratings.db")
    print(db.execute("select count(*) from wins").fetchone()[0], "games")
    for m in models[-10:]:
        m_id = model_id(m[0])
        print(m[1], r.get(m_id, "model id not found({})".format(m_id)))
Пример #6
0
def fill_and_wait_time(bufsize=dual_net.EXAMPLES_PER_GENERATION,
                       write_dir=None,
                       threads=32,
                       start_from=None):
    start_from = start_from or dt.datetime.utcnow()
    write_dir = write_dir or fsdb.golden_chunk_dir()
    buf = ExampleBuffer(bufsize)
    chunk_to_make, fast_write = _determine_chunk_to_make(write_dir)

    hours = fsdb.get_hour_dirs()
    with timer("Rsync"):
        time_rsync(
            min(dt.datetime.strptime(hours[-1], "%Y-%m-%d-%H/"), start_from))
        start_from = dt.datetime.utcnow()

    hours = fsdb.get_hour_dirs()
    files = (tf.gfile.Glob(os.path.join(LOCAL_DIR, d, "*.zz"))
             for d in reversed(hours)
             if tf.gfile.Exists(os.path.join(LOCAL_DIR, d)))
    files = itertools.islice(files, get_window_size(chunk_to_make))

    models = fsdb.get_models()
    buf.parallel_fill(list(itertools.chain.from_iterable(files)),
                      threads=threads)
    print("Filled buffer, watching for new games")

    while (fsdb.get_latest_model() == models[-1]
           or buf.total_updates < MINIMUM_NEW_GAMES):
        with timer("Rsync"):
            time_rsync(start_from - dt.timedelta(minutes=60))
        start_from = dt.datetime.utcnow()
        hours = sorted(fsdb.get_hour_dirs(LOCAL_DIR))
        new_files = list(
            map(lambda d: tf.gfile.Glob(os.path.join(LOCAL_DIR, d, '*.zz')),
                hours[-2:]))
        buf.update(list(itertools.chain.from_iterable(new_files)))
        if fast_write:
            break
        time.sleep(30)
        if fsdb.get_latest_model() != models[-1]:
            print("New model!  Waiting for games. Got", buf.total_updates,
                  "new games so far")

    latest = fsdb.get_latest_model()
    print("New model!", latest[1], "!=", models[-1][1])
    print(buf)
    buf.flush(chunk_to_make)
Пример #7
0
def backfill():
    models = [m[1] for m in fsdb.get_models()]

    import dual_net
    import tensorflow as tf
    from tqdm import tqdm
    features, labels = dual_net.get_inference_input()
    dual_net.model_fn(features, labels, tf.estimator.ModeKeys.PREDICT,
                      dual_net.get_default_hyperparams())

    for model_name in tqdm(models):
        if model_name.endswith('-upgrade'):
            continue
        try:
            load_file = os.path.join(fsdb.models_dir(), model_name)
            dest_file = os.path.join(fsdb.models_dir(), model_name)
            main.convert(load_file, dest_file)
        except:
            print('failed on', model_name)
            continue
Пример #8
0
def make_chunk_for(output_dir=LOCAL_DIR,
                   local_dir=LOCAL_DIR,
                   game_dir=None,
                   model_num=1,
                   positions=dual_net.EXAMPLES_PER_GENERATION,
                   threads=8,
                   samples_per_game=4):
    """
    Explicitly make a golden chunk for a given model `model_num`
    (not necessarily the most recent one).

      While we haven't yet got enough samples (EXAMPLES_PER_GENERATION)
      Add samples from the games of previous model.
    """
    game_dir = game_dir or fsdb.selfplay_dir()
    ensure_dir_exists(output_dir)
    models = [(num, name) for num, name in fsdb.get_models()
              if num < model_num]
    buf = ExampleBuffer(positions)
    files = []
    for _, model in sorted(models, reverse=True):
        local_model_dir = os.path.join(local_dir, model)
        if not tf.gfile.Exists(local_model_dir):
            print("Rsyncing", model)
            _rsync_dir(os.path.join(game_dir, model), local_model_dir)
        files.extend(tf.gfile.Glob(os.path.join(local_model_dir, '*.zz')))
        if len(files) * samples_per_game > positions:
            break

    print("Filling from {} files".format(len(files)))

    buf.parallel_fill(files,
                      threads=threads,
                      samples_per_game=samples_per_game)
    print(buf)
    output = os.path.join(output_dir, str(model_num) + '.tfrecord.zz')
    print("Writing to", output)
    buf.flush(output)