def _determine_chunk_to_make(write_dir): """ Returns the full path of the chunk to make (gs://...) and a boolean, indicating whether we should wait for a new model or if we're 'behind' and should just write out our current chunk immediately True == write immediately. """ models = fsdb.get_models() # Last model is N. N+1 (should be) training. We should gather games for N+2. chunk_to_make = os.path.join(write_dir, str(models[-1][0] + 1) + '.tfrecord.zz') if not tf.gfile.Exists(chunk_to_make): # N+1 is missing. Write it out ASAP print("Making chunk ASAP:", chunk_to_make) return chunk_to_make, True chunk_to_make = os.path.join(write_dir, str(models[-1][0] + 2) + '.tfrecord.zz') while tf.gfile.Exists(chunk_to_make): print("Chunk for next model ({}) already exists. Sleeping.".format( chunk_to_make)) time.sleep(5 * 60) models = fsdb.get_models() chunk_to_make = os.path.join(write_dir, str(models[-1][0] + 2) + '.tfrecord.zz') print("Making chunk:", chunk_to_make) return chunk_to_make, False
def fill_and_wait_models(bufsize=EXAMPLES_PER_GENERATION, write_dir=None, threads=8, model_window=100, skip_first_rsync=False): """ Fills a ringbuffer with positions from the most recent games, then continually rsync's and updates the buffer until a new model is promoted. Once it detects a new model, iit then dumps its contents for training to immediately begin on the next model. """ write_dir = write_dir or fsdb.golden_chunk_dir() buf = ExampleBuffer(bufsize) models = fsdb.get_models()[-model_window:] if not skip_first_rsync: with timer("Rsync"): smart_rsync(models[-1][0] - 6) files = tqdm(map(files_for_model, models), total=len(models)) buf.parallel_fill(list(itertools.chain(*files)), threads=threads) print("Filled buffer, watching for new games") while fsdb.get_latest_model()[0] == models[-1][0]: with timer("Rsync"): smart_rsync(models[-1][0] - 2) new_files = tqdm(map(files_for_model, models[-2:]), total=len(models)) buf.update(list(itertools.chain(*new_files))) time.sleep(60) latest = fsdb.get_latest_model() print("New model!", latest[1], "!=", models[-1][1]) print(buf) buf.flush(os.path.join(write_dir, str(latest[0] + 1) + '.tfrecord.zz'))
def smart_rsync(from_model_num=0, source_dir=None, dest_dir=LOCAL_DIR): source_dir = source_dir or fsdb.selfplay_dir() from_model_num = 0 if from_model_num < 0 else from_model_num models = [m for m in fsdb.get_models() if m[0] >= from_model_num] for _, model in models: _rsync_dir(os.path.join(source_dir, model), os.path.join(dest_dir, model))
def main(): root = os.path.abspath( os.path.join("sgf", fsdb.FLAGS.bucket_name, "sgf/eval")) if FLAGS.sync_ratings: sync(root) for k, v in top_n(20): print("Top model {}: {}".format(k, v)) db = sqlite3.connect("ratings.db") print("db has", db.execute("select count(*) from wins").fetchone()[0], "games") models = fsdb.get_models() for m in models[-10:]: m_id = model_id_of(m[0]) if m_id in r: rat, sigma = r[m_id] print("{:>30}: {:.2f} ({:.3f})".format(m[1], rat, sigma)) else: print("{}, Model id not found({})".format(m[1], m_id)) # Suggest some pairs random.seed(5) print() suggest_pairs(5, 2)
def main(): root = os.path.abspath( os.path.join("sgf", fsdb.FLAGS.bucket_name, "sgf/eval")) sync(root) models = fsdb.get_models() data = wins_subset(fsdb.models_dir()) print(len(data)) r = compute_ratings(data) for v, k in sorted([(v, k) for k, v in r.items()])[-20:][::-1]: print(models[model_num_for(k)][1], v) db = sqlite3.connect("ratings.db") print(db.execute("select count(*) from wins").fetchone()[0], "games") for m in models[-10:]: m_id = model_id(m[0]) print(m[1], r.get(m_id, "model id not found({})".format(m_id)))
def fill_and_wait_time(bufsize=EXAMPLES_PER_GENERATION, write_dir=None, threads=32, start_from=None): start_from = start_from or dt.datetime.utcnow() write_dir = write_dir or fsdb.golden_chunk_dir() buf = ExampleBuffer(bufsize) chunk_to_make, fast_write = _determine_chunk_to_make(write_dir) hours = fsdb.get_hour_dirs() with timer("Rsync"): time_rsync( min(dt.datetime.strptime(hours[-1], "%Y-%m-%d-%H/"), start_from)) start_from = dt.datetime.utcnow() hours = fsdb.get_hour_dirs() files = (tf.gfile.Glob(os.path.join(LOCAL_DIR, d, "*.zz")) for d in reversed(hours) if tf.gfile.Exists(os.path.join(LOCAL_DIR, d))) files = itertools.islice(files, get_window_size(chunk_to_make)) models = fsdb.get_models() buf.parallel_fill(list(itertools.chain.from_iterable(files)), threads=threads) print("Filled buffer, watching for new games") while (fsdb.get_latest_model() == models[-1] or buf.total_updates < MINIMUM_NEW_GAMES): with timer("Rsync"): time_rsync(start_from - dt.timedelta(minutes=60)) start_from = dt.datetime.utcnow() hours = sorted(fsdb.get_hour_dirs(LOCAL_DIR)) new_files = list( map(lambda d: tf.gfile.Glob(os.path.join(LOCAL_DIR, d, '*.zz')), hours[-2:])) buf.update(list(itertools.chain.from_iterable(new_files))) if fast_write: break time.sleep(30) if fsdb.get_latest_model() != models[-1]: print("New model! Waiting for games. Got", buf.total_updates, "new games so far") latest = fsdb.get_latest_model() print("New model!", latest[1], "!=", models[-1][1]) print(buf) buf.flush(chunk_to_make)
def loop(unused_argv): if len(fsdb.get_models()) == 0: # TODO(amj): Do bootstrap here. pass while True: print("=" * 40, flush=True) with utils.timer("Train"): completed_process = train() if completed_process.returncode > 0: print("Training failed, aborting.") sys.exit(1) with utils.timer("Validate"): if not FLAGS.pro_dataset: print("*** --pro_dataset not set, skipping pro validation ***") else: validate_pro()
def main(): root = os.path.abspath( os.path.join("sgf", fsdb.FLAGS.bucket_name, "sgf/eval")) if FLAGS.sync_ratings: sync(root) models = fsdb.get_models() data = wins_subset(fsdb.models_dir()) print("win subset", len(data), "games") r = compute_ratings(data) for v, k in sorted([(v, k) for k, v in r.items()])[-20:][::-1]: print("Top model({}) {}: {}".format(k, model_num_for(k), v)) db = sqlite3.connect("ratings.db") print("db has", db.execute("select count(*) from wins").fetchone()[0], "games") for m in models[-10:]: m_id = model_id(m[0]) if m_id in r: rat, sigma = r[m_id] print("{:>30}: {:.2f} ({:.3f})".format(m[1], rat, sigma)) else: print("{}, Model id not found({})".format(m[1], m_id))
def make_chunk_for(output_dir=LOCAL_DIR, local_dir=LOCAL_DIR, game_dir=None, model_num=1, positions=EXAMPLES_PER_GENERATION, threads=8, sampling_frac=0.02): """ Explicitly make a golden chunk for a given model `model_num` (not necessarily the most recent one). While we haven't yet got enough samples (EXAMPLES_PER_GENERATION) Add samples from the games of previous model. """ game_dir = game_dir or fsdb.selfplay_dir() ensure_dir_exists(output_dir) models = [model for model in fsdb.get_models() if model[0] < model_num] buf = ExampleBuffer(positions, sampling_frac=sampling_frac) files = [] for _, model in sorted(models, reverse=True): local_model_dir = os.path.join(local_dir, model) if not tf.gfile.Exists(local_model_dir): print("Rsyncing", model) _rsync_dir(os.path.join(game_dir, model), local_model_dir) files.extend(tf.gfile.Glob(os.path.join(local_model_dir, '*.zz'))) print("{}: {} games".format(model, len(files))) if len(files) * 200 * sampling_frac > positions: break print("Filling from {} files".format(len(files))) buf.parallel_fill(files, threads=threads) print(buf) output = os.path.join(output_dir, str(model_num) + '.tfrecord.zz') print("Writing to", output) buf.flush(output)