def train(working_dir): model_num, model_name = fsdb.get_latest_model() games = gfile.Glob(os.path.join(fsdb.selfplay_dir(), model_name, '*.zz')) if len(games) < MIN_GAMES_PER_GENERATION: print("{} doesn't have enough games to train a new model yet ({})". format(model_name, len(games))) print("Sleeping...") time.sleep(10 * 60) print("Done...") sys.exit(1) print("Training on gathered game data, initializing from {}".format( model_name)) new_model_num = model_num + 1 new_model_name = shipname.generate(new_model_num) print("New model will be {}".format(new_model_name)) training_file = os.path.join(fsdb.golden_chunk_dir(), str(new_model_num) + '.tfrecord.zz') while not gfile.Exists(training_file): print("Waiting for", training_file) time.sleep(1 * 60) print("Using Golden File:", training_file) save_file = os.path.join(fsdb.models_dir(), new_model_name) try: main.train(working_dir, [training_file], save_file, generation_num=model_num + 1) except: logging.exception("Train error")
def fill_and_wait_models(bufsize=dual_net.EXAMPLES_PER_GENERATION, write_dir=None, threads=8, model_window=100, skip_first_rsync=False): """ Fills a ringbuffer with positions from the most recent games, then continually rsync's and updates the buffer until a new model is promoted. Once it detects a new model, iit then dumps its contents for training to immediately begin on the next model. """ write_dir = write_dir or fsdb.golden_chunk_dir() buf = ExampleBuffer(bufsize) models = fsdb.get_models()[-model_window:] if not skip_first_rsync: with timer("Rsync"): smart_rsync(models[-1][0] - 6) files = tqdm(map(files_for_model, models), total=len(models)) buf.parallel_fill(list(itertools.chain(*files)), threads=threads) print("Filled buffer, watching for new games") while fsdb.get_latest_model()[0] == models[-1][0]: with timer("Rsync"): smart_rsync(models[-1][0] - 2) new_files = tqdm(map(files_for_model, models[-2:]), total=len(models)) buf.update(list(itertools.chain(*new_files))) time.sleep(60) latest = fsdb.get_latest_model() print("New model!", latest[1], "!=", models[-1][1]) print(buf) buf.flush(os.path.join(write_dir, str(latest[0] + 1) + '.tfrecord.zz'))
def train(working_dir): model_num, model_name = fsdb.get_latest_model() print("Training on gathered game data, initializing from {}".format(model_name)) new_model_num = model_num + 1 new_model_name = shipname.generate(new_model_num) print("New model will be {}".format(new_model_name)) training_file = os.path.join( fsdb.golden_chunk_dir(), str(new_model_num) + '.tfrecord.zz') while not gfile.Exists(training_file): print("Waiting for", training_file) time.sleep(1*60) print("Using Golden File:", training_file) try: save_file = os.path.join(fsdb.models_dir(), new_model_name) print("Training model") dual_net.train(training_file) print("Exporting model to ", save_file) dual_net.export_model(working_dir, save_file) except Exception as e: import traceback logging.error(traceback.format_exc()) print(traceback.format_exc()) logging.exception("Train error") sys.exit(1)
def fill_and_wait_time(bufsize=dual_net.EXAMPLES_PER_GENERATION, write_dir=None, threads=32, start_from=None): start_from = start_from or dt.datetime.utcnow() write_dir = write_dir or fsdb.golden_chunk_dir() buf = ExampleBuffer(bufsize) chunk_to_make, fast_write = _determine_chunk_to_make(write_dir) hours = fsdb.get_hour_dirs() with timer("Rsync"): time_rsync( min(dt.datetime.strptime(hours[-1], "%Y-%m-%d-%H/"), start_from)) start_from = dt.datetime.utcnow() hours = fsdb.get_hour_dirs() files = (tf.gfile.Glob(os.path.join(LOCAL_DIR, d, "*.zz")) for d in reversed(hours) if tf.gfile.Exists(os.path.join(LOCAL_DIR, d))) files = itertools.islice(files, get_window_size(chunk_to_make)) models = fsdb.get_models() buf.parallel_fill(list(itertools.chain.from_iterable(files)), threads=threads) print("Filled buffer, watching for new games") while (fsdb.get_latest_model() == models[-1] or buf.total_updates < MINIMUM_NEW_GAMES): with timer("Rsync"): time_rsync(start_from - dt.timedelta(minutes=60)) start_from = dt.datetime.utcnow() hours = sorted(fsdb.get_hour_dirs(LOCAL_DIR)) new_files = list( map(lambda d: tf.gfile.Glob(os.path.join(LOCAL_DIR, d, '*.zz')), hours[-2:])) buf.update(list(itertools.chain.from_iterable(new_files))) if fast_write: break time.sleep(30) if fsdb.get_latest_model() != models[-1]: print("New model! Waiting for games. Got", buf.total_updates, "new games so far") latest = fsdb.get_latest_model() print("New model!", latest[1], "!=", models[-1][1]) print(buf) buf.flush(chunk_to_make)