def bootstrap( working_dir: 'tf.estimator working directory.', model_save_path: 'Where to export the first bootstrapped generation'): _ensure_dir_exists(working_dir) _ensure_dir_exists(os.path.dirname(model_save_path)) dual_net.bootstrap(working_dir) dual_net.export_model(working_dir, model_save_path)
def train(working_dir): model_num, model_name = fsdb.get_latest_model() print("Training on gathered game data, initializing from {}".format(model_name)) new_model_num = model_num + 1 new_model_name = shipname.generate(new_model_num) print("New model will be {}".format(new_model_name)) training_file = os.path.join( fsdb.golden_chunk_dir(), str(new_model_num) + '.tfrecord.zz') while not gfile.Exists(training_file): print("Waiting for", training_file) time.sleep(1*60) print("Using Golden File:", training_file) try: save_file = os.path.join(fsdb.models_dir(), new_model_name) print("Training model") dual_net.train(training_file) print("Exporting model to ", save_file) dual_net.export_model(working_dir, save_file) except Exception as e: import traceback logging.error(traceback.format_exc()) print(traceback.format_exc()) logging.exception("Train error") sys.exit(1)
def bootstrap( working_dir: 'tf.estimator working directory.', model_save_path: 'Where to export the first bootstrapped generation'): _ensure_dir_exists(working_dir) _ensure_dir_exists(os.path.dirname(model_save_path)) dual_net.bootstrap(working_dir) dual_net.export_model(working_dir, model_save_path)
def main(argv): """Train on examples and export the updated model weights.""" tf_records = argv[1:] logging.info("Training on %s records: %s to %s", len(tf_records), tf_records[0], tf_records[-1]) if FLAGS.dist_train: hvd.init() mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) with utils.logged_timer("Training"): train(*tf_records) if (not FLAGS.dist_train) or hvd.rank() == 0: if FLAGS.export_path: dual_net.export_model(FLAGS.export_path) epoch = int(os.path.basename(FLAGS.export_path)) mllogger.event(key="save_model", value={"Iteration": epoch}) if FLAGS.freeze: dual_net.freeze_graph(FLAGS.export_path, FLAGS.use_trt, FLAGS.trt_max_batch_size, FLAGS.trt_precision, FLAGS.selfplay_precision)
def train(tf_records: 'list of files of tf_records to train on', model_save_path: 'Where to export the completed generation.'): print("Training on:", tf_records[0], "to", tf_records[-1]) with utils.logged_timer("Training"): dual_net.train(*tf_records) print("== Training done. Exporting model to ", model_save_path) dual_net.export_model(flags.FLAGS.model_dir, model_save_path) freeze_graph(model_save_path)
def train(working_dir: 'tf.estimator working directory.', tf_records: 'list of files of tf_records to train on', model_save_path: 'Where to export the completed generation.'): print("Training on:", tf_records[0], "to", tf_records[-1]) with utils.logged_timer("Training"): dual_net.train(working_dir, tf_records) print("== Training done. Exporting model to ", model_save_path) dual_net.export_model(working_dir, model_save_path) freeze_graph(model_save_path)
def main(argv): """Train on examples and export the updated model weights.""" tf_records = argv[1:] logging.info("Training on %s records: %s to %s", len(tf_records), tf_records[0], tf_records[-1]) with utils.logged_timer("Training"): train(*tf_records) if FLAGS.export_path: dual_net.export_model(FLAGS.export_path)
def train( working_dir: 'tf.estimator working directory.', tf_records: 'list of files of tf_records to train on', model_save_path: 'Where to export the completed generation.', generation_num: 'Which generation you are training.'=0): print("Training on:", tf_records[0], "to", tf_records[-1]) with timer("Training"): dual_net.train(working_dir, tf_records, generation_num) dual_net.export_model(working_dir, model_save_path) freeze_graph(model_save_path)
def train(working_dir: 'tf.estimator working directory.', chunk_dir: 'Directory where gathered training chunks are.', model_save_path: 'Where to export the completed generation.', generation_num: 'Which generation you are training.' = 0): tf_records = sorted(gfile.Glob(os.path.join(chunk_dir, '*.tfrecord.zz'))) tf_records = tf_records[-1 * (WINDOW_SIZE // EXAMPLES_PER_RECORD):] print("Training from:", tf_records[0], "to", tf_records[-1]) with timer("Training"): dual_net.train(working_dir, tf_records, generation_num) dual_net.export_model(working_dir, model_save_path)
def train( working_dir: 'tf.estimator working directory.', chunk_dir: 'Directory where gathered training chunks are.', model_save_path: 'Where to export the completed generation.', generation_num: 'Which generation you are training.'=0): tf_records = sorted(gfile.Glob(os.path.join(chunk_dir, '*.tfrecord.zz'))) tf_records = tf_records[-1 * (WINDOW_SIZE // EXAMPLES_PER_RECORD):] print("Training from:", tf_records[0], "to", tf_records[-1]) with timer("Training"): dual_net.train(working_dir, tf_records, generation_num) dual_net.export_model(working_dir, model_save_path)
def bootstrap( working_dir: 'tf.estimator working directory. If not set, defaults to a random tmp dir'=None, model_save_path: 'Where to export the first bootstrapped generation'=None): if working_dir is None: with tempfile.TemporaryDirectory() as working_dir: _ensure_dir_exists(working_dir) _ensure_dir_exists(os.path.dirname(model_save_path)) dual_net.bootstrap(working_dir) dual_net.export_model(working_dir, model_save_path) else: _ensure_dir_exists(working_dir) _ensure_dir_exists(os.path.dirname(model_save_path)) dual_net.bootstrap(working_dir) dual_net.export_model(working_dir, model_save_path)
def test_inference(self): with tempfile.TemporaryDirectory() as working_dir, \ tempfile.TemporaryDirectory() as export_dir: dual_net.bootstrap(working_dir, **fast_hparams) exported_model = os.path.join(export_dir, 'bootstrap-model') dual_net.export_model(working_dir, exported_model) n1 = dual_net.DualNetwork(exported_model, **fast_hparams) n1.run(go.Position()) # In the past we've had issues initializing two separate NNs # in the same process... just double check that two DualNetwork # instances can live side by side. n2 = dual_net.DualNetwork(exported_model, **fast_hparams) n2.run(go.Position())
def main(argv): """Train on examples and export the updated model weights.""" tf_records = argv[1:] logging.info("Training on %s records: %s to %s", len(tf_records), tf_records[0], tf_records[-1]) with utils.logged_timer("Training"): train(*tf_records) if FLAGS.export_path: dual_net.export_model(FLAGS.export_path) if FLAGS.freeze: if FLAGS.use_tpu: dual_net.freeze_graph_tpu(FLAGS.export_path) else: dual_net.freeze_graph(FLAGS.export_path, FLAGS.use_trt, FLAGS.trt_max_batch_size, FLAGS.trt_precision)
def test_inference(self): with tempfile.TemporaryDirectory() as working_dir, \ tempfile.TemporaryDirectory() as export_dir: dual_net.bootstrap(working_dir, **fast_hparams) exported_model = os.path.join(export_dir, 'bootstrap-model') dual_net.export_model(working_dir, exported_model) n1 = dual_net.DualNetwork(exported_model, **fast_hparams) n1.run(go.Position()) # In the past we've had issues initializing two separate NNs # in the same process... just double check that two DualNetwork # instances can live side by side. n2 = dual_net.DualNetwork(exported_model, **fast_hparams) n2.run(go.Position())
def bootstrap( working_dir: 'tf.estimator working directory. If not set, defaults to a random tmp dir'=None, model_save_path: 'Where to export the first bootstrapped generation'=None): qmeas.start_time('bootstrap') if working_dir is None: with tempfile.TemporaryDirectory() as working_dir: _ensure_dir_exists(working_dir) _ensure_dir_exists(os.path.dirname(model_save_path)) dual_net.bootstrap(working_dir) dual_net.export_model(working_dir, model_save_path) else: _ensure_dir_exists(working_dir) _ensure_dir_exists(os.path.dirname(model_save_path)) dual_net.bootstrap(working_dir) dual_net.export_model(working_dir, model_save_path) qmeas.stop_time('bootstrap')
def main(argv): """Train on examples and export the updated model weights.""" tf_records = argv[1:] logging.info("Training on %s records: %s to %s", len(tf_records), tf_records[0], tf_records[-1]) with utils.logged_timer("Training"): estimator = train(*tf_records) if FLAGS.export_path: dual_net.export_model(FLAGS.export_path) estimator.export_saved_model(FLAGS.export_path, serving_input_receiver_fn()) else: estimator.export_saved_model('saved_model', serving_input_receiver_fn()) if FLAGS.freeze: if FLAGS.use_tpu: dual_net.freeze_graph_tpu(FLAGS.export_path) else: dual_net.freeze_graph(FLAGS.export_path)
def main(argv): """Train on examples and export the updated model weights.""" if FLAGS.dist_train: hvd.init() mll.global_batch_size(FLAGS.train_batch_size) mll.lr_rates(FLAGS.lr_rates) mll.lr_boundaries(FLAGS.lr_boundaries) tf_records = argv[1:] logging.info("Training on %s records: %s to %s", len(tf_records), tf_records[0], tf_records[-1]) with utils.logged_timer("Training"): train(*tf_records) if (not FLAGS.dist_train) or hvd.rank() == 0: if FLAGS.export_path: dual_net.export_model(FLAGS.export_path) if FLAGS.freeze: if FLAGS.use_tpu: dual_net.freeze_graph_tpu(FLAGS.export_path) else: dual_net.freeze_graph(FLAGS.export_path)
def main(unused_argv): """Bootstrap random weights.""" utils.ensure_dir_exists(os.path.dirname(FLAGS.export_path)) if FLAGS.create_bootstrap: dual_net.bootstrap() dual_net.export_model(FLAGS.export_path)