def train_by_multi_gpus(n_gpu=1, epochs=None): import tensorflow as tf logger.info("Training with %s GPUs", n_gpu) if n_gpu > 1: with tf.device('/cpu:0'): model = load_latest_model() else: model = load_latest_model() with tf.device('/cpu:0'): best_model = load_best_model() base_name, index = model.name.split('_') new_name = "_".join([base_name, str(int(index) + 1)]) + ".h5" all_data_file_names = get_file_names_data_dir(os.path.join(SELF_PLAY_DATA, best_model.name)) tf_callback = TensorBoard(log_dir=os.path.join(conf['LOG_DIR'], new_name), histogram_freq=conf['HISTOGRAM_FREQ'], batch_size=BATCH_SIZE, write_graph=False, write_grads=False) nan_callback = TerminateOnNaN() if n_gpu > 1: pmodel = multi_gpu_model(model, gpus=n_gpu) opt = SGD(lr=1e-2, momentum=0.9) pmodel.compile(loss=loss, optimizer=opt, metrics=["accuracy"]) else: pmodel = model if epochs is None: epochs = EPOCHS_PER_SAVE for epoch in tqdm.tqdm(range(epochs), desc="Epochs"): for worker in tqdm.tqdm(range(NUM_WORKERS), desc="Iteration"): files = sample(all_data_file_names, BATCH_SIZE) # RANDOM because we use SGD (Stochastic Gradient Decent) X = np.zeros((BATCH_SIZE, SIZE, SIZE, 17)) policy_y = np.zeros((BATCH_SIZE, 1)) value_y = np.zeros((BATCH_SIZE, SIZE * SIZE + 1)) for j, filename in enumerate(files): with h5py.File(filename) as f: board = f['board'][:] policy = f['policy_target'][:] value_target = f['value_target'][()] X[j] = board policy_y[j] = value_target value_y[j] = policy fake_epoch = epoch * NUM_WORKERS + worker # used as initial_epoch, epochs is to be understood as "final epoch". The model is trained until the epoch of index epochs is reached. pmodel.fit(X, [value_y, policy_y], initial_epoch=fake_epoch, epochs=fake_epoch + 1, validation_split=VALIDATION_SPLIT, # Needed for TensorBoard histograms and gradi callbacks=[tf_callback, nan_callback], verbose=0, batch_size=BATCH_SIZE) model.name = new_name.split('.')[0] model.save(os.path.join(conf['MODEL_DIR'], new_name)) logger.info("Finished training with multi GPUs. New model %s saved", new_name) return model
def load_model(self): best_model = load_best_model() logger.info("Loaded best model %s", best_model.name) latest_model = load_latest_model() logger.info("Loaded latest %s", latest_model.name) return latest_model, best_model
def main(): print("Starting run (v{})".format(__version__)) init_directories() model_name = "model_1" model = create_initial_model(name=model_name) while True: model = load_latest_model() best_model = load_best_model() train(model, game_model_name=best_model.name) evaluate(best_model, model) K.clear_session()
def main(): config = tf.ConfigProto() config.gpu_options.allow_growth = True K.set_session(tf.Session(config=config)) init_directories() while True: model = load_latest_model() best_model = load_best_model() evaluate(best_model, model) train(model, game_model_name=best_model.name) K.clear_session()
def main(): init_directories() model_name = "model_1" model = create_initial_model(name=model_name) while True: model = load_latest_model() best_model = load_model(os.path.join(conf['MODEL_DIR'], conf['BEST_MODEL']), custom_objects={'loss': loss}) train(model, game_model_name=best_model.name) evaluate(best_model, model)
def main(): config = tf.ConfigProto() config.gpu_options.allow_growth = True K.set_session(tf.Session(config=config)) init_directories() model_name = "model_1" model = create_initial_model(name=model_name) while True: model = load_latest_model() best_model = load_best_model() self_play(best_model, n_games=conf['N_GAMES'], mcts_simulations=conf['MCTS_SIMULATIONS']) train(model, game_model_name=best_model.name) evaluate(best_model, model) K.clear_session()
def main(): model = load_latest_model() best_model = load_best_model() evaluate(best_model, model) K.clear_session() event_handler = MyHandler() observer = Observer() observer.schedule(event_handler, path=os.path.join(conf['MODEL_DIR']), recursive=False) observer.start() try: while True: time.sleep(60) except KeyboardInterrupt: observer.stop() observer.join()
def load_model(self): self.best_model = load_best_model() self.latest_model = load_latest_model()
def on_created(self, event): time.sleep(30) model = load_latest_model() best_model = load_best_model() evaluate(best_model, model) K.clear_session()
def main(): init_directories() clean_up_empty() GPUs = conf['GPUs'] EPOCHS_PER_SAVE = conf['EPOCHS_PER_SAVE'] BATCH_SIZE = conf['TRAIN_BATCH_SIZE'] NUM_WORKERS = conf['NUM_WORKERS'] SIZE = conf['SIZE'] n_gpu = len(GPUs) if n_gpu <= 1: raise EnvironmentError( "Number of GPU need > 1 for multi-gpus training") logger.info("STARTING TRAINING PHASE with %s GPUs", len(GPUs)) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(GPUs).strip('[').strip(']').strip( ' ') global model model = load_latest_model() base_name, index = model.name.split('_') smallest_loss = Inf # try: # model = multi_gpu_model(model, cpu_relocation=True) # print("Training using multiple GPUs..") # except: # print("Training using single GPU or CPU..") opt = SGD(lr=1e-2, momentum=0.9, clipnorm=0.9) model.compile(loss=loss, optimizer=opt, metrics=["accuracy"]) params = { 'dim': (SIZE, SIZE, 17), 'batch_size': BATCH_SIZE * n_gpu, 'shuffle': True } while True: new_name = "_".join([base_name, str(int(index) + 1)]) + ".h5" # partition = get_KGS_training_desc() # get_training_desc() training_generator = KGSDataGenerator([], None, **params) # validation_generator = KGSDataGenerator(partition['validation'], None, **params) reduce_lr = ReduceLROnPlateau(monitor='policy_out_acc', factor=0.1, patience=3, verbose=1, mode='auto', min_lr=0) callbacks_list = [reduce_lr] EPOCHS_PER_BACKUP = conf['EPOCHS_PER_BACKUP'] cycle = EPOCHS_PER_SAVE // EPOCHS_PER_BACKUP for i in range(cycle): logger.info("CYCLE {}/{}".format(i + 1, cycle)) model.fit_generator( generator=training_generator, # validation_data=validation_generator, use_multiprocessing=True, workers=NUM_WORKERS, epochs=EPOCHS_PER_BACKUP, verbose=1, callbacks=callbacks_list) model.save(os.path.join(conf['MODEL_DIR'], "backup.h5")) logger.info('Auto save model backup.h5')
def load_model(self): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu_id) self.best_model = load_best_model() self.latest_model = load_latest_model()