def learn(nn_class, game, seed=0, num_steps=5, total_timesteps=int(80e6), vision_shape: VisionShape = VisionShape(8, 8, 1), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=1000): set_global_seeds(seed) # save_name = os.path.join('models', env_id + '.save') vision_shape = game.get_vision_shape() action_size = game.get_action_size() agent = A2CAgentV0(nn_class=nn_class, vision_shape=vision_shape, action_size=action_size, num_steps=num_steps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, optimizer_alpha=alpha, optimizer_epsilon=epsilon) # if os.path.exists(save_name): # agent.load(save_name) trainer = A2CAgentV0Trainer(game, agent, num_steps=num_steps, gamma=gamma) tstart = time.time() for b in range(1, total_timesteps // num_steps + 1): states, rewards, actions, values = trainer.get_a_training_batch() policy_loss, value_loss, policy_entropy = \ agent.train(states, rewards, actions, values) time_cost_seconds = time.time() - tstart fps = int((b * num_steps) / time_cost_seconds) if b % log_interval == 0 or b == 1: print(' - - - - - - - ') print("nupdates", b) print("total_timesteps", b * num_steps) print("fps", fps) print("policy_entropy", float(policy_entropy)) print("value_loss", float(value_loss)) # total reward r = trainer.total_rewards[-100:] # get last 100 tr = trainer.real_total_rewards[-100:] if len(r) == 100: print("avg reward (last 100):", np.mean(r)) if len(tr) == 100: print("avg total reward (last 100):", np.mean(tr)) print("max (last 100):", np.max(tr))
def do_training(): set_global_seeds(0) num_actions = 2 num_hidden_units = 128 model = A2CModel(num_actions, num_hidden_units) import gym # Create the environment env = gym.make("CartPole-v0") trainer = A2CTrainer(env) trainer.train(model) model.save('./__models__/player_3/') return model
import tensorflow as tf from othello.tf_utils import set_global_seeds from othello.players.a2c_player_4.A2CModel8x8 import A2CModel from othello.players.a2c_player_4.A2CTrainer import A2CTrainer from othello.players.a2c_player_4.GameWrapper import GameWrapperInpatient def do_training(board_size=8, max_episodes=10_000, optimizer_learn_rate=0.001, model_save_path='./__models__/a2c_player_4/', tensorboard_path='./__models__/a2c_player_4_tensorboard/', load_saved_model=False, game_reset_random=True): set_global_seeds(0) model = A2CModel(vision_shape=(board_size, board_size, 1), num_actions=board_size * board_size + 1, tensorboard_path=tensorboard_path) if load_saved_model: logging.info('Loading model...') model.load_model(model_save_path) logging.info('Model loaded from: ' + model_save_path) # Create the environment env = GameWrapperInpatient(board_size=board_size) trainer = A2CTrainer(env, model, optimizer_learn_rate=optimizer_learn_rate,