Пример #1
0
def learn(nn_class,
          game,
          seed=0,
          num_steps=5,
          total_timesteps=int(80e6),
          vision_shape: VisionShape = VisionShape(8, 8, 1),
          vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4,
          epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=1000):
    set_global_seeds(seed)

    # save_name = os.path.join('models', env_id + '.save')
    vision_shape = game.get_vision_shape()
    action_size = game.get_action_size()
    agent = A2CAgentV0(nn_class=nn_class,
                       vision_shape=vision_shape,
                       action_size=action_size,
                       num_steps=num_steps,
                       ent_coef=ent_coef, vf_coef=vf_coef,
                       max_grad_norm=max_grad_norm,
                       lr=lr, optimizer_alpha=alpha, optimizer_epsilon=epsilon)
    # if os.path.exists(save_name):
    #     agent.load(save_name)
    trainer = A2CAgentV0Trainer(game, agent, num_steps=num_steps, gamma=gamma)

    tstart = time.time()
    for b in range(1, total_timesteps // num_steps + 1):
        states, rewards, actions, values = trainer.get_a_training_batch()
        policy_loss, value_loss, policy_entropy = \
            agent.train(states, rewards, actions, values)
        time_cost_seconds = time.time() - tstart
        fps = int((b * num_steps) / time_cost_seconds)
        if b % log_interval == 0 or b == 1:
            print(' - - - - - - - ')
            print("nupdates", b)
            print("total_timesteps", b * num_steps)
            print("fps", fps)
            print("policy_entropy", float(policy_entropy))
            print("value_loss", float(value_loss))

            # total reward
            r = trainer.total_rewards[-100:]  # get last 100
            tr = trainer.real_total_rewards[-100:]
            if len(r) == 100:
                print("avg reward (last 100):", np.mean(r))
            if len(tr) == 100:
                print("avg total reward (last 100):", np.mean(tr))
                print("max (last 100):", np.max(tr))
Пример #2
0
def do_training():
    set_global_seeds(0)

    num_actions = 2
    num_hidden_units = 128

    model = A2CModel(num_actions, num_hidden_units)

    import gym
    # Create the environment
    env = gym.make("CartPole-v0")

    trainer = A2CTrainer(env)

    trainer.train(model)

    model.save('./__models__/player_3/')

    return model
Пример #3
0
import tensorflow as tf

from othello.tf_utils import set_global_seeds
from othello.players.a2c_player_4.A2CModel8x8 import A2CModel
from othello.players.a2c_player_4.A2CTrainer import A2CTrainer
from othello.players.a2c_player_4.GameWrapper import GameWrapperInpatient


def do_training(board_size=8,
                max_episodes=10_000,
                optimizer_learn_rate=0.001,
                model_save_path='./__models__/a2c_player_4/',
                tensorboard_path='./__models__/a2c_player_4_tensorboard/',
                load_saved_model=False,
                game_reset_random=True):
    set_global_seeds(0)

    model = A2CModel(vision_shape=(board_size, board_size, 1),
                     num_actions=board_size * board_size + 1,
                     tensorboard_path=tensorboard_path)
    if load_saved_model:
        logging.info('Loading model...')
        model.load_model(model_save_path)
        logging.info('Model loaded from: ' + model_save_path)

    # Create the environment
    env = GameWrapperInpatient(board_size=board_size)

    trainer = A2CTrainer(env,
                         model,
                         optimizer_learn_rate=optimizer_learn_rate,