Exemplo n.º 1
0
def main():
    torch.cuda.set_device(0)
    seed_num = 1
    torch.cuda.manual_seed(seed_num)
    #	data_dir = '/home/bike/data/mnist/'
    out_dir = '/home/becky/Git/reinforcement_learning_pytorch/log/REINFORCEMENT_{}/'.format(
        datetime.now())
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
        shutil.copyfile(sys.argv[0], out_dir + '/REINFORCE_cart_pole.py')
    sys.stdout = logger.Logger(out_dir)
    env_name = 'CartPole-v0'
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    num_episodes = 300
    rewards = np.zeros(num_episodes)
    QValue = REINFORCEAgent(obs_dim,
                            act_dim,
                            learning_rate=0.0001,
                            reward_decay=0.99,
                            e_greedy=0.9)
    for i_episode in range(num_episodes):
        rewards[i_episode] = run_policy(env, QValue, episodes=100)
        print("In episode {}, the reward is {}".format(
            str(i_episode), str(rewards[i_episode])))
        if killer.kill_now:
            now = "REINFORCE_v1"
            QValue.save_model(str(now))
            break

    print('game over!')
    util.before_exit(model=QValue.model, reward=rewards)
    env.close()
    env.render(close=True)
Exemplo n.º 2
0
        g = 0
        for t in reversed(range(len(rewards))):
            g = rewards[t] + gamma * g
            returns[t] = g
        trajectory['returns'] = returns


seed = 0
env = REINFORCEEnv()
np.random.seed(seed)
tf.set_random_seed(seed)
env.seed(seed=seed)

obs_dim = env.observation_space.shape[0]
n_act = 7  #config: act_dim #env.action_space.n
agent = REINFORCEAgent(obs_dim, n_act, epochs=5, hdim=32, lr=3e-4, seed=seed)

avg_return_list = deque(maxlen=1000)
avg_loss_list = deque(maxlen=1000)

episode_size = 1
batch_size = 16
nupdates = 10000

for update in range(nupdates + 1):
    #print ('update: ', update)
    trajectories = run_policy(env, agent, episodes=episode_size)
    compute_returns(trajectories)
    observes, actions, returns = build_train_set(trajectories)

    pol_loss = agent.update(observes, actions, returns, batch_size=batch_size)