class ActorCritic(object): def __init__(self, state_dim, action_dim, final_activation=tf.nn.tanh, action_bound=0.4, training_batch_size=32, GAMMA=0.95, lr=0.001, replay_buffer_size=1024): self.ID = random_string(10) self.state_dim = state_dim self.action_dim = action_dim self.final_activation = final_activation self.action_bound = action_bound self.GAMMA = GAMMA self.lr = lr self.replay_buffer_size = replay_buffer_size self.replay_buffer = ReplayBuffer(replay_buffer_size) self.training_batch_size = training_batch_size with tf.variable_scope(self.ID) as scope: self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.lr, self.final_activation) self.critic = Critic(self.state_dim, self.action_dim, self.lr) def add_to_replay_buffer(self, state, action, reward, resulting_state): self.replay_buffer.add(state, action, reward, resulting_state) def add_batch_to_replay_buffer(self, states, actions, rewards, resulting_states): for s, a, r, rs in zip(states, actions, rewards, resulting_states): self.replay_buffer.add(s, a, r, rs) def get_batch(self, training_batch_size=None): if not training_batch_size: training_batch_size = self.training_batch_size return self.replay_buffer.sample_batch(training_batch_size) def train_from_replay_buffer(self, should_print=False): # small trouble: if it's done, you don't want to run this thing on it. # I takes the new state, I predict an action, I predict that pair's q val, # I do: reward + GAMMA*next_q_val. I then do critic.optimize_q_val if not self.replay_buffer.size(): print('buffer empty!') return 0 states, actions, rewards, resulting_states = self.replay_buffer.sample_batch( self.training_batch_size) predicted_action = self.actor.get_actions(resulting_states) predicted_vals = self.critic.predict_q_val(resulting_states, predicted_action) true_vals = rewards + (self.GAMMA * predicted_vals) # print(true_vals[4]) losses = self.critic.optimize_q_val(states, actions, true_vals) grads = self.critic.get_action_grads(states, actions) self.actor.train_from_batch(states, grads) return losses if should_print: actual_q, out = self.critic.return_q_and_out( states, actions, true_vals) print('ACTUAL_Q: {}\n\n'.format(actual_q)) print('OUT: {}'.format(out)) return losses def get_actions(self, states): return self.actor.get_actions(states)
next_state = state if done: break actor = Actor(env.action_space, env.observation_space) critic = Critic(env.action_space, env.observation_space, actor.sess) for ep in range(1000): # batch train total_reward = 0 env.reset() action = env.action_space.sample() state, reward, done, _ = env.step(action) for _ in range(1000): # training states, actions, rewards, next_states = memory.sample(20) next_actions = actor.get_actions(next_states) next_qs = critic.get_qs(next_states, next_actions) loss, q = critic.train(states, actions, rewards, next_qs) action_gradients = critic.get_action_gradients(states, actions) actor.train(states, action_gradients[0]) env.render() action = actor.get_action_for_train(state, ep) next_state, reward, done, _ = env.step(action) memory.add((state, action, reward, next_state)) # print(state, action, reward, next_state) total_reward += reward # print(action, reward, total_reward) state = next_state if done: break