def __init__( self, d_state, d_action, device, gamma, tau, policy_lr, value_lr, value_loss, value_n_layers, value_n_units, value_activation, policy_n_layers, policy_n_units, policy_activation, grad_clip, policy_delay=2, policy_noise=0.2, noise_clip=0.5, expl_noise=0.1, tdg_error_weight=0, td_error_weight=1, ): super().__init__() self.actor = Actor(d_state, d_action, policy_n_layers, policy_n_units, policy_activation).to(device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = RAdam(self.actor.parameters(), lr=policy_lr) self.critic = ActionValueFunction(d_state, d_action, value_n_layers, value_n_units, value_activation).to(device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = RAdam(self.critic.parameters(), lr=value_lr) self.discount = gamma self.tau = tau self.policy_delay = policy_delay self.policy_noise = policy_noise self.noise_clip = noise_clip self.expl_noise = expl_noise self.normalizer = None self.value_loss = value_loss self.grad_clip = grad_clip self.device = device self.last_actor_loss = 0 self.tdg_error_weight = tdg_error_weight self.td_error_weight = td_error_weight self.step_counter = 0
def __init__(self, use_conv, nets, dimO, dimA, obs, obs2, is_training, sess, scope='hyperactor'): self.actors = [] with tf.variable_scope(scope): for i in range(FLAGS.num_options): actor = Actor(use_conv, nets, dimO, dimA, obs, obs2, is_training, sess, scope='actor%d' % i) self.actors.append(actor) super(HyperOptionsActor, self).__init__(use_conv, nets, dimO, [FLAGS.num_options], obs, obs2, is_training, sess, scope)
def main(): env = NormalizedEnv(gym.make('Pendulum-v0')) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] agent = Actor(state_dim, action_dim).to('cuda') agent.load_state_dict(torch.load('./Models/78.0_actor.pt')) eposide = 0 done = False eposide_list = [] while eposide < 100: eposide_reward = 0 state = env.reset() state = (state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) state = to_tensor(state) while not done: action = agent.forward(state).detach().cpu().data.numpy() state_, reward, done, _ = env.step(action) state_ = (state_ - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) env.render() state = to_tensor(state_) eposide_reward += reward eposide_list.append(eposide_reward) eposide += 1 done = False print("{} : {}".format(eposide, eposide_reward)) import matplotlib.pyplot as plt x = np.arange(100) y = np.array(eposide_list) plt.plot(x, y) plt.savefig("./test_eposide_reward.png") env.close()
def get_policy(dump: str, action_spec): state_dict = torch.load(dump, map_location="cpu") policy = Actor(*state_dict["args"].tolist()) policy.load_state_dict(state_dict) policy.eval() @torch.no_grad() def _policy(time_step): state = np.concatenate(list(time_step.observation.values())) state_tensor = torch.tensor(state, dtype=torch.float) p = policy(state_tensor).numpy() return np.clip(p, action_spec.minimum, action_spec.maximum) return _policy
class Action(): def __init__(self, state_dim, action_dim): self.actor = Actor(state_dim, action_dim) self.actor.eval() self.noise = OrnsteinUhlenbeckActionNoise(action_dim) self.to_tensor = util.to_tensor pass def chose_action(self, state, explort): if explort: a0 = self.get_exploration_action(state) else: a0 = self.get_exploitation_action(state) return a0 def get_exploitation_action(self, state): '''得到给定状态下依据目标演员网络计算出的行为,不探索 Args: state numpy数组 Returns: action numpy数组 ''' action = self.actor.forward(self.to_tensor(state)).squeeze(0) action = action.cpu().data.numpy() return action def get_exploration_action(self, state): '''得到给定状态下根据演员网络计算出的带噪声的行为,模拟一定的探索 Args: state numpy数组 Returns: action numpy数组 ''' action = self.actor.forward(self.to_tensor(state)).squeeze(0) new_action = action.cpu().data.numpy() + (self.noise.sample()) new_action = new_action.clip(min=-1, max=1) return new_action def load_param(self, source_model): self.actor.load_state_dict(source_model.state_dict())
REPLACE_ITER_A = args.target_update_a REPLACE_ITER_C = args.target_update_c GAMMA = args.gamma env = Env() STATE_DIM = env.state_dim ACTION_DIM = env.action_dim ACTION_BOUND = env.action_bound config = tf.ConfigProto() config.gpu_options.allow_growth = True config.log_device_placement=True sess = tf.Session(config=config) # Create actor and critic. actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A) critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_) actor.add_grad_to_graph(critic.a_grads) M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1) saver = tf.train.Saver() path = './checkpoints' saver.restore(sess, tf.train.latest_checkpoint(path)) def eval(): s = env.reset() while True: a = actor.choose_action(s) s_, r, done, collision = env.step(a)
from memory import * if __name__ == '__main__': env = gym.make('Pendulum-v0') env = env.unwrapped state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high var = 3. with tf.Session() as sess: memory = Memory(32, 10000) actor = Actor(sess, state_dim, action_bound, lr=0.01, tau=0.01) critic = Critic(sess, state_dim, actor.s, actor.s_, actor.a, actor.a_, gamma=0.9, lr=0.001, tau=0.01) t = critic.get_gradients() actor.generate_gradients(t) sess.run(tf.global_variables_initializer()) for i in range(1000): s = env.reset() r_episode = 0 for j in range(200): a = actor.choose_action(s) a = np.clip(np.random.normal(a, var), -action_bound, action_bound) # 异策略探索 s_, r, done, info = env.step(a)
env = gym.make('Duckietown-udem1-v0') # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) state_size = env.observation_space.shape action_size = env.action_space.shape[0] max_action = float(env.action_space.high[0]) actor_agent = Actor(state_size, action_size, max_action) actor_path = torch.load( '/home/ivlabs/users/sharath/final_year_thesis/ddpg_models/checkpoint_13_actor.pth' ) actor_agent.load_state_dict(actor_path) stack_size = 4 stacked_frames = deque( [np.zeros((120, 160), dtype=np.int) for i in range(stack_size)], maxlen=4) state = env.reset() with torch.no_grad(): while True: state = env.reset() state, stacked_frames = stack_images(stacked_frames, state, True) rewards = [] while True: state = torch.from_numpy(state).float()
def __init__(self, env: callable, state_shape: list, action_size: int, q_network_shape: tuple, mu_network_shape: tuple, buffer_size: int, gamma: float, tau: float, noise_stddev: float, save_dir: str, actor_learning_rate: float, critic_learning_rate: float, batch_size: int, episode: int, train_epoch: int, run_epoch: int = 100, action_reshape: callable = None): self.env = env self.batch_size = batch_size self.state_shape = state_shape self.action_size = action_size self.q_network_shape = q_network_shape self.mu_network_shape = mu_network_shape self.buffer_size = buffer_size self.noise_stddev = noise_stddev self.episode = episode self.train_epoch = train_epoch self.dir = save_dir self.action_reshape = action_reshape self.run_epoch = run_epoch state_i = tf.placeholder("float32", [batch_size] + state_shape) state_i_next = tf.placeholder("float32", [batch_size] + state_shape) action_i = tf.placeholder("float32", [batch_size, action_size]) mu_apo = Mu_Model("Mu_apo", state_i_next, action_size, mu_network_shape, batch_size, trainable=False) q = Q_Model("Q_0", state_i, action_i, q_network_shape, batch_size) mu = Mu_Model("Mu_0", state_i, action_size, mu_network_shape, batch_size, y_grads=q.a_grads) q_apo = Q_Model("Q_apo", state_i_next, mu_apo.a, q_network_shape, batch_size, trainable=False) self._actor = Actor(mu, mu_apo, gamma, tau, actor_learning_rate) self._critic = Critic(q, q_apo, gamma, tau, batch_size, critic_learning_rate) self._s_buf = ReplayBuf(buffer_size, self.state_shape) self._a_buf = ReplayBuf(buffer_size, [self.action_size]) self._r_buf = ReplayBuf(buffer_size, [1]) self._s_next_buf = ReplayBuf(buffer_size, self.state_shape) self._sess = None self._saver = tf.train.Saver()
class Model(object): def __init__(self, env: callable, state_shape: list, action_size: int, q_network_shape: tuple, mu_network_shape: tuple, buffer_size: int, gamma: float, tau: float, noise_stddev: float, save_dir: str, actor_learning_rate: float, critic_learning_rate: float, batch_size: int, episode: int, train_epoch: int, run_epoch: int = 100, action_reshape: callable = None): self.env = env self.batch_size = batch_size self.state_shape = state_shape self.action_size = action_size self.q_network_shape = q_network_shape self.mu_network_shape = mu_network_shape self.buffer_size = buffer_size self.noise_stddev = noise_stddev self.episode = episode self.train_epoch = train_epoch self.dir = save_dir self.action_reshape = action_reshape self.run_epoch = run_epoch state_i = tf.placeholder("float32", [batch_size] + state_shape) state_i_next = tf.placeholder("float32", [batch_size] + state_shape) action_i = tf.placeholder("float32", [batch_size, action_size]) mu_apo = Mu_Model("Mu_apo", state_i_next, action_size, mu_network_shape, batch_size, trainable=False) q = Q_Model("Q_0", state_i, action_i, q_network_shape, batch_size) mu = Mu_Model("Mu_0", state_i, action_size, mu_network_shape, batch_size, y_grads=q.a_grads) q_apo = Q_Model("Q_apo", state_i_next, mu_apo.a, q_network_shape, batch_size, trainable=False) self._actor = Actor(mu, mu_apo, gamma, tau, actor_learning_rate) self._critic = Critic(q, q_apo, gamma, tau, batch_size, critic_learning_rate) self._s_buf = ReplayBuf(buffer_size, self.state_shape) self._a_buf = ReplayBuf(buffer_size, [self.action_size]) self._r_buf = ReplayBuf(buffer_size, [1]) self._s_next_buf = ReplayBuf(buffer_size, self.state_shape) self._sess = None self._saver = tf.train.Saver() def train(self): # input definition s_i = self._actor.s s_i_next = self._actor.s_apo a_i = self._critic.a r_i = self._critic.reward # data container definition data_s_i = np.zeros([self.batch_size] + self.state_shape) ck_pt = tf.train.get_checkpoint_state(self.dir) if ck_pt is not None: self._sess = tf.Session() self._saver.restore(self._sess, ck_pt.state.model_checkpoint_path) ''' except: print("Load model error") self._sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) ''' else: self._sess = tf.Session() self._sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) self._sess.run([ self._actor.init_target_net(), self._critic.init_target_net() ]) count = 0 for _ in range(self.episode): end_state = self.env.reset() while True: try: self.env.render() except: pass start_state = end_state data_s_i[0] = start_state action = self._sess.run(self._actor.a, { s_i: data_s_i })[0] + np.random.normal( 0, scale=self.noise_stddev, size=self.action_size) # get an action, a = Mu(s)+Noise if self.action_reshape is not None: end_state, reward, _done, _ = self.env.step( self.action_reshape(action)) else: end_state, reward, _done, _ = self.env.step(action) #print("Action: {}".format(action)) #print("Reward: {}".format(reward)) self._s_buf.append(start_state) self._a_buf.append(action) self._r_buf.append(np.array([reward])) self._s_next_buf.append(end_state) count += 1 if _done: # final state break if len(self._s_buf ) >= self.batch_size * 10 and count >= self.run_epoch: print("Action: {}".format(action)) count = 0 loss = np.zeros([0]) q = np.zeros([0]) for i in range(self.train_epoch): sample = list(range(len(self._s_buf))) np.random.shuffle(sample) data_s_i = self._s_buf.get_by_indexes( sample[:self.batch_size]) # get batch data_a_i = self._a_buf.get_by_indexes( sample[:self.batch_size]) data_r_i = self._r_buf.get_by_indexes( sample[:self.batch_size]) data_s_i_next = self._s_next_buf.get_by_indexes( sample[:self.batch_size]) _, loss = self._sess.run( [self._critic.minimize_loss(), self._critic.loss ], # minimize critic loss { s_i: data_s_i, a_i: data_a_i, s_i_next: data_s_i_next, r_i: data_r_i }) a = self._sess.run(self._actor.a, {s_i: data_s_i}) _, a = self._sess.run( [self._actor.maximize_action_q(), self._actor.a ], # maximize actor-critic value { s_i: data_s_i, a_i: a }) q = self._sess.run( self._critic.Q, # calculate q value { s_i: data_s_i, a_i: a }) self._sess.run(self._critic.update_target_net() ) # update target network self._sess.run(self._actor.update_target_net()) print("Average loss: {}".format(loss.mean())) print("Average Q value: {}".format(q.mean()))
session = tf.Session() actors = [] critics = [] actors_noise = [] memories = [] # actors & critics for i in range(env.n): n_action = env.action_space[i].n state_size = env.observation_space[i].shape[0] state = tf.placeholder(tf.float32, shape=[None, state_size]) reward = tf.placeholder(tf.float32, [None, 1]) state_next = tf.placeholder(tf.float32, shape=[None, state_size]) speed = 0.8 if env.agents[i].adversary else 1 actors.append(Actor('actor' + str(i), session, n_action, speed, state, state_next)) critics.append(Critic('critic' + str(i), session, n_action, actors[i].eval_actions, actors[i].target_actions, state, state_next, reward)) actors[i].add_gradients(critics[i].action_gradients) actors_noise.append(OrnsteinUhlenbeckActionNoise( mu=ou_mus[i], sigma=ou_sigma[i], theta=ou_theta[i], dt=ou_dt[i], x0=ou_x0[i])) memories.append(Memory(args.memory_size)) session.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=10000000)
def __init__(self, state_dim, action_dim): self.actor = Actor(state_dim, action_dim) self.actor.eval() self.noise = OrnsteinUhlenbeckActionNoise(action_dim) self.to_tensor = util.to_tensor pass
class TD3(nn.Module): def __init__( self, d_state, d_action, device, gamma, tau, policy_lr, value_lr, value_loss, value_n_layers, value_n_units, value_activation, policy_n_layers, policy_n_units, policy_activation, grad_clip, policy_delay=2, policy_noise=0.2, noise_clip=0.5, expl_noise=0.1, tdg_error_weight=0, td_error_weight=1, ): super().__init__() self.actor = Actor(d_state, d_action, policy_n_layers, policy_n_units, policy_activation).to(device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = RAdam(self.actor.parameters(), lr=policy_lr) self.critic = ActionValueFunction(d_state, d_action, value_n_layers, value_n_units, value_activation).to(device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = RAdam(self.critic.parameters(), lr=value_lr) self.discount = gamma self.tau = tau self.policy_delay = policy_delay self.policy_noise = policy_noise self.noise_clip = noise_clip self.expl_noise = expl_noise self.normalizer = None self.value_loss = value_loss self.grad_clip = grad_clip self.device = device self.last_actor_loss = 0 self.tdg_error_weight = tdg_error_weight self.td_error_weight = td_error_weight self.step_counter = 0 def setup_normalizer(self, normalizer): self.normalizer = copy.deepcopy(normalizer) def get_action(self, states, deterministic=False): states = states.to(self.device) with torch.no_grad(): if self.normalizer is not None: states = self.normalizer.normalize_states(states) actions = self.actor(states) if not deterministic: actions += torch.randn_like(actions) * self.expl_noise return actions.clamp(-1, +1) def get_action_with_logp(self, states): states = states.to(self.device) if self.normalizer is not None: states = self.normalizer.normalize_states(states) a = self.actor(states) return a, torch.ones( a.shape[0], device=a.device) * np.inf # inf: should not be used def get_action_value(self, states, actions): with torch.no_grad(): states = states.to(self.device) actions = actions.to(self.device) return self.critic(states, actions)[0] # just q1 def update(self, states, actions, logps, rewards, next_states, masks): if self.normalizer is not None: states = self.normalizer.normalize_states(states) next_states = self.normalizer.normalize_states(next_states) self.step_counter += 1 # Select action according to policy and add clipped noise noise = (torch.randn_like(actions) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) raw_next_actions = self.actor_target(next_states) next_actions = (raw_next_actions + noise).clamp(-1, 1) # Compute the target Q value next_Q1, next_Q2 = self.critic_target(next_states, next_actions) next_Q = torch.min(next_Q1, next_Q2) q_target = rewards.unsqueeze( 1) + self.discount * masks.float().unsqueeze(1) * next_Q zero_targets = torch.zeros_like(q_target, device=self.device) # Get current Q estimates q1, q2 = self.critic(states, actions) q1_td_error, q2_td_error = q_target - q1, q_target - q2 critic_loss, standard_loss, gradient_loss = torch.tensor( 0, device=self.device), torch.tensor( 0, device=self.device), torch.tensor(0, device=self.device) if self.td_error_weight != 0: # Compute standard critic loss if self.value_loss == 'huber': standard_loss = 0.5 * ( F.smooth_l1_loss(q1_td_error, zero_targets) + F.smooth_l1_loss(q2_td_error, zero_targets)) elif self.value_loss == 'mse': standard_loss = 0.5 * (F.mse_loss(q1_td_error, zero_targets) + F.mse_loss(q2_td_error, zero_targets)) critic_loss = critic_loss + self.td_error_weight * standard_loss if self.tdg_error_weight != 0: # Compute gradient critic loss gradients_error_norms1 = torch.autograd.grad( outputs=q1_td_error, inputs=actions, grad_outputs=torch.ones(q1_td_error.size(), device=self.device), retain_graph=True, create_graph=True, only_inputs=True)[0].flatten(start_dim=1).norm(dim=1, keepdim=True) gradients_error_norms2 = torch.autograd.grad( outputs=q2_td_error, inputs=actions, grad_outputs=torch.ones(q2_td_error.size(), device=self.device), retain_graph=True, create_graph=True, only_inputs=True)[0].flatten(start_dim=1).norm(dim=1, keepdim=True) if self.value_loss == 'huber': gradient_loss = 0.5 * ( F.smooth_l1_loss(gradients_error_norms1, zero_targets) + F.smooth_l1_loss(gradients_error_norms2, zero_targets)) elif self.value_loss == 'mse': gradient_loss = 0.5 * ( F.mse_loss(gradients_error_norms1, zero_targets) + F.mse_loss(gradients_error_norms2, zero_targets)) critic_loss = critic_loss + self.tdg_error_weight * gradient_loss # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_value_(self.critic.parameters(), self.grad_clip) self.critic_optimizer.step() if self.step_counter % self.policy_delay == 0: # Compute actor loss q1, q2 = self.critic( states, self.actor(states)) # originally in TD3 we had here q1 only q_min = torch.min(q1, q2) actor_loss = -q_min.mean() self.last_actor_loss = actor_loss.item() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_value_(self.actor.parameters(), self.grad_clip) self.actor_optimizer.step() # Update the frozen target policy for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) # Update the frozen target value function for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) return raw_next_actions[0, 0].item( ), self.td_error_weight * standard_loss.item( ), self.tdg_error_weight * gradient_loss.item(), self.last_actor_loss @staticmethod def catastrophic_divergence(q_loss, pi_loss): return q_loss > 1e2 or (pi_loss is not None and abs(pi_loss) > 1e5)