def __init__(self, env: Env = None, capacity=2e6, batch_size=128, action_lim=1, learning_rate=0.001, gamma=0.999, epochs=2): if env is None: raise "agent should have an environment" super(DDPGAgent, self).__init__(env, capacity) self.state_dim = env.observation_space.shape[0] # 状态连续 self.action_dim = env.action_space.shape[0] # 行为连续 self.action_lim = action_lim # 行为值限制 self.batch_size = batch_size # 批学习一次状态转换数量 self.learning_rate = learning_rate # 学习率 self.gamma = gamma # 衰减因子 self.epochs = epochs # 统一批状态转换学习的次数 self.tau = 0.001 # 软拷贝的系数 self.noise = OrnsteinUhlenbeckActionNoise(self.action_dim) self.actor = Actor(self.state_dim, self.action_dim, self.action_lim) self.target_actor = Actor(self.state_dim, self.action_dim, self.action_lim) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.learning_rate) self.critic = Critic(self.state_dim, self.action_dim) self.target_critic = Critic(self.state_dim, self.action_dim) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), self.learning_rate) hard_update(self.target_actor, self.actor) # 硬拷贝 hard_update(self.target_critic, self.critic) # 硬拷贝 return
def __init__(self, env): super(DDPG, self).__init__() pi_net = PiNet(self.ns, self.na) self.pi_net = pi_net.to(self.device) pi_target = PiNet(self.ns, self.na) self.pi_target = pi_target.to(self.device) self.load_state_dict(self.pi_target, self.pi_net.state_dict()) q_net = QNet(self.ns, self.na) self.q_net = q_net.to(self.device) q_target = QNet(self.ns, self.na) self.q_target = q_target.to(self.device) self.load_state_dict(self.q_target, self.q_net.state_dict()) self.optimizer_q = torch.optim.Adam(self.q_net.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=1e-2) self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999), weight_decay=0) self.noise = OrnsteinUhlenbeckActionNoise( torch.zeros(1, self.na).to(self.device), self.epsilon * torch.ones(1, self.na).to(self.device))
def __init__(self, action_dim, action_bound, tau, lr_a, lr_c, state_dim, gamma, batch_size): self.target = tf.placeholder(tf.float32, [None, 1], 'critic_target') self.s = tf.placeholder(tf.float32, [None, state_dim], 'state') self.s_ = tf.placeholder(tf.float32, [None, state_dim], 'next_state') self.memory = ReplayBuffer(max_size=10000) self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) self.batch_size = batch_size self.gamma = gamma self.sess = tf.Session() self.actor = Actor(self.sess, self.s, self.s_, action_dim, action_bound, tau, lr_a, f1_units=300) self.critic = Critic(self.sess, lr_c, self.s, self.s_, self.actor.a, self.actor.a_, self.target, tau, gamma, state_dim, action_dim, f1_units=300) self.actor.add_grad_to_graph(self.critic.a_g) self.sess.run(tf.global_variables_initializer())
def train( env, actor_learning_rate=0.0001, critic_learning_rate=0.001, gamma=0.99, tau=0.001, max_episodes=100, buffer_size=1000000, batch_size=64, plot_flag=True, verbose=True, save_dir=None, ): if os.path.exists(os.path.join(save_dir, "epi_rwds.npy")): return np.load(os.path.join(save_dir, "epi_rwds.npy")) if not os.path.exists(save_dir): os.makedirs(save_dir) # setup learner obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_gain = env.action_space.high learner = DDPG( obs_dim, action_dim, action_gain, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma, tau=tau, ) # setup other reqs - buffer, noise buffer = ReplayBuffer(buffer_size) noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) epi_rwds = [] # train # 玩100盘 for e in range(0, max_episodes + 200): print('game no.', e, end=' ') run_episode(learner, env, noise, buffer, batch_size=64) if e % 10 == 0: # testing without noise at the end of each epoch # one epoch = 10 episodes e_rwds = 0 for _ in range(10): e_rwds += run_episode(learner, env, None, buffer, train_flag=False) print( "End of epoch # {} | {} training episodes completed | Total reward = {}" .format(e // 5, e, e_rwds / 10)) epi_rwds.append(e_rwds / 10) # save trained model if save_dir: learner.save_model(save_dir) # plot training curve if plot_flag: plt.plot(epi_rwds) plt.xlabel("Episode") plt.ylabel("Reward") plt.savefig(os.path.join(save_dir, "training_curve.png")) plt.show() return epi_rwds
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='Pendulum-v0') parser.add_argument("--action-dim", type=int, default=1) parser.add_argument("--state-dim", type=int, default=1) parser.add_argument("--input-shape", type=list, default=[None, 1]) parser.add_argument("--epochs", type=int, default=30000) parser.add_argument('--tau', help='soft target update parameter', default=0.001) parser.add_argument("--action-bound", type=float, default=1.) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--gamma", type=float, default=.99) parser.add_argument("--K", type=int, default=1, help='The number of steps to train the environment') parser.add_argument( "--L", type=int, default=1, help='The number of Q-learning steps for hypothetical rollouts') parser.add_argument("--latent-size", type=int, default=4, help='Size of vector for Z') args = parser.parse_args() # Initialize environment env = gym.make(args.environment) args.state_dim = env.observation_space.shape[0] args.input_shape = [None, args.state_dim] args.action_dim = env.action_space.shape[0] #assert args.action_dim == 1 args.action_bound = env.action_space.high print(args) # Networks actor_source = actor(state_shape=[None, args.state_dim],\ action_shape=[None, args.action_dim],\ output_bound=args.action_bound[0],\ scope='actor_source') critic_source = critic(state_shape=[None, args.state_dim],\ action_shape=[None, args.action_dim],\ scope='critic_source') actor_target = actor(state_shape=[None, args.state_dim],\ action_shape=[None, args.action_dim],\ output_bound=args.action_bound[0],\ scope='actor_target') critic_target = critic(state_shape=[None, args.state_dim],\ action_shape=[None, args.action_dim],\ scope='critic_target') # Initialize the GANs cgan_state = CGAN(input_shape=args.input_shape,\ action_size=args.action_dim,\ latent_size=args.latent_size,\ gen_input_shape=args.input_shape,\ continuous_action=True) cgan_reward = CGAN(input_shape=args.input_shape,\ action_size=args.action_dim,\ latent_size=args.latent_size,\ gen_input_shape=[None, 1],\ continuous_action=True) # Update and copy operators update_target_actor = update_target_graph2('actor_source', 'actor_target', args.tau) update_target_critic = update_target_graph2('critic_source', 'critic_target', args.tau) copy_target_actor = update_target_graph2('actor_source', 'actor_target', 1.) copy_target_critic = update_target_graph2('critic_source', 'critic_target', 1.) # Replay memory memory = Memory(args.replay_mem_size) # Actor noise actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(args.action_dim)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(copy_target_critic) sess.run(copy_target_actor) for epoch in range(args.epochs): state = env.reset() total_rewards = 0.0 while True: #env.render() # Choose an action action = sess.run( actor_source.action, feed_dict={actor_source.states: state[np.newaxis, ...] })[0] + actor_noise() # Execute action state1, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([state[np.newaxis, ...],\ action[np.newaxis, ...],\ reward,\ state1[np.newaxis, ...],\ done]) # Training step: update actor critic using real experience batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states = np.concatenate(batch[:, 0], axis=0) actions = np.concatenate(batch[:, 1], axis=0) rewards = batch[:, 2] states1 = np.concatenate(batch[:, 3], axis=0) dones = batch[:, 4] # Update the critic actions1 = sess.run(actor_target.action,\ feed_dict={actor_target.states:states1}) targetQ = np.squeeze(sess.run(critic_target.Q,\ feed_dict={critic_target.states:states1,\ critic_target.actions:actions1}), axis=-1) targetQ = rewards + ( 1. - dones.astype(np.float32)) * args.gamma * targetQ targetQ = targetQ[..., np.newaxis] _, critic_loss = sess.run([critic_source.critic_solver,\ critic_source.loss],\ feed_dict={critic_source.states:states,\ critic_source.actions:actions,\ critic_source.targetQ:targetQ}) # Update the actor critic_grads = sess.run(critic_source.grads,\ feed_dict={critic_source.states:states,\ critic_source.actions:actions})[0]# Grab gradients from critic _ = sess.run(actor_source.opt,\ feed_dict={actor_source.states:states,\ actor_source.dQ_by_da:critic_grads}) # Update target networks sess.run(update_target_critic) sess.run(update_target_actor) # Training step: update the environment model using real experience (i.e., update the conditional GANs) for k in range(args.K): batch = np.array(memory.sample(args.batch_size)) states = np.concatenate(batch[:, 0], axis=0) actions = np.concatenate(batch[:, 1], axis=0) rewards = batch[:, 2] states1 = np.concatenate(batch[:, 3], axis=0) _, D_loss_state = sess.run([cgan_state.D_solver, cgan_state.D_loss],\ feed_dict={cgan_state.states:states,\ cgan_state.actions:actions,\ cgan_state.Z:sample_z(len(batch),\ args.latent_size),\ cgan_state.X:states1}) _, G_loss_state = sess.run([cgan_state.G_solver,\ cgan_state.G_loss],\ feed_dict={cgan_state.states:states,\ cgan_state.actions:actions,\ cgan_state.Z:sample_z(len(batch),\ args.latent_size)}) _, D_loss_reward = sess.run([cgan_reward.D_solver,\ cgan_reward.D_loss],\ feed_dict={cgan_reward.states:states,\ cgan_reward.actions:actions,\ cgan_reward.Z:sample_z(len(batch),\ args.latent_size),\ cgan_reward.X:rewards[..., np.newaxis]}) _, G_loss_reward = sess.run([cgan_reward.G_solver,\ cgan_reward.G_loss],\ feed_dict={cgan_reward.states:states,\ cgan_reward.actions:actions,\ cgan_reward.Z:sample_z(len(batch),\ args.latent_size)}) #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state # Training step: update actor critic using imagination rollouts for l in range(args.L): batch = np.array(memory.sample(args.batch_size)) states_ = np.concatenate(batch[:, 3], axis=0) actions = np.random.uniform(env.action_space.low[0],\ env.action_space.high[0],\ size=(len(batch),\ env.action_space.shape[0])) dones = np.array([False] * len(batch)) G_sample_state = sess.run(cgan_state.G_sample,\ feed_dict={cgan_state.states:states_,\ cgan_state.actions:actions,\ cgan_state.Z:sample_z(len(batch),\ args.latent_size)}) G_sample_reward = sess.run(cgan_reward.G_sample,\ feed_dict={cgan_reward.states:states_,\ cgan_reward.actions:actions,\ cgan_reward.Z:sample_z(len(batch),\ args.latent_size)}) G_sample_reward = np.squeeze(G_sample_reward, axis=-1) # Update the critic actions1 = sess.run(actor_target.action,\ feed_dict={actor_target.states:G_sample_state}) targetQ = np.squeeze(sess.run(critic_target.Q,\ feed_dict={critic_target.states:G_sample_state,\ critic_target.actions:actions1}), axis=-1) targetQ = G_sample_reward + ( 1. - dones.astype(np.float32)) * args.gamma * targetQ targetQ = targetQ[..., np.newaxis] _, critic_loss = sess.run([critic_source.critic_solver,\ critic_source.loss],\ feed_dict={critic_source.states:states_,\ critic_source.actions:actions,\ critic_source.targetQ:targetQ}) # Update the actor critic_grads = sess.run(critic_source.grads,\ feed_dict={critic_source.states:states_,\ critic_source.actions:actions})[0]# Grab gradients from critic _ = sess.run(actor_source.opt,\ feed_dict={actor_source.states:states_,\ actor_source.dQ_by_da:critic_grads}) # Update target networks sess.run(update_target_critic) sess.run(update_target_actor) state = np.copy(state1) if done == True: print 'epoch', epoch, 'total rewards', total_rewards break
CRITIC_LEARNING_RATE = 0.001 BUFFER_SIZE = 1000000 MINIBATCH_SIZE = 64 MAX_EPISODES = 500 MAX_EP_STEPS = 1000 GAMMA = 0.99 with tf.Session() as sess: critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM, TAU, CRITIC_LEARNING_RATE) actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND, MINIBATCH_SIZE, TAU, ACTOR_LEARNING_RATE, critic.model) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM)) #TODO: Ornstein-Uhlenbeck noise. sess.run(tf.global_variables_initializer()) # initialize target net actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE) # log variables reward_log = []
class DDPGAgent(Agent): '''使用Actor-Critic算法结合深度学习的个体 接收一个环境对象,同时接受相关的学习参数 从env得到state和action的特征数目 继承自基类Agent ''' def __init__(self, env: Env = None, capacity=2e6, batch_size=128, action_lim=1, learning_rate=0.001, gamma=0.999, epochs=2): if env is None: raise "agent 缺少环境env" super(DDPGAgent, self).__init__(env, capacity) self.state_dim = env.observation_space.shape[0] # 状态连续,获取状态维度 self.action_dim = env.action_space.shape[0] # 行为连续,获取行为维度 self.action_lim = action_lim # 行为值限制 self.batch_size = batch_size # 批学习一次状态转换数量 self.learning_rate = learning_rate # 学习率 self.gamma = 0.999 # 衰减因子 self.epochs = epochs # 统一批状态转换学习的次数 self.tau = 0.01 # 软拷贝的系数 self.noise = OrnsteinUhlenbeckActionNoise(self.action_dim) self.actor = Actor(self.state_dim, self.action_dim, self.action_lim) self.target_actor = Actor(self.state_dim, self.action_dim, self.action_lim) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.learning_rate) self.critic = Critic(self.state_dim, self.action_dim) self.target_critic = Critic(self.state_dim, self.action_dim) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), self.learning_rate) # 初始化 # 将critic和actor网络的参数硬拷贝到目标网络target_critic和target_actor hard_update(self.target_actor, self.actor) # 硬拷贝 hard_update(self.target_critic, self.critic) # 硬拷贝 return def get_exploitation_action(self, state): '''得到给定状态下依据target_actor网络计算出的行为,不探索 Args: state numpy数组 Returns: action numpy 数组 ''' action = self.target_actor.forward(state).detach() return action.data.numpy() def get_exploration_action(self, state): '''得到给定状态下根据actor计算出的带噪声的行为,模拟一定的探索 Args: state numpy数组 Returns: action numpy 数组 ''' action = self.actor.forward(state).detach() new_action = action.data.numpy() + (self.noise.sample() * self.action_lim) new_action = new_action.clip(min=-1 * self.action_lim, max=self.action_lim) return new_action def _learn_from_memory(self): '''从experience学习,更新两个网络的参数; ''' # 随机获取记忆里的Transmition trans_pieces = self.sample(self.batch_size) s0 = np.vstack([x.s0 for x in trans_pieces]) a0 = np.array([x.a0 for x in trans_pieces]) r1 = np.array([x.reward for x in trans_pieces]) s1 = np.vstack([x.s1 for x in trans_pieces]) # 优化critic网络参数,最小化loss a1 = self.target_actor.forward(s1).detach() next_val = torch.squeeze(self.target_critic.forward(s1, a1).detach()) r1 = torch.from_numpy(r1) # r1 = r1.type(torch.DoubleTensor) next_val = next_val.type(torch.DoubleTensor) y_expected = r1 + self.gamma * next_val y_expected = y_expected.type(torch.FloatTensor) a0 = torch.from_numpy(a0) # 转换成Tensor y_predicted = torch.squeeze(self.critic.forward(s0, a0)) # 最小化loss,更新critic loss_critic = F.smooth_l1_loss(y_predicted, y_expected) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() # 优化actor网络参数,优化的目标是使得Q增大 pred_a0 = self.actor.forward(s0) # 直接使用a0会不收敛 #反向梯度下降(梯度上升),以某状态的价值估计为策略目标函数 loss_actor = -1 * torch.sum(self.critic.forward(s0, pred_a0)) self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() # 软更新参数,跟新target网络 soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) return (loss_critic.item(), loss_actor.item()) def learning_method(self, display=False, explore=True): ''' 负责agent与env的交互 实现一个完整的episode,遍历 返回 time_in_episode 当前episode的步数 total_reward 当前episode的总奖励 ''' self.state = np.float64(self.env.reset()) #初始化episode的状态 time_in_episode, total_reward = 0, 0 is_done = False #episode的结束标志 loss_critic, loss_actor = 0.0, 0.0 while not is_done: # add code here s0 = self.state #获取当前状态 if explore: a0 = self.get_exploration_action(s0) else: a0 = self.actor.forward(s0).detach().data.numpy() s1, r1, is_done, info, total_reward = self.act( a0) #执行动作,并获得观测值,并将当前trans序列s0,a0,r0,s1存储 if display: self.env.render() if self.total_trans > self.batch_size: #若当前的trans的数量大于batch size,开始基于experience replay的学习 loss_c, loss_a = self._learn_from_memory() loss_critic += loss_c loss_actor += loss_a time_in_episode += 1 loss_critic /= time_in_episode loss_actor /= time_in_episode if display: print("{}".format(self.experience.last_episode)) return time_in_episode, total_reward, loss_critic, loss_actor def learning(self, max_episode_num=800, display=False, explore=True): ''' 迭代所有的episode 并在训练过程中保存训练结果 ''' total_time, episode_reward, num_episode = 0, 0, 0 total_times, episode_rewards, num_episodes = [], [], [] for i in tqdm(range(max_episode_num)): #遍历所有episode time_in_episode, episode_reward, loss_critic, loss_actor = \ self.learning_method(display = display, explore = explore) #对该episode进行学习 total_time += time_in_episode #总的步数 num_episode += 1 #总的episode数 total_times.append(total_time) #存储每个episode的times步数 episode_rewards.append(episode_reward) #存储每个episode的reward num_episodes.append(num_episode) #存储每个episode的索引 print("episode:{:3}:loss critic:{:4.3f}, J_actor:{:4.3f}".\ format(num_episode-1, loss_critic, -loss_actor)) if explore and num_episode % 100 == 0: # 保存模型 self.save_models(num_episode) return total_times, episode_rewards, num_episodes def save_models(self, episode_count): torch.save(self.target_actor.state_dict(), './Models/' + str(episode_count) + '_actor.pt') torch.save(self.target_critic.state_dict(), './Models/' + str(episode_count) + '_critic.pt') print("Models saved successfully") def load_models(self, episode): self.actor.load_state_dict( torch.load('./Models/' + str(episode) + '_actor.pt')) self.critic.load_state_dict( torch.load('./Models/' + str(episode) + '_critic.pt')) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) print("Models loaded succesfully")
class DDPGAgent(Agent): ''' 使用Actor-Critic算法结合深度学习的个体 ''' def __init__(self, env: Env = None, capacity=2e6, batch_size=128, action_lim=1, learning_rate=0.001, gamma=0.999, epochs=2): if env is None: raise "agent should have an environment" super(DDPGAgent, self).__init__(env, capacity) self.state_dim = env.observation_space.shape[0] # 状态连续 self.action_dim = env.action_space.shape[0] # 行为连续 self.action_lim = action_lim # 行为值限制 self.batch_size = batch_size # 批学习一次状态转换数量 self.learning_rate = learning_rate # 学习率 self.gamma = gamma # 衰减因子 self.epochs = epochs # 统一批状态转换学习的次数 self.tau = 0.001 # 软拷贝的系数 self.noise = OrnsteinUhlenbeckActionNoise(self.action_dim) self.actor = Actor(self.state_dim, self.action_dim, self.action_lim) self.target_actor = Actor(self.state_dim, self.action_dim, self.action_lim) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.learning_rate) self.critic = Critic(self.state_dim, self.action_dim) self.target_critic = Critic(self.state_dim, self.action_dim) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), self.learning_rate) hard_update(self.target_actor, self.actor) # 硬拷贝 hard_update(self.target_critic, self.critic) # 硬拷贝 return def get_exploitation_action(self, state): '''得到给定状态下依据目标演员网络计算出的行为,不探索 Args: state numpy数组 Returns: action numpy 数组 ''' action = self.target_actor.forward(state).detach() return action.data.numpy() def get_exploration_action(self, state): '''得到给定状态下根据演员网络计算出的带噪声的行为,模拟一定的探索 Args: state numpy数组 Returns: action numpy 数组 ''' action = self.actor.forward(state).detach() new_action = action.data.numpy() + (self.noise.sample() * self.action_lim) new_action = new_action.clip(min=-1 * self.action_lim, max=self.action_lim) return new_action def _learn_from_memory(self): '''从记忆学习,更新两个网络的参数 ''' # 随机获取记忆里的Transmition trans_pieces = self.sample(self.batch_size) s0 = np.vstack([x.s0 for x in trans_pieces]) a0 = np.array([x.a0 for x in trans_pieces]) r1 = np.array([x.reward for x in trans_pieces]) # is_done = np.array([x.is_done for x in trans_pieces]) s1 = np.vstack([x.s1 for x in trans_pieces]) # 优化评论家网络参数 a1 = self.target_actor.forward(s1).detach() next_val = torch.squeeze(self.target_critic.forward(s1, a1).detach()) # y_exp = r + gamma*Q'( s2, pi'(s2)) y_expected = torch.from_numpy(r1).type( torch.FloatTensor) + self.gamma * next_val y_expected = y_expected.type(torch.FloatTensor) # y_pred = Q( s1, a1) a0 = torch.from_numpy(a0) # 转换成Tensor y_predicted = torch.squeeze(self.critic.forward(s0, a0)) # compute critic loss, and update the critic loss_critic = F.smooth_l1_loss(y_predicted, y_expected) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() # 优化演员网络参数,优化的目标是使得Q增大 pred_a0 = self.actor.forward(s0) # 直接使用a0会不收敛 #反向梯度下降(梯度上升),以某状态的价值估计为策略目标函数 loss_actor = -1 * torch.sum(self.critic.forward(s0, pred_a0)) self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() # 软更新参数 soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) return (loss_critic.item(), loss_actor.item()) def learning_method(self, display=False, explore=True): self.state = np.float64(self.env.reset()) time_in_episode, total_reward = 0, 0 is_done = False loss_critic, loss_actor = 0.0, 0.0 while not is_done: # add code here s0 = self.state if explore: a0 = self.get_exploration_action(s0) else: a0 = self.actor.forward(s0).detach().data.numpy() s1, r1, is_done, info, total_reward = self.act(a0) if display: self.env.render() if self.total_trans > self.batch_size: loss_c, loss_a = self._learn_from_memory() loss_critic += loss_c loss_actor += loss_a time_in_episode += 1 loss_critic /= time_in_episode loss_actor /= time_in_episode if display: print("{}".format(self.experience.last_episode)) return time_in_episode, total_reward, loss_critic, loss_actor def learning(self, max_episode_num=800, display=False, explore=True): total_time, episode_reward, num_episode = 0, 0, 0 total_times, episode_rewards, num_episodes = [], [], [] for i in tqdm(range(max_episode_num)): time_in_episode, episode_reward, loss_critic, loss_actor = \ self.learning_method(display=display, explore=explore) total_time += time_in_episode num_episode += 1 total_times.append(total_time) episode_rewards.append(episode_reward) num_episodes.append(num_episode) print("episode:{:3}:loss critic:{:4.3f}, J_actor:{:4.3f}".\ format(num_episode-1, loss_critic, -loss_actor)) if explore and num_episode % 100 == 0: self.save_models(num_episode) return total_times, episode_rewards, num_episodes def save_models(self, episode_count): torch.save(self.target_actor.state_dict(), './Models/' + str(episode_count) + '_actor.pt') torch.save(self.target_critic.state_dict(), './Models/' + str(episode_count) + '_critic.pt') print("Models saved successfully") def load_models(self, episode): self.actor.load_state_dict( torch.load('./Models/' + str(episode) + '_actor.pt')) self.critic.load_state_dict( torch.load('./Models/' + str(episode) + '_critic.pt')) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) print("Models loaded succesfully")
class DDPG(Algorithm): def __init__(self, env): super(DDPG, self).__init__() pi_net = PiNet(self.ns, self.na) self.pi_net = pi_net.to(self.device) pi_target = PiNet(self.ns, self.na) self.pi_target = pi_target.to(self.device) self.load_state_dict(self.pi_target, self.pi_net.state_dict()) q_net = QNet(self.ns, self.na) self.q_net = q_net.to(self.device) q_target = QNet(self.ns, self.na) self.q_target = q_target.to(self.device) self.load_state_dict(self.q_target, self.q_net.state_dict()) self.optimizer_q = torch.optim.Adam(self.q_net.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=1e-2) self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999), weight_decay=0) self.noise = OrnsteinUhlenbeckActionNoise( torch.zeros(1, self.na).to(self.device), self.epsilon * torch.ones(1, self.na).to(self.device)) def play(self, env, evaluate=False): if env.k == 0: self.noise.reset() noise = self.noise() if not evaluate else 0 if self.env_steps >= self.warmup_steps or evaluate: with torch.no_grad(): a = self.pi_net(env.s) + noise a = torch.clamp(a, min=-1, max=1) else: a = None state = env(a) return state def train(self): results = defaultdict(lambda: defaultdict(list)) for i, (s, a, r, t, stag) in tqdm(enumerate(self.sample())): i += 1 self.train_mode() self.optimizer_q.zero_grad() self.optimizer_p.zero_grad() with torch.no_grad(): pi_tag = self.pi_target(stag) q_target = self.q_target(stag, pi_tag) g = r + (1 - t) * self.gamma**self.n_steps * q_target qa = self.q_net(s, a) loss_q = F.mse_loss(qa, g, reduction='mean') loss_q.backward() if self.clip_q: nn.utils.clip_grad_norm(self.q_net.parameters(), self.clip_q) self.optimizer_q.step() if not i % self.delayed_policy_update: pi = self.pi_net(s) if self.env_steps >= self.warmup_steps: v = self.q_net(s, pi) loss_p = (-v).mean() else: loss_p = F.smooth_l1_loss(pi, a) loss_p.backward() if self.clip_p: nn.utils.clip_grad_norm(self.pi_net.parameters(), self.clip_p) self.optimizer_p.step() results['scalar']['q_est'].append(float(-loss_p)) soft_update(self.pi_net, self.pi_target, self.tau) results['scalar']['loss_q'].append(float(loss_q)) soft_update(self.q_net, self.q_target, self.tau) # if not n % self.target_update: # self.load_state_dict(self.pi_target, self.pi_net.state_dict()) # self.load_state_dict(self.q_target, self.q_net.state_dict()) if not i % self.train_epoch: statistics = self.env.get_stats() for k, v in statistics.items(): for ki, vi in v.items(): results[k][ki] = vi results['scalar']['rb'] = self.replay_buffer.size results['scalar']['env-steps'] = self.env_steps results['scalar']['episodes'] = self.episodes results['scalar']['train-steps'] = i yield results results = defaultdict(lambda: defaultdict(list))
def __init__( self, observation_space, action_space, actor_class=Actor, actor_kwargs={}, critic_class=Critic, critic_kwargs={}, epsilon_initial=1.0, epsilon_final=0.01, epsilon_steps=10000, batch_size=64, gamma=0.99, beta=0.5, # averaging factor between off-policy and on-policy targets during n-step updates tau_actor=0.001, # Polyak averaging factor for updating target weights tau_critic=0.001, replay_memory=None, # memory buffer object replay_memory_size=1000000, learning_rate_actor=0.00001, learning_rate_critic=0.001, initial_memory_threshold=0, clip_grad=10, adam_betas=(0.95, 0.999), use_ornstein_noise=False, # if false, uses epsilon-greedy with uniform-random action-parameter exploration loss_func=F.mse_loss, # F.smooth_l1_loss inverting_gradients=False, n_step_returns=False, seed=None, buffer_next_actions=False, central_critic=False): super(PDDPGAgent, self).__init__(observation_space, action_space) self.actions_with_param = 3 self.action_parameter_sizes = np.array([ self.action_space.spaces[i].shape[0] for i in range(1, self.actions_with_param + 1) ]) self.action_parameter_size = int(self.action_parameter_sizes.sum()) self.action_max = torch.from_numpy(np.ones( (self.act_dim, ))).float().to(device) self.action_min = -self.action_max.detach() self.action_range = (self.action_max - self.action_min).detach() self.action_parameter_max_numpy = np.concatenate([ self.action_space.spaces[i].high for i in range(1, self.actions_with_param + 1) ]).ravel() self.action_parameter_min_numpy = np.concatenate([ self.action_space.spaces[i].low for i in range(1, self.actions_with_param + 1) ]).ravel() self.action_parameter_range_numpy = (self.action_parameter_max_numpy - self.action_parameter_min_numpy) self.action_parameter_max = torch.from_numpy( self.action_parameter_max_numpy).float().to(device) self.action_parameter_min = torch.from_numpy( self.action_parameter_min_numpy).float().to(device) self.action_parameter_range = torch.from_numpy( self.action_parameter_range_numpy).float().to(device) self.epsilon = epsilon_initial self.epsilon_initial = epsilon_initial self.epsilon_final = epsilon_final self.epsilon_steps = epsilon_steps self.clip_grad = clip_grad self.batch_size = batch_size self.gamma = gamma self.beta = beta self.replay_memory_size = replay_memory_size self.initial_memory_threshold = initial_memory_threshold self.learning_rate_actor = learning_rate_actor self.learning_rate_critic = learning_rate_critic self.inverting_gradients = inverting_gradients self.tau_actor = tau_actor self.tau_critic = tau_critic self._step = 0 self._episode = 0 self.updates = 0 self.np_random = None self.seed = seed self._seed(seed) self.buffer_next_actions = buffer_next_actions self.use_ornstein_noise = use_ornstein_noise self.noise = OrnsteinUhlenbeckActionNoise( self.action_parameter_size, random_machine=self.np_random, mu=0., theta=0.15, sigma=0.0001) # print(self.act_dim + self.action_parameter_size) self.n_step_returns = n_step_returns self.loss_func = loss_func # l1_smooth_loss performs better but original paper used MSE self.actor = actor_class(self.obs_dim, self.act_dim, self.action_parameter_size, **actor_kwargs).to(device) self.actor_target = actor_class(self.obs_dim, self.act_dim, self.action_parameter_size, **actor_kwargs).to(device) hard_update_target_network(self.actor, self.actor_target) self.actor_target.eval() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.learning_rate_actor, betas=adam_betas) if not central_critic: if replay_memory is None: self.replay_memory = MemoryNStepReturns( replay_memory_size, observation_space.shape, (self.act_dim + self.action_parameter_size, ), next_actions=self.buffer_next_actions, n_step_returns=self.n_step_returns) else: self.replay_memory = replay_memory self.critic = critic_class(self.obs_dim, self.act_dim, self.action_parameter_size, **critic_kwargs).to(device) self.critic_target = critic_class(self.obs_dim, self.act_dim, self.action_parameter_size, **critic_kwargs).to(device) hard_update_target_network(self.critic, self.critic_target) self.critic_target.eval() self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.learning_rate_critic, betas=adam_betas)
class PDDPGAgent(BaseAgent): """ DDPG actor-critic agent for parameterised action spaces [Hausknecht and Stone 2016] """ def __init__( self, observation_space, action_space, actor_class=Actor, actor_kwargs={}, critic_class=Critic, critic_kwargs={}, epsilon_initial=1.0, epsilon_final=0.01, epsilon_steps=10000, batch_size=64, gamma=0.99, beta=0.5, # averaging factor between off-policy and on-policy targets during n-step updates tau_actor=0.001, # Polyak averaging factor for updating target weights tau_critic=0.001, replay_memory=None, # memory buffer object replay_memory_size=1000000, learning_rate_actor=0.00001, learning_rate_critic=0.001, initial_memory_threshold=0, clip_grad=10, adam_betas=(0.95, 0.999), use_ornstein_noise=False, # if false, uses epsilon-greedy with uniform-random action-parameter exploration loss_func=F.mse_loss, # F.smooth_l1_loss inverting_gradients=False, n_step_returns=False, seed=None, buffer_next_actions=False, central_critic=False): super(PDDPGAgent, self).__init__(observation_space, action_space) self.actions_with_param = 3 self.action_parameter_sizes = np.array([ self.action_space.spaces[i].shape[0] for i in range(1, self.actions_with_param + 1) ]) self.action_parameter_size = int(self.action_parameter_sizes.sum()) self.action_max = torch.from_numpy(np.ones( (self.act_dim, ))).float().to(device) self.action_min = -self.action_max.detach() self.action_range = (self.action_max - self.action_min).detach() self.action_parameter_max_numpy = np.concatenate([ self.action_space.spaces[i].high for i in range(1, self.actions_with_param + 1) ]).ravel() self.action_parameter_min_numpy = np.concatenate([ self.action_space.spaces[i].low for i in range(1, self.actions_with_param + 1) ]).ravel() self.action_parameter_range_numpy = (self.action_parameter_max_numpy - self.action_parameter_min_numpy) self.action_parameter_max = torch.from_numpy( self.action_parameter_max_numpy).float().to(device) self.action_parameter_min = torch.from_numpy( self.action_parameter_min_numpy).float().to(device) self.action_parameter_range = torch.from_numpy( self.action_parameter_range_numpy).float().to(device) self.epsilon = epsilon_initial self.epsilon_initial = epsilon_initial self.epsilon_final = epsilon_final self.epsilon_steps = epsilon_steps self.clip_grad = clip_grad self.batch_size = batch_size self.gamma = gamma self.beta = beta self.replay_memory_size = replay_memory_size self.initial_memory_threshold = initial_memory_threshold self.learning_rate_actor = learning_rate_actor self.learning_rate_critic = learning_rate_critic self.inverting_gradients = inverting_gradients self.tau_actor = tau_actor self.tau_critic = tau_critic self._step = 0 self._episode = 0 self.updates = 0 self.np_random = None self.seed = seed self._seed(seed) self.buffer_next_actions = buffer_next_actions self.use_ornstein_noise = use_ornstein_noise self.noise = OrnsteinUhlenbeckActionNoise( self.action_parameter_size, random_machine=self.np_random, mu=0., theta=0.15, sigma=0.0001) # print(self.act_dim + self.action_parameter_size) self.n_step_returns = n_step_returns self.loss_func = loss_func # l1_smooth_loss performs better but original paper used MSE self.actor = actor_class(self.obs_dim, self.act_dim, self.action_parameter_size, **actor_kwargs).to(device) self.actor_target = actor_class(self.obs_dim, self.act_dim, self.action_parameter_size, **actor_kwargs).to(device) hard_update_target_network(self.actor, self.actor_target) self.actor_target.eval() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.learning_rate_actor, betas=adam_betas) if not central_critic: if replay_memory is None: self.replay_memory = MemoryNStepReturns( replay_memory_size, observation_space.shape, (self.act_dim + self.action_parameter_size, ), next_actions=self.buffer_next_actions, n_step_returns=self.n_step_returns) else: self.replay_memory = replay_memory self.critic = critic_class(self.obs_dim, self.act_dim, self.action_parameter_size, **critic_kwargs).to(device) self.critic_target = critic_class(self.obs_dim, self.act_dim, self.action_parameter_size, **critic_kwargs).to(device) hard_update_target_network(self.critic, self.critic_target) self.critic_target.eval() self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.learning_rate_critic, betas=adam_betas) def __str__(self): desc = ("P-DDPG Agent with frozen initial weight layer\n" + "Actor: {}\n".format(self.actor) + "Critic: {}\n".format(self.critic) + "Actor Alpha: {}\n".format(self.learning_rate_actor) + "Critic Alpha: {}\n".format(self.learning_rate_critic) + "Gamma: {}\n".format(self.gamma) + "Tau Actor: {}\n".format(self.tau_actor) + "Tau Critic: {}\n".format(self.tau_critic) + "Beta: {}\n".format(self.beta) + "Inverting Gradients: {}\n".format(self.inverting_gradients) + "Replay Memory: {}\n".format(self.replay_memory_size) + "epsilon_initial: {}\n".format(self.epsilon_initial) + "epsilon_final: {}\n".format(self.epsilon_final) + "epsilon_steps: {}\n".format(self.epsilon_steps) + "Clip norm: {}\n".format(self.clip_grad) + "Batch Size: {}\n".format(self.batch_size) + "Ornstein Noise?: {}\n".format(self.use_ornstein_noise) + "Seed: {}\n".format(self.seed)) return desc def set_action_parameter_passthrough_weights(self, initial_weights, initial_bias=None): passthrough_layer = self.actor.action_parameters_passthrough_layer print(initial_weights.shape) print(passthrough_layer.weight.data.size()) assert initial_weights.shape == passthrough_layer.weight.data.size() passthrough_layer.weight.data = torch.Tensor( initial_weights).float().to(device) if initial_bias is not None: print(initial_bias.shape) print(passthrough_layer.bias.data.size()) assert initial_bias.shape == passthrough_layer.bias.data.size() passthrough_layer.bias.data = torch.Tensor( initial_bias).float().to(device) passthrough_layer.requires_grad = False passthrough_layer.weight.requires_grad = False passthrough_layer.bias.requires_grad = False hard_update_target_network(self.actor, self.actor_target) def _invert_gradients(self, grad, vals, grad_type, inplace=True): # 5x faster on CPU if grad_type == "actions": max_p = self.action_max.cpu() min_p = self.action_min.cpu() rnge = self.action_range.cpu() elif grad_type == "action_parameters": max_p = self.action_parameter_max.cpu() min_p = self.action_parameter_min.cpu() rnge = self.action_parameter_range.cpu() else: raise ValueError("Unhandled grad_type: '" + str(grad_type) + "'") assert grad.shape == vals.shape if not inplace: grad = grad.clone() with torch.no_grad(): for n in range(grad.shape[0]): # index = grad < 0 # actually > but Adam minimises, so reversed (could also double negate the grad) index = grad[n] > 0 grad[n][index] *= (index.float() * (max_p - vals[n]) / rnge)[index] grad[n][~index] *= ((~index).float() * (vals[n] - min_p) / rnge)[~index] return grad def _seed(self, seed=None): """ NOTE: this will not reset the randomly initialised weights; use the seed parameter in the constructor instead. :param seed: :return: """ self.seed = seed random.seed(seed) np.random.seed(seed) self.np_random = np.random.RandomState(seed=seed) if seed is not None: torch.manual_seed(seed) torch.cuda.manual_seed(seed) def _ornstein_uhlenbeck_noise(self, all_action_parameters): """ Continuous action exploration using an Ornstein–Uhlenbeck process. """ return all_action_parameters.data.numpy() + ( self.noise.sample() * self.action_parameter_range_numpy) def end_episode(self): self._episode += 1 # anneal exploration if self._episode < self.epsilon_steps: self.epsilon = self.epsilon_initial - ( self.epsilon_initial - self.epsilon_final) * (self._episode / self.epsilon_steps) else: self.epsilon = self.epsilon_final pass def act(self, obs): with torch.no_grad(): obs = torch.from_numpy(obs).to(device) all_actions, all_action_parameters = self.actor.forward(obs) all_actions = all_actions.detach().cpu().data.numpy() all_action_parameters = all_action_parameters.detach().cpu( ).data.numpy() # Hausknecht and Stone [2016] use epsilon greedy actions with uniform random action-parameter exploration if self.np_random.uniform() < self.epsilon: all_actions = self.np_random.uniform(size=all_actions.shape) offsets = np.array([ self.action_parameter_sizes[i] for i in range(self.actions_with_param) ], dtype=int).cumsum() offsets = np.concatenate((np.array([0]), offsets)) if not self.use_ornstein_noise: for i in range(self.actions_with_param): all_action_parameters[offsets[i]:offsets[ i + 1]] = self.np_random.uniform( self.action_parameter_min_numpy[ offsets[i]:offsets[i + 1]], self.action_parameter_max_numpy[ offsets[i]:offsets[i + 1]]) # select maximum action action = np.argmax(all_actions) offset = np.array( [self.action_parameter_sizes[i] for i in range(action)], dtype=int).sum() if self.use_ornstein_noise and self.noise is not None: all_action_parameters[offset:offset + self.action_parameter_sizes[action]] += \ self.noise.sample()[offset:offset + self.action_parameter_sizes[action]] if action < self.actions_with_param: action_parameters = all_action_parameters[ offset:offset + self.action_parameter_sizes[action]] else: action_parameters = None return action, action_parameters, all_actions, all_action_parameters # def step(self, state, action, reward, next_state, next_action, terminal, time_steps=1, optimise=True): # action, action_params, all_actions, all_action_parameters = action # self._step += 1 # self._add_sample(state, np.concatenate((all_actions.data, all_action_parameters.data)).ravel(), reward, # next_state, terminal) # if optimise and self._step >= self.batch_size and self._step >= self.initial_memory_threshold: # self.update() # def _add_sample(self, state, action, reward, next_state, terminal): # assert not self.n_step_returns # assert len(action) == self.act_dim + self.action_parameter_size # self.replay_memory.append(state, action, reward, next_state, terminal) def update(self): if self.replay_memory.nb_entries < self.batch_size or \ self.replay_memory.nb_entries < self.initial_memory_threshold: return # Sample a batch from replay memory if self.n_step_returns: obs, actions, rewards, next_obs, terminals, n_step_returns = \ self.replay_memory.sample(self.batch_size, random_machine=self.np_random) else: obs, actions, rewards, next_obs, terminals = \ self.replay_memory.sample(self.batch_size, random_machine=self.np_random) n_step_returns = None obs = torch.from_numpy(obs).to(device) actions_combined = torch.from_numpy(actions).to( device) # make sure to separate actions and action-parameters actions = actions_combined[:, :self.act_dim] action_parameters = actions_combined[:, self.act_dim:] rewards = torch.from_numpy(rewards).to(device) next_obs = torch.from_numpy(next_obs).to(device) terminals = torch.from_numpy(terminals).to(device) if self.n_step_returns: n_step_returns = torch.from_numpy(n_step_returns).to(device) # ---------------------- optimize critic ---------------------- with torch.no_grad(): # use actor_target to predict next action pred_next_actions, pred_next_action_parameters = self.actor_target.forward( next_obs) # use critic_target to predict target value off_policy_next_val = \ self.critic_target.forward(next_obs, pred_next_actions, pred_next_action_parameters) off_policy_target = rewards + ( 1 - terminals) * self.gamma * off_policy_next_val if self.n_step_returns: on_policy_target = n_step_returns target = self.beta * on_policy_target + ( 1. - self.beta) * off_policy_target else: target = off_policy_target y_expected = target # use critic to predict actual value y_predicted = self.critic.forward(obs, actions, action_parameters) loss_critic = self.loss_func(y_predicted, y_expected) # update critic self.critic_optimizer.zero_grad() loss_critic.backward() if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.clip_grad) self.critic_optimizer.step() # ---------------------- optimise actor ---------------------- # 1 - calculate gradients from critic with torch.no_grad(): # use actor to make action (with no grad) actions, action_params = self.actor(obs) action_params = torch.cat((actions, action_params), dim=1) action_params.requires_grad = True # use critic and compute its gradients Q_val = self.critic(obs, action_params[:, :self.act_dim], action_params[:, self.act_dim:]).mean() self.critic.zero_grad() Q_val.backward() from copy import deepcopy delta_a = deepcopy(action_params.grad.data) # 2 - apply inverting gradients and combine with gradients from actor # use actor to make action again (with grad) actions, action_params = self.actor(Variable(obs)) action_params = torch.cat((actions, action_params), dim=1) # invert gradients of actions and parameters separately delta_a[:, self.act_dim:] = self._invert_gradients( delta_a[:, self.act_dim:].cpu(), action_params[:, self.act_dim:].cpu(), grad_type="action_parameters", inplace=True) delta_a[:, :self.act_dim] = self._invert_gradients( delta_a[:, :self.act_dim].cpu(), action_params[:, :self.act_dim].cpu(), grad_type="actions", inplace=True) out = -torch.mul(delta_a, action_params) self.actor.zero_grad() out.backward(torch.ones(out.shape).to(device)) if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad) self.actor_optimizer.step() # ---------------- soft update actor and critic --------------- soft_update_target_network(self.actor, self.actor_target, self.tau_actor) soft_update_target_network(self.critic, self.critic_target, self.tau_critic)