def __init__(self, hidden_size, env): self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] self.Actor = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Actor_target = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic_target = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() for target_param, param in zip(self.Actor_target.parameters(), self.Actor.parameters()): target_param.data = param.data for target_param, param in zip(self.Critic_target.parameters(), self.Critic.parameters()): target_param.data = param.data self.Memory = Memory(30000) self.criterion = nn.MSELoss().cuda() self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(), lr=1e-2) self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(), lr=1e-1)
def __init__(self, state_dim, action_dim, max_action): self.actor = Actor(state_dim, action_dim, max_action).to(device) # gradient descent self.actor_target = Actor(state_dim, action_dim, max_action).to(device) # poliac average self.actor_target.load_state_dict( self.actor.state_dict()) # load with parameters of actor model self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic = Critic(state_dim, action_dim).to(device) self.critic_target = Critic(state_dim, action_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.max_action = max_action
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Reward monitoring self.best_total_reward = -np.inf # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
class DDPGagent: def __init__(self, hidden_size, env): self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] self.Actor = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Actor_target = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic_target = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() for target_param, param in zip(self.Actor_target.parameters(), self.Actor.parameters()): target_param.data = param.data for target_param, param in zip(self.Critic_target.parameters(), self.Critic.parameters()): target_param.data = param.data self.Memory = Memory(30000) self.criterion = nn.MSELoss().cuda() self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(), lr=1e-2) self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(), lr=1e-1) def get_action(self, state): state = torch.from_numpy(state).float().unsqueeze(0).cuda() action = self.Actor.forward(state) action = action.detach().cpu().numpy() return action def update(self, batch_size): states, actions, rewards, next_states, _ = self.Memory.sample( batch_size) states = torch.tensor(states).cuda() actions = torch.tensor(actions).cuda() rewards = torch.tensor(rewards).cuda() next_states = torch.tensor(next_states).cuda() Q_Value = self.Critic.forward(states, action=actions) next_actions = self.Actor_target(next_states) next_Q = self.Critic_target.forward(next_states, next_actions.detach()) Q_prime = rewards + 0.99 * next_Q critic_loss = self.criterion(Q_Value, Q_prime) policy_loss = -self.Critic.forward(states, self.Actor.forward(states)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() for target_param, param in zip(self.Actor_target.parameters(), self.Actor.parameters()): target_param.data = (param.data * 1e-2 + target_param.data * (1.0 - 1e-2)) for target_param, param in zip(self.Critic_target.parameters(), self.Critic.parameters()): target_param.data.copy_(param.data * 1e-2 + target_param.data * (1.0 - 1e-2))
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Reward monitoring self.best_total_reward = -np.inf # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.total_reward = 0.0 #self.count = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward if self.total_reward > self.best_total_reward: self.best_total_reward = self.total_reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
import gym import matplotlib.pyplot as plt from Actor_Critic import Actor, Critic GAME = 'CartPole-v0' EPISODES = 1000 RENDER = False env = gym.make(GAME) env = env.unwrapped n_states = env.observation_space.shape[0] n_actions = env.action_space.n actor = Actor(n_states, n_actions) critic = Critic(n_states) def run(): plt.ion() total_r = 0 avg_ep_r_hist = [] for episode in range(EPISODES): ep_r = 0 s = env.reset() while True: a = actor.choose_action(s) s_, r, done, info = env.step(a) r = -10 if done else r
batches = data_utils.batch_iter( list(zip(all_new_select_x,all_new_select_y)),all_new_select_x.shape[0], 1,shuffle=True) for batch in batches: x_batch, y_batch = zip(*batch) # print(x_batch, y_batch) train_step(x_batch, y_batch) if episode % 2 == 0:#在测试集上检验效果 print("\nEvaluation:") n2,p2,acc2=dev_step(x_dev, y_dev, writer=dev_summary_writer) print("acc_ori",acc_ori,"new_select_acc",acc2) sess = tf.Session() actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A) critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor sess.run(tf.global_variables_initializer()) if OUTPUT_GRAPH: tf.summary.FileWriter("logs/", sess.graph) for i_episode in range(MAX_EPISODE): s = env.reset() t = 0 track_r = [] while True: if RENDER: env.render() a = actor.choose_action(s)
class TD3(object): # training the agent over ceratin amount of time steps. def __init__(self, state_dim, action_dim, max_action): self.actor = Actor(state_dim, action_dim, max_action).to(device) # gradient descent self.actor_target = Actor(state_dim, action_dim, max_action).to(device) # poliac average self.actor_target.load_state_dict( self.actor.state_dict()) # load with parameters of actor model self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic = Critic(state_dim, action_dim).to(device) self.critic_target = Critic(state_dim, action_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.max_action = max_action def select_action(self, state): # input of actor state = torch.Tensor(state.reshape(1, -1)).to( device) # in horizontal vector # convert back to numpy for adding noise and cliping return self.actor(state).cpu().data.numpy().flatten( ) # to forward propagation we can use cpu. flattern:to get action in 1D array # do exploration we have policy noise, standard deviation! # policy_freq: frequency of the delay, update the target once every iteration. def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): for it in range(iterations): # sample a batch of transitions (s, s’, a, r) from the memory, creat 4 seperate batches: batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample( batch_size) state = torch.Tensor(batch_states).to(device) next_state = torch.Tensor(batch_next_states).to(device) action = torch.Tensor(batch_actions).to(device) reward = torch.Tensor(batch_rewards).to(device) done = torch.Tensor(batch_dones).to(device) # From the next state s', the Actor target plays the next action a' next_action = self.actor_target(next_state) # add Gaussian noise to this next action a’ and we clip it in a range of values supported by the environment noise = torch.Tensor(batch_actions).data.normal_( 0, policy_noise).to(device) noise = noise.clamp(-noise_clip, noise_clip) next_action = (next_action + noise).clamp(-self.max_action, self.max_action) # The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs: target_Q1, target_Q2 = self.critic_target(next_state, next_action) # keep the minimum of these two Q-values: min(Qt1, Qt2) target_Q = torch.min(target_Q1, target_Q2) # get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2) target_Q = reward + ((1 - done) * discount * target_Q).detach() # two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs current_Q1, current_Q2 = self.critic(state, action) # compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt) critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model if it % policy_freq == 0: actor_loss = -self.critic.Q1(state, self.actor(state)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # once every two iterations, we update the weights of the Actor target by polyak averaging for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # once every two iterations, we update the weights of the Critic target by polyak averaging for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # save method to save a trained model def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) # load method to load a pre-trained model def load(self, filename, directory): self.actor.load_state_dict( torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict( torch.load('%s/%s_critic.pth' % (directory, filename)))
import matplotlib.pyplot as plt from Actor_Critic import Actor, Critic GAME = 'Pendulum-v0' EPISODES = 5000 MAX_STEP = 100 env = gym.make(GAME) env = env.unwrapped n_states = env.observation_space.shape[0] n_actions = env.action_space.shape[0] low_action_bound = env.action_space.low[0] high_action_bound = env.action_space.high[0] actor = Actor(n_states, n_actions, low_action_bound, high_action_bound) critic = Critic(n_states) def run(): plt.ion() total_r = 0 avg_ep_r_hist = [] for episode in range(EPISODES): ep_step = 0 s = env.reset() while True: a = actor.choose_action(s) s_, r, done, info = env.step(a) total_r += r