def test_ReplayBuffer(self): mem = ReplayBuffer(2) mem.push(1) mem.push(2) [sample] = mem.sample(2) self.assertEqual(sorted(sample), [1, 2]) mem.push(3) [sample] = mem.sample(2) self.assertEqual(sorted(sample), [2, 3]) mem.push(4) [sample] = mem.sample(2) self.assertEqual(sorted(sample), [3, 4])
class MaddpgAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): self.agents = [ Agent(state_size=state_size, action_size=action_size, random_seed=random_seed), Agent(state_size=state_size, action_size=action_size, random_seed=random_seed) ] self.seed = random.seed(random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # self.soft_update(self.critic_local, self.critic_target, 1) # self.soft_update(self.actor_local, self.actor_target, 1) def act(self, states, add_noise=True): actions = [ agent.act(state, add_noise) for agent, state in zip(self.agents, states) ] return actions def step(self, states, actions, rewards, next_states, dones): # Shared replay buffer for i, _ in enumerate(self.agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): for agent in self.agents: agent.learn(experiences, gamma) def reset(self): for agent in self.agents: agent.reset() def save_checkpont(self): for i, agent in enumerate(self.agents): agent.save_checkpont(i)
def learn(env, policy, q_func, optimizer_spec, session, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, lr_multiplier=1.0): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: img_in: tf.Tensor tensorflow tensor representing the input image num_actions: int number of actions scope: str scope in which all the model related variables should be created reuse: bool whether previously created variables should be reused. optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer session: tf.Session tensorflow session to use. exploration: rl_algs.deepq.utils.schedules.Schedule schedule for probability of chosing random action. stopping_criterion: (env, t) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network grad_norm_clipping: float or None If not None gradients' norms are clipped to this value. """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_shape = env.observation_space.shape else: img_h, img_w, img_c = env.observation_space.shape input_shape = (img_h, img_w, frame_history_len * img_c) num_actions = env.action_space.n # set up placeholders # placeholder for current observation (or state) obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for current action act_t_ph = tf.placeholder(tf.int32, [None]) # placeholder for current reward rew_t_ph = tf.placeholder(tf.float32, [None]) # placeholder for next observation (or state) obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for end of episode mask # this value is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target, not the # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) done_mask_ph = tf.placeholder(tf.float32, [None]) # casting to float on GPU ensures lower data transfer times. obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 # Declare variables for logging t_log = [] mean_reward_log = [] best_mean_log = [] episodes_log = [] exploration_log = [] learning_rate_log = [] # Create a network to produce the current q values for each possible action current_q_func = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) # Current Q-Value Function q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') # Creat the target q function network target_q_func = q_func(obs_tp1_float, num_actions, scope="target_q_func", reuse=False) # Target Q-Value Function target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func') # Encode actions as as a one hot vector, based on the action that was chosen act_t = tf.one_hot(act_t_ph, depth=num_actions, dtype=tf.float32, name="action_one_hot") q_act_t = tf.reduce_sum(act_t * current_q_func, axis=1) # Calculate the current reward, and use that to get the loss function y = rew_t_ph + gamma * tf.reduce_max(target_q_func, reduction_indices=[1]) total_error = tf.square(tf.subtract( y, q_act_t)) #(reward + gamma*V(s') - Q(s, a))**2 # construct optimization op (with gradient clipping) learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) train_fn = minimize_and_clip(optimizer, total_error, var_list=q_func_vars, clip_val=grad_norm_clipping) # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_fn.append(var_target.assign(var)) update_target_fn = tf.group(*update_target_fn) # construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### model_initialized = False num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 SAVE_EVERY_N_STEPS = 200000 for t in itertools.count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env, t): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Store last_obs into replay buffer idx = replay_buffer.store_frame(last_obs) if t == 0: act, reward, done = env.action_space.sample(), 0, False # Choose action if not model_initialized: # choose random action act = env.action_space.sample() else: input_batch = replay_buffer.encode_recent_observation() act = policy.select_action(current_q_func, input_batch, obs_t_ph) # Step simulator forward one step last_obs, reward, done, info = env.step(act) replay_buffer.store_effect( idx, act, reward, done) # Store action taken after last_obs and corresponding reward if done == True: # done was True in latest transition; we have already stored that last_obs = env.reset() # Reset observation done = False ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # 3.b: initialize the model if it has not been initialized yet; to do # that, call # initialize_interdependent_variables(session, tf.global_variables(), { # obs_t_ph: obs_t_batch, # obs_tp1_ph: obs_tp1_batch, # }) # where obs_t_batch and obs_tp1_batch are the batches of observations at # the current and next time step. The boolean variable model_initialized # indicates whether or not the model has been initialized. # Remember that you have to update the target network too (see 3.d)! # 3.c: train the model. To do this, you'll need to use the train_fn and # total_error ops that were created earlier: total_error is what you # created to compute the total Bellman error in a batch, and train_fn # will actually perform a gradient step and update the network parameters # to reduce total_error. When calling session.run on these you'll need to # populate the following placeholders: # obs_t_ph # act_t_ph # rew_t_ph # obs_tp1_ph # done_mask_ph # (this is needed for computing total_error) # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) # (this is needed by the optimizer to choose the learning rate) # 3.d: periodically update the target network by calling # session.run(update_target_fn) # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # 3.a Sample a batch of transitions obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = replay_buffer.sample( batch_size) # 3.b Initialize model if not initialized yet if not model_initialized: initialize_interdependent_variables( session, tf.global_variables(), { obs_t_ph: obs_t_batch, obs_tp1_ph: obs_tp1_batch, }) session.run(update_target_fn) model_initialized = True # 3.c Train the model using train_fn and total_error session.run( train_fn, { obs_t_ph: obs_t_batch, act_t_ph: act_batch, rew_t_ph: rew_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask, learning_rate: optimizer_spec.lr_schedule.value(t) }) # 3.d Update target network every target_update_freq steps if t % target_update_freq == 0: session.run(update_target_fn) num_param_updates += 1 ##### ### 4. Log progress episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and model_initialized: print("Timestep %d" % (t, )) t_log.append(t) print("mean reward (100 episodes) %f" % mean_episode_reward) mean_reward_log.append(mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) best_mean_log.append(best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) episodes_log.append(len(episode_rewards)) print("exploration %f" % policy.current_eps) exploration_log.append(policy.current_eps) print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) learning_rate_log.append(optimizer_spec.lr_schedule.value(t)) sys.stdout.flush() if t % SAVE_EVERY_N_STEPS == 0 and model_initialized: training_log = ({ 't_log': t_log, 'mean_reward_log': mean_reward_log, 'best_mean_log': best_mean_log, 'episodes_log': episodes_log, 'exploration_log': exploration_log, 'learning_rate_log': learning_rate_log }) output_file_name = 'ram_lr' + str(lr_multiplier) + '_' + str( t) + '_data.pkl' with open(output_file_name, 'wb') as f: pickle.dump(training_log, f)
def lunarworker(wid): import tensorflow as tf import numpy as np import gym import time import os from distagent import DistAgent from memory import ReplayBuffer from util import Linear, scale, RewMonitor, SkipEnv, StackEnv gpus = tf.config.experimental.get_visible_devices("GPU") # Select single gpu depending on wid total_gpus = 2 gpu_nr = wid % total_gpus tf.config.set_visible_devices(gpus[gpu_nr], 'GPU') # Restricts mem to allow multiple tf sessions on one GPU tf.config.experimental.set_memory_growth(gpus[gpu_nr], True) # Train parameters N = int(8e6) eps = Linear(startval=0.1, endval=0.01, exploresteps=int(200e3)) gamma = 0.99 updatefreq = 4 targetfreq = 1000 savefreq = 80000 # Setup env = gym.make("LunarLander-v2") env = RewMonitor(env) env = SkipEnv(env, skip=4) # env = StackEnv(env, n_frames=4) action_len = env.action_space.n agent = DistAgent(action_len, dense=16, supportsize=29, vmin=-7.0, vmax=7.0) mem = ReplayBuffer(size=int(20e3), batchsize=32) # Prefill tf.print("Collecting history...") prefill_end = int(10e3) state = env.reset() buff = [] for t in range(1, prefill_end + 1): action = env.action_space.sample() endstate, rew, done, _ = env.step(action) data = (state, action, scale(rew), gamma, endstate, float(done)) buff.append(data) if done: state = env.reset() else: state = endstate if t % 10000 == 0: tf.print(f"Collected {t} samples.") tf.print("Done.") tf.print("Storing history...") for data in buff: mem.add(data) tf.print("Done.") # Warm up states, _, _, _, _, _, = mem.sample() agent.probvalues(states) agent.t_probvalues(states) agent.update_target() # Initial dispatch tottime = time.time() # Training loop tf.print(f"Worker {wid} learning...") state = env.reset() episode_rewards = [] buff = [] for t in range(1, N + 1): t_eps = tf.constant(eps(t), dtype=tf.float32) action = agent.eps_greedy_action( state=np.reshape(state, [1, 8]).astype(np.float32), epsval=t_eps, )[0].numpy() endstate, rew, done, info = env.step(action) data = (state, action, scale(rew), gamma, endstate, float(done)) buff.append(data) if info["Game Over"]: score = info["Episode Score"] episode_rewards.append(score) state = env.reset() if len(episode_rewards) % 100 == 0: tmptime = time.time() msit = (tmptime - tottime) / t * 1000 ma100 = np.mean(episode_rewards[-111:-1]) epstr = (f"Epsiode: {len(episode_rewards)}, " + f"Step: {t}, " + f"MA100: {ma100}, " + f"AvgSpeed: {msit:4.2f} ms/it") tf.print(epstr) else: state = endstate if t % updatefreq == 0: for data in buff: mem.add(data) buff = [] (states, actions, drews, gexps, endstates, dones) = mem.sample() agent.train(states, actions, drews, gexps, endstates, dones) if t % targetfreq == 0: agent.update_target() if t % savefreq == 0: dir_str = f"lunarmodels/step{t}/" os.makedirs(dir_str, exist_ok=True) file_str = dir_str + "model-id-" + f"{wid}" + ".h5" agent.save(file_str) env.close() tmptime = time.time() tottime = tmptime - tottime msit = tottime / N * 1000 tf.print(f"Learning done in {tottime:6.0f}s using {msit:4.2f} ms/it.") tf.print("Done.")
class Maddpg(): """MADDPG Agent : Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize a MADDPG Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ super(Maddpg, self).__init__() self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Instantiate Multiple Agent self.agents = [ Agent(state_size,action_size, random_seed, num_agents) for i in range(num_agents) ] # Instantiate Memory replay Buffer (shared between agents) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def reset(self): """Reset all the agents""" for agent in self.agents: agent.reset() def act(self, states, noise): """Return action to perform for each agents (per policy)""" return [ agent.act(state, noise) for agent, state in zip(self.agents, states) ] def step(self, states, actions, rewards, next_states, dones, num_current_episode): """ # Save experience in replay memory, and use random sample from buffer to learn""" self.memory.add(encode(states), encode(actions), rewards, encode(next_states), dones) # If enough samples in the replay memory and if it is time to update if (len(self.memory) > BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE ==0) : # Note: this code only expects 2 agents assert(len(self.agents)==2) # Allow to learn several time in a row in the same episode for i in range(MULTIPLE_LEARN_PER_UPDATE): # Sample a batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #0 self.maddpg_learn(experiences, own_idx=0, other_idx=1) # Sample another batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #1 self.maddpg_learn(experiences, own_idx=1, other_idx=0) def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA): """ Update the policy of the MADDPG "own" agent. The actors have only access to agent own information, whereas the critics have access to all agents information. Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(states) -> action critic_target(all_states, all_actions) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples own_idx (int) : index of the own agent to update in self.agents other_idx (int) : index of the other agent to update in self.agents gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Filter out the agent OWN states, actions and next_states batch own_states = decode(self.state_size, self.num_agents, own_idx, states) own_actions = decode(self.action_size, self.num_agents, own_idx, actions) own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) # Filter out the OTHER agent states, actions and next_states batch other_states = decode(self.state_size, self.num_agents, other_idx, states) other_actions = decode(self.action_size, self.num_agents, other_idx, actions) other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states) # Concatenate both agent information (own agent first, other agent in second position) all_states=torch.cat((own_states, other_states), dim=1).to(device) all_actions=torch.cat((own_actions, other_actions), dim=1).to(device) all_next_states=torch.cat((own_next_states, other_next_states), dim=1).to(device) agent = self.agents[own_idx] # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)), dim =1).to(device) Q_targets_next = agent.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = agent.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() if (CLIP_CRITIC_GRADIENT): torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1) agent.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()), dim = 1).to(device) actor_loss = -agent.critic_local(all_states, all_actions_pred).mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # agent.soft_update(agent.critic_local, agent.critic_target, TAU) agent.soft_update(agent.actor_local, agent.actor_target, TAU) def checkpoints(self): """Save checkpoints for all Agents""" for idx, agent in enumerate(self.agents): actor_local_filename = 'models/checkpoint_actor_local_' + str(idx) + '.pth' critic_local_filename = 'models/checkpoint_critic_local_' + str(idx) + '.pth' actor_target_filename = 'models/checkpoint_actor_target_' + str(idx) + '.pth' critic_target_filename = 'models/checkpoint_critic_target_' + str(idx) + '.pth' torch.save(agent.actor_local.state_dict(), actor_local_filename) torch.save(agent.critic_local.state_dict(), critic_local_filename) torch.save(agent.actor_target.state_dict(), actor_target_filename) torch.save(agent.critic_target.state_dict(), critic_target_filename)
class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.n_actions = n_actions # 总的动作个数 self.device = device # 设备,cpu或gpu等 self.gamma = gamma # 奖励的折扣因子 # e-greedy策略相关参数 self.actions_count = 0 # 用于epsilon的衰减计数 self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = MLP(n_states, n_actions).to(self.device) self.target_net = MLP(n_states, n_actions).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout # 可查parameters()与state_dict()的区别,前者require_grad=True self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def choose_action(self, state, train=True): '''选择动作 ''' if train: self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor([state], device=self.device, dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.policy_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action else: with torch.no_grad(): # 取消保存梯度 # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor( [state], device='cpu', dtype=torch.float32 ) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.target_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() return action def update(self): if len(self.memory) < self.batch_size: return # 从memory中随机采样transition state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) '''转为张量 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]])''' state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 '''计算当前(s_t,a)对应的Q(s_t, a)''' '''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])''' q_values = self.policy_net(state_batch).gather( dim=1, index=action_batch) # 等价于self.forward # 计算所有next states的V(s_{t+1}),即通过target_net中选取reward最大的对应states next_state_values = self.target_net(next_state_batch).max( 1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) # 计算 expected_q_value # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward expected_q_values = reward_batch + self.gamma * \ next_state_values * (1-done_batch[0]) # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss # 优化模型 self.optimizer.zero_grad( ) # zero_grad清除上一步所有旧的gradients from the last step # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 self.loss.backward() for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step() # 更新模型 def save_model(self, path): torch.save(self.target_net.state_dict(), path) def load_model(self, path): self.target_net.load_state_dict(torch.load(path))
class NECAgent: """ NEC agent """ def __init__(self, config): self.nec_net = NEC(config).to(config['device']) self.train_eps = config['train_eps'] self.eval_eps = config['eval_eps'] self.num_actions = config['num_actions'] self.replay_buffer = ReplayBuffer(config['observation_shape'], config['replay_buffer_size']) self.batch_size = config['batch_size'] self.discount = config['discount'] self.n_step_horizon = config['horizon'] self.episode = 0 self.logger = ScoreLogger(config['env_name'], config['exp_name']) self.env_name = config['env_name'] self.exp_name = config['exp_name'] self.device = config['device'] self.train() # make sure model is on appropriate device at this point before constructing optimizer self.optimizer = RMSprop(self.nec_net.parameters(), lr=config['learning_rate'], alpha=config['rmsprop_alpha'], eps=config['rmsprop_epsilon']) self.loss_fn = MSELoss() def train(self): self.training = True self.nec_net.train() def eval(self): self.training = False self.nec_net.eval() def new_episode(self): # trackers for computing N-step returns and updating replay and dnd memories at the end of episode self.observations, self.keys, self.actions, self.values, self.rewards = [], [], [], [], [] self.episode += 1 def set_epsilon(self, eps): self.train_eps = eps def step(self, obs): q_values, key = self.nec_net.lookup(obs) eps = self.train_eps if self.training else self.eval_eps # do epsilon-greedy crap action = np.random.choice(np.arange( self.num_actions)) if np.random.rand() < eps else _argmax(q_values) # update trackers if self.training: self.actions.append(action) self.observations.append(obs) self.keys.append(key) self.values.append(np.max(q_values)) return action def update(self, consequence): """ Called from main training loop to inform agent of consequence of last action including reward and if the episode terminated """ reward, done = consequence if self.env_name.startswith("CartPole"): reward = reward if not done else -reward # update reward tracker self.rewards.append(reward) if done: episode_length = len(self.actions) # compute N-step returns in reverse order returns, n_step_returns = [None] * (episode_length + 1), [None] * episode_length returns[episode_length] = 0 for t in range(episode_length - 1, -1, -1): returns[t] = self.rewards[t] + self.discount * returns[t + 1] if episode_length - t > self.n_step_horizon: n_step_returns[t] = returns[ t] + self.discount**self.n_step_horizon * ( self.values[t + self.n_step_horizon] - returns[t + self.n_step_horizon]) else: # use on-policy monte carlo returns when below horizon n_step_returns[t] = returns[t] self.keys, n_step_returns = torch.stack(self.keys), np.array( n_step_returns, dtype=np.float32) # for fancy indexing # batch update of replay memory self.replay_buffer.append_batch( np.stack(self.observations), np.asarray(self.actions, dtype=np.int64), n_step_returns) # batch update of episodic memories unique_actions = np.unique(self.actions) for action in unique_actions: action_idxs = np.nonzero(self.actions == action)[0] self.nec_net.update_memory(action, self.keys[action_idxs], n_step_returns[action_idxs]) # save/log metrics for plotting or whatever solved = self.logger.add_score(sum(self.rewards), self.episode) if solved: path = f'{os.getcwd()}/cartpole/trained_agents/nec_{self.exp_name}.pth' torch.save(self.nec_net.state_dict(), path) return True return False def optimize(self): """ Here, we sample from the replay buffer and train the NEC model end-to-end with backprop """ if self.replay_buffer.size() < self.batch_size: return observations, actions, returns = self.replay_buffer.sample( self.batch_size) self.optimizer.zero_grad() q_values = self.nec_net(observations.to(self.device))[range( self.batch_size), actions] # pick q_values for chosen actions loss = self.loss_fn(q_values, returns.to(self.device)) loss.backward() self.optimizer.step() def get_q_values(self, observations, actions): """ Computes q_values for observation, action pairs passed in. Used for testing """ with torch.no_grad(): self.eval() observations = torch.from_numpy(observations) q_values = self.nec_net(observations)[range(len(actions)), actions] return q_values.numpy()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, mnoise=True, split_state=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.mnoise = mnoise self.split_state = split_state # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # initialize targets same as original networks self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) # Noise process if self.mnoise: self.noise = OUNoise((2, action_size), random_seed) else: self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones, step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward if self.split_state: for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) else: self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): """ Copy network parameters from source to target Inputs: target (torch.nn.Module): Net to copy parameters to source (torch.nn.Module): Net whose parameters to copy """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, number_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed number_agents (int): number of agents """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.number_agents = number_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise processes self.noise = OUNoise((number_agents, action_size), random_seed) #self.noise = GaussianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1) #self.noise = GeometricBrownianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experiences in replay memory, and use random sample from buffer to learn.""" # We save experience tuples in the memory for each agent. for i in range(self.number_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn, if enough samples are available in memory (threshold value: BATCH_SIZE) and at learning interval settings if len(self.memory) > BATCH_SIZE: for _ in range(UPDATE_RATE): experiences = self.memory.sample() self.learn(experiences, GAMMA) # def act(self, states, add_noise=True): # """Returns actions for given state as per current policy.""" # # The code has been adapted to implement batch normalization. # actions = np.zeros((self.number_agents, self.action_size)) # self.actor_local.eval() # with torch.no_grad(): # for agent_number, state in enumerate(states): # state = torch.from_numpy(state).float().unsqueeze(0).to(device) # The code has been adapted to implement batch normalization. # action = self.actor_local(state).cpu().data.numpy() # actions[agent_number, :] = action # self.actor_local.train() # if add_noise: # actions += self.noise.sample() # return np.clip(actions, -1, 1) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.number_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_number, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_number, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG_Agent: def __init__(self, state_size, action_size, random_seed, actor_hidden=[400, 300], critic_hidden=[400, 300], id=0): super(DDPG_Agent, self).__init__() self.actor_local = Actor(state_size, action_size, random_seed, hidden_layer_param=actor_hidden).to(DEVICE) self.actor_target = Actor(state_size, action_size, random_seed, hidden_layer_param=actor_hidden).to(DEVICE) self.critic_local = Critic(state_size, action_size, random_seed, hidden_layer_param=critic_hidden).to(DEVICE) self.critic_target = Critic( state_size, action_size, random_seed, hidden_layer_param=critic_hidden).to(DEVICE) self.actor_opt = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_opt = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) self.memory = ReplayBuffer(action_size, random_seed) self.seed = random.seed(random_seed) self.id = id print(critic_hidden) print("") print("--- Agent {} Params ---".format(self.id)) print("Going to train on {}".format(DEVICE)) print("Learning Rate:: Actor: {} | Critic: {}".format( LR_ACTOR, LR_CRITIC)) print( "Replay Buffer:: Buffer Size: {} | Sampled Batch size: {}".format( BUFFER_SIZE, BATCH_SIZE)) print("") print("Actor paramaters:: Input: {} | Hidden Layers: {} | Output: {}". format(state_size, actor_hidden, action_size)) print("Critic paramaters:: Input: {} | Hidden Layers: {} | Output: {}". format(state_size, [critic_hidden[0] + action_size, *critic_hidden[1:]], 1)) print(self.actor_local) print(self.critic_local) print("") print("") # def act(self, state): # state = torch.from_numpy(state).float().to(DEVICE) # self.actor_local.eval() # with torch.no_grad(): # actions = self.actor_local(state).cpu().data.numpy() # self.actor_local.train() # return actions def act(self, obs, noise=0.0): obs = obs.to(DEVICE) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(obs) #+ noise*self.noise.noise() return action def step(self, state, action, reward, next_state, done): # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # --- Teach Critic (with TD) --- # recommended_actions = self.actor_target(next_states) Q_nexts = self.critic_target(next_states, recommended_actions) Q_targets = (rewards + GAMMA * Q_nexts * (1 - dones) ) # This is what we actually got from experience Q_expected = self.critic_local( states, actions ) # This is what we thought the expected return of that state-action is. critic_loss = CRITERION(Q_targets, Q_expected) self.critic_opt.zero_grad() critic_loss.backward() self.critic_opt.step() # --- Teach Actor --- # next_actions = self.actor_local(states) # Here we get the value of each state-actions. # This will be backpropagated to the weights that produced the action in the actor network. # Large values will make weights stronger, smaller values (less expected return for that state-action) weaker actor_loss = -self.critic_local(states, next_actions).mean() self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() # Mix model parameters in both Actor and Critic # self.soft_update(self.actor_local, self.actor_target) self.soft_update(self.critic_local, self.critic_target) def soft_update(self, local, target): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)
class DQN(): def __init__(self, env, save_location, start_episode=1, saved_model=None, prioritized_replay=False): self.env = env self.num_actions = env.action_space.n self.start_episode = start_episode self.save_location = save_location self.saved_model = saved_model self.prioritized_replay = prioritized_replay self.alpha = 0.6 self.beta = 0 self.learning_rate = 1e-4 self.gamma = 0.98 self.buffer_limit = 10**5 self.training_frame_start = 10000 * 5 self.batch_size = 32 self.eps_start = 1 self.eps_end = 0.01 self.decay_factor = 10**5 if prioritized_replay: self.memory = PrioritizedReplayBuffer(size=self.buffer_limit, alpha=self.alpha) self.prioritized_replay_eps = 1e-5 else: self.memory = ReplayBuffer(size=self.buffer_limit) if saved_model: self.epsilon_decay = lambda x: self.eps_end else: self.epsilon_decay = lambda x: self.eps_end + ( self.eps_start - self.eps_end) * math.exp(-1. * x / self. decay_factor) self.save_interval = 100000 self.update_target_interval = 10000 self.device = device self.q = Qnet(84, 84, in_channels=4, n_actions=self.num_actions).to(device) self.q_target = Qnet(84, 84, in_channels=4, n_actions=self.num_actions).to(device) #[self.q, self.q_target], self.optimizer = amp.initialize([self.q, self.q_target], self.optimizer, opt_level="O1") #playing around with mixed-precision training def train(self): s, a, r, s_prime, done_mask = self.memory.sample(self.batch_size) s = torch.as_tensor(s).to(device) a = torch.LongTensor(a).to(device) r = torch.as_tensor(r).to(device) s_prime = torch.as_tensor(s_prime).to(device) done_mask = torch.as_tensor(done_mask).to(device) q_out = self.q(s) # collect output from the chosen action dimension q_a = q_out.gather(1, a) # most reward we get in next state s_prime max_q_prime = self.q_target(s_prime).max(1)[0].unsqueeze(1) target = r + self.gamma * max_q_prime * done_mask # how much is our policy different from the true target loss = F.smooth_l1_loss(q_a, target) self.optimizer.zero_grad() #with amp.scale_loss(loss, self.optimizer) as scaled_loss: # playing around with mixed-precision training # scaled_loss.backward() loss.backward() self.optimizer.step() def run(self, num_episodes): self.q_target.load_state_dict( self.q.state_dict()) # Load policy weights into target network self.optimizer = optim.Adam(self.q.parameters(), lr=self.learning_rate) if self.saved_model: self.q.load_state_dict( torch.load(saved_model)) # Load pretrained model self.beginLogging() #watcher = tw.Watcher() env = self.env best_episode_score = float('-Inf') score = 0.0 total_frames = 0 state = get_state(env.reset()) # Start first game for episode in tqdm( range(self.start_episode, self.start_episode + num_episodes)): # anneal 100% to 1% over training epsilon = self.epsilon_decay(total_frames) episode_score = 0 done = False while not done: action = self.q.sample_action( torch.Tensor(state).unsqueeze(0).to(device), epsilon) obs, reward, done, info = env.step(action) next_state = get_state(obs) done_mask = 0.0 if done else 1.0 self.memory.put((state, action, reward, next_state, done_mask)) state = next_state score += reward episode_score += reward if total_frames > self.training_frame_start: self.train() # Copy policy weights to target if total_frames % self.update_target_interval == 0: self.q_target.load_state_dict(self.q.state_dict()) # Save policy weights if total_frames % self.save_interval == 0: torch.save( self.q.state_dict(), os.path.join(self.save_location, 'policy_%s.pt' % episode)) # Reset environment for the next game if done: state = get_state(env.reset()) total_frames += 1 best_episode_score = max(best_episode_score, episode_score) # Print updates every episode out = "n_episode : {}, Total Frames : {}, Average Score : {:.1f}, Episode Score : {:.1f}, Best Score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format( episode, total_frames, score / episode, episode_score, best_episode_score, len(self.memory), epsilon * 100) print(out) self.log(out) # Microsoft Tensorwatch Watcher for Visualizing Training #watcher.observe( # episode = episode, # episode_score = episode_score, # total_score = score, # buffer_size = self.memory.size(), # epsilon = epsilon, # frames = total_frames, #) # save final model weights torch.save(self.q.state_dict(), os.path.join(self.save_location, 'policy_final.pt')) def beginLogging(self): with open(os.path.join(self.save_location, 'log.out'), 'w') as f: f.write('') def log(self, out): with open(os.path.join(self.save_location, 'log.out'), 'a') as f: f.write('%s\n' % out)
class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions # 总的动作个数 self.device = device # 设备,cpu或gpu等 self.gamma = gamma # e-greedy策略相关参数 self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout # 可查parameters()与state_dict()的区别,前者require_grad=True self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def choose_action(self, state, train=True): '''选择动作 ''' if train: self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor([state], device=self.device, dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.policy_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action else: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor([state], device='cpu', dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.target_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() return action def update(self): if len(self.memory) < self.batch_size: return # 从memory中随机采样transition state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) # 转为张量 # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 # 计算当前(s_t,a)对应的Q(s_t, a) q_values = self.policy_net(state_batch) next_q_values = self.policy_net(next_state_batch) # 代入当前选择的action,得到Q(s_t|a=a_t) q_value = q_values.gather(dim=1, index=action_batch) '''以下是Nature DQN的q_target计算方式 # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数 next_q_state_value = self.target_net( next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) # 计算 q_target # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0]) ''' '''以下是Double DQNq_target计算方式,与NatureDQN稍有不同''' next_target_values = self.target_net(next_state_batch) # 选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a)) next_target_q_value = next_target_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) q_target = reward_batch + self.gamma * next_target_q_value * ( 1 - done_batch[0]) self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss # 优化模型 self.optimizer.zero_grad( ) # zero_grad清除上一步所有旧的gradients from the last step # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 self.loss.backward() for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step() # 更新模型 def save_model(self, path): torch.save(self.target_net.state_dict(), path) def load_model(self, path): self.target_net.load_state_dict(torch.load(path))
class DQN_agent(object): def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)): self.env = env self.max_episode_steps = env._max_episode_steps """ beta: The discounted factor of Q-value function (epsilon): The explore or exploit policy epsilon. initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1 final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps', The epsilon set to the 'final_epsilon' determinately. epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'. """ self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] """ episode: Record training episode steps: Add 1 when predicting an action learning: The trigger of agent learning. It is on while training agent. It is off while testing agent. action_space: The action space of the current environment, e.g 2. """ self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space """ input_len The input length of the neural network. It equals to the length of the state vector. output_len: The output length of the neural network. It is equal to the action space. eval_model: The model for predicting action for the agent. target_model: The model for calculating Q-value of next_state to update 'eval_model'. use_target_model: Trigger for turn 'target_model' on/off """ state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) # memory: Store and sample experience replay. self.memory = ReplayBuffer(hyper_params['memory_size']) """ batch_size: Mini batch size for training model. update_steps: The frequence of traning model model_replace_freq: The frequence of replacing 'target_model' by 'eval_model' """ self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] print("agent initialized") # Linear decrease function for epsilon def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps): decay_rate = curr_steps / final_decay_steps if decay_rate > 1: decay_rate = 1 return initial_value - (initial_value - final_value) * decay_rate def explore_or_exploit_policy(self, state): p = uniform(0, 1) # Get decreased epsilon epsilon = self.linear_decrease(self.initial_epsilon, self.final_epsilon, self.steps, self.epsilon_decay_steps) #if(np.random.randint(1000)==4): #print("epsilon",epsilon) if p < epsilon: #return action return randint(0, self.action_space - 1) else: #return action return self.greedy_policy(state) def greedy_policy(self, state): return self.eval_model.predict(state) # This next function will be called in the main RL loop to update the neural network model given a batch of experience # 1) Sample a 'batch_size' batch of experiences from the memory. # 2) Predict the Q-value from the 'eval_model' based on (states, actions) # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward # 5) Call fit() to do the back-propagation for 'eval_model'. def update_batch(self): if len(self.memory ) < self.batch_size or self.steps % self.update_steps != 0: return #print("fetching minibatch from replay memory") batch = self.memory.sample(self.batch_size) (states, actions, reward, next_states, is_terminal) = batch states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) #q_values = q_values[np.arange(self.batch_size), actions] q_values = q_values[batch_index, actions] # Calculate target if self.use_target_model: #print("target_model.predict") best_actions, q_next = self.target_model.predict_batch(next_states) else: best_actions, q_next = self.eval_model.predict_batch(next_states) q_max = q_next[batch_index, best_actions] terminal = 1 - terminal q_max *= terminal q_target = reward + self.beta * q_max # update model self.eval_model.fit(q_values, q_target) def learn_and_evaluate(self, training_episodes, test_interval): test_number = training_episodes // test_interval all_results = [] for i in range(test_number): # learn self.learn(test_interval) # evaluate avg_reward = self.evaluate() all_results.append(avg_reward) return all_results def learn(self, test_interval): for episode in tqdm(range(test_interval), desc="Training"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: #INSERT YOUR CODE HERE # add experience from explore-exploit policy to memory action = self.explore_or_exploit_policy(state) next_state, reward, done, info = self.env.step(action) self.memory.add(state, action, reward, next_state, done) # update the model every 'update_steps' of experience self.update_batch() # update the target network (if the target network is being used) every 'model_replace_freq' of experiences if self.use_target_model and (self.steps % self.model_replace_freq == 0): self.target_model.replace(self.eval_model) self.steps += 1 steps += 1 state = next_state def evaluate(self, trials=30): total_reward = 0 for _ in tqdm(range(trials), desc="Evaluating"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: steps += 1 action = self.greedy_policy(state) state, reward, done, _ = self.env.step(action) total_reward += reward avg_reward = total_reward / trials print(avg_reward) f = open(result_file, "a+") f.write(str(avg_reward) + "\n") f.close() if avg_reward >= self.best_reward: self.best_reward = avg_reward self.save_model() return avg_reward # save model def save_model(self): self.eval_model.save(result_floder + '/best_model.pt') # load model def load_model(self): self.eval_model.load(result_floder + '/best_model.pt')
class DQNAgent_Vanila_simple(agent): def __init__(self, model, opt, learning=True): super().__init__() self.memory = ReplayBuffer(3000) self.previous_state = None self.previous_action = None self.previous_legal_actions = None self.step = 0 self.model = model self.opt = opt self.loss = 0 self.batch_size = 10 self.test_q = 0 self.max_tile = 0 #self.test_q = 0 self.epsilon_schedule = LinearSchedule(1000000, initial_p=0.99, final_p=0.01) self.learning = learning def should_explore(self): self.epsilon = self.epsilon_schedule.value(self.step) return random.random() < self.epsilon def action(self): if self.learning: self.step += 1 legalActions = self.legal_actions(deepcopy(self.gb.board)) if len(legalActions) == 0: print(111111111111111111111111111111111111111) board = deepcopy(self.gb.board) board = oneHotMap(board) if self.learning and self.should_explore(): q_values = None action = random.choice(legalActions) choice = self.actions[action] else: #mark state = torch.from_numpy(board).type( torch.FloatTensor).cuda().view(-1, 17, 4, 4) action, q_values = self.predict(state, legalActions) choice = self.actions[action] if self.learning: reward = self.gb.currentReward if reward != 0: reward = np.log2(reward) if (self.previous_state is not None and self.previous_action is not None): self.memory.add(self.previous_state, self.previous_action, self.previous_legal_actions, reward, legalActions, board, 0) self.previous_state = board self.previous_action = action self.previous_legal_actions = legalActions if self.learning: self.update() return choice def enableLearning(self): self.model.train() self.learning = True self.max_tile = 0 self.reset() def disableLearning(self): self.model.eval() self.learning = False def end_episode(self): if not self.learning: m = np.max(self.gb.board) if m > self.max_tile: self.max_tile = m return #print(self.gb.board) board = deepcopy(self.gb.board) board = oneHotMap(board) #legalActions = self.legal_actions(deepcopy(self.gb.board)) #print(legalActions) self.memory.add(self.previous_state, self.previous_action, self.previous_legal_actions, self.gb.currentReward, [], board, 1) self.reset() def reset(self): self.previous_state = None self.previous_action = None self.previous_legal_actions = None def update(self): if self.step < self.batch_size: return batch = self.memory.sample(self.batch_size) (states, actions, legal_actions, reward, next_legal_actions, next_states, is_terminal) = batch terminal = torch.tensor(is_terminal).type(torch.cuda.FloatTensor) reward = torch.tensor(reward).type(torch.cuda.FloatTensor) states = torch.from_numpy(states).type(torch.FloatTensor).cuda().view( -1, 17, 4, 4) next_states = torch.from_numpy(next_states).type( torch.FloatTensor).cuda().view(-1, 17, 4, 4) # Current Q Values _, q_values = self.predict_batch(states) batch_index = torch.arange(self.batch_size, dtype=torch.long) #print(actions) #print(q_values) q_values = q_values[batch_index, actions] #print(q_values) # Calculate target q_actions_next, q_values_next = self.predict_batch( next_states, legalActions=next_legal_actions) #print(q_values_next) q_max = q_values_next.max(1)[0].detach() q_max = (1 - terminal) * q_max # if sum(terminal == 1) > 0: # print(reward) # print( (terminal == 1).nonzero()) # print(terminal) # print(next_legal_actions) # print(q_max) # input() q_target = reward + 0.99 * q_max self.opt.zero_grad() loss = self.model.loss_function(q_target, q_values) loss.backward() self.opt.step() #train_loss = loss_vae.item() + loss_dqn.item() self.loss += loss.item() / len(states) def predict_batch(self, input, legalActions=None): input = input #print(legalActions) q_values = self.model(input) if legalActions is None: values, q_actions = q_values.max(1) else: isNotlegal = True # print(legalActions) # print(q_values) q_values_true = torch.full((self.batch_size, 4), -100000000).cuda() for i, action in enumerate(legalActions): q_values_true[i, action] = q_values[i, action] values, q_actions = q_values_true.max(1) q_values = q_values_true #print(q_values_true) ''' while isNotlegal: isNotlegal = False values, q_actions = q_values.max(1) #print(q_values) #print(values) #print(q_actions) for i, action in enumerate(q_actions): #print(legalActions[i]) if len(legalActions[i]) == 0: continue if action.item() not in legalActions[i]: isNotlegal = True # print(i) # print(action.item()) # print(q_values) q_values[i, action] = -1 # print(q_values) # print("*********************") ''' return q_actions, q_values def predict(self, input, legalActions): q_values = self.model(input) for action in range(4): if action not in legalActions: q_values[0, action] = -100000000 action = torch.argmax(q_values) if int(action.item()) not in legalActions: print(legalActions, q_values, action) print("!!!!!!!!!!!!!!!!!!!!!!!!!") return action.item(), q_values def legal_actions(self, copy_gb): legalActions = [] for i in range(4): try_gb = gameboard(4, deepcopy(copy_gb)) changed = try_gb.takeAction(self.actions[i]) if changed: legalActions.append(i) return legalActions '''
class MADDPG(): """Interacts with and learns from the environment.""" def __init__(self, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = config.state_size self.action_size = config.action_size self.seed = random.seed(config.random_seed) self.config = config self.t_step = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(self.state_size, self.action_size, config.random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, config.random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.state_size, self.action_size, config.random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, config.random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay) # Noise process self.noise = OUNoise(self.action_size, config.random_seed) # Replay memory self.memory = ReplayBuffer(self.action_size, config.buffer_size, config.batch_size, config.random_seed) # ----------------------- initialize target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) if config.shared_replay_buffer: self.memory = config.memory else: self.memory = config.memory_fn() def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.config.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config.batch_size: experiences = self.memory.sample() self.learn(experiences, self.config.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.tau) self.soft_update(self.actor_local, self.actor_target, self.config.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, num_agents, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, ALPHA, BETA, ANNEAL_OVER) # Tensorboard interface self.writer = SummaryWriter(comment="-ddpg-no-pri") self.tb_tracker = TBMeanTracker(self.writer, batch_size=10) self.step_t = 0 def step(self, state, action, reward, next_state, done, timestamp): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward #for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn at defined interval, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestamp % self.num_agents == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA) self.step_t += 1 def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ #states, actions, rewards, next_states, dones, idxs, weights = experiences states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # update priorities # updates = torch.abs(Q_expected - Q_targets).cpu().data.squeeze(1).numpy() # self.memory.update_priorities(idxs, updates) self.tb_tracker.track("loss_critic", critic_loss.to("cpu"), self.step_t) # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.tb_tracker.track("loss_actor", actor_loss.to("cpu"), self.step_t) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ---------------------------- update noise ---------------------------- # self.epsilon -= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
def train(self): # initialize memory buffer buffer = ReplayBuffer(int(500000), self.batch_size, self.num_agents, 0) # use keep_awake to keep workspace from disconnecting for episode in range(self.number_of_episodes): env_info = self.env.reset(train_mode=True)[self.brain_name] agent_episode_rewards = [0, 0] for agent in self.maddpg.ddpg_agents: agent.noise.reset() for episode_t in range(self.max_episode_len): states = env_info.vector_observations states_t = to_tensor(states) with torch.no_grad(): action_ts = self.maddpg.act(states_t, noise=self.noise) self.noise *= self.noise_reduction actions = torch.stack(action_ts).numpy() env_info = self.env.step(actions)[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for i in range(self.num_agents): agent_episode_rewards[i] += rewards[i] full_state = np.concatenate(states) full_next_state = np.concatenate(next_states) buffer.add((states, full_state, actions, rewards, next_states, full_next_state, dones)) # update once after every episode_per_update critic_losses = [] actor_losses = [] if len(buffer) > self.batch_size and episode % self.episode_per_update == 0: for i in range(self.num_agents): samples = buffer.sample() cl, al = self.maddpg.update(samples, i) critic_losses.append(cl) actor_losses.append(al) self.maddpg.update_targets() # soft update the target network towards the actual networks if np.any(dones): # if any of the agents are done break break episode_reward = max(agent_episode_rewards) self.episode_rewards.append(episode_reward) self.last_100_episode_rewards.append(episode_reward) self.avg_rewards.append(np.mean(self.last_100_episode_rewards)) # scores.append(episode_reward) print('\rEpisode {}\tAverage Score: {:.4f}\tScore: {:.4f}'.format(episode, self.avg_rewards[-1], episode_reward), end="") if episode % self.print_period == 0: print('\rEpisode {}\tAverage Score: {:.4f}'.format(episode, self.avg_rewards[-1])) # saving successful model # training ends when the threshold value is reached. if self.avg_rewards[-1] >= self.threshold: save_dict_list = [] for i in range(self.num_agents): save_dict = {'actor_params': self.maddpg.ddpg_agents[i].actor.state_dict(), 'actor_optim_params': self.maddpg.ddpg_agents[i].actor_optimizer.state_dict(), 'critic_params': self.maddpg.ddpg_agents[i].critic.state_dict(), 'critic_optim_params': self.maddpg.ddpg_agents[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save(save_dict_list, self.ckpt) raw_score_plotter(self.episode_rewards) plotter('Tennis', len(self.episode_rewards), self.avg_rewards, self.threshold) break
class Agent(): def __init__(self, state_size, action_size, batch_size=128, gamma=0.99, mean_lambda=1e-3, std_lambda=1e-3, z_lambda=0.0): self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size) self.mean_lambda = mean_lambda self.std_lambda = std_lambda self.z_lambda = z_lambda self.current_value = Value(state_size).to(device) self.target_value = Value(state_size).to(device) self.softQ = soft_Q(state_size, action_size) self.policy = Policy(state_size, action_size) self.value_optimizer = optim.Adam(self.current_value.parameters(), lr=3e-4) self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4) def act(self, state): #state = torch.from_numpy(np.asarray(state)).float().to(device) action = self.policy.act(state) if self.memory.__len__() > self.batch_size: self.update() return action def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) def update(self): state, action, reward, next_state, done = self.memory.sample() expected_soft_q_value = self.softQ.forward(state, action) expected_value = self.current_value.forward(state) new_action, log_prob, z, mean, log_std = self.policy.evaluate(state) target_value = self.target_value.forward(next_state) next_soft_q_value = reward + self.gamma * target_value * (1 - done) q_val_mse = F.mse_loss(expected_soft_q_value, next_soft_q_value.detach()) expected_new_q_val = self.softQ.forward(state, new_action) next_value = expected_new_q_val - log_prob val_loss = F.mse_loss(expected_value, next_value.detach()) log_prob_target = expected_new_q_val - expected_value policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() mean_loss = self.mean_lambda * mean.pow(2).mean() std_loss = self.std_lambda * log_std.pow(2).mean() z_loss = self.z_lambda * z.pow(2).sum(1).mean() policy_loss += mean_loss + std_loss + z_loss self.soft_q_optimizer.zero_grad() q_val_mse.backward() self.soft_q_optimizer.step() self.value_optimizer.zero_grad() val_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.soft_update(self.current_value, self.target_value, TAU) def soft_update(self, local_model, target_model, TRANSFER_RATE): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(TRANSFER_RATE * local_param.data + (1.0 - TRANSFER_RATE) * target_param.data)
class MADDPG(): def __init__(self, state_size, action_size, n_agents, seed): self.state_size = state_size self.action_size = action_size self.n_agents = n_agents self.seed = random.seed(seed) # Actor-Critic agents self.ActorCriticAgents = [ Agent(state_size, action_size, n_agents, seed) for _ in range(n_agents) ] # Replay memory self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, seed) def OUNoise_reset(self): for agent in self.ActorCriticAgents: agent.exploration_noise.reset() def act(self, state): actions = [] for i, agent in enumerate(self.ActorCriticAgents): agent_action = agent.act(state[i]) actions.append(agent_action[0]) return np.stack(actions, axis=0) def step(self, ep, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: for i in range(self.n_agents): self.learn(i) def learn(self, agent_index): states, actions, rewards, next_states, dones = self.memory.sample() target_next_actions = torch.from_numpy( np.zeros(shape=actions.shape)).float().to(device) for idx, agent in enumerate(self.ActorCriticAgents): current_states = states[:, idx] target_next_actions[:, idx, :] = agent.actor_target(current_states) target_next_actions = torch.reshape(target_next_actions, shape=(BATCH_SIZE, -1)) current_agent_states = states[:, agent_index, :] current_agent_actions = actions[:, agent_index, :] current_agent_rewards = torch.reshape(rewards[:, agent_index], shape=(BATCH_SIZE, 1)) current_agent_dones = torch.reshape(dones[:, agent_index], shape=(BATCH_SIZE, 1)) action_preds = actions.clone() action_preds[:, agent_index, :] = self.ActorCriticAgents[ agent_index].actor_local(current_agent_states) action_preds = torch.reshape(action_preds, shape=(BATCH_SIZE, -1)) self.ActorCriticAgents[agent_index].update( states, current_agent_states, actions, current_agent_actions, target_next_actions, rewards, current_agent_rewards, next_states, dones, current_agent_dones, action_preds) def save_checkpoint(self): for i in range(self.n_agents): torch.save(self.ActorCriticAgents[i].actor_local.state_dict(), f'actor_checkpoint{i}.pth') torch.save(self.ActorCriticAgents[i].critic_local.state_dict(), f'critic_checkpoint{i}.pth')
class Agent(): def __init__(self, state_space, action_space, memory_size=1000000, batch_size=32, seed=0, q_size=51): self.state_space = state_space self.action_space = action_space self.memory_size = memory_size self.batch_size = batch_size self.seed = seed self.q_size = q_size self.current_model = QDQN(self.state_space, self.action_space, n_quantiles=self.q_size).to(device) self.target_model = QDQN(self.state_space, self.action_space, n_quantiles=self.q_size).to(device) self.optimizer = Adam(self.current_model.parameters(), lr=LR) self.memory = ReplayBuffer(self.action_space, self.memory_size, self.batch_size, self.seed) self.update_every = 0 self.tau = (torch.Tensor( (2 * np.arange(self.current_model.n_quantiles) + 1) / (2.0 * self.current_model.n_quantiles)).view(1, -1)).to(device) def soft_update(self, local_model, target_model, TRANSFER_RATE): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(TRANSFER_RATE * local_param.data + (1.0 - TRANSFER_RATE) * target_param.data) def act(self, state, epsilon): if random.random() <= epsilon: action = random.choice(np.arange(self.action_space)) else: action = self.current_model.act(state).cpu().numpy() #action = self.current_model.act(state, epsilon).cpu().numpy() return action def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.update_every += 1 if self.update_every % UPDATE_FREQUENCY == 0: if len(self.memory) >= self.batch_size: experience = self.memory.sample() self.learn(experience, GAMMA) def learn(self, experience, gamma): sampled_state, sampled_action, sampled_reward, sampled_next_state, sampled_done = experience #print(self.current_model(sampled_state).shape) #print(self.current_model(sampled_state)[0:self.batch_size, 0: self.action_space]) #print(self.current_model(sampled_state)) #print(self.current_model(sampled_state).shape) #print(sampled_action.shape) #print(sampled_action.expand(self.batch_size, self.q_size)) #print(sampled_action.unsqueeze(1).expand(self.batch_size, 1, self.q_size).shape) action = sampled_action.unsqueeze(1).expand(self.batch_size, 1, self.q_size) #print(self.current_model(sampled_state)) #print(self.current_model(sampled_state).gather(1, action).squeeze(1)) theta = self.current_model(sampled_state).gather(1, action).squeeze(1) #theta = self.current_model(sampled_state).mean(2) z_next = self.target_model(sampled_next_state).detach() #print(z_next) #print(z_next.shape) z_next_max = z_next[np.arange(self.batch_size), z_next.mean(2).max(1)[1]] #print(z_next_max) Ttheta = sampled_reward + GAMMA * (1 - sampled_done) * z_next_max #print(Ttheta) #print(Ttheta.shape) #print(theta.shape) diff = Ttheta.t().unsqueeze(-1) - theta loss = self.huber(diff) * (self.tau - (diff.detach() < 0).float()).abs() loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.current_model, self.target_model, TRANSFER_RATE) def huber(self, x, k=1.0): return torch.where(x.abs() < k, 0.5 * x.pow(2), k * (x.abs() - 0.5 * k))
def main(): # define arguments parser = argparse.ArgumentParser() parser.add_argument("--render", action="store_true", help="Render the state") parser.add_argument("--render_interval", type=int, default=10, help="Number of rollouts to skip before rendering") parser.add_argument("--num_rollouts", type=int, default=-1, help="Number of max rollouts") parser.add_argument("--logfile", type=str, help="Indicate where to save rollout data") parser.add_argument( "--load_params", type=str, help="Load previously learned parameters from [LOAD_PARAMS]") parser.add_argument("--save_params", type=str, help="Save learned parameters to [SAVE_PARAMS]") args = parser.parse_args() signal.signal(signal.SIGINT, stopsigCallback) global stopsig # create the basketball environment env = BasketballVelocityEnv(fps=60.0, timeInterval=0.1, goal=[0, 5, 0], initialLengths=np.array([0, 0, 1, 1, 0, 0, 0]), initialAngles=np.array([0, 45, 0, 0, 0, 0, 0])) # create space stateSpace = ContinuousSpace(ranges=env.state_range()) actionRange = env.action_range() actionSpace = DiscreteSpace( intervals=[15 for i in range(2)] + [1], ranges=[actionRange[1], actionRange[2], actionRange[7]]) processor = JointProcessor(actionSpace) # create the model and policy functions modelFn = MxFullyConnected(sizes=[stateSpace.n + actionSpace.n, 64, 32, 1], alpha=0.001, use_gpu=True) if args.load_params: print("loading params...") modelFn.load_params(args.load_params) softmax = lambda s: np.exp(s) / np.sum(np.exp(s)) policyFn = EpsilonGreedyPolicy( epsilon=0.5, getActionsFn=lambda state: actionSpace.sample(1024), distributionFn=lambda qstate: softmax(modelFn(qstate))) dataset = ReplayBuffer() if args.logfile: log = open(args.logfile, "a") rollout = 0 while args.num_rollouts == -1 or rollout < args.num_rollouts: print("Iteration:", rollout) state = env.reset() reward = 0 done = False steps = 0 while not done: if stopsig: break action = policyFn(state) nextState, reward, done, info = env.step( createAction(processor.process_env_action(action))) dataset.append(state, action, reward, nextState) state = nextState steps += 1 if args.render and rollout % args.render_interval == 0: env.render() if stopsig: break dataset.reset() # push trajectory into the dataset buffer modelFn.fit(processor.process_Q(dataset.sample(1024)), num_epochs=10) print("Reward:", reward if (reward >= 0.00001) else 0, "with Error:", modelFn.score(), "with steps:", steps) if args.logfile: log.write("[" + str(rollout) + ", " + str(reward) + ", " + str(modelFn.score()) + "]\n") rollout += 1 if rollout % 100 == 0: policyFn.epsilon *= 0.95 print("Epsilon is now:", policyFn.epsilon) if args.logfile: log.close() if args.save_params: print("saving params...") modelFn.save_params(args.save_params)
class DQN: def __init__(self, n_actions=100, gamma=0.99, epsilon_start=0.95, epsilon_end=0.05, epsilon_decay=500, memory_capacity=1000, policy_lr=0.01, batch_size=64, device="cuda", path="D:/unity2017/water/ai/saved_model/checkpoint1.pth", pretrained=False): self.path = path self.device = device # 设备,cpu或gpu等 self.gamma = gamma # 奖励的折扣因子 self.n_actions = n_actions # e-greedy策略相关参数 self.actions_count = 0 # 用于epsilon的衰减计数 self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = resnet50() num_ftrs = self.policy_net.fc.in_features self.policy_net.fc = nn.Linear(num_ftrs, self.n_actions) self.policy_net.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) self.policy_net.to(self.device) if pretrained: self.policy_net.load_state_dict(torch.load(self.path)) self.target_net = resnet50() self.target_net.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) self.target_net.fc = nn.Linear(num_ftrs, self.n_actions) self.target_net.to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout # 可查parameters()与state_dict()的区别,前者require_grad=True self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) self.pretrained = pretrained def choose_action(self, state, train=True): '''选择动作 ''' if train: self.epsilon = self.epsilon_end + ( self.epsilon_start - self.epsilon_end) * math.exp( -1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 #if self.pretrained: # self.epsilon = self.epsilon_end if random.random() > self.epsilon: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) state = state.unsqueeze(0).to(self.device) q_value = self.policy_net(state) action = q_value.max(1)[1].item() else: action = random.randint(0, 99) return action else: with torch.no_grad(): # 取消保存梯度 # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor( state, device='cpu', dtype=torch.float32 ) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.target_net(state) action = q_value.max(1)[1].item() return action def update(self): if len(self.memory) < self.batch_size: return # 从memory中随机采样transition state_batch_, action_batch, reward_batch, next_state_batch_, done_batch = self.memory.sample( self.batch_size) state_batch = torch.ones((self.batch_size, 1, 150, 6), device=self.device, dtype=torch.float) for i in range(self.batch_size): state_batch[i] = state_batch_[i] next_state_batch = torch.ones((self.batch_size, 1, 150, 6), device=self.device, dtype=torch.float) for i in range(self.batch_size): next_state_batch[i] = next_state_batch_[i] '''转为张量 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]])''' # state_batch = torch.tensor(state_batch, device=self.device,dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) # next_state_batch = torch.tensor( # next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 '''计算当前(s_t,a)对应的Q(s_t, a)''' '''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])''' q_values = self.policy_net(state_batch).gather( dim=1, index=action_batch) # 等价于self.forward # 计算所有next states的V(s_{t+1}),即通过target_net中选取reward最大的对应states next_state_values = self.target_net(next_state_batch).max( 1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) # 计算 expected_q_value # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward expected_q_values = reward_batch + self.gamma * \ next_state_values * (1 - done_batch[0]) # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss # 优化模型 self.optimizer.zero_grad( ) # zero_grad清除上一步所有旧的gradients from the last step # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 self.loss.backward() for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step() # 更新模型 def save_model(self, path): torch.save(self.target_net.state_dict(), path) def load_model(self, path): self.target_net.load_state_dict(torch.load(path))
class DDPG(): """Reinforcement Learning agent , learning using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.08 self.exploration_sigma = 0.15 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor 0.99 self.tau = 0.001 # for soft update of target parameters 0.01 # Score tracker and learning parameters self.total_reward = None self.count = 0 self.score = 0 self.best_score = -np.inf self.last_state = None def reset_episode(self): self.total_reward = None self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): if self.total_reward: self.total_reward += reward else: self.total_reward = reward self.count += 1 # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(states)[0] # add some noise for exploration return list(action + self.noise.sample()) def learn(self, experiences): """Update policy and value parameters using given batch of reward tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted actions of next-state and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # track best score self.score = self.total_reward / float( self.count) if self.count else -np.inf if self.best_score < self.score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Christophers_Agent(): def __init__(self, task): # Task (environment) information self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low self.w = np.random.normal( size=( self.state_size, self.action_size ), # weights for simple linear policy: state_space x action_space scale=(self.action_range / (2 * self.state_size) )) # start producing actions in a decent range self.actor = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic = Critic(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_target = Critic(self.state_size, self.action_size) self.gamma = 0.95 self.tau = 0.001 self.best_w = None self.best_score = -np.inf self.exploration_mu = 0.5 self.exploration_theta = 0.2 self.exploration_sigma = 0.4 self.noise = Noise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 32 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.best_score = -np.inf self.num_steps = 0 # Episode variables self.reset_episode() def reset_episode(self): if self.get_score() > self.best_score: self.best_score = self.get_score() self.total_reward = 0.0 self.num_steps = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.num_steps += 1 self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor.model.predict(state)[0] action = list(action + self.noise.sample()) # add some noise for exploration return action def get_score(self): return -np.inf if self.num_steps == 0 else self.total_reward / self.num_steps def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) done = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - done) self.critic.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor.train_fn([states, action_gradients, 1]) self.soft_update(self.critic.model, self.critic_target.model) self.soft_update(self.actor.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): def __init__(self, env, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2): self.states = env.observation_space self.state_size = env.observation_space.shape[0] self.actions = env.action_space self.action_size = env.action_space.shape[0] self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1) def act(self, state, step, epsilon=True): state = torch.from_numpy(np.asarray(state)).float().to(device) action = self.actor.forward(state) action = action.detach().cpu().numpy() if epsilon: noise = np.random.normal(0, 0.1, action.shape[0]) action += noise return action def update(self, step): state, action, reward, next_state, done = self.memory.sample() next_state_action = self.target_actor(next_state) noise = Normal(torch.zeros(self.action_size), self.sigma).sample() noise = torch.clamp(noise, -self.noise_clip, self.noise_clip).to(device) next_state_action += noise target_Q0 = self.target_critic0(next_state, next_state_action) target_Q1 = self.target_critic1(next_state, next_state_action) target_Q = torch.min(target_Q0, target_Q1) target_value = reward + self.gamma * target_Q * (1.0 - done) expected_Q0 = self.critic0(state, action) expected_Q1 = self.critic1(state, action) critic_0_loss = F.mse_loss(expected_Q0, target_value.detach()) critic_1_loss = F.mse_loss(expected_Q1, target_value.detach()) self.critic0_optimizer.zero_grad() critic_0_loss.backward() self.critic0_optimizer.step() self.critic1_optimizer.zero_grad() critic_1_loss.backward() self.critic1_optimizer.step() if step % self.update_frequency == 0: actor_loss = self.critic0.forward(state, self.actor.forward(state)) actor_loss = -actor_loss.mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE) self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE) self.soft_update(self.actor, self.target_actor, TRANSFER_RATE) def soft_update(self, local_model, target_model, tao): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tao * local_param.data + (1.0 - tao) * target_param.data) def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done)
class Agent(): def __init__(self, state_size, action_size, action_sigma=0.1, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2, seed=0): ''' TD3 Agent :param state_size: State Dimension :param action_size: Action dimension :param action_sigma: standard deviation of the noise to be added to the action :param memory_size: :param batch: :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper) :param noise_clip: How much noise to allow :param gamma: :param update_frequency: :param seed: ''' self.state_size = state_size self.action_size = action_size self.action_sigma = action_sigma self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.seed = seed self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) #second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) # second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch, seed=seed) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1) def act(self, state, epsilon=True): state = torch.from_numpy(np.asarray(state)).float().to(device) self.actor.eval() with torch.no_grad(): action = self.actor.forward(state).cpu().data.numpy() self.actor.train() if epsilon: #if we want to inject some noise noise = np.random.normal(0, self.action_sigma, action.shape[0]) action += noise return action def update(self, step): ''' #https: // arxiv.org / pdf / 1802.09477.pdf the function is very similar to typical DDPG algorithm, except for 1) we have 2 critics to update 2) we take the min of the 2 values critics output 3) Has modified Target network with noise injected into it (Chapter 5.3 of the paper) 4) We delay updating the actor by certain steps :param step: how often to update the actor :return: ''' state, action, reward, next_state, done = self.memory.sample() # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_state_action = self.target_actor(next_state) #sample a random noise noise = Normal(torch.zeros(self.action_size), self.sigma).sample() noise = torch.clamp(noise, -self.noise_clip, self.noise_clip).to(device) next_state_action += noise target_Q0 = self.target_critic0(next_state, next_state_action) target_Q1 = self.target_critic1(next_state, next_state_action) target_Q = torch.min(target_Q0, target_Q1) target_value = reward + self.gamma * target_Q * (1.0 - done) expected_Q0 = self.critic0(state, action) expected_Q1 = self.critic1(state, action) critic_0_loss = F.mse_loss(expected_Q0, target_value.detach()) critic_1_loss = F.mse_loss(expected_Q1, target_value.detach()) self.critic0_optimizer.zero_grad() critic_0_loss.backward() self.critic0_optimizer.step() self.critic1_optimizer.zero_grad() critic_1_loss.backward() self.critic1_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss #as mentioned in the paper, we delay updating the actor network. if step % self.update_frequency == 0: actor_loss = self.critic0.forward(state, self.actor.forward(state)) actor_loss = -actor_loss.mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ------------------- # self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE) self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE) self.soft_update(self.actor, self.target_actor, TRANSFER_RATE) def soft_update(self, local_model, target_model, tao): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tao * local_param.data + (1.0 - tao) * target_param.data) def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done)
class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions self.device = device self.gamma = gamma self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def select_action(self, state): '''选择工作 Args: state [array]: 状态 Returns: [array]: 动作 ''' self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): state = torch.tensor( [state], device=self.device, dtype=torch.float32 ) # 先转为张量便于丢给神经网络,state元素数据原本为float64;注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 q_value = self.policy_net( state ) # tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action def update(self): if len(self.memory) < self.batch_size: return state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) state_batch = torch.tensor( state_batch, device=self.device, dtype=torch.float ) # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net q_values = self.policy_net(state_batch).gather( 1, action_batch) # 等价于self.forward # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = self.target_net(next_state_batch).max( 1)[0].detach() # tensor([ 0.0060, -0.0171,...,]) # Compute the expected Q values expected_q_values = reward_batch + self.gamma * next_state_values * ( 1 - done_batch[0]) # Compute Huber loss # self.loss = nn.MSELoss(q_values, expected_q_values.unsqueeze(1)) self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad( ) # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls). self.loss.backward( ) # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step( ) # causes the optimizer to take a step based on the gradients of the parameters.
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_episodes, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed num_episodes (int): number of training epochs """ self.state_size = state_size self.action_size = action_size self.seed = seed # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.anneal_beta = (1. - BETA) / num_episodes self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA, BETA) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.t_learning_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def update_weights(self): self.memory.anneal_beta(self.anneal_beta) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, idxs, weights = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # update priorities updates = torch.abs(Q_expected - Q_targets).cpu().data.squeeze(1).numpy() self.memory.update_priorities(idxs, updates) # Compute loss loss = F.l1_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() (loss * weights).mean().backward() self.optimizer.step() # ------------------- update target network ------------------- # self.t_learning_step += 1 if self.t_learning_step % UPDATE_TARGET_STEPS == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): # PyTorch copy: destination.data.copy(source.data) target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
def main(): # define arguments parser = argparse.ArgumentParser() parser.add_argument("--render", action="store_true", help="Render the state") parser.add_argument("--render_interval", type=int, default=10, help="Number of rollouts to skip before rendering") parser.add_argument("--num_rollouts", type=int, default=1000, help="Number of max rollouts") parser.add_argument("--logfile", type=str, help="Indicate where to save rollout data") parser.add_argument( "--load_params", type=str, help="Load previously learned parameters from [LOAD_PARAMS]") parser.add_argument("--save_params", type=str, help="Save learned parameters to [SAVE_PARAMS]") parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") parser.add_argument("--test", action="store_true", help="Test the params") args = parser.parse_args() signal.signal(signal.SIGINT, stopsigCallback) global stopsig # create the basketball environment env = BasketballVelocityEnv(fps=60.0, timeInterval=0.1, goal=[0, 5, 0], initialLengths=np.array([0, 0, 1, 1, 1, 0, 1]), initialAngles=np.array( [0, 45, -20, -20, 0, -20, 0])) # create space stateSpace = ContinuousSpace(ranges=env.state_range()) actionSpace = ContinuousSpace(ranges=env.action_range()) # create the model and policy functions modelFn = PoWERDistribution(stateSpace.n, actionSpace.n, sigma=5.0 if not args.test else 0) if args.load_params: print("Loading params...") modelFn.load_params(args.load_params) replayBuffer = ReplayBuffer(1024) if args.logfile: log = open(args.logfile, "a") rollout = 0 while args.num_rollouts == -1 or rollout < args.num_rollouts: print("Iteration:", rollout) state = env.reset() reward = 0 done = False steps = 0 while not done and steps < 5: if stopsig: break action, eps = modelFn.predict( state, replayBuffer.sample(gamma=args.gamma)) if steps == 4: action[-1] = 1.0 nextState, reward, done, info = env.step(action) replayBuffer.append(state, action, reward, nextState=nextState, info={"eps": eps}) state = nextState steps += 1 if args.render and rollout % args.render_interval == 0: env.render() if stopsig: break # no importance sampling, implement it when we have small datasets replayBuffer.reset() dataset = replayBuffer.sample(gamma=args.gamma) modelFn.fit(dataset) avgR = np.sum(dataset["rewards"]) / float(len(dataset["rewards"])) avgQ = np.sum(dataset["values"]) / float(len(dataset["values"])) print("Rollouts:", rollout, "Error:", modelFn.score(), "Average Q", avgQ, "Average R", avgR) if args.logfile: log.write("[" + str(rollout) + ", " + str(modelFn.score()) + ", " + str(avgQ) + ", " + str(avgR) + "]\n") rollout += 1 if args.logfile: log.close() if args.save_params: print("Saving params...") modelFn.save_params(args.save_params)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, apply_dueling=False, apply_double=False): """ Initialize a Unity agent object. :param state_size: (int) dimension of each state :param action_size: (int) dimension of each action :param seed: (int) random seed """ assert(self._true_xor(apply_dueling, apply_double), "Choose one between dueling networks or DDQN") self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.apply_dueling = apply_dueling self.apply_double = apply_double # Q-Network self.q_net_target = QNetwork(state_size, action_size, seed, apply_dueling=apply_dueling).to(device) self.q_net_local = QNetwork(state_size, action_size, seed, apply_dueling=apply_dueling).to(device) self.opt = optim.Adam(self.q_net_local.parameters(), lr=LR) # Replay memory self.memory_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 @staticmethod def _true_xor(*args): return sum(args) == 1 def step(self, state, action, reward, next_state, done): """ Save experience in replay memory buffer for future experience replay :param state: The current state of the agent :param action: The action that the agent has taken in given state :param reward: The reward associated with the state action combination :param next_state: The resulting state after taking action in previous state :param done: (bool) Has the terminal state been reached? :return: None """ self.memory_buffer.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_CYCLE if self.t_step == 0: # If enough samples are available in memory, get random subset and learn from it if BATCH_SIZE < len(self.memory_buffer): experiences = self.memory_buffer.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """ Returns actions for given state as per current policy. :param state: (array_like) current state :param eps: (float) epsilon, for epsilon-greedy action selection :return: (int) The index of the action to be taken by the agent """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.q_net_local.eval() with torch.no_grad(): # Do not perform a forward pass in this context action_values = self.q_net_local(state) self.q_net_local.train() # Epsilon-greedy action selection greed_p = random.random() return np.argmax(action_values.cpu().data.numpy()) if greed_p > eps else \ random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. :param experiences: (Tuple[torch.Tensor]) tuple of (s, a, r, s', done) tuples :param gamma: (float) discount factor :return: """ states, actions, rewards, next_states, done_signals = experiences if not self.apply_double: # Get max predicted Q values for the next state of the target model. Q_targets_next = self.q_net_target(next_states).detach().max(1)[0].unsqueeze(1) else: # In the case of Double-DQN, evaluate the best selected action with the target model's set of parameters. indices = torch.argmax(self.q_net_local(next_states).detach(), 1) # The selected next best action's indices # Evaluate that action by comparing with the local network's set of parameters Q_targets_next = self.q_net_target(next_states).detach().gather(1, indices.unsqueeze(1)) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - done_signals)) # Get expected Q values from local model (being trained) # x.gather(1, actions) returns a tensor which results from the concatenation of the input tensor values along # the given dimensions (here the dim indexes are the taken actions indices) Q_expected = self.q_net_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.opt.zero_grad() loss.backward() self.opt.step() # perform network update self.soft_update(self.q_net_local, self.q_net_target, TAU) @staticmethod def soft_update(local_model, target_model, tau): """ Soft update model parameters, given by the function: θ_target = τ*θ_local + (1 - τ)*θ_target :param local_model: (PyTorch model) weights will be copied from :param target_model: (PyTorch model) weights will be copied to :param tau: (float) interpolation parameter :return: """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)