class DDPG: def __init__(self, action_dim, action_bound, tau, lr_a, lr_c, state_dim, gamma, batch_size): self.target = tf.placeholder(tf.float32, [None, 1], 'critic_target') self.s = tf.placeholder(tf.float32, [None, state_dim], 'state') self.s_ = tf.placeholder(tf.float32, [None, state_dim], 'next_state') self.memory = ReplayBuffer(max_size=10000) self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) self.batch_size = batch_size self.gamma = gamma self.sess = tf.Session() self.actor = Actor(self.sess, self.s, self.s_, action_dim, action_bound, tau, lr_a, f1_units=300) self.critic = Critic(self.sess, lr_c, self.s, self.s_, self.actor.a, self.actor.a_, self.target, tau, gamma, state_dim, action_dim, f1_units=300) self.actor.add_grad_to_graph(self.critic.a_g) self.sess.run(tf.global_variables_initializer()) def choose_action(self, s): a = self.actor.choose_action(s) var = self.noise() a = a + var return a[0] def update_target_networks(self): self.sess.run([self.actor.replace, self.critic.replace]) def store(self, s, a, r, s_, done): self.memory.store(s, a, r, s_, done) def learn(self): bs, ba, br, bs_, _ = self.memory.sample(self.batch_size) q_ = self.sess.run(self.critic.q_, {self.s_: bs_}) br = br[:, np.newaxis] target_critic = br + self.gamma * q_ self.critic.learn(bs, ba, target_critic) self.actor.learn(bs) self.update_target_networks()
class Agent: def __init__(self, state_dim, action_dim, explore_noise = "Gaussian", *args, **kwargs): self.lr = 1e-4 self.gamma = 0.99 self.tau = 0.005 self.bs = 512 self.bfs = 1000000 self.d = 2 self.explore_noise = explore_noise self.explore_noise_size = 0.1 # or 0.01 self.process_noise_generator = ProcessNoise(action_dim) self.criticreg_noise_size = 0.2 self.criticreg_noise_clip = 0.5 self.state_dim = state_dim self.action_dim = action_dim self.actor_nn_dim = [256, 256, self.action_dim] self.critic_nn_dim = [256, 256, 1] self.state1_place = tf.placeholder(tf.float32, [None, self.state_dim]) self.action_place = tf.placeholder(tf.float32, [None, self.action_dim]) self.reward_place = tf.placeholder(tf.float32, [None,1]) self.isdone_place = tf.placeholder(tf.float32, [None,1]) self.state2_place = tf.placeholder(tf.float32, [None, self.state_dim]) with tf.variable_scope("target_actor", reuse = tf.AUTO_REUSE): self.Q_next_action = self.actor_nn(self.state2_place) self.Q_next_noise = tf.clip_by_value(tf.random.normal([self.bs, self.action_dim], 0, self.criticreg_noise_size), - self.criticreg_noise_clip, self.criticreg_noise_clip) self.Q_next_noisy_action = tf.clip_by_value(self.Q_next_action + self.Q_next_noise, -1, 1) with tf.variable_scope("target_critic_1", reuse = tf.AUTO_REUSE): self.Q_critic_1 = self.critic_nn(self.state2_place, self.Q_next_noisy_action) with tf.variable_scope("target_critic_2", reuse = tf.AUTO_REUSE): self.Q_critic_2 = self.critic_nn(self.state2_place, self.Q_next_noisy_action) self.Q_critic_min = tf.minimum(self.Q_critic_1, self.Q_critic_2) self.Q_y = self.reward_place + self.gamma * (1-self.isdone_place) * self.Q_critic_min with tf.variable_scope("main_critic_1", reuse = tf.AUTO_REUSE): self.Q_Q_1 = self.critic_nn(self.state1_place, self.action_place) with tf.variable_scope("main_critic_2", reuse = tf.AUTO_REUSE): self.Q_Q_2 = self.critic_nn(self.state1_place, self.action_place) self.Q_loss = tf.reduce_mean((self.Q_Q_1 - self.Q_y)**2) + tf.reduce_mean((self.Q_Q_2 - self.Q_y)**2) with tf.variable_scope("main_actor", reuse = tf.AUTO_REUSE): self.P_this_action = self.actor_nn(self.state1_place) with tf.variable_scope("main_critic_1", reuse = tf.AUTO_REUSE): self.P_Q_1 = self.critic_nn(self.state1_place, self.P_this_action) self.P_loss = - tf.reduce_mean(self.P_Q_1) with tf.variable_scope("main_actor", reuse = tf.AUTO_REUSE): self.action = self.actor_nn(self.state1_place) all_variables = tf.trainable_variables() self.main_critic_var = [i for i in all_variables if "main_critic" in i.name] self.target_critic_var = [i for i in all_variables if "target_critic" in i.name] self.main_actor_var = [i for i in all_variables if "main_actor" in i.name] self.target_actor_var = [i for i in all_variables if "target_actor" in i.name] assert len(self.main_critic_var) == len(self.target_critic_var) assert len(self.main_actor_var) == len(self.target_actor_var) self.Q_op = tf.train.AdamOptimizer(self.lr).minimize(self.Q_loss, var_list = self.main_critic_var) self.P_op = tf.train.AdamOptimizer(self.lr).minimize(self.P_loss, var_list = self.main_actor_var) self.T_init = [tf.assign(T, M) for (T,M) in zip(self.target_critic_var + self.target_actor_var, self.main_critic_var + self.main_actor_var)] self.T_op = [tf.assign(T, self.tau * M + (1 - self.tau) * T) for (T,M) in zip( self.target_critic_var + self.target_actor_var, self.main_critic_var + self.main_actor_var)] self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, self.bfs) self.step_count = 0 self.total_step_count = 0 self.episode_count = 0 self.train_count = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.T_init) self.saver = tf.train.Saver(max_to_keep=1000) def actor_nn(self, state, bound = True): dim = self.actor_nn_dim A = state for i in range(0,len(dim)-1): A = tf.layers.dense(A, units= dim[i], activation = tf.nn.relu) action = tf.layers.dense(A, units= dim[-1], activation = tf.nn.tanh) return action def critic_nn(self, state, action): dim = self.critic_nn_dim A = tf.concat([state, action], axis = 1) for i in range(0,len(dim)-1): A = tf.layers.dense(A, units= dim[i], activation = tf.nn.relu) critic = tf.layers.dense(A, units= dim[-1], activation = None) return critic def get_action(self, state_data, stochastic = True): this_action = self.sess.run(self.action, feed_dict= {self.state1_place: state_data}) if stochastic: if self.explore_noise == "Gaussian": explore_noise = np.random.normal(0, self.explore_noise_size, [1, self.action_dim]) elif self.explore_noise == "Process": explore_noise = self.explore_noise_size * self.process_noise_generator.next() else: raise NotImplementedError this_action = np.clip(this_action + explore_noise, -1, 1) return this_action def eval_loss(self, bs = None): if bs is None: bs = self.bs this_bs = np.minimum(bs, self.replay_buffer.size) this_batch = self.replay_buffer.sample_batch(this_bs) feed_dict = {self.state1_place: this_batch["obs1"], self.action_place: this_batch["acts"], self.reward_place: this_batch["rews"], self.isdone_place: this_batch["done"], self.state2_place: this_batch["obs2"]} pass def train_iter(self): if self.bs <= self.replay_buffer.size: this_batch = self.replay_buffer.sample_batch(self.bs) feed_dict = {self.state1_place: this_batch["obs1"], self.action_place: this_batch["acts"], self.reward_place: this_batch["rews"], self.isdone_place: this_batch["done"], self.state2_place: this_batch["obs2"]} self.sess.run([self.Q_op], feed_dict=feed_dict) if self.total_step_count % self.d == 0: self.sess.run([self.P_op], feed_dict=feed_dict) self.sess.run(self.T_op) self.train_count += 1 self.total_step_count += 1 def record(self, this_state, this_action, this_reward, this_done, next_state): self.replay_buffer.store(obs=this_state, act=this_action, rew=this_reward, next_obs=next_state, done=this_done) self.step_count += 1 def reset_agent(self): self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, self.bfs) self.step_count = 0 self.total_step_count = 0 self.train_count = 0 self.episode_count = 0 self.sess.run(tf.global_variables_initializer()) self.sess.run(self.T_init) def reset_episode(self): self.step_count = 0 self.train_count = 0 self.episode_count += 1
class Trainer(): def __init__(self, params: Parameters): self.parms = params self.env = Env(params.game, params.gamma, norm_rewards=None, norm_states=False) self.buffer = ReplayBuffer(params.replay_size) # Seed self.env.seed(params.seed) np.random.seed(params.seed) tf.random.set_seed(params.seed) self.critic = DDPGValueNet(feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) self.target_critic = DDPGValueNet( feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) self._copy_para(self.critic.model, self.target_critic.model) self.actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self.target_actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self._copy_para(self.actor, self.target_actor) self.ema = tf.train.ExponentialMovingAverage(decay=1.0 - self.parms.tau) def _copy_para(self, from_model, to_model): """ Copy parameters for soft updating :param from_model: latest model :param to_model: target model :return: None """ for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): j.assign(i) def _ema_update(self): paras = self.actor.trainable_weights + \ self.critic.model.trainable_weights self.ema.apply(paras) for i, j in zip(self.target_actor.trainable_weights + \ self.target_critic.model.trainable_weights, paras): i.assign(self.ema.average(j)) def _train(self): # Sample batch = self.buffer.sample(self.parms.batch_size) s = np.array([batch_[0] for batch_ in batch]) a = np.array([batch_[1] for batch_ in batch]) r = np.array([batch_[2] for batch_ in batch]) s_next = np.array([batch_[3] for batch_ in batch]) not_done = np.array([not batch_[4] for batch_ in batch]) # Reshpe r = r[:, np.newaxis] not_done = not_done[:, np.newaxis] # Train critic with tf.GradientTape() as tape: pi_next = self.target_actor(s_next) a_next = pi_next.sample() q_next = self.target_critic([s_next, a_next]) y = r + self.parms.gamma * q_next * not_done q = self.critic([s, a]) c_loss = tf.losses.mean_squared_error(y, q) c_grads = tape.gradient(c_loss, self.critic.model.trainable_weights) self.critic.model.optimizer.apply_gradients( zip(c_grads, self.critic.model.trainable_weights)) # Train actor with tf.GradientTape() as tape: pi = self.actor(s) a = pi.sample() q = self.critic([s, a]) a_loss = -tf.reduce_mean(q) a_grads = tape.gradient(a_loss, self.actor.trainable_weights) self.actor.optimizer.apply_gradients( zip(a_grads, self.actor.trainable_weights)) self._ema_update() def train_step(self): # Episode infomation episode_ret = [] # Initialize s s = self.env.reset() for _ in range(self.parms.train_step_len): # Interact pi = self.actor(s[np.newaxis, :]) # batch_size=1 a = pi.sample()[0] s_next, r, done, info = self.env.step(a) # Store self.buffer.store((s, a, r, s_next, done)) # Train if self.buffer.size() > self.parms.start_size: self._train() if done: _, ret = info['done'] episode_ret.append(ret) s_next = self.env.reset() s = s_next return np.mean(episode_ret)
class Trainer(): def __init__(self, params: Parameters): self.parms = params self.env = Env(params.game, params.gamma, norm_rewards=None, norm_states=False) self.buffer = ReplayBuffer(params.replay_size) # Seed self.env.seed(params.seed) np.random.seed(params.seed) tf.random.set_seed(params.seed) # Four critic nets critic_nets = [ DDPGValueNet(feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) for _ in range(4) ] self.critic1, self.critic2, self.target_critic1, self.target_critic2 = critic_nets # Two actor nets self.actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self.target_actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) # Copy parms self._copy_para(self.critic1, self.target_critic1) self._copy_para(self.critic2, self.target_critic2) self._copy_para(self.actor, self.target_actor) self.train_step_cnt = 0 def _copy_para(self, from_model, to_model): """ Copy parameters for soft updating :param from_model: latest model :param to_model: target model :return: None """ for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): j.assign(i) def _target_soft_update(self, net, target_net): """ soft update the target net with Polyak averaging """ for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): target_param.assign( # copy weight value into target parameters target_param * (1.0 - self.parms.tau) + param * self.parms.tau) def _train(self): # Sample batch = self.buffer.sample(self.parms.batch_size) s = np.array([batch_[0] for batch_ in batch]) a = np.array([batch_[1] for batch_ in batch]) r = np.array([batch_[2] for batch_ in batch]) s_next = np.array([batch_[3] for batch_ in batch]) not_done = np.array([not batch_[4] for batch_ in batch]) # Reshpe r = r[:, np.newaxis] not_done = not_done[:, np.newaxis] # Set target y pi_next = self.target_actor(s_next) a_next = pi_next.sample() q_next = tf.minimum(self.target_critic1([s_next, a_next]), self.target_critic2([s_next, a_next])) y = r + self.parms.gamma * q_next * not_done # Train critic1 with tf.GradientTape() as c1_tape: q1 = self.critic1([s, a]) c1_loss = tf.losses.mean_squared_error(y, q1) c1_grads = c1_tape.gradient(c1_loss, self.critic1.trainable_weights) self.critic1.optimizer.apply_gradients( zip(c1_grads, self.critic1.trainable_weights)) # Train critic2 with tf.GradientTape() as c2_tape: q2 = self.critic2([s, a]) c2_loss = tf.losses.mean_squared_error(y, q2) c2_grads = c2_tape.gradient(c2_loss, self.critic2.trainable_weights) self.critic2.optimizer.apply_gradients( zip(c2_grads, self.critic2.trainable_weights)) # Train actor if self.train_step_cnt % self.parms.actor_interval == 0: with tf.GradientTape() as a_tape: pi = self.actor(s) a = pi.sample() q = self.critic1([s, a]) a_loss = -tf.reduce_mean(q) a_grads = a_tape.gradient(a_loss, self.actor.trainable_weights) self.actor.optimizer.apply_gradients( zip(a_grads, self.actor.trainable_weights)) # update parms self._target_soft_update(self.actor, self.target_actor) self._target_soft_update(self.critic1, self.target_critic1) self._target_soft_update(self.critic2, self.target_critic2) def train_step(self): # Episode infomation episode_ret = [] # Initialize s s = self.env.reset() for _ in range(self.parms.train_step_len): # Interact pi = self.actor(s[np.newaxis, :]) # batch_size=1 a = pi.sample()[0] s_next, r, done, info = self.env.step(a) # Store self.buffer.store((s, a, r, s_next, done)) # Train if self.buffer.size() > self.parms.start_size: self._train() self.train_step_cnt += 1 if done: _, ret = info['done'] episode_ret.append(ret) s_next = self.env.reset() s = s_next return np.mean(episode_ret)
class Agent(object): def __init__(self, env, alpha, beta, tau, gamma, state_dim = 8, action_dim = 2, max_replay_size = 1000000, l1_dim = 400, l2_dim = 300, batch_size=64): self.env = env self.max_action = float(env.action_space.high[0]) self.alpha = alpha # learning rate for actor network self.beta = beta # learning rate for critic network self.tau = tau # polyak averaging parameter self.gamma = gamma # discount factor of reward self.update_actor_count = 0 self.update_actor_freq = 2 self.policy_noise = .2 self.noise_clip = .5 self.state_dim = state_dim self.action_dim = action_dim self.l1_dim = l1_dim self.l2_dim = l2_dim self.batch_size = batch_size self.max_replay_size = max_replay_size # build the agent self.build_agent() # with "tau = 1", we initialize the target network the same as the main network self.update_target_network(tau = 1) def build_agent(self): # build the actor-critic network and also their target networks self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.alpha) self.target_actor = copy.deepcopy(self.actor) self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.beta) self.target_critic = copy.deepcopy(self.critic) # build the replaybuffer self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim) # build the OUNoise for action selection self.noise = OUNoise(self.action_dim) def act(self, state): state = T.tensor(state, dtype=T.float) action = self.actor(state) noisy_action = action + T.tensor(self.noise(), dtype=T.float) return noisy_action.cpu().detach().numpy() # store transition into the replay buffer def remember(self, state, action, reward, next_state, done): self.replaybuffer.store(state, action, reward, next_state, done) def sample_replaybuffer(self): # sample from the ReplayBuffer states, actions, rewards, next_states, dones = self.replaybuffer.sample(self.batch_size) states = T.tensor(states, dtype=T.float) actions = T.tensor(actions, dtype=T.float) rewards = T.tensor(rewards, dtype=T.float) next_states = T.tensor(next_states, dtype=T.float) dones = T.tensor(dones) return states, actions, rewards, next_states, dones def step(self): # we cannot learn before the amount of transitions inside # the replay buffer is larger than the batch size if self.replaybuffer.mem_cntr < self.batch_size: return self.update_actor_count += 1 # get transition samples from replayer buffer states, actions, rewards, next_states, dones = self.sample_replaybuffer() # update the critic network self.update_critic(states, actions, rewards, next_states, dones) if self.update_actor_count % self.update_actor_freq == 0: # update the actor network self.update_actor(states) # update target network parameters self.update_target_network() def update_critic(self, states, actions, rewards, next_states, dones): with T.no_grad(): # Select action according to policy and add clipped noise noise = (T.randn_like(actions) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) next_action = (self.target_actor(next_states) + noise).clamp(-self.max_action, self.max_action) # Compute the target Q value and use the minimum of them target_Q1, target_Q2 = self.target_critic(next_states, next_action) target_Q = T.min(target_Q1, target_Q2) target_Q = [rewards[j] + self.gamma * target_Q[j] * dones[j] for j in range(self.batch_size)] # reshape the variable target_Q = T.tensor(target_Q) target_Q = target_Q.view(self.batch_size, 1) # Get current Q estimates current_Q1, current_Q2 = self.critic(states, actions) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) # Optimize the critic self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() def update_actor(self, states): # here we use the output from the actor network NOT the noisy action # because we only need to enforce exploration in the when actual interactions # happen in the environment actions = self.actor(states) actor_loss = - self.critic.q1_forward(states, actions).mean() # Optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() def update_target_network(self, tau=None): tau = self.tau if tau is None else tau # polyak averaging to update the target critic network for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) # polyak averaging to update the target actor network for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class SAC: def __init__(self, env, test_env, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, entropy_tuning: bool = False, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.01, max_ep_len=1000, device='cpu', num_test_episodes=1, save_freq=2, log_mode: List[str] = ["stdout"], log_key: str = "timestep", save_model: str = "checkpoints", checkpoint_path: str = None, log_interval: int = 10, load_model=False, dir_prefix: str = None): torch.manual_seed(seed) np.random.seed(seed) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.seed = seed self.env = env self.test_env = test_env self.obs_dim = env.observation_space.shape self.act_dim = env.action_space.shape[0] self.act_limit = env.action_space.high[0] self.replay_size = replay_size self.batch_size = batch_size #self.noise_scale = act_noise self.load_model = load_model self.log_key = log_key #self.logdir = logdir self.save_model = save_model self.checkpoint_path = checkpoint_path #self.log_interval = log_interval #self.logger = Logger(logdir=logdir, formats=[*log_mode]) #self.pi_lr = pi_lr #self.q_lr = q_lr self.lr = lr self.ac_kwargs = ac_kwargs self.steps_per_epoch = steps_per_epoch self.epochs = epochs self.max_ep_len = max_ep_len self.gamma = gamma self.polyak = polyak self.alpha = alpha self.entropy_tuning = entropy_tuning self.start_steps = start_steps self.update_after = update_after self.update_every = update_every self.save_freq = save_freq self.action_time_step = 0 #no. of updates self.current_timestep = 0 self.current_epoch = 0 self.dir_prefix = dir_prefix # Store the weights and scores in a new directory self.directory = "logs/sac_single_Agent_{}{}/".format( self.dir_prefix, time.strftime("%Y%m%d-%H%M%S")) # appends the timedate os.makedirs(self.directory, exist_ok=True) self.model_dir = os.path.join(self.directory, 'model_param/') os.makedirs(self.model_dir) # Tensorboard writer object self.writer = SummaryWriter(log_dir=self.directory + 'tensorboard/') print("Logging to {}\n".format(self.directory + 'tensorboard/')) #self.test_env = env self.num_test_episodes = num_test_episodes # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs).to(self.device) self.ac_targ = deepcopy(self.ac).to(self.device) #actually no need of saving the policy parameters as target above, since we do not need any target Actor in SAC. if self.load_model: if os.path.exists(self.checkpoint_path): self.ac.load_state_dict( torch.load(os.path.abspath(self.checkpoint_path))) self.ac_targ = deepcopy(self.ac).to(self.device) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.lr) self.pi_scheduler = StepLR(self.pi_optimizer, step_size=1, gamma=0.96) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.q_scheduler = StepLR(self.q_optimizer, step_size=1, gamma=0.96) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=self.replay_size) # from https://github.com/SforAiDl/genrl/blob/master/genrl/deep/agents/sac/sac.py if self.entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) #else: # self.alpha=self.alpha # no need of action scales setting # action_limit is directly obtained within the MLPActorCritic class # action_bias is not need as for the city learn environment, actions are bounded with -1/3 to +1/3 # and the bias sums to 0 # Assign device if "cuda" in device and torch.cuda.is_available(): self.device = torch.device(device) else: self.device = torch.device("cpu") # Assign seed if seed is not None: set_seeds(seed, self.env) #initialize logs self.empty_logs() # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) print(var_counts) self.logs["var_counts"] = var_counts print( colorize( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts, 'green', bold=True)) self.writer.add_scalar('Number of parameters/pi', var_counts[0]) self.writer.add_scalar('Number of parameters/q1', var_counts[1]) self.writer.add_scalar('Number of parameters/q2', var_counts[2]) #print(colorize(msg, color, bold=True)) def load_weights(self, weights) -> None: """ Load weights for the agent from pretrained model """ self.q1.load_state_dict(weights["q1_weights"]) self.q2.load_state_dict(weights["q2_weights"]) self.policy.load_state_dict(weights["policy_weights"]) def empty_logs(self): """ Empties logs """ self.logs = {} self.logs["q1_loss"] = [] self.logs["q2_loss"] = [] self.logs["policy_loss"] = [] self.logs["alpha_loss"] = [] self.logs["var_counts"] = () def safe_mean(log: List[int]): """ Returns 0 if there are no elements in logs """ return np.mean(log) if len(log) > 0 else 0 def get_logging_params(self) -> Dict[str, Any]: """ :returns: Logging parameters for monitoring training :rtype: dict """ logs = { "policy_loss": safe_mean(self.logs["policy_loss"]), "q1_loss": safe_mean(self.logs["q1_loss"]), "q2_loss": safe_mean(self.logs["q2_loss"]), "alpha_loss": safe_mean(self.logs["alpha_loss"]), } self.empty_logs() return logs # Set up function for computing SAC Q-losses def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = self.ac.q1(o, a) q2 = self.ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = self.ac.pi(o2) # Target Q-values q1_pi_targ = self.ac_targ.q1(o2, a2) q2_pi_targ = self.ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 #logging into tensorboard self.writer.add_scalar('loss/Critic1_loss', loss_q1, self.current_timestep) self.writer.add_scalar('loss/Critic2_loss', loss_q2, self.current_timestep) # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) self.logs["q1_loss"].append(loss_q1.item()) self.logs["q2_loss"].append(loss_q2.item()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(self, data): o = data['obs'] pi, logp_pi = self.ac.pi(o) q1_pi = self.ac.q1(o, pi) q2_pi = self.ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (self.alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) # alpha loss alpha_loss = torch.tensor(0.0).to(self.device) if self.entropy_tuning: alpha_loss = -(self.log_alpha * (logp_pi + self.target_entropy).detach()).mean() self.writer.add_scalar('loss/entropy_tuning_loss', alpha_loss, self.current_timestep) self.logs["alpha_loss"].append(alpha_loss.item()) else: alpha_loss = 0 #logging into tensorboard self.writer.add_scalar('loss/Actor_loss', loss_pi, self.current_timestep) self.logs["policy_loss"].append(loss_pi.item()) return loss_pi, alpha_loss, pi_info def update(self, data): # First run one gradient descent step for Q1 and Q2 self.q_optimizer.zero_grad() loss_q, q_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in self.q_params: p.requires_grad = False # Next run one gradient descent step for pi. self.pi_optimizer.zero_grad() loss_pi, alpha_loss, pi_info = self.compute_loss_pi(data) loss_pi.backward() self.pi_optimizer.step() if self.entropy_tuning: # Next run one gradient descent step for alpha. self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() self.writer.add_scalar('entropy_tuning_param/alpha', self.alpha, self.current_timestep) # Unfreeze Q-network so you can optimize it at the next SAC step. for p in self.q_params: p.requires_grad = True # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) def reset_action_tracker(self): self.action_tracker = [] def reset_reward_tracker(self): self.reward_tracker = [] def get_action(self, o, deterministic=False): return self.ac.act( torch.as_tensor(o, dtype=torch.float32).to(self.device), deterministic) def eval_agent(self, test=True): if test == True: eval_env = self.test_env t_env = 'testing environment' else: eval_env = deepcopy(self.env) t_env = 'training environment' ep_rews = [] for j in range(self.num_test_episodes): o, d, ep_ret, ep_len = eval_env.reset(), False, 0, 0 while not (d or (ep_len == self.max_ep_len)): # Take deterministic actions at test time (noise_scale=0) # o = (o-self.replay_buffer.obs_buf_min) /(self.replay_buffer.obs_buf_max - self.replay_buffer.obs_buf_min) nom = o - self.replay_buffer.obs_buf_min denom = self.replay_buffer.obs_buf_max - self.replay_buffer.obs_buf_min denom[denom == 0] = 1 o = nom / denom o, r, d, _ = eval_env.step(self.get_action(o, True)) ep_ret += r ep_len += 1 ep_rews.append(ep_ret) print("Evaluating on the {} for {} episode, Mean Reward: {}".format( t_env, self.num_test_episodes, np.mean(ep_rews))) #print('Final cost',eval_env.cost()) self.writer.add_scalar("Scores/ramping", eval_env.cost()['ramping'], self.current_epoch) self.writer.add_scalar("Scores/1-load_factor", eval_env.cost()['1-load_factor'], self.current_epoch) self.writer.add_scalar("Scores/average_daily_peak", eval_env.cost()['average_daily_peak'], self.current_epoch) self.writer.add_scalar("Scores/peak_demand", eval_env.cost()['peak_demand'], self.current_epoch) self.writer.add_scalar("Scores/net_electricity_consumption", eval_env.cost()['net_electricity_consumption'], self.current_epoch) self.writer.add_scalar("Scores/total", eval_env.cost()['total'], self.current_epoch) self.writer.add_scalar("Scores/test_episode_reward", np.mean(ep_rews), self.current_epoch) return np.mean(ep_rews), eval_env.cost()['total'] def learn(self) -> None: ep_num = 0 best_score = 1.5 return_per_episode = [] # Prepare for interaction with environment total_steps = self.steps_per_epoch * self.epochs epoch_start_time = time.time() o, ep_ret, ep_len = self.env.reset(), 0, 0 #self.current_epoch=1 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): self.current_timestep = t #for logging # if t > 8759 update minmax of buffer and use it to normalize # so,we collect data for 1 year and calculate min-max of obs and rewards if t == self.start_steps: self.replay_buffer.collect_minmax() # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > self.start_steps: #print(t) a = self.get_action(o) else: a = self.env.action_space.sample() # Step the env o2, r, d, _ = self.env.step(a) self.writer.add_scalar('Rewards/single_Agent_reward', r, self.current_timestep) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == self.max_ep_len else d # Store experience to replay buffer self.replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == self.max_ep_len): #print('End of trajectory: Episode return is', ep_ret ) #print('Cost function is', self.env.cost()) ep_num += 1 return_per_episode.append(ep_ret) self.writer.add_scalar('Rewards/return_per_episode', ep_ret, ep_num) o, ep_ret, ep_len = self.env.reset(), 0, 0 # Update handling if t >= self.update_after and t % self.update_every == 0: #if t >= self.update_after: #instead of updating for some fixed steps, update for every step #print('updating') for _ in range(self.update_every): batch = self.replay_buffer.sample_batch(self.batch_size) #print(batch) #print(batch.size) #sys.exit() self.update(data=batch) #End of epoch handling if (t + 1) % self.steps_per_epoch == 0: epoch = (t + 1) // self.steps_per_epoch self.current_epoch += 1 self.pi_scheduler.step() self.q_scheduler.step() print('Epoch:', epoch, 'Policy_LR:', self.pi_scheduler.get_lr(), 'Critic_LR:', self.q_scheduler.get_lr()) print('time step: {} , epoch: {} ,time elapsed: {} '.format( t + 1, epoch, time.time() - epoch_start_time)) train_mean_return, test_score = self.eval_agent(test=False) #test_mean_return=self.eval_agent(test=True) #print('time_per_epoch',time.time()-epoch_start_time) epoch_start_time = time.time() print('\n') # Save model if (epoch % self.save_freq == 0): if test_score < best_score: best_score = test_score print( 'Better evaluation score and hence saving model to {}' .format( os.path.join(self.directory, 'model_param/'))) torch.save( self.ac.state_dict(), os.path.join(self.directory, 'model_param/') + 'checkpoint.pt') if (t + 1) % self.steps_per_epoch == 0: self.action_time_step = 0 else: self.action_time_step += 1 return epoch, train_mean_return * (self.batch_size)
class DQNAgent: def __init__(self, state_dim, action_dim, tau, epsilon, mem_size, batch_size, gamma, lr): self.sess = tf.Session() self.s = tf.placeholder(tf.float32, [None, *state_dim], 'state') self.s_ = tf.placeholder(tf.float32, [None, *state_dim], 'next_state') self.t = tf.placeholder(tf.float32, [ None, ], 'target') self.action_in = tf.placeholder(tf.int32, [ None, ], 'action') self.action_dim = action_dim self.action = tf.one_hot(self.action_in, depth=action_dim) self.batch_size = batch_size self.lr = lr self.gamma = gamma self.memory = ReplayBuffer(max_size=mem_size) # set the exploration params self.epsilon = epsilon self.decay_steps = 5000 self.decay_inc = (epsilon - 0.1) / 4000 # replace the target network params self.replace_counter = 0 self.replace_iter = 300 self.Q_net = QNet(sess=self.sess, lr=1e-3, action_dim=action_dim, S=self.s, S_=self.s_, tau=tau) self.q_eval = tf.reduce_sum(tf.multiply(self.action, self.Q_net.q), axis=1) self.loss = tf.reduce_mean(tf.squared_difference(self.q_eval, self.t)) self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss) self.sess.run(tf.global_variables_initializer()) def choose_action(self, s): if s.ndim < 2: s = [s] q_values = self.sess.run(self.Q_net.q, {self.s: s}) a_best = np.argmax(q_values) a = a_best if np.random.random() > self.epsilon else np.random.randint( self.action_dim) return a def store(self, s, a, r, s_, done): self.memory.store(s, a, r, s_, done) def learn(self): states, actions, rewards, next_states, dones = self.memory.sample( self.batch_size) # use the target network to select the best action for next state action_next_target = np.argmax(self.sess.run(self.Q_net.q_, {self.s_: next_states}), axis=1) # use the eval network to obtain the next state value and the target q_next = self.sess.run(self.q_eval, { self.s: next_states, self.action_in: action_next_target }) q_next[dones] = 0 target = rewards + self.gamma * q_next loss, _ = self.sess.run( [self.loss, self.train_op], { self.s: states, self.s_: next_states, self.action_in: actions, self.t: target }) if self.replace_counter % self.replace_iter == 0: self.sess.run(self.Q_net.replace) self.epsilon = max(0.1, self.epsilon - self.decay_inc) self.replace_counter += 1
class Agent(object): def __init__(self, env, alpha, beta, tau, gamma, max_replay_size = 1000000, batch_size=64, l1_dim = 400, l2_dim = 300, state_dim = 8, action_dim = 2): self.env = env self.alpha = alpha # learning rate for actor network self.beta = beta # learning rate for critic network self.tau = tau # polyak averaging parameter self.gamma = gamma # discount factor of reward self.max_replay_size = max_replay_size self.batch_size = batch_size self.l1_dim = l1_dim self.l2_dim = l2_dim self.state_dim = state_dim self.action_dim = action_dim # build the agent self.build_agent() # with "tau = 1", we initialize the target network the same as the main network self.update_target_network(tau = 1) def build_agent(self): # build the actor-critic network and also their target networks self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim, self.alpha) self.target_actor = copy.deepcopy(self.actor) self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim, self.beta) self.target_critic = copy.deepcopy(self.critic) # build the replaybuffer self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim) # build the OUNoise for action selection self.noise = OUNoise(self.action_dim) def act(self, state): state = T.tensor(state, dtype=T.float) action = self.actor(state) noisy_action = action + T.tensor(self.noise(), dtype=T.float) return noisy_action.cpu().detach().numpy() # store transition into the replay buffer def remember(self, state, action, reward, next_state, done): self.replaybuffer.store(state, action, reward, next_state, done) def sample_replaybuffer(self): # sample from the ReplayBuffer states, actions, rewards, next_states, dones = self.replaybuffer.sample(self.batch_size) states = T.tensor(states, dtype=T.float) actions = T.tensor(actions, dtype=T.float) rewards = T.tensor(rewards, dtype=T.float) next_states = T.tensor(next_states, dtype=T.float) dones = T.tensor(dones) return states, actions, rewards, next_states, dones def step(self): # we cannot learn before the amount of transitions inside # the replay buffer is larger than the batch size if self.replaybuffer.mem_cntr < self.batch_size: return # get transition samples from replayer buffer states, actions, rewards, next_states, dones = self.sample_replaybuffer() # update the critic network self.update_critic(states, actions, rewards, next_states, dones) # update the actor network self.update_actor(states) # update target network parameters self.update_target_network() def update_critic(self, states, actions, rewards, next_states, dones): # update the critic network target_actions = self.target_actor(next_states) target_critic_values = self.target_critic(next_states, target_actions) critic_values = self.critic(states, actions) target_critic_values = [rewards[j] + self.gamma * target_critic_values[j] * dones[j] for j in range(self.batch_size)] # reshape the variable target_critic_values = T.tensor(target_critic_values) target_critic_values = target_critic_values.view(self.batch_size, 1) critic_loss = F.mse_loss(target_critic_values, critic_values) # In PyTorch, we need to set the gradients to zero before starting to do backpropragation # because PyTorch accumulates the gradients on subsequent backward passes # optimize the critic self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() def update_actor(self, states): # here we use the output from the actor network NOT the noisy action # because we only need to enforce exploration in the when actual interactions # happen in the environment actions = self.actor(states) # NOTICE here we do not multiply "actor_loss" with " self.actor(states)" # because we here take gradient with respect to the parameter not # first part to action and second to parameter (refer to the original paper) actor_loss = - self.critic(states, actions).mean() # optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() def update_target_network(self, tau=None): tau = self.tau if tau is None else tau # polyak averaging to update the target critic network for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) # polyak averaging to update the target actor network for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class DQNAgent: def __init__(self, state_dim, action_dim, f1, tau, epsilon, mem_size, batch_size, gamma, lr): self.sess = tf.Session() self.s = tf.placeholder(tf.float32, [None, *state_dim], 'state') self.s_ = tf.placeholder(tf.float32, [None, *state_dim], 'next_state') self.t = tf.placeholder(tf.float32, [ None, ], 'target') self.action_in = tf.placeholder(tf.int32, [ None, ], 'action') self.action_dim = action_dim self.action = tf.one_hot(self.action_in, depth=action_dim) self.batch_size = batch_size self.lr = lr self.gamma = gamma self.epsilon = epsilon self.decay_steps = 5000 self.decay_inc = (epsilon - 0.1) / 4000 self.replace_counter = 0 self.memory = ReplayBuffer(max_size=mem_size) self.Q_net = QNet(sess=self.sess, lr=1e-3, action_dim=action_dim, f1=f1, S=self.s, S_=self.s_, tau=tau) self.q_action = tf.reduce_sum(tf.multiply(self.action, self.Q_net.q), axis=1) self.error = tf.abs(self.q_action - self.t) self.loss = tf.reduce_mean(tf.square(self.error)) self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss) self.sess.run(tf.global_variables_initializer()) def choose_action(self, s): if s.ndim < 2: s = [s] q_values = self.sess.run(self.Q_net.q, {self.s: s}) a_best = np.argmax(q_values) a = a_best if np.random.random() > self.epsilon else np.random.randint( self.action_dim) return a def store(self, s, a, r, s_, done): self.memory.store(s, a, r, s_, done) def learn(self): states, actions, rewards, next_states, dones = self.memory.sample( self.batch_size) q_next = self.sess.run(self.Q_net.q_, {self.s_: next_states}) q_next[dones] = np.zeros([self.action_dim]) target = rewards + self.gamma * np.max(q_next, axis=1) errors, _ = self.sess.run( [self.error, self.train_op], { self.s: states, self.s_: next_states, self.action_in: actions, self.t: target }) if self.replace_counter % 300 == 0: self.sess.run(self.Q_net.replace) self.epsilon = max(0.1, self.epsilon - self.decay_inc) self.replace_counter += 1
def main(args): if 'L2M2019Env' in args.env_name: env = L2M2019Env(visualize=False, difficulty=args.difficulty) test_env = L2M2019Env(visualize=False, difficulty=args.difficulty) else: env = gym.make(args.env_name) test_env = gym.make(args.env_name) device = torch.device(args.device) data = np.load('./official_obs_scaler.npz') obs_mean, obs_std = data['mean'], data['std'] # 1.Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) env.seed(args.seed) test_env.seed(args.seed + 999) # 2.Create actor, critic, EnvSampler() and PPO. if 'L2M2019Env' in args.env_name: obs_dim = 99 else: obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high act_low = env.action_space.low actor_critic = MLPActorCritic(obs_dim, act_dim, hidden_sizes=args.hidden_sizes).to(device) replay_buffer = ReplayBuffer(obs_dim, act_dim, args.buffer_size) gac = GAC(actor_critic, replay_buffer, device=device, gamma=args.gamma, alpha_start=args.alpha_start, alpha_min=args.alpha_min, alpha_max=args.alpha_max) def act_encoder(y): # y = [min, max] ==> x = [-1, 1] # if args.env_name == 'L2M2019Env': # return y return (y - act_low) / (act_high - act_low) * 2.0 - 1.0 def act_decoder(x): # x = [-1, 1] ==> y = [min, max] # if args.env_name == 'L2M2019Env': # return np.abs(x) return (x + 1.0) / 2.0 * (act_high - act_low) - act_low def get_observation(env): obs = np.array(env.get_observation()[242:]) obs = (obs - obs_mean) / obs_std state_desc = env.get_state_desc() p_body = [ state_desc['body_pos']['pelvis'][0], -state_desc['body_pos']['pelvis'][2] ] v_body = [ state_desc['body_vel']['pelvis'][0], -state_desc['body_vel']['pelvis'][2] ] v_tgt = env.vtgt.get_vtgt(p_body).T return np.append(obs, v_tgt) def get_reward(env): reward = 10.0 # Reward for not falling down state_desc = env.get_state_desc() p_body = [ state_desc['body_pos']['pelvis'][0], -state_desc['body_pos']['pelvis'][2] ] v_body = [ state_desc['body_vel']['pelvis'][0], -state_desc['body_vel']['pelvis'][2] ] v_tgt = env.vtgt.get_vtgt(p_body).T vel_penalty = np.linalg.norm(v_body - v_tgt) muscle_penalty = 0 for muscle in sorted(state_desc['muscles'].keys()): muscle_penalty += np.square( state_desc['muscles'][muscle]['activation']) ret_r = reward - (vel_penalty * 3 + muscle_penalty * 1) if vel_penalty < 0.3: ret_r += 10 return ret_r # 3.Start training. def get_action(o, deterministic=False): o = torch.FloatTensor(o.reshape(1, -1)).to(device) a = actor_critic.act(o, deterministic) return a def test_agent(): test_ret, test_len = 0, 0 for j in range(args.epoch_per_test): _, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 o = get_observation(test_env) while not (d or (ep_len == args.max_ep_len)): # Take deterministic actions at test time a = get_action(o, True) a = act_decoder(a) for _ in range(args.frame_skip): _, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 if d: break o = get_observation(test_env) test_ret += ep_ret test_len += ep_len return test_ret / args.epoch_per_test, test_len / args.epoch_per_test total_step = args.total_epoch * args.steps_per_epoch _, d, ep_len = env.reset(), False, 0 o = get_observation(env) for t in range(1, total_step + 1): if t <= args.start_steps: a = act_encoder(env.action_space.sample()) else: a = get_action(o, deterministic=False) a = act_decoder(a) r = 0.0 for _ in range(args.frame_skip): _, _, d, _ = env.step(a) r += get_reward(env) ep_len += 1 if d: break o2 = get_observation(env) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == args.max_ep_len else d # if not d: # new_o, new_r, new_o2 = generate_success(o, o2) # replay_buffer.store(new_o, a, new_r * args.reward_scale, new_o2, d) # Store experience to replay buffer replay_buffer.store(o, a, r * args.reward_scale, o2, d) o = o2 if d or (ep_len == args.max_ep_len): _, ep_len = env.reset(obs_as_dict=False), 0 o = get_observation(env) if t >= args.update_after and t % args.steps_per_update == 0: for _ in range(args.steps_per_update): loss_a, loss_c, alpha = gac.update(args.batch_size) gac.update_beta() print( "loss_actor = {:<22}, loss_critic = {:<22}, alpha = {:<20}, beta = {:<20}" .format(loss_a, loss_c, alpha, gac.beta)) # End of epoch handling if t >= args.update_after and t % args.steps_per_epoch == 0: test_ret, test_len = test_agent() print("Step {:>10}: test_ret = {:<20}, test_len = {:<20}".format( t, test_ret, test_len)) print( "-----------------------------------------------------------") yield t, test_ret, test_len, actor_critic