class BaseAgent: def __init__(self, features, actions, params): self.features = features self.actions = actions self.params = params # define parameter contract self.alpha = params['alpha'] self.epsilon = params['epsilon'] self.target_refresh = params['target_refresh'] self.buffer_size = params['buffer_size'] self.h1 = params['h1'] self.h2 = params['h2'] # build two networks, one for the "online" learning policy # the other as a fixed target network self.policy_net = Network(features, self.h1, self.h2, actions).to(device) self.target_net = Network(features, self.h1, self.h2, actions).to(device) self.det_net = Network(features, self.h1, self.h2, actions).to(device) self.bpolicy_net = Network(features, self.h1, self.h2, actions).to(device) self.bpolicy_net.load_state_dict( torch.load( "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt" )) # build the optimizer for _only_ the policy network # target network parameters will be copied from the policy net periodically self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) # a simple circular replay buffer (i.e. a FIFO buffer) self.buffer = ReplayBuffer(self.buffer_size) self.steps = 0 # initialize the weights of the target network to match the weights of policy network self.policy_net.cloneWeightsTo(self.target_net) def selectAction(self, x): # take a random action about epsilon percent of the time q_s, _ = self.bpolicy_net(x) if q_s.shape[0] == 3: q_s = q_s.unsqueeze(0) #act = q_s.argmax().detach() # else: act = torch.max(q_s, 1).indices.detach().numpy() for i in range(act.shape[0]): action = act[i] if action == 1: if np.random.rand() < self.epsilon: act[i] = np.random.choice([0, 2]) # if act.cpu().numpy() == 1: # if np.random.rand() < self.epsilon: # a = np.random.randint(self.actions-1) # if np.random.rand() < self.epsilon: # a = np.random.randint(self.actions) # return torch.tensor(a, device=device) # # otherwise take a greedy action # q_s, _ = self.bpolicy_net(x) # # print(q_s) # return q_s.argmax().detach() act_tensor = torch.from_numpy(act).detach().to(device) return act_tensor def updateNetwork(self, samples): pass def update(self, s, a, sp, r, gamma): # the "online" sample gets tossed into the replay buffer self.buffer.add((s, a, sp, r, gamma)) self.steps += 1 # if it is time to set the target net <- policy network # do that before the learning step if self.steps % self.target_refresh == 0: self.policy_net.cloneWeightsTo(self.target_net) # as long as we have enough samples in the buffer to do one mini-batch update # go ahead and randomly sample a mini-batch and do a single update if len(self.buffer) > 200: samples, idcs = self.buffer.sample(200) self.updateNetwork(samples)
class NAF: MODEL_NAME = "NAF" TARGET_MODEL_NAME = "target-NAF" class Build(Enum): SINGLE = 1 MULTIPLE = 2 HYDRA = 3 def __init__(self, prep, build, policy, state_dim, action_dim, monitor_directory, buffer_size=10000, batch_size=32, steps_before_train=100, train_freq=1, num_steps=1000000, learning_rate=1e-3, update_rate=1e-3, max_reward=None, detailed_summary=False): self.prep = prep self.build_mode = build self.policy = policy self.state_dim = state_dim self.action_dim = action_dim self.summary_dir = os.path.join(monitor_directory, "summary") self.detailed_summary = detailed_summary self.discount = 0.99 self.learning_rate = learning_rate self.target_update_rate = update_rate self.buffer_size = buffer_size self.batch_size = batch_size self.steps_before_train = steps_before_train self.train_freq = train_freq self.max_reward = max_reward self.max_iters = num_steps self.step = 0 self.solved = False self.state_layers = [64, 32] self.mu_layers = [16, 8, self.action_dim] self.l_layers = [16, 8, (self.action_dim * (self.action_dim + 1)) / 2] self.v_layers = [16, 8, 1] self.action_inputs = None self.reward_inputs = None self.done = None self.state_inputs = None self.state_outputs = None self.mu_outputs = None self.l_outputs = None self.value_outputs = None self.next_state_inputs = None self.next_state_outputs = None self.target_value_outputs = None self.target = None self.advantages = None self.q_values = None self.loss = None self.global_step = None self.inc_global_step = None self.train_op = None self.target_update = None self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) self.build() self.merged = tf.summary.merge_all() self.session = tf.Session() self.summary_dir = utils.new_summary_dir(self.summary_dir) utils.log_params( self.summary_dir, { "learning rate": self.learning_rate, "batch size": self.batch_size, "update rate": self.target_update_rate, "buffer size": self.buffer_size, "build": self.build_mode.name, "train frequency": self.train_freq }) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session.run(init_op) def build(self): self.action_inputs = tf.placeholder(tf.float32, (None, self.action_dim)) self.reward_inputs = tf.placeholder(tf.float32, (None, )) self.done = tf.placeholder(tf.float32, (None, )) self.state_inputs, self.state_outputs, self.mu_outputs, self.l_outputs, self.value_outputs = \ self.build_network(self.MODEL_NAME) self.next_state_inputs, self.next_state_outputs, _, _, self.target_value_outputs = \ self.build_network(self.TARGET_MODEL_NAME) self.target = tf.expand_dims(self.reward_inputs, 1) + self.discount * ( 1 - tf.expand_dims(self.done, 1)) * self.target_value_outputs # taken from https://github.com/carpedm20/NAF-tensorflow/blob/master/src/network.py pivot = 0 rows = [] for idx in range(self.action_dim): count = self.action_dim - idx diag_elem = tf.exp(tf.slice(self.l_outputs, (0, pivot), (-1, 1))) non_diag_elems = tf.slice(self.l_outputs, (0, pivot + 1), (-1, count - 1)) row = tf.pad(tf.concat((diag_elem, non_diag_elems), 1), ((0, 0), (idx, 0))) rows.append(row) pivot += count L = tf.transpose(tf.stack(rows, axis=1), (0, 2, 1)) P = tf.matmul(L, tf.transpose(L, (0, 2, 1))) adv_term = tf.expand_dims(self.action_inputs - self.mu_outputs, -1) self.advantages = -tf.matmul(tf.transpose(adv_term, [0, 2, 1]), tf.matmul(P, adv_term)) / 2 self.advantages = tf.reshape(self.advantages, [-1, 1]) self.q_values = self.advantages + self.value_outputs self.loss = tf.reduce_mean( architect.huber_loss(self.q_values - tf.stop_gradient(self.target))) tf.summary.scalar("training_loss", self.loss) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1)) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.train_op = optimizer.minimize(self.loss) self.create_target_update_op() def build_network(self, name): detailed_summary = self.detailed_summary if name == self.TARGET_MODEL_NAME: detailed_summary = False with tf.variable_scope(name): state_inputs = tf.placeholder(tf.float32, shape=(None, self.state_dim)) if self.build_mode == self.Build.SINGLE: state_outputs = architect.dense_block( state_inputs, self.state_layers, name="state_branch", detailed_summary=detailed_summary) mu_outputs = architect.dense_block( state_outputs, [self.mu_layers[-1]], "mu_branch", detailed_summary=detailed_summary) l_outputs = architect.dense_block( state_outputs, [self.l_layers[-1]], "l_branch", detailed_summary=detailed_summary) value_outputs = architect.dense_block( state_outputs, [self.v_layers[-1]], "value_branch", detailed_summary=detailed_summary) elif self.build_mode == self.Build.MULTIPLE: state_outputs = None mu_state = architect.dense_block( state_inputs, self.state_layers, name="mu_state", detailed_summary=detailed_summary) l_state = architect.dense_block( state_inputs, self.state_layers, name="l_state", detailed_summary=detailed_summary) value_state = architect.dense_block( state_inputs, self.state_layers, name="value_state", detailed_summary=detailed_summary) mu_outputs = architect.dense_block( mu_state, [self.mu_layers[-1]], "mu_branch", detailed_summary=detailed_summary) l_outputs = architect.dense_block( l_state, [self.l_layers[-1]], "l_branch", detailed_summary=detailed_summary) value_outputs = architect.dense_block( value_state, [self.v_layers[-1]], "value_branch", detailed_summary=detailed_summary) elif self.build_mode == self.Build.HYDRA: state_outputs = architect.dense_block( state_inputs, self.state_layers, name="state_branch", detailed_summary=detailed_summary) mu_outputs = architect.dense_block( state_outputs, self.mu_layers, "mu_branch", detailed_summary=detailed_summary) l_outputs = architect.dense_block( state_outputs, self.l_layers, "l_branch", detailed_summary=detailed_summary) value_outputs = architect.dense_block( state_outputs, self.v_layers, "value_branch", detailed_summary=detailed_summary) else: raise ValueError("Wrong build type.") return state_inputs, state_outputs, mu_outputs, l_outputs, value_outputs def create_target_update_op(self): # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.MODEL_NAME) target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_MODEL_NAME) self.target_update = [] for v_source, v_target in zip(net_vars, target_net_vars): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_update.append(update_op) self.target_update = tf.group(*self.target_update) def learn(self): # learn batch = self.buffer.sample(self.batch_size) merged, targets, _ = self.session.run( [self.merged, self.target, self.train_op], feed_dict={ self.state_inputs: batch["states"], self.action_inputs: batch["actions"], self.reward_inputs: batch["rewards"], self.next_state_inputs: batch["next_states"], self.done: batch["done"] }) self.summary_writer.add_summary(merged, global_step=self.step) # target update self.session.run(self.target_update) def run_episode(self, env): self.policy.reset() state = env.reset() state, skip = self.prep.process(state) total_reward = 0 while True: # play if skip: action = env.action_space.sample() else: action = self.session.run(self.mu_outputs, feed_dict={self.state_inputs: state})[0] action = self.policy.add_noise(action) tmp_state = state tmp_skip = skip state, reward, done, _ = env.step(action) state, skip = self.prep.process(state) total_reward += reward if not tmp_skip and not tmp_skip: self.buffer.add({ "state": tmp_state[0], "action": action, "reward": reward, "next_state": state[0], "done": int(done) }) if self.step >= self.steps_before_train and not self.solved: # learn for _ in range(self.train_freq): self.learn() _, self.step = self.session.run( [self.inc_global_step, self.global_step]) else: _, self.step = self.session.run( [self.inc_global_step, self.global_step]) if done: break summary_value = summary_pb2.Summary.Value(tag="episode_reward", simple_value=total_reward) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=self.step) if self.max_reward is not None: if total_reward >= self.max_reward: self.solved = True else: self.solved = False if self.step == self.max_iters: self.saver.save(self.session, self.summary_dir, global_step=self.step) return total_reward, self.step def close(self): self.session.close()
class DQN(BaseAgent): def __init__(self, features, actions, state_array, params): super(DQN, self).__init__(features, actions, params) self.buffer_BACK = ReplayBuffer(1000) self.buffer_STAY = ReplayBuffer(1000) self.buffer_FORWARD = ReplayBuffer(1000) self.back_q_net = Network(features, self.h1, self.h2, 1).to(device) self.back_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.back_q_net.cloneWeightsTo(self.back_target_q_net) self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device) self.stay_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.stay_q_net.cloneWeightsTo(self.stay_target_q_net) self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device) self.forward_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.forward_q_net.cloneWeightsTo(self.forward_target_q_net) self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.back_values = [] self.stay_values = [] self.forward_values = [] self.back_values_baseline = [] self.stay_values_baseline = [] self.forward_values_baseline = [] self.td_loss = [] self.state_array = state_array self.penultimate_features = [] self.ratioMap = params['ratioMap'] self.sampleSize = params['sampleSize'] def updateNetwork(self, samples): # organize the mini-batch so that we can request "columns" from the data # e.g. we can get all of the actions, or all of the states with a single call batch = getBatchColumns(samples) # compute Q(s, a) for each sample in mini-batch Qs, x = self.policy_net(batch.states) Qsa = Qs.gather(1, batch.actions).squeeze() self.penultimate_features.append(x) # by default Q(s', a') = 0 unless the next states are non-terminal Qspap = torch.zeros(batch.size, device=device) # for i in range(len(batch.actions.numpy())): # if batch.actions.numpy()[i][0] == 0: # self.back_values.append(Qsa.detach().numpy()[i]) # elif batch.actions.numpy()[i][0] == 1: # self.stay_values.append(Qsa.detach().numpy()[i]) # elif batch.actions.numpy()[i][0] == 2: # self.forward_values.append(Qsa.detach().numpy()[i]) # if we don't have any non-terminal next states, then no need to bootstrap if batch.nterm_sp.shape[0] > 0: Qsp, _ = self.target_net(batch.nterm_sp) # bootstrapping term is the max Q value for the next-state # only assign to indices where the next state is non-terminal Qspap[batch.nterm] = Qsp.max(1).values # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize # don't worry about detaching the bootstrapping term for semi-gradient Q-learning # the target network handles that target = batch.rewards + batch.gamma * Qspap.detach() td_loss = 0.5 * f.mse_loss(target, Qsa) # make sure we have no gradients left over from previous update self.optimizer.zero_grad() self.target_net.zero_grad() # compute the entire gradient of the network using only the td error td_loss.backward() self.td_loss.append(td_loss.detach().numpy()) # self.td_loss = self.td_loss + list(td_loss.detach().numpy()) Qs_state_array, _ = self.policy_net(self.state_array) Qsa_mean_states = torch.mean(Qs_state_array, 0) self.back_values.append(Qsa_mean_states[0].detach().numpy()) self.stay_values.append(Qsa_mean_states[1].detach().numpy()) self.forward_values.append(Qsa_mean_states[2].detach().numpy()) # update the *policy network* using the combined gradients self.optimizer.step() def updateActionNet(self, samples, q_net, target_q_net, optimizer, storeList): batch = getBatchColumns(samples) Qs, x = q_net(batch.states) # Qsa = Qs.squeeze() # for i in range(len(batch.actions)): # storeList.append(Qsa.detach().numpy()[i]) Qspap = torch.zeros(batch.size, device=device) ############ ============ CHECK ================= ############################### if batch.nterm_sp.shape[0] > 0: ## Qsp, _ = target_q_net(batch.nterm_sp) #### Is this correct ???? Qsp_back, _ = self.back_target_q_net(batch.nterm_sp) Qsp_stay, _ = self.stay_target_q_net(batch.nterm_sp) Qsp_forward, _ = self.forward_target_q_net(batch.nterm_sp) Qsp = torch.hstack([Qsp_back, Qsp_stay, Qsp_forward]) # bootstrapping term is the max Q value for the next-state # only assign to indices where the next state is non-terminal Qspap[batch.nterm] = Qsp.max(1).values ############ ============ CHECK ================= ############################### # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize # don't worry about detaching the bootstrapping term for semi-gradient Q-learning # the target network handles that target = batch.rewards + batch.gamma * Qspap.detach() td_loss = 0.5 * f.mse_loss(target, Qsa) # make sure we have no gradients left over from previous update optimizer.zero_grad() target_q_net.zero_grad() self.back_target_q_net.zero_grad() self.stay_target_q_net.zero_grad() self.forward_target_q_net.zero_grad() # compute the entire gradient of the network using only the td error td_loss.backward() Qs_state_array, _ = q_net(self.state_array) Qsa_mean_states = torch.mean(Qs_state_array, 0) storeList.append(Qsa_mean_states[0].detach().numpy()) # update the *policy network* using the combined gradients optimizer.step() def update(self, s, a, sp, r, gamma): if a.cpu().numpy() == 0: self.buffer_BACK.add((s, a, sp, r, gamma)) elif a.cpu().numpy() == 1: self.buffer_STAY.add((s, a, sp, r, gamma)) elif a.cpu().numpy() == 2: self.buffer_FORWARD.add((s, a, sp, r, gamma)) # the "online" sample gets tossed into the replay buffer self.buffer.add((s, a, sp, r, gamma)) self.steps += 1 # if it is time to set the target net <- policy network # do that before the learning step if self.steps % self.target_refresh == 0: self.policy_net.cloneWeightsTo(self.target_net) self.back_q_net.cloneWeightsTo(self.back_target_q_net) self.stay_q_net.cloneWeightsTo(self.stay_target_q_net) self.forward_q_net.cloneWeightsTo(self.forward_target_q_net) back_sample_count = math.floor( self.ratioMap.backward_ratio * self.sampleSize) stay_sample_count = math.floor( self.ratioMap.stay_ratio * self.sampleSize) forward_sample_count = math.floor( self.ratioMap.forward_ratio * self.sampleSize) # as long as we have enough samples in the buffer to do one mini-batch update # go ahead and randomly sample a mini-batch and do a single update if len(self.buffer_BACK) > back_sample_count \ and len(self.buffer_STAY) > stay_sample_count \ and len(self.buffer_FORWARD) > forward_sample_count: samplesBack, idcs = self.buffer_BACK.sample(back_sample_count) samplesStay, idcs = self.buffer_STAY.sample(stay_sample_count) samplesForward, idcs = self.buffer_FORWARD.sample(forward_sample_count) self.updateActionNet(samplesBack, self.back_q_net, self.back_target_q_net, self.optimizerBack, self.back_values_baseline) self.updateActionNet(samplesStay, self.stay_q_net, self.stay_target_q_net, self.optimizerStay, self.stay_values_baseline) self.updateActionNet(samplesForward, self.forward_q_net, self.forward_target_q_net, self.optimizerForward, self.forward_values_baseline) samples = samplesBack + samplesStay + samplesForward self.updateNetwork(samples)
class DDPG_Agent(Agent): """Interacts with and learns from the environment.""" policy_type = "DDPG" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ super().__init__() self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = DDPG_Actor(state_size, action_size, random_seed).to(device) self.actor_target = DDPG_Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = DDPG_Critic(state_size, action_size, random_seed).to(device) self.critic_target = DDPG_Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #Statistics self.stats = { "actor_loss": [], "critic_loss": [], "reward_sum": [], } def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() action = self.actor_local.select_action(state) self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() #tmp = np.array((critic_loss.item(), actor_loss.item())) #print(tmp) # --------------------------- for the plot ----------------------------- # # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) with torch.no_grad(): actions_pred_target = self.actor_target(states) actor_loss_target = -self.critic_target( states, actions_pred_target).mean() Q_expected_target = self.critic_target(states, actions) critic_loss_target = F.mse_loss(Q_expected_target, Q_targets) with open("saveDDPG_critic-actor_loss.csv", "a") as f: tmp = str(critic_loss_target.item()) + "," + str( actor_loss_target.item()) + "\n" f.write(tmp) self.save_stats(actor_loss=actor_loss.item(), critic_loss=critic_loss.item(), reward_sum=rewards.sum().item()) def store_policy(self, env_name, score): traced = torch.jit.script(self.actor_target) torch.jit.save( traced, "data/policies/" + "DDPGAgent" + str(env_name) + "#" + str(score) + ".zip") def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class BaseAgent: def __init__(self, features, actions, params): self.features = features self.actions = actions self.params = params # define parameter contract self.alpha = params['alpha'] self.epsilon = params['epsilon'] self.target_refresh = params['target_refresh'] self.buffer_size = params['buffer_size'] self.h1 = params['h1'] self.h2 = params['h2'] # build two networks, one for the "online" learning policy # the other as a fixed target network self.policy_net = Network(features, self.h1, self.h2, actions).to(device) self.target_net = Network(features, self.h1, self.h2, actions).to(device) # build the optimizer for _only_ the policy network # target network parameters will be copied from the policy net periodically self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min') # a simple circular replay buffer (i.e. a FIFO buffer) self.buffer = ReplayBuffer(self.buffer_size) self.steps = 0 self.actionCounter = np.zeros((env.width, env.height, env.num_actions)) # initialize the weights of the target network to match the weights of policy network self.policy_net.cloneWeightsTo(self.target_net) def selectAction(self, x): # take a random action about epsilon percent of the time if np.random.rand() < self.epsilon: a = np.random.randint(self.actions) return torch.tensor(a, device=device) # otherwise take a greedy action q_s, _ = self.policy_net(x) # print(q_s.detach().numpy()[0][3]) print(q_s.argmax().detach()) return q_s.argmax().detach() def updateNetwork(self, samples): pass def update(self, s, a, r, sp, gamma): # the "online" sample gets tossed into the replay buffer self.buffer.add((s, a, r, sp, gamma)) self.steps += 1 a = a.numpy() s = s.numpy() self.actionCounter[s[0][0]][s[0][1]][a] += 1 # if it is time to set the target net <- policy network # do that before the learning step if self.steps % self.target_refresh == 0: self.policy_net.cloneWeightsTo(self.target_net) # as long as we have enough samples in the buffer to do one mini-batch update # go ahead and randomly sample a mini-batch and do a single update if len(self.buffer) > 32: samples, idcs = self.buffer.sample(32) self.updateNetwork(samples)
class BaseAgent: def __init__(self, features: int, actions: int, params: Dict, seed: int, collector: Collector): self.features = features self.actions = actions self.params = params self.collector = collector self.seed = seed # define parameter contract self.gamma = params['gamma'] self.epsilon = params.get('epsilon', 0) # the mellowmax parameter self.omega = params.get('omega', 1.0) # set up network for estimating Q(s, a) self.value_net = Network(features, actions, params, seed).to(device) # build the optimizer self.optimizer_params = params['optimizer'] self.optimizer = deserializeOptimizer(self.value_net.parameters(), self.optimizer_params) self.steps = 0 # set up the replay buffer self.buffer_size = params['buffer_size'] self.batch_size = params['batch'] self.buffer_type = params.get('buffer', 'standard') if self.buffer_type == 'per': prioritization = params['prioritization'] self.buffer = PrioritizedReplayMemory(self.buffer_size, prioritization) else: self.buffer = ReplayBuffer(self.buffer_size) # build a target network self.target_refresh = params.get('target_refresh', 1) self.target_net = copy.deepcopy(self.value_net) self.initializeTargetNet() def getValues(x: torch.Tensor): qs = self.values(x).detach().cpu().squeeze(0).numpy() return qs self.policy = createEpsilonGreedy(seed, self.epsilon, getValues) # return the Q(s, a) values from the value network def values(self, x): return self.value_net(x)[0] # sample an action according to our policy def selectAction(self, x): return self.policy.selectAction(x) def initializeTargetNet(self): # if we aren't using target nets, then save some compute if self.target_refresh > 1: self.target_net = copy.deepcopy(self.value_net) cloneNetworkWeights(self.value_net, self.target_net) else: self.target_net = self.value_net @abstractmethod def updateNetwork(self, batch: Batch, predictions: Dict): pass @abstractmethod def forward(self, batch: Batch) -> Dict[str, torch.Tensor]: pass @abstractmethod def bootstrap(self, batch: Batch, next_values: torch.Tensor) -> Dict[str, torch.Tensor]: pass # a helper method that lets us bypass combining gradients whenever # target networks are disabled def combineTargetGrads(self): if self.target_net == self.value_net: return addGradients_(self.value_net, self.target_net) def update(self, s, a, sp, r, gamma): self.buffer.add((s, a, sp, r, gamma)) self.steps += 1 if self.steps % self.target_refresh == 0 and self.target_refresh > 1: cloneNetworkWeights(self.value_net, self.target_net) if len(self.buffer) > self.batch_size + 1: samples, idcs = self.buffer.sample(self.batch_size) batch = getBatchColumns(samples) predictions = self.forward(batch) tde = self.updateNetwork(batch, predictions) self.buffer.update_priorities(idcs, tde)
class DDPG: CRITIC_NAME = "critic" TARGET_CRITIC_NAME = "target_critic" ACTOR_NAME = "actor" TARGET_ACTOR_NAME = "target_actor" def __init__(self, state_dim, action_dim, monitor_directory, actor_learning_rate=1e-5, critic_learning_rate=1e-3, critic_target_update_rate=1e-3, actor_target_update_rate=1e-3, discount=0.99, l2_decay=1e-2, buffer_size=1000000, batch_size=64, detail_summary=False, tanh_action=True, input_batch_norm=True, all_batch_norm=True, log_frequency=10): self.state_dim = state_dim self.action_dim = action_dim self.critic_learning_rate = critic_learning_rate self.actor_learning_rate = actor_learning_rate self.critic_target_update_rate = critic_target_update_rate self.actor_target_update_rate = actor_target_update_rate self.discount = discount self.batch_size = batch_size self.l2_decay = l2_decay self.buffer_size = buffer_size self.summary_dir = os.path.join(monitor_directory, "summary") self.detail_summary = detail_summary self.tanh_action = tanh_action self.input_batch_norm = input_batch_norm self.all_batch_norm = all_batch_norm self.log_frequency = log_frequency self.step = 0 self.solved = False self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) self.__build() self.summary_dir = utils.new_summary_dir(self.summary_dir) utils.log_params( self.summary_dir, { "actor learning rate": self.actor_learning_rate, "critic learning rate": self.critic_learning_rate, "batch size": self.batch_size, "actor update rate": self.actor_target_update_rate, "critic update rate": self.critic_target_update_rate, "buffer size": self.buffer_size, }) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session = tf.Session() self.merged = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.session.run(init_op) """ PUBLIC """ def learn(self): batch = self.buffer.sample(self.batch_size) self.__train_critic(batch["states"], batch["actions"], batch["rewards"], batch["next_states"], batch["done"]) self.__train_actor(batch["states"]) self.session.run([ self.target_critic_update, self.target_actor_update, self.inc_global_step ]) def act(self, state): a = self.session.run(self.action, feed_dict={ self.state_input: state, self.is_training: False })[0] return a def perceive(self, transition): self.buffer.add(transition) def log_scalar(self, name, value, index): summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=index) def save(self): self.saver.save(self.session, self.summary_dir, global_step=self.session.run(self.global_step)) def close(self): self.session.close() """ PRIVATE """ def __build_critic(self, name, state_input, action_input): bn_training = self.is_training if name == self.TARGET_CRITIC_NAME: bn_training = False with tf.variable_scope(name): # weights and biases W1 = self.__get_weights((self.state_dim, 400), self.state_dim, name="W1") b1 = self.__get_weights((400, ), self.state_dim, name="b1") W2 = self.__get_weights((400, 300), 400 + self.action_dim, name="W2") b2 = self.__get_weights((300, ), 400 + self.action_dim, name="b2") W2_action = self.__get_weights((self.action_dim, 300), 400 + self.action_dim, name="W2_action") W3 = tf.Variable(tf.random_uniform((300, 1), -3e-3, 3e-3), name="W3") b3 = tf.Variable(tf.random_uniform((1, ), -3e-3, 3e-3), name="b3") # layers if self.input_batch_norm: state_input = tf.layers.batch_normalization( state_input, training=bn_training) layer_1 = tf.matmul(state_input, W1) + b1 if self.all_batch_norm: layer_1 = tf.layers.batch_normalization(layer_1, training=bn_training) layer_1 = tf.nn.relu(layer_1) layer_2 = tf.nn.relu( tf.matmul(layer_1, W2) + tf.matmul(action_input, W2_action) + b2) output_layer = tf.matmul(layer_2, W3) + b3 # summary if name == self.CRITIC_NAME: self.critic_summaries = [ tf.summary.histogram("W1", W1), tf.summary.histogram("b1", b1), tf.summary.histogram("W2", W2), tf.summary.histogram("b2", b2), tf.summary.histogram("W2_action", W2_action), tf.summary.histogram("W3", W3), tf.summary.histogram("b3", b3), tf.summary.histogram("layer_1", layer_1), tf.summary.histogram("layer_2", layer_2), tf.summary.histogram("output_layer", output_layer) ] # weight decay weights = [W1, b1, W2, b2, W2_action, W3, b3] weight_decay = tf.add_n( [self.l2_decay * tf.nn.l2_loss(var) for var in weights]) return output_layer, weight_decay def __build_actor(self, name, state_input): bn_training = self.is_training if name == self.TARGET_ACTOR_NAME: bn_training = False with tf.variable_scope(name): # weights and biases W1 = self.__get_weights((self.state_dim, 400), self.state_dim, name="W1") b1 = self.__get_weights((400, ), self.state_dim, name="b1") W2 = self.__get_weights((400, 300), 400, name="W2") b2 = self.__get_weights((300, ), 400, name="b2") W3 = tf.Variable(tf.random_uniform((300, self.action_dim), minval=-3e-3, maxval=3e-3), name="W3") b3 = tf.Variable(tf.random_uniform((self.action_dim, ), -3e-3, 3e-3), name="b3") # layers if self.input_batch_norm: state_input = tf.layers.batch_normalization( state_input, training=bn_training) layer_1 = tf.matmul(state_input, W1) + b1 if self.all_batch_norm: layer_1 = tf.layers.batch_normalization(layer_1, training=bn_training) layer_1 = tf.nn.relu(layer_1) layer_2 = tf.matmul(layer_1, W2) + b2 if self.all_batch_norm: layer_2 = tf.layers.batch_normalization(layer_2, training=bn_training) layer_2 = tf.nn.relu(layer_2) output_layer = tf.matmul(layer_2, W3) + b3 # summary if name == self.ACTOR_NAME: self.actor_summaries = [ tf.summary.histogram("W1", W1), tf.summary.histogram("b1", b1), tf.summary.histogram("W2", W2), tf.summary.histogram("b2", b2), tf.summary.histogram("W3", W3), tf.summary.histogram("b3", b3), tf.summary.histogram("layer_1", layer_1), tf.summary.histogram("layer_2", layer_2), tf.summary.histogram("output_layer", output_layer) ] if self.tanh_action: return tf.nn.tanh(output_layer) else: return output_layer def __build(self): self.state_input = tf.placeholder(tf.float32, shape=(None, self.state_dim), name="state_input") self.next_state_input = tf.placeholder(tf.float32, shape=(None, self.state_dim), name="next_state_input") self.action_input = tf.placeholder(tf.float32, shape=(None, self.action_dim), name="action_input") self.reward_input = tf.placeholder(tf.float32, shape=(None, ), name="reward_input") self.done_input = tf.placeholder(tf.float32, shape=(None, ), name="done_input") self.is_training = tf.placeholder(tf.bool, name="is_training") # inputs summary if self.detail_summary: self.input_summaries = [ tf.summary.histogram("state", self.state_input), tf.summary.histogram("next_state", self.next_state_input), tf.summary.histogram("action", self.action_input), tf.summary.histogram("reward", self.reward_input), tf.summary.histogram("done", self.done_input) ] self.target_action = self.__build_actor(self.TARGET_ACTOR_NAME, self.next_state_input) self.q_value, weight_decay = self.__build_critic( self.CRITIC_NAME, self.state_input, self.action_input) self.target_q_value, _ = self.__build_critic(self.TARGET_CRITIC_NAME, self.next_state_input, self.target_action) self.tmp = tf.expand_dims(self.reward_input, 1) self.targets = tf.expand_dims(self.reward_input, 1) + self.discount * ( 1 - tf.expand_dims(self.done_input, 1)) * self.target_q_value self.diff = self.targets - self.q_value self.loss = tf.reduce_mean( tf.square(tf.stop_gradient(self.targets) - self.q_value)) + weight_decay self.loss_summary = tf.summary.scalar("critic_loss", self.loss) self.critic_train_op = tf.train.AdamOptimizer( self.critic_learning_rate).minimize(self.loss) # add critic batch norm. update if self.input_batch_norm or self.all_batch_norm: self.critic_bn_update_op = tf.get_collection( tf.GraphKeys.UPDATE_OPS, scope=self.CRITIC_NAME) self.critic_bn_update_op = tf.group(*self.critic_bn_update_op) self.critic_train_op = tf.group(self.critic_train_op, self.critic_bn_update_op) self.action = self.__build_actor(self.ACTOR_NAME, self.state_input) self.actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTOR_NAME) self.action_gradients = tf.gradients(self.q_value, self.action_input)[0] self.actor_params_gradient = tf.gradients(self.action, self.actor_params, -self.action_gradients) # actor gradients summary if self.detail_summary: self.actor_summaries.append( tf.summary.histogram("action_gradient", self.action_gradients)) for grad in self.actor_params_gradient: self.actor_summaries.append( tf.summary.histogram("actor_parameter_gradients", grad)) self.actor_train_op = tf.train.AdamOptimizer( self.actor_learning_rate).apply_gradients( zip(self.actor_params_gradient, self.actor_params)) # add actor batch norm. update if self.input_batch_norm or self.all_batch_norm: self.actor_bn_update_op = tf.get_collection( tf.GraphKeys.UPDATE_OPS, scope=self.ACTOR_NAME) self.actor_bn_update_op = tf.group(*self.actor_bn_update_op) self.actor_train_op = tf.group(self.actor_train_op, self.actor_bn_update_op) self.target_critic_update = architect.create_target_update_ops( self.CRITIC_NAME, self.TARGET_CRITIC_NAME, self.critic_target_update_rate) self.target_actor_update = architect.create_target_update_ops( self.ACTOR_NAME, self.TARGET_ACTOR_NAME, self.actor_target_update_rate) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1)) # group summaries self.critic_summaries = tf.summary.merge(self.critic_summaries) if self.detail_summary: self.actor_summaries = tf.summary.merge(self.actor_summaries) self.input_summaries = tf.summary.merge(self.input_summaries) @staticmethod def __get_weights(shape, input_shape, name="var"): return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(input_shape), 1 / math.sqrt(input_shape)), name=name) def __train_actor(self, states): actions = self.session.run(self.action, feed_dict={ self.state_input: states, self.is_training: True }) self.session.run(self.actor_train_op, feed_dict={ self.state_input: states, self.action_input: actions, self.is_training: True }) def __train_critic(self, states, actions, rewards, next_states, done): feed_dict = { self.state_input: states, self.action_input: actions, self.reward_input: rewards, self.next_state_input: next_states, self.done_input: done, self.is_training: True } step = self.session.run(self.global_step) if step % self.log_frequency == 0: ops = [self.critic_train_op, self.loss_summary] if self.detail_summary: ops.append(self.actor_summaries) ops.append(self.input_summaries) res = self.session.run(ops, feed_dict=feed_dict) self.summary_writer.add_summary(res[1], global_step=step) if self.detail_summary: self.summary_writer.add_summary(res[2], global_step=step) self.summary_writer.add_summary(res[3], global_step=step) else: self.session.run(self.critic_train_op, feed_dict=feed_dict)
class DeepQNetwork: ACTION_VALUE_NET_NAME = "q-network" TARGET_ACTION_VALUE_NET_NAME = "target-q-network" def __init__(self, network, prep, exp_policy, state_dim, action_dim, name, learning_rate=1e-3, hard_update_frequency=500, soft_update_rate=None, buffer_size=50000, batch_size=32, num_steps=200000, discount=0.99, use_huber_loss=True, detailed_summary=False, max_reward=200, steps_before_learn=1000, train_freq=1, save_end=True): self.network = network self.prep = prep self.exp_policy = exp_policy self.greedy_policy = policy.Greedy() self.state_dim = state_dim self.action_dim = action_dim self.discount = discount self.summary_dir = os.path.join(name, "summary") self.use_huber_loss = use_huber_loss self.detailed_summary = detailed_summary self.learning_rate = learning_rate self.batch_size = batch_size self.hard_update_frequency = hard_update_frequency self.soft_update_rate = soft_update_rate self.num_steps = num_steps self.step = 0 self.steps_before_learn = steps_before_learn self.train_freq = train_freq self.solved = False self.max_reward = max_reward self.save_end = save_end self.actions = None self.rewards = None self.done = None self.action_q_values = None self.max_target_q_values = None self.targets = None self.global_step = None self.inc_global_step = None self.train_op = None self.states = None self.q_values = None self.next_states = None self.target_q_values = None self.target_update = None self.build_all() self.merged = tf.summary.merge_all() self.session = tf.Session() self.summary_dir = utils.new_summary_dir(self.summary_dir) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session.run(init_op) self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) def build_all(self): self.actions = tf.placeholder(tf.float32, (None, self.action_dim), name="actions") self.rewards = tf.placeholder(tf.float32, (None,), name="rewards") self.done = tf.placeholder(tf.float32, (None,), name="done") self.build_network() self.build_target_network() if self.soft_update_rate is not None: self.create_soft_target_update_op() else: self.create_hard_target_update_op() self.action_q_values = tf.reduce_sum(self.q_values * self.actions, axis=1) self.max_target_q_values = tf.reduce_max(self.target_q_values, axis=1) self.targets = self.rewards + (1 - self.done) * (self.discount * self.max_target_q_values) if self.detailed_summary: architect.variable_summaries(self.targets, name="targets") td_diff = self.action_q_values - tf.stop_gradient(self.targets) if self.use_huber_loss: loss = tf.reduce_mean(architect.huber_loss(td_diff)) else: loss = tf.reduce_mean(tf.pow(td_diff, 2)) tf.summary.scalar("loss", loss) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1)) self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss) def build_network(self): self.states, self.q_values = self.network.build(self.state_dim, self.action_dim, self.ACTION_VALUE_NET_NAME) def build_target_network(self): self.next_states, self.target_q_values = self.network.build(self.state_dim, self.action_dim, self.TARGET_ACTION_VALUE_NET_NAME) def create_soft_target_update_op(self): # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTION_VALUE_NET_NAME) target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_ACTION_VALUE_NET_NAME) self.target_update = [] for v_source, v_target in zip(net_vars, target_net_vars): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.soft_update_rate * (v_target - v_source)) self.target_update.append(update_op) self.target_update = tf.group(*self.target_update) def create_hard_target_update_op(self): net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTION_VALUE_NET_NAME) target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_ACTION_VALUE_NET_NAME) self.target_update = [] for v_source, v_target in zip(net_vars, target_net_vars): update_op = v_target.assign(v_source) self.target_update.append(update_op) self.target_update = tf.group(*self.target_update) def learn(self): # learn batch = self.buffer.sample(self.batch_size) merged, _ = self.session.run([self.merged, self.train_op], feed_dict={ self.states: batch["states"], self.actions: batch["actions"], self.rewards: batch["rewards"], self.next_states: batch["next_states"], self.done: batch["done"] }) self.summary_writer.add_summary(merged, global_step=self.step) # target update if self.soft_update_rate is not None: self.session.run(self.target_update) elif self.step % self.hard_update_frequency == 0: self.session.run(self.target_update) def run_episode(self, env): state = env.reset() state, skip = self.prep.process(state) total_reward = 0 while True: # play if skip: action = env.action_space.sample() else: q_values = self.session.run(self.q_values, feed_dict={self.states: state})[0] if self.solved: action = self.greedy_policy.select_action(q_values) else: action = self.exp_policy.select_action(q_values) action_one_hot = np.zeros(self.action_dim) action_one_hot[action] = 1 tmp_state = state tmp_skip = skip state, reward, done, info = env.step(action) state, skip = self.prep.process(state) total_reward += reward if not tmp_skip and not tmp_skip: self.buffer.add({ "state": tmp_state[0], "action": action_one_hot, "reward": reward, "next_state": state[0], "done": int(done) }) if self.step >= self.steps_before_learn and self.step % self.train_freq == 0 and not self.solved: # learn self.learn() _, self.step = self.session.run([self.inc_global_step, self.global_step]) if done: break summary_value = summary_pb2.Summary.Value(tag="episode_reward", simple_value=total_reward) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=self.step) if total_reward >= self.max_reward: self.solved = True else: self.solved = False if self.step == self.num_steps: self.saver.save(self.session, self.summary_dir, global_step=self.step) return total_reward, self.step def close(self): self.session.close()