def __init__(self, config, action_mask): super(CL_DPG, self).__init__(config) # Set Hyper-parameters self.initial_phase = not config.true_embeddings and not config.load_embed and not config.restore # Initial training phase required if learning embeddings self.batch_norm = False # Function to get state features and action representation self.state_features = Basis.get_Basis(config=config) self.action_rep = CL_ActionRepresentation.VAE_Action_representation( action_dim=self.action_dim, state_dim=self.state_features.feature_dim, config=config) # Create instances for Actor and Q_fn self.actor = Actor(action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) self.Q = Q_fn(action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) # Create target networks # Deepcopy not working. self.target_state_features = Basis.get_Basis(config=config) self.target_actor = Actor( action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) self.target_Q = Q_fn(action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) # self.target_action_rep = ActionRepresentation.Action_representation_deep(action_dim=self.action_dim, config=config) # Copy the initialized values to target self.target_state_features.load_state_dict( self.state_features.state_dict()) self.target_actor.load_state_dict(self.actor.state_dict()) self.target_Q.load_state_dict(self.Q.state_dict()) # self.target_action_rep.load_state_dict(self.action_rep.state_dict()) self.memory = MemoryBuffer( max_len=self.config.buffer_size, state_dim=self.state_dim, action_dim=1, atype=long, config=config, dist_dim=self.action_rep.reduced_action_dim) # off-policy self.noise = OrnsteinUhlenbeckActionNoise( self.config.reduced_action_dim) self.modules = [('actor', self.actor), ('Q', self.Q), ('state_features', self.state_features), ('action_rep', self.action_rep), ('target_actor', self.target_actor), ('target_state_features', self.target_state_features), ('target_Q', self.target_Q)] #, # ('target_action_rep', self.target_action_rep)] self.init() self.update_mask(action_mask=action_mask)
def __init__(self, config, action_mask): super(CL_ActorCritic, self).__init__(config) # Initial training phase required if learning embeddings from scratch self.initial_phase = not config.true_embeddings and not config.load_embed # Function to get state features and action representation self.state_features = Basis.get_Basis(config=config) self.action_rep = CL_ActionRepresentation.VAE_Action_representation(state_dim=self.state_features.feature_dim, action_dim=self.action_dim, config=config) # Create instances for Actor and Q_fn self.critic = Critic.Critic_with_traces(state_dim=self.state_features.feature_dim, config=config) self.actor = Policy.embed_Gaussian(action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) # Initialize storage containers self.memory = MemoryBuffer(max_len=self.config.buffer_size, state_dim=self.state_dim, action_dim=1, atype=long, config=config, dist_dim=self.action_rep.reduced_action_dim) # off-policy self.trajectory = Trajectory(max_len=self.config.batch_size, state_dim=self.state_dim, action_dim=1, atype=long, config=config, dist_dim=self.action_rep.reduced_action_dim) # on-policy self.modules = [('actor', self.actor), ('critic', self.critic), ('state_features', self.state_features), ('action_rep', self.action_rep)] self.init() self.update_mask(action_mask=action_mask)
def __init__(self, config): super(embed_Reinforce, self).__init__(config) self.ep_rewards = [] self.ep_states = [] self.ep_actions = [] self.ep_exec_action_embs = [] self.ep_chosen_action_embs = [] # Set Hyper-parameters self.memory = MemoryBuffer(size=config.buffer_size) self.counter = 0 self.initial_phase = not config.true_embeddings # Initial training phase required if learning embeddings # Function to get state features and action representation if config.fourier_order > 0: self.state_features = Basis.Fourier_Basis(config=config) else: self.state_features = Basis.NN_Basis(config=config) # Function to get state features and action representation self.action_rep = Action_representation( state_dim=self.state_features.feature_dim, action_dim=self.action_dim, config=config) self.baseline = Critic.Critic( state_dim=self.state_features.feature_dim, config=config) # Create instances for Actor and Q_fn self.atype = config.dtype self.actor = Policy.embed_Gaussian( action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) self.action_size = self.action_dim self.modules = [('actor', self.actor), ('baseline', self.baseline), ('state_features', self.state_features), ('action_rep', self.action_rep)] self.init()
class embed_Reinforce(Agent): def __init__(self, config): super(embed_Reinforce, self).__init__(config) self.ep_rewards = [] self.ep_states = [] self.ep_actions = [] self.ep_exec_action_embs = [] self.ep_chosen_action_embs = [] # Set Hyper-parameters self.memory = MemoryBuffer(size=config.buffer_size) self.counter = 0 self.initial_phase = not config.true_embeddings # Initial training phase required if learning embeddings # Function to get state features and action representation if config.fourier_order > 0: self.state_features = Basis.Fourier_Basis(config=config) else: self.state_features = Basis.NN_Basis(config=config) # Function to get state features and action representation self.action_rep = Action_representation( state_dim=self.state_features.feature_dim, action_dim=self.action_dim, config=config) self.baseline = Critic.Critic( state_dim=self.state_features.feature_dim, config=config) # Create instances for Actor and Q_fn self.atype = config.dtype self.actor = Policy.embed_Gaussian( action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) self.action_size = self.action_dim self.modules = [('actor', self.actor), ('baseline', self.baseline), ('state_features', self.state_features), ('action_rep', self.action_rep)] self.init() def get_action(self, state, explore=0.2): explore = 0 #Don't do eps-greedy with policy gradients. if self.initial_phase or np.random.rand() < explore: # take random actions (uniformly in actual action space) to observe the interactions initially action = np.random.randint(self.action_dim) exec_action_emb = self.action_rep.get_embedding(action).cpu().view( -1).data.numpy() chosen_action_emb = exec_action_emb else: state = np.float32(state) if len(state.shape) == 1: state = np.expand_dims(state, 0) state = self.state_features.forward(state) chosen_action_emb = self.actor.get_action_wo_dist(state, explore=0) action = self.action_rep.get_best_match(chosen_action_emb) exec_action_emb = self.action_rep.get_embedding(action).cpu().view( -1).data.numpy() chosen_action_emb = chosen_action_emb.cpu().view(-1).data.numpy() return action, (exec_action_emb, chosen_action_emb) def update(self, s1, a1, a_emb1, r1, s2, done): if not self.initial_phase: # Store the episode history self.ep_rewards.append(r1) self.ep_states.append(s1) self.ep_actions.append(int(a1)) self.ep_exec_action_embs.append(a_emb1[0]) self.ep_chosen_action_embs.append(a_emb1[1]) if done: # Compute gamma return and do on-policy update g_rewards, R = [], 0 for r in self.ep_rewards[::-1]: R = r + self.config.gamma * R g_rewards.insert(0, R) self.optimize(np.float32(self.ep_states), np.float32(self.ep_actions), np.float32(self.ep_exec_action_embs), np.float32(self.ep_chosen_action_embs), np.float32(g_rewards)) # Reset the episode history self.ep_rewards = [] self.ep_states = [] self.ep_actions = [] self.ep_exec_action_embs = [] self.ep_chosen_action_embs = [] else: self.memory.add(s1, a1, a_emb1[0], r1, s2, int(done != 1), randomize=True) # a_emb1 gets ignored subsequently if self.memory.length >= self.config.buffer_size: # action embeddings can be learnt offline self.initial_phase_training( max_epochs=self.config.initial_phase_epochs) def optimize(self, s1, a1, exec_a1_emb, chosen_a1_emb, r1): r1 = Variable(torch.from_numpy(r1).type(self.config.dtype), requires_grad=False).view(-1, 1) exec_a1_emb = Variable(torch.from_numpy(exec_a1_emb).type( self.config.dtype), requires_grad=False) chosen_a1_emb = Variable(torch.from_numpy(chosen_a1_emb).type( self.config.dtype), requires_grad=False) a1_emb = exec_a1_emb if self.config.emb_flag == 'exec' else chosen_a1_emb s1 = self.state_features.forward(s1) # ---------------------- optimize critic ---------------------- val_pred = self.baseline.forward(s1) # loss_baseline = F.smooth_l1_loss(val_pred, r1) loss_baseline = F.mse_loss(val_pred, r1) # ---------------------- optimize actor ---------------------- td_error = (r1 - val_pred).detach() if self.config.TIS: _, dist = self.actor.get_action(s1) exec_prob = self.actor.get_prob_from_dist( dist, exec_a1_emb, scalar=self.config.TIS_scalar) chosen_prob = self.actor.get_prob_from_dist( dist, chosen_a1_emb, scalar=self.config.TIS_scalar) TIS_ratio = (exec_prob / chosen_prob).detach() #TODO: clip this ratio? loss_actor = -1.0 * torch.mean( TIS_ratio * td_error * self.actor.get_log_prob_dist(dist, exec_a1_emb)) else: loss_actor = -1.0 * torch.mean( td_error * self.actor.get_log_prob(s1, a1_emb)) # loss_actor = -1.0 * torch.sum(td_error * self.actor.get_log_prob(s1, a1_emb)) # loss_actor = -1.0 * torch.mean(torch.mean(r1 * self.actor.get_log_prob(s1, a1_emb), -1)) # without baseline loss = loss_baseline + loss_actor # print(val_pred, a1_emb) # ------------ optimize the embeddings always ---------------- if not self.config.true_embeddings: a1 = Variable(torch.from_numpy(a1).type(self.config.dtype_long), requires_grad=False) action_pred = self.action_rep.forward(s1[:-1], s1[1:]) loss_act_rep = F.cross_entropy(action_pred, a1[:-1]) loss += loss_act_rep * self.config.emb_lambda self.step(loss, clip_norm=10) def initial_phase_training(self, max_epochs=-1): # change optimizer to Adam for supervised learning self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(), lr=1e-3) self.state_features.optim = torch.optim.Adam( self.state_features.parameters(), lr=1e-3) initial_losses = [] print("Inital training phase started...") #TODO: Split into train and validation to avoid overfitting for counter in range(max_epochs): losses = [] for s1, a1, _, _, s2, _ in self.memory.get_batch( size=self.config.sup_batch_size, randomize=True): a1 = Variable(torch.from_numpy(a1).type( self.config.dtype_long), requires_grad=False) self.clear_gradients() # clear all the gradients from last run s1 = self.state_features.forward(s1) s2 = self.state_features.forward(s2) # ------------ optimize the embeddings ---------------- action_pred = self.action_rep.forward(s1, s2) loss_act_rep = F.cross_entropy(action_pred, a1) loss_act_rep.backward() self.action_rep.optim.step() self.state_features.optim.step() losses.append(loss_act_rep.cpu().view(-1).data.numpy()[0]) # print(np.mean(loss)) initial_losses.append(np.mean(losses)) if counter % 1 == 0: print("Epoch {} loss:: {}".format( counter, np.mean(initial_losses[-10:]))) #self.save() # Terminate initial phase once action representations have converged. if len(initial_losses) >= 20 and np.mean( initial_losses[-10:]) >= np.mean(initial_losses[-20:]): print("Converged...") break # Reset the optim to whatever is there in config self.action_rep.optim = self.config.optim(self.action_rep.parameters(), lr=self.config.embed_lr) self.state_features.optim = self.config.optim( self.state_features.parameters(), lr=self.config.state_lr) print('... Initial training phase terminated!') self.initial_phase = False
class CL_DPG(Agent): # @profile def __init__(self, config, action_mask): super(CL_DPG, self).__init__(config) # Set Hyper-parameters self.initial_phase = not config.true_embeddings and not config.load_embed and not config.restore # Initial training phase required if learning embeddings self.batch_norm = False # Function to get state features and action representation self.state_features = Basis.get_Basis(config=config) self.action_rep = CL_ActionRepresentation.VAE_Action_representation( action_dim=self.action_dim, state_dim=self.state_features.feature_dim, config=config) # Create instances for Actor and Q_fn self.actor = Actor(action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) self.Q = Q_fn(action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) # Create target networks # Deepcopy not working. self.target_state_features = Basis.get_Basis(config=config) self.target_actor = Actor( action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) self.target_Q = Q_fn(action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) # self.target_action_rep = ActionRepresentation.Action_representation_deep(action_dim=self.action_dim, config=config) # Copy the initialized values to target self.target_state_features.load_state_dict( self.state_features.state_dict()) self.target_actor.load_state_dict(self.actor.state_dict()) self.target_Q.load_state_dict(self.Q.state_dict()) # self.target_action_rep.load_state_dict(self.action_rep.state_dict()) self.memory = MemoryBuffer( max_len=self.config.buffer_size, state_dim=self.state_dim, action_dim=1, atype=long, config=config, dist_dim=self.action_rep.reduced_action_dim) # off-policy self.noise = OrnsteinUhlenbeckActionNoise( self.config.reduced_action_dim) self.modules = [('actor', self.actor), ('Q', self.Q), ('state_features', self.state_features), ('action_rep', self.action_rep), ('target_actor', self.target_actor), ('target_state_features', self.target_state_features), ('target_Q', self.target_Q)] #, # ('target_action_rep', self.target_action_rep)] self.init() self.update_mask(action_mask=action_mask) def update_mask(self, action_mask): self.action_mask = action_mask self.curr_action_set = np.where(self.action_mask)[0] self.action_rep.update_mask(self.action_mask) # Overrides the reset function in parent class def reset(self, action_mask, change_flag): for _, module in self.modules: module.reset() if change_flag: if self.config.re_init == 'full': # Do a complete re initialization after the MDP has changed self.__init__(self.config, action_mask) self.update_mask(action_mask) self.initial_phase = True self.memory.reset() def get_action(self, state, explore=0): if self.batch_norm: self.actor.eval( ) # Set the actor to Evaluation mode. Required for Batchnorm if self.initial_phase: # take random actions (uniformly in actual action space) to observe the interactions initially action = np.random.choice(self.curr_action_set) action_emb = self.action_rep.get_embedding(action).cpu().view( -1).data.numpy() else: state = tensor(state, dtype=float32, requires_grad=False, device=self.config.device).view(1, -1) state = self.state_features.forward(state) action_emb = self.actor.get_action(state) noise = self.noise.sample() * explore #* 0.1 action_emb += Variable(torch.from_numpy(noise).type(float32), requires_grad=False) action = self.action_rep.get_best_match(action_emb) action_emb = action_emb.cpu().view(-1).data.numpy() self.track_entropy_cont(action_emb) return action, action_emb def update(self, s1, a1, a_emb1, r1, s2, done): self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1)) if self.initial_phase and self.memory.length >= self.config.buffer_size: self.initial_phase_training( max_epochs=self.config.initial_phase_epochs) elif not self.initial_phase and self.memory.length > self.config.sup_batch_size: self.optimize() def optimize(self): if self.batch_norm: self.actor.train( ) # Set the actor to training mode. Required for Batchnorm s1, a1, a1_emb, r1, s2, not_absorbing = self.memory.sample( self.config.sup_batch_size) # ---------------------- optimize critic ---------------------- # Use target actor exploitation policy here for loss evaluation s2_t = self.target_state_features.forward(s2).detach() a2_emb = self.target_actor.get_action( s2_t).detach() # Detach targets from grad computation. next_val = self.target_Q.forward( s2_t, a2_emb).detach() # Compute Q'( s2, pi'(s2)) val_exp = r1 + self.config.gamma * next_val * not_absorbing # y_exp = r + gamma * Q'( s2, pi'(s2)) s1_ = self.state_features.forward(s1) val_pred = self.Q.forward(s1_, a1_emb) # y_pred = Q( s1, a1) loss_Q = F.mse_loss(val_pred, val_exp) # loss_Q = F.smooth_l1_loss(val_pred, val_exp) # compute critic loss self.clear_gradients() loss_Q.backward() self.Q.optim.step() self.state_features.optim.step() # ---------------------- optimize actor ---------------------- s1_ = self.state_features.forward(s1) s2_ = self.state_features.forward(s2) pred_a1_emb = self.actor.get_action(s1_) loss_actor = -1.0 * torch.mean(self.Q.forward(s1_, pred_a1_emb)) loss_rep = self.action_rep.unsupervised_loss( s1_, a1.view(-1), s2_) * self.config.emb_lambda loss = loss_actor + loss_rep self.clear_gradients() loss.backward() self.actor.optim.step() self.action_rep.optim.step() self.state_features.optim.step() # ------------ update target actor and critic ----------------- soft_update(self.target_actor, self.actor, self.config.tau) soft_update(self.target_Q, self.Q, self.config.tau) soft_update(self.target_state_features, self.state_features, self.config.tau) def self_supervised_update(self, s1, a1, s2, reg=1): s1 = self.state_features(s1) s2 = self.state_features(s2) loss = self.action_rep.unsupervised_loss(s1, a1.view(-1), s2) * reg self.clear_gradients() loss.backward() self.action_rep.optim.step() self.state_features.optim.step() return loss.item() def clear_gradients(self): for module in [ self.action_rep, self.actor, self.Q, self.state_features ]: module.optim.zero_grad() def initial_phase_training(self, max_epochs=-1): if self.batch_norm: self.actor.train( ) # Set the actor to training mode. Required for Batchnorm # change optimizer to Adam for unsupervised learning self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(), lr=1e-2) self.state_features.optim = torch.optim.Adam( self.state_features.parameters(), lr=1e-2) initial_losses = [] print("Inital training phase started...") for counter in range(max_epochs): losses = [] for s1, a1, _, _, s2, _ in self.memory.batch_sample( batch_size=self.config.sup_batch_size, randomize=True): loss = self.self_supervised_update(s1, a1, s2) losses.append(loss) initial_losses.append(np.mean(losses)) if counter % 1 == 0: print("Epoch {} loss:: {}".format( counter, np.mean(initial_losses[-10:]))) if self.config.only_phase_one: self.save() print("Saved..") # Terminate initial phase once action representations have converged. if len(initial_losses) >= 20 and np.mean( initial_losses[-10:]) + 1e-5 >= np.mean( initial_losses[-20:]): print("Converged...") break # Reset the optim to whatever is there in config self.action_rep.optim = self.config.optim(self.action_rep.parameters(), lr=self.config.embed_lr) self.state_features.optim = self.config.optim( self.state_features.parameters(), lr=self.config.state_lr) print('... Initial training phase terminated!') self.initial_phase = False self.save() if self.config.only_phase_one: exit() hard_update(self.target_state_features, self.state_features)
class CL_ActorCritic(Agent): def __init__(self, config, action_mask): super(CL_ActorCritic, self).__init__(config) # Initial training phase required if learning embeddings from scratch self.initial_phase = not config.true_embeddings and not config.load_embed # Function to get state features and action representation self.state_features = Basis.get_Basis(config=config) self.action_rep = CL_ActionRepresentation.VAE_Action_representation(state_dim=self.state_features.feature_dim, action_dim=self.action_dim, config=config) # Create instances for Actor and Q_fn self.critic = Critic.Critic_with_traces(state_dim=self.state_features.feature_dim, config=config) self.actor = Policy.embed_Gaussian(action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) # Initialize storage containers self.memory = MemoryBuffer(max_len=self.config.buffer_size, state_dim=self.state_dim, action_dim=1, atype=long, config=config, dist_dim=self.action_rep.reduced_action_dim) # off-policy self.trajectory = Trajectory(max_len=self.config.batch_size, state_dim=self.state_dim, action_dim=1, atype=long, config=config, dist_dim=self.action_rep.reduced_action_dim) # on-policy self.modules = [('actor', self.actor), ('critic', self.critic), ('state_features', self.state_features), ('action_rep', self.action_rep)] self.init() self.update_mask(action_mask=action_mask) def update_mask(self, action_mask): self.action_mask = action_mask self.curr_action_set = np.where(self.action_mask)[0] self.action_rep.update_mask(self.action_mask) # Overrides the reset function in parent class def reset(self, action_mask, change_flag): for _, module in self.modules: module.reset() if change_flag: if self.config.re_init == 'full': # Do a complete re initialization after the MDP has changed self.__init__(self.config, action_mask) if self.config.re_init == 'policy': # Re-init only the policy, (state features and value functions can carry over from prv time) self.action_rep = CL_ActionRepresentation.Action_representation( state_dim=self.state_features.feature_dim, action_dim=self.action_dim, config=self.config) self.actor = Policy.embed_Gaussian(action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=self.config) self.update_mask(action_mask) self.initial_phase = True self.memory.reset() def get_action(self, state, explore=0): explore = 0 # Don't do eps-greedy with policy gradients if self.initial_phase or np.random.rand() < explore: # take random actions (uniformly in actual action space) to observe the interactions initially action = np.random.choice(self.curr_action_set) chosen_action_emb = self.action_rep.get_embedding(action).cpu().view(-1).data.numpy() else: state = tensor(state, dtype=float32, requires_grad=False, device=self.config.device).view(1, -1) state = self.state_features.forward(state) chosen_action_emb, _ = self.actor.get_action(state, explore=0) action = self.action_rep.get_best_match(chosen_action_emb) chosen_action_emb = chosen_action_emb.cpu().view(-1).data.numpy() return action, chosen_action_emb def update(self, s1, a1, a_emb1, r1, s2, done, debug=False): if not self.initial_phase: # On-policy episode history, # Dont use value predicted from the absorbing/goal state # self.optimize(s1, a1, a_emb1, r1, s2, int(done != 1)) self.trajectory.add(s1, a1, a_emb1, r1, s2, int(done != 1)) if self.trajectory.size >= self.config.batch_size or done: self.optimize(debug) self.trajectory.reset() else: # action embeddings can be learnt offline self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1)) if self.memory.length >= self.config.buffer_size: self.initial_phase_training(max_epochs=self.config.initial_phase_epochs) def optimize(self, debug=False): s1, a1, chosen_a1_emb, r1, s2, not_absorbing = self.trajectory.get_all() s1 = self.state_features.forward(s1) s2 = self.state_features.forward(s2) # ---------------------- optimize critic ---------------------- next_val = self.critic.forward(s2).detach() # Detach targets from grad computation. val_exp = r1 + self.config.gamma * next_val * not_absorbing val_pred = self.critic.forward(s1) loss_critic = F.mse_loss(val_pred, val_exp) # loss_critic = F.smooth_l1_loss(val_pred, val_exp) # ---------------------- optimize actor ---------------------- td_error = (val_exp - val_pred).detach() logp, dist = self.actor.get_log_prob(s1, chosen_a1_emb) loss_actor = -1.0 * torch.mean(td_error * logp) # loss_actor += self.config.entropy_lambda * self.actor.get_entropy_from_dist(dist) # Take one policy gradient step loss = loss_critic + loss_actor # if not self.config.true_embeddings and self.config.emb_lambda > 0: # loss += self.action_rep.unsupervised_loss(s1, a1.view(-1), s2) * self.config.emb_lambda self.step(loss, clip_norm=1) def self_supervised_update(self, s1, a1, s2, reg=1): self.clear_gradients() # clear all the gradients from last run # If doing online updates, sharing the state features might be problematic! s1 = self.state_features.forward(s1) s2 = self.state_features.forward(s2) # ------------ optimize the embeddings ---------------- loss_act_rep = self.action_rep.unsupervised_loss(s1, a1.view(-1), s2, normalized=True) * reg loss_act_rep.backward() # Directly call the optimizer's step fn to bypass lambda traces (if any) self.action_rep.optim.step() self.state_features.optim.step() return loss_act_rep.item() def initial_phase_training(self, max_epochs=-1): # change optimizer to Adam for unsupervised learning self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(), lr=1e-2) self.state_features.optim = torch.optim.Adam(self.state_features.parameters(), lr=1e-2) initial_losses = [] print("Inital training phase started...") for counter in range(max_epochs): losses = [] for s1, a1, _, _, s2, _ in self.memory.batch_sample(batch_size=self.config.sup_batch_size, randomize=True): loss = self.self_supervised_update(s1, a1, s2) losses.append(loss) initial_losses.append(np.mean(losses)) if counter % 1 == 0: print("Epoch {} loss:: {}".format(counter, np.mean(initial_losses[-10:]))) if self.config.only_phase_one: self.save() print("Saved..") # Terminate initial phase once action representations have converged. if len(initial_losses) >= 20 and np.mean(initial_losses[-10:]) + 1e-5 >= np.mean(initial_losses[-20:]): print("Converged...") break # Reset the optim to whatever is there in config self.action_rep.optim = self.config.optim(self.action_rep.parameters(), lr=self.config.embed_lr) self.state_features.optim = self.config.optim(self.state_features.parameters(), lr=self.config.state_lr) print('... Initial training phase terminated!') self.initial_phase = False self.save() if self.config.only_phase_one: exit()
class embed_ActorCritic(Agent): def __init__(self, config): super(embed_ActorCritic, self).__init__(config) # Initial training phase required if learning embeddings from scratch self.initial_phase = not config.true_embeddings and not config.load_embed # Function to get state features and action representation self.state_features = Basis.get_Basis(config=config) self.action_rep = ActionRepresentation.Action_representation( state_dim=self.state_features.feature_dim, action_dim=self.action_dim, config=config) # Create instances for Actor and Q_fn self.critic = Critic.Critic_with_traces( state_dim=self.state_features.feature_dim, config=config) self.actor = Policy.embed_Gaussian( action_dim=self.action_rep.reduced_action_dim, state_dim=self.state_features.feature_dim, config=config) # Initialize storage containers self.memory = MemoryBuffer( max_len=self.config.buffer_size, state_dim=self.state_dim, action_dim=1, atype=long, config=config, dist_dim=self.action_rep.reduced_action_dim) # off-policy self.trajectory = Trajectory( max_len=self.config.batch_size, state_dim=self.state_dim, action_dim=1, atype=long, config=config, dist_dim=self.action_rep.reduced_action_dim) # on-policy self.modules = [('actor', self.actor), ('critic', self.critic), ('state_features', self.state_features), ('action_rep', self.action_rep)] self.init() # If needed later: # If the embeddings are going to be trained on the fly, but are restored from ckpt # Then load the associated state feature basis and ss'-> e params as well. # if self.config.emb_lambda and self.config.load_embed: # self.state_features.load(self.config.paths['ckpt'] + name + '.pt') # self.action_representation.load(ss'->e features) def get_action(self, state, explore=0): explore = 0 # Don't do eps-greedy with policy gradients if self.initial_phase or np.random.rand() < explore: # take random actions (uniformly in actual action space) to observe the interactions initially action = np.random.randint(self.action_dim) chosen_action_emb = self.action_rep.get_embedding( action).cpu().view(-1).data.numpy() else: state = tensor(state, dtype=float32, requires_grad=False, device=self.config.device) state = self.state_features.forward(state.view(1, -1)) chosen_action_emb, _ = self.actor.get_action(state, explore=0) action = self.action_rep.get_best_match(chosen_action_emb) chosen_action_emb = chosen_action_emb.cpu().view(-1).data.numpy() return action, chosen_action_emb def update(self, s1, a1, a_emb1, r1, s2, done): if not self.initial_phase: # Off-policy episodes, If doing simultaneous online embedding optimization # if not self.config.true_embeddings and self.config.emb_lambda > 0: # self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1)) # On-policy episode history, # Dont use value predicted from the absorbing/goal state # self.optimize(s1, a1, a_emb1, r1, s2, int(done != 1)) self.trajectory.add(s1, a1, a_emb1, r1, s2, int(done != 1)) if self.trajectory.size >= self.config.batch_size or done: self.optimize() self.trajectory.reset() else: # action embeddings can be learnt offline self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1)) if self.memory.length >= self.config.buffer_size: self.initial_phase_training( max_epochs=self.config.initial_phase_epochs) def optimize(self): s1, a1, chosen_a1_emb, r1, s2, not_absorbing = self.trajectory.get_all( ) s1 = self.state_features.forward(s1) s2 = self.state_features.forward(s2) # ---------------------- optimize critic ---------------------- next_val = self.critic.forward( s2).detach() # Detach targets from grad computation. val_exp = r1 + self.config.gamma * next_val * not_absorbing val_pred = self.critic.forward(s1) loss_critic = F.mse_loss(val_pred, val_exp) # loss_critic = F.smooth_l1_loss(val_pred, val_exp) # print(next_val.shape, val_pred.shape, val_exp.shape, r1.shape, not_absorbing.shape, exec_a1_emb.shape, s1.shape) #check correctness # print("------------------",next_val, val_pred, val_exp, r1, not_absorbing, a1_emb, s1, s2) #check correctness # ---------------------- optimize actor ---------------------- td_error = (val_exp - val_pred).detach() logp, dist = self.actor.get_log_prob(s1, chosen_a1_emb) loss_actor = -1.0 * torch.mean(td_error * logp) # loss_actor += self.config.entropy_lambda * self.actor.get_entropy_from_dist(dist) # Take one policy gradient step loss = loss_critic + loss_actor self.step(loss, clip_norm=1) # Take one unsupervised step # if not self.config.true_embeddings and self.config.emb_lambda > 0:# and self.memory.size >self.config.sup_batch_size: # s1, a1, _, _, s2, _ = self.memory.sample(batch_size=self.config.sup_batch_size) # self.self_supervised_update(s1, a1, s2, reg=self.config.emb_lambda) def self_supervised_update(self, s1, a1, s2, reg=1): self.clear_gradients() # clear all the gradients from last run # If doing online updates, sharing the state features might be problematic! s1 = self.state_features.forward(s1) s2 = self.state_features.forward(s2) # ------------ optimize the embeddings ---------------- loss_act_rep = self.action_rep.unsupervised_loss( s1, a1.view(-1), s2, normalized=True) * reg loss_act_rep.backward() # Directly call the optimizer's step fn to bypass lambda traces (if any) self.action_rep.optim.step() self.state_features.optim.step() return loss_act_rep.item() def initial_phase_training(self, max_epochs=-1): # change optimizer to Adam for unsupervised learning self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(), lr=1e-3) self.state_features.optim = torch.optim.Adam( self.state_features.parameters(), lr=1e-3) initial_losses = [] print("Inital training phase started...") for counter in range(max_epochs): losses = [] for s1, a1, _, _, s2, _ in self.memory.batch_sample( batch_size=self.config.sup_batch_size, randomize=True): loss = self.self_supervised_update(s1, a1, s2) losses.append(loss) initial_losses.append(np.mean(losses)) if counter % 1 == 0: print("Epoch {} loss:: {}".format( counter, np.mean(initial_losses[-10:]))) if self.config.only_phase_one: self.save() print("Saved..") # Terminate initial phase once action representations have converged. if len(initial_losses) >= 20 and np.mean( initial_losses[-10:]) + 1e-5 >= np.mean( initial_losses[-20:]): print("Converged...") break # Reset the optim to whatever is there in config self.action_rep.optim = self.config.optim(self.action_rep.parameters(), lr=self.config.embed_lr) self.state_features.optim = self.config.optim( self.state_features.parameters(), lr=self.config.state_lr) print('... Initial training phase terminated!') self.initial_phase = False self.save() if self.config.only_phase_one: exit() # if not updating on the fly, then delete the memory buffer: del self.memory
class embed_DPG(Agent): def __init__(self, config): super(embed_DPG, self).__init__(config) # Set Hyper-parameters self.initial_phase =False# not config.true_embeddings and not config.load_embed # Initial training phase required if learning embeddings self.batch_norm = False self.ctr = 0 # Function to get state features and action representation self.action_rep = ActionRepresentation.Action_representation_deep(action_dim=self.action_dim, config=config) # Create instances for Actor and Q_fn self.actor = Actor(action_dim=self.action_rep.reduced_action_dim, config=config) self.Q = Q_fn(action_dim=self.action_rep.reduced_action_dim, config=config) # Create target networks # Deepcopy not working. self.target_actor = Actor(action_dim=self.action_rep.reduced_action_dim, config=config) self.target_Q = Q_fn(action_dim=self.action_rep.reduced_action_dim, config=config) # self.target_action_rep = ActionRepresentation.Action_representation_deep(action_dim=self.action_dim, config=config) # Copy the initialized values to target self.target_actor.load_state_dict(self.actor.state_dict()) self.target_Q.load_state_dict(self.Q.state_dict()) # self.target_action_rep.load_state_dict(self.action_rep.state_dict()) self.memory = MemoryBuffer(max_len=self.config.buffer_size, state_dim=self.state_dim, action_dim=1, atype=long, config=config, dist_dim=self.action_rep.reduced_action_dim) # off-policy self.noise = OrnsteinUhlenbeckActionNoise(self.config.reduced_action_dim) self.modules = [('actor', self.actor), ('Q', self.Q), ('action_rep', self.action_rep), ('target_actor', self.target_actor), ('target_Q', self.target_Q)]#, # ('target_action_rep', self.target_action_rep)] self.init() def get_action(self, state, explore=0): if self.batch_norm: self.actor.eval() # Set the actor to Evaluation mode. Required for Batchnorm if self.initial_phase: # take random actions (uniformly in actual action space) to observe the interactions initially action = np.random.randint(self.action_dim) action_emb = self.action_rep.get_embedding(action).cpu().view(-1).data.numpy() else: state = tensor(state, dtype=float32, requires_grad=False, device=self.config.device).view(1, -1) action_emb = self.actor.get_action(state) noise = self.noise.sample() #* 0.1 action_emb += Variable(torch.from_numpy(noise).type(float32), requires_grad=False) action = self.action_rep.get_best_match(action_emb) action_emb = action_emb.cpu().view(-1).data.numpy() self.track_entropy_cont(action_emb) return action, action_emb def update(self, s1, a1, a_emb1, r1, s2, done): self.memory.add(s1, a1, a_emb1, r1, s2, int(done != 1)) if self.initial_phase and self.memory.length >= self.config.buffer_size: self.initial_phase_training(max_epochs=self.config.initial_phase_epochs) elif not self.initial_phase and self.memory.length > self.config.sup_batch_size: self.optimize() def optimize(self): if self.batch_norm: self.actor.train() # Set the actor to training mode. Required for Batchnorm s1, a1, a1_emb, r1, s2, not_absorbing = self.memory.sample(self.config.sup_batch_size) # print(s1.shape, a1.shape, a1_emb.shape, r1.shape, s2.shape, not_absorbing.shape) # ---------------------- optimize critic ---------------------- # Use target actor exploitation policy here for loss evaluation a2_emb = self.target_actor.get_action(s2).detach() # Detach targets from grad computation. next_val = self.target_Q.forward(s2, a2_emb).detach() # Compute Q'( s2, pi'(s2)) val_exp = r1 + self.config.gamma * next_val * not_absorbing # y_exp = r + gamma * Q'( s2, pi'(s2)) val_pred = self.Q.forward(s1, a1_emb) # y_pred = Q( s1, a1) # loss_Q = F.smooth_l1_loss(val_pred, val_exp) # compute critic loss loss_Q = F.mse_loss(val_pred, val_exp) self.Q.update(loss_Q) # ---------------------- optimize actor ---------------------- pred_a1_emb = self.actor.get_action(s1) loss_actor = -1.0 * torch.mean(self.Q.forward(s1, pred_a1_emb)) self.actor.update(loss_actor) # ------------ update target actor and critic ----------------- soft_update(self.target_actor, self.actor, self.config.tau) soft_update(self.target_Q, self.Q, self.config.tau) if not self.config.true_embeddings and self.config.emb_lambda > 0: self.ctr += 1 if self.ctr > 100: self.self_supervised_training() self.ctr = 0 def self_supervised_training(self, eps=1e-3): prv_loss = 1e5 while True: s1, a1, _, _, s2, _ = self.memory.sample(batch_size=self.config.sup_batch_size) loss = self.action_rep.unsupervised_loss(s1, a1.view(-1), s2) self.action_rep.update(loss) # soft_update(self.target_action_rep, self.action_rep, self.config.tau) # quick check for convergence, break loss = loss.item() if prv_loss - loss < eps: break prv_loss = loss def initial_phase_training(self, max_epochs=-1): if self.batch_norm: self.actor.train() # Set the actor to training mode. Required for Batchnorm # change optimizer to Adam for unsupervised learning self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(), lr=1e-3) initial_losses = [] print("Inital training phase started...") for counter in range(max_epochs): losses = [] for s1, a1, _, _, s2, _ in self.memory.batch_sample(batch_size=self.config.sup_batch_size, randomize=True): loss_act_rep = self.action_rep.unsupervised_loss(s1, a1, s2) self.action_rep.update(loss_act_rep) losses.append(loss_act_rep.item()) initial_losses.append(np.mean(losses)) if counter % 1 == 0: print("Epoch {} loss:: {}".format(counter, np.mean(initial_losses[-10:]))) if self.config.only_phase_one: self.save() print("Saved..") # Terminate initial phase once action representations have converged. if len(initial_losses) >= 20 and np.mean(initial_losses[-10:]) + 1e-5 >= np.mean(initial_losses[-20:]): print("Converged...") break # Reset the optim to whatever is there in config self.action_rep.optim = self.config.optim(self.action_rep.parameters(), lr=self.config.embed_lr) print('... Initial training phase terminated!') self.initial_phase = False self.save() if self.config.only_phase_one: exit() hard_update(self.target_action_rep, self.action_rep)