def __init__(self, hparams): self.obs_shape = hparams['obs_shape'] self.n_actions = hparams['n_actions'] self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions) #.cuda() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.n_actions) # if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) self.hparams = hparams
def __init__(self, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.next_state_pred_ = hparams['next_state_pred_'] # Policy and Value network if 'traj_action_mask' in hparams and hparams['traj_action_mask']: self.actor_critic = CNNPolicy_trajectory_action_mask( self.obs_shape[0], hparams['action_space']) else: self.actor_critic = CNNPolicy(self.obs_shape[0], hparams['action_space']) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, hparams['action_space']) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') self.action_shape = 1 if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[ 'grad_var_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams
print ('\nInit Policies') # agent = a2c(model_dict) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' # load_policy = 1 policies = [] policies_dir = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' for f in os.listdir(policies_dir): print (f) policy = CNNPolicy(2, 4) #.cuda() param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) policy.load_state_dict(param_dict) # policy = torch.load(param_file).cuda() print ('loaded params', param_file) policy.cuda() policies.append(policy) #just one for now # break # policy = policies[0]
def __init__(self, hparams): self.obs_shape = hparams['obs_shape'] self.n_actions = hparams['n_actions'] self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions).cuda()
class a2c(object): def __init__(self, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.next_state_pred_ = hparams['next_state_pred_'] # Policy and Value network if 'traj_action_mask' in hparams and hparams['traj_action_mask']: self.actor_critic = CNNPolicy_trajectory_action_mask( self.obs_shape[0], hparams['action_space']) else: self.actor_critic = CNNPolicy(self.obs_shape[0], hparams['action_space']) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, hparams['action_space']) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') self.action_shape = 1 if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[ 'grad_var_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy, next_state_pred): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # self.rollouts.insert_state_pred(next_state_pred) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef #*10. self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if 'traj_action_mask' in hparams and hparams['traj_action_mask']: self.actor_critic = CNNPolicy_trajectory_action_mask( self.obs_shape[0], envs.action_space) else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape self.action_shape = 1 # if __: # self.deterministic_action = 0 # else: # self.deterministic_action = 0 if hparams['gif_'] or hparams['ls_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams
class a2c(object): def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if 'traj_action_mask' in hparams and hparams['traj_action_mask']: self.actor_critic = CNNPolicy_trajectory_action_mask( self.obs_shape[0], envs.action_space) else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # #for batch norm # self.actor_critic.train() #self.actor_critic.eval() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape self.action_shape = 1 # if __: # self.deterministic_action = 0 # else: # self.deterministic_action = 0 if hparams['gif_'] or hparams['ls_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] # print ('aaa') # print (self.actor_critic.act(current_state)) value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) # print ('lll') return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
#load experiemetn dict print ('load experiment dict') dict_location = exp_dir + '/' +env_name+ 'NoFrameskip-v4/A2C/seed0/model_dict.json' with open(dict_location, 'r') as outfile: exp_dict = json.load(outfile) #Init policy , not agent print ('init policy') policy = CNNPolicy(2*3, 18) #frames*channels, action size #load params # param_file = exp_dir + '/' +env_name+ 'NoFrameskip-v4/A2C/seed0/model_params3/model_params2000000.pt' param_file = exp_dir + '/' +env_name+ 'NoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt' param_dict = torch.load(param_file) policy.load_state_dict(param_dict) # policy = torch.load(param_file).cuda() print ('loaded params', param_file) policy.cuda()
class a2c(object): def __init__(self, hparams): self.obs_shape = hparams['obs_shape'] self.n_actions = hparams['n_actions'] self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions) #.cuda() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.n_actions) # if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']: # self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step() def no_update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() self.optimizer.zero_grad()
def __init__(self, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.action_size = hparams['action_size'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) self.actor_critic = CNNPolicy(self.obs_shape[0], self.action_size) # #for batch norm # self.actor_critic.train() #self.actor_critic.eval() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.action_size) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape # self.action_shape = 1 # # if __: # # self.deterministic_action = 0 # # else: # # self.deterministic_action = 0 # if hparams['gif_'] or hparams['ls_']: # self.rollouts_list = RolloutStorage_list() self.hparams = hparams
class a2c(object): def __init__(self, hparams): self.obs_shape = hparams['obs_shape'] self.n_actions = hparams['n_actions'] self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions) #.cuda() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.n_actions) # if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']: # self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step() def no_update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() self.optimizer.zero_grad()
state_dataset = [] for i in range(len(dataset)): for t in range(len(dataset[i])): state_dataset.append(dataset[i][t][1]) # /255.) print(len(state_dataset)) print('\nInit Policies') # agent = a2c(model_dict) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' # load_policy = 1 policies = [] policies_dir = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' for f in os.listdir(policies_dir): print(f) policy = CNNPolicy(2, 4) #.cuda() param_file = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' + f + '/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) policy.load_state_dict(param_dict) # policy = torch.load(param_file).cuda() print('loaded params', param_file) policy.cuda() policies.append(policy) #just one for now break policy = policies[0]
print(len(dataset)) print(len(dataset[ii][0])) # single timepoint print(dataset[ii][0][0].shape) #action [1] a_t+1 print(dataset[ii][0][1].shape) #state [2,84,84] s_t state_dataset = [] for i in range(len(dataset)): for t in range(len(dataset[i])): state_dataset.append(dataset[i][t][1]) print(len(state_dataset)) print('Init Policy') policy = CNNPolicy(2, 4) #.cuda() # agent = a2c(model_dict) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' load_policy = 1 if load_policy: param_file = home + '/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt' param_dict = torch.load(param_file) # print (param_dict.keys()) # for key in param_dict.keys(): # print (param_dict[key].size()) # print (policy.state_dict().keys())
# dataset: trajectories: timesteps: (action,state) state: [2,84,84] print(len(dataset)) print(len(dataset[ii][0])) # single timepoint print(dataset[ii][0][0].shape) #action [1] a_t+1 print(dataset[ii][0][1].shape) #state [2,84,84] s_t state_dataset = [] for i in range(len(dataset)): for t in range(len(dataset[i])): state_dataset.append(dataset[i][t][1]) # /255.) print(len(state_dataset)) print('Init Expert Policy') expert_policy = CNNPolicy(2, 4) #.cuda() # agent = a2c(model_dict) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' load_policy = 1 if load_policy: # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt' param_file = home + '/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) # print (param_dict.keys()) # for key in param_dict.keys(): # print (param_dict[key].size()) # print (policy.state_dict().keys()) # for key in policy.state_dict().keys():
print ('\nInit Policies') # agent = a2c(model_dict) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' # load_policy = 1 policies = [] policies_dir = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' for f in os.listdir(policies_dir): print (f) policy = CNNPolicy(2, 4) #.cuda() param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) policy.load_state_dict(param_dict) # policy = torch.load(param_file).cuda() print ('loaded params', param_file) policy.cuda() policies.append(policy) #just one for now break
class a2c(object): def __init__(self, hparams): self.obs_shape = hparams['obs_shape'] self.n_actions = hparams['n_actions'] self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions).cuda() # self.use_gae = hparams['use_gae'] # self.gamma = hparams['gamma'] # self.tau = hparams['tau'] # self.num_steps = hparams['num_steps'] # self.num_processes = hparams['num_processes'] # self.value_loss_coef = hparams['value_loss_coef'] # self.entropy_coef = hparams['entropy_coef'] # self.cuda = hparams['cuda'] # self.opt = hparams['opt'] # self.grad_clip = hparams['grad_clip'] # # Policy and Value network # # if hparams['dropout'] == True: # # print ('CNNPolicy_dropout2') # # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # # elif len(envs.observation_space.shape) == 3: # # print ('CNNPolicy2') # # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # # else: # # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], hparams['n_actions']) # else: # self.actor_critic = CNNPolicy(self.obs_shape[0], hparams['n_actions']) # # Storing rollouts # self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, hparams['n_actions']) # if self.cuda: # self.actor_critic.cuda() # self.rollouts.cuda() # #Optimizer # if self.opt == 'rms': # self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) # elif self.opt == 'adam': # self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) # elif self.opt == 'sgd': # self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) # else: # print ('no opt specified') # # if envs.action_space.__class__.__name__ == "Discrete": # # action_shape = 1 # # else: # # action_shape = envs.action_space.shape[0] # # self.action_shape = action_shape # self.action_shape = 1 # # if __: # # self.deterministic_action = 0 # # else: # # self.deterministic_action = 0 # if hparams['gif_'] or hparams['ls_']: # self.rollouts_list = RolloutStorage_list() # self.hparams = hparams # def __init__(self, hparams): # self.use_gae = hparams['use_gae'] # self.gamma = hparams['gamma'] # self.tau = hparams['tau'] # self.obs_shape = hparams['obs_shape'] # self.num_steps = hparams['num_steps'] # self.num_processes = hparams['num_processes'] # self.value_loss_coef = hparams['value_loss_coef'] # self.entropy_coef = hparams['entropy_coef'] # self.cuda = hparams['cuda'] # self.opt = hparams['opt'] # self.grad_clip = hparams['grad_clip'] # # Policy and Value network # # if hparams['dropout'] == True: # # print ('CNNPolicy_dropout2') # # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # # elif len(envs.observation_space.shape) == 3: # # print ('CNNPolicy2') # # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # # else: # # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], hparams['n_actions']) # else: # self.actor_critic = CNNPolicy(self.obs_shape[0], hparams['n_actions']) # # Storing rollouts # self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, hparams['n_actions']) # if self.cuda: # self.actor_critic.cuda() # self.rollouts.cuda() # #Optimizer # if self.opt == 'rms': # self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) # elif self.opt == 'adam': # self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) # elif self.opt == 'sgd': # self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) # else: # print ('no opt specified') # # if envs.action_space.__class__.__name__ == "Discrete": # # action_shape = 1 # # else: # # action_shape = envs.action_space.shape[0] # # self.action_shape = action_shape # self.action_shape = 1 # # if __: # # self.deterministic_action = 0 # # else: # # self.deterministic_action = 0 # if hparams['gif_'] or hparams['ls_']: # self.rollouts_list = RolloutStorage_list() # self.hparams = hparams # def __init__(self, envs, hparams): # self.use_gae = hparams['use_gae'] # self.gamma = hparams['gamma'] # self.tau = hparams['tau'] # self.obs_shape = hparams['obs_shape'] # self.num_steps = hparams['num_steps'] # self.num_processes = hparams['num_processes'] # self.value_loss_coef = hparams['value_loss_coef'] # self.entropy_coef = hparams['entropy_coef'] # self.cuda = hparams['cuda'] # self.opt = hparams['opt'] # self.grad_clip = hparams['grad_clip'] # # Policy and Value network # # if hparams['dropout'] == True: # # print ('CNNPolicy_dropout2') # # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # # elif len(envs.observation_space.shape) == 3: # # print ('CNNPolicy2') # # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # # else: # # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # # Storing rollouts # self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) # if self.cuda: # self.actor_critic.cuda() # self.rollouts.cuda() # #Optimizer # if self.opt == 'rms': # self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) # elif self.opt == 'adam': # self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) # elif self.opt == 'sgd': # self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) # else: # print ('no opt specified') # # if envs.action_space.__class__.__name__ == "Discrete": # # action_shape = 1 # # else: # # action_shape = envs.action_space.shape[0] # # self.action_shape = action_shape # self.action_shape = 1 # # if __: # # self.deterministic_action = 0 # # else: # # self.deterministic_action = 0 # if hparams['gif_'] or hparams['ls_']: # self.rollouts_list = RolloutStorage_list() # self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
class a2c(object): def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.next_state_pred_ = hparams['next_state_pred_'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if 'traj_action_mask' in hparams and hparams['traj_action_mask']: self.actor_critic = CNNPolicy_trajectory_action_mask( self.obs_shape[0], envs.action_space) else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape self.action_shape = 1 # if __: # self.deterministic_action = 0 # else: # self.deterministic_action = 0 if hparams['gif_'] or hparams['ls_'] or hparams['vae_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy, next_state_pred): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # self.rollouts.insert_state_pred(next_state_pred) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) # print (len(self.rollouts.state_preds)) if self.next_state_pred_: state_preds = torch.cat(self.rollouts.state_preds).view( self.num_steps, self.num_processes, 1, 84, 84) real_states = self.rollouts.states[1:] #[Steps, P, stack, 84,84] real_states = real_states[:, :, -1].contiguous().view( self.num_steps, self.num_processes, 1, 84, 84) #[Steps, P, 1, 84,84] self.state_pred_error = (state_preds - Variable(real_states)).pow(2).mean() # self.state_pred_error = Variable(torch.zeros(1)).mean().cuda() state_pred_error_value = self.state_pred_error.detach() # print (real_states.size()) # fafd # print (self.num_steps) # print (state_preds.size()) # fada self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] # print (state_preds) # print (real_states) # print (state_pred_error) advantages = Variable(self.rollouts.returns[:-1]) - values # advantages = values - Variable(self.rollouts.returns[:-1]) value_loss = advantages.pow(2).mean() if self.next_state_pred_: action_loss = -( (Variable(advantages.data) + state_pred_error_value * .0001) * action_log_probs).mean() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef + .0001 * self.state_pred_error fasdfa else: # adv = torch.clamp(Variable(advantages.data), min= -10, max=10) # action_loss = - (adv* action_log_probs).mean() #could just do detach instead of data # action_loss = (Variable(self.rollouts.returns[:-1]).detach() * action_log_probs).mean() action_loss = -(advantages.detach() * action_log_probs).mean() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef #*10. self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
print(len(state_dataset)) # fdsfds print('\nInit Policies') # agent = a2c(model_dict) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' # load_policy = 1 policies = [] # policies_dir = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' policies_dir = home + '/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/' for f in os.listdir(policies_dir): print(f) # policy = CNNPolicy(2, 4) #.cuda() policy = CNNPolicy(2, 18) #.cuda() #num-frames, nyum-actions # param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt' param_file = home + '/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/' + f + '/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) policy.load_state_dict(param_dict) # policy = torch.load(param_file).cuda() print('loaded params', param_file) policy.cuda() policies.append(policy) #just one for now break
print (len(dataset[ii][0])) # single timepoint print (dataset[ii][0][0].shape) #action [1] a_t+1 print (dataset[ii][0][1].shape) #state [2,84,84] s_t state_dataset = [] for i in range(len(dataset)): for t in range(len(dataset[i])): state_dataset.append(dataset[i][t][1]) # /255.) print (len(state_dataset)) print ('Init Expert Policy') expert_policy = CNNPolicy(2, 4) #.cuda() # agent = a2c(model_dict) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' load_policy = 1 if load_policy: # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt' param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) # print (param_dict.keys()) # for key in param_dict.keys(): # print (param_dict[key].size()) # print (policy.state_dict().keys()) # for key in policy.state_dict().keys():
state_dataset.append(dataset[i][t][1]) # /255.) print(len(state_dataset)) print('\nInit Policies') # agent = a2c(model_dict) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' # load_policy = 1 policies = [] policies_dir = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' for f in os.listdir(policies_dir): print(f) policy = CNNPolicy(2, 4) #.cuda() param_file = home + '/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' + f + '/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) policy.load_state_dict(param_dict) # policy = torch.load(param_file).cuda() print('loaded params', param_file) policy.cuda() policies.append(policy) #just one for now break # if load_policy: # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt'
print ('\nInit Policies') # agent = a2c(model_dict) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' # load_policy = 1 policies = [] policies_dir = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/' for f in os.listdir(policies_dir): print (f) policy = CNNPolicy(2, 4) #.cuda() param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) policy.load_state_dict(param_dict) # policy = torch.load(param_file).cuda() print ('loaded params', param_file) policy.cuda() policies.append(policy) #just one for now break policy = policies[0]
class a2c(object): def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # self.next_state_pred_ = hparams['next_state_pred_'] # Policy and Value network # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') self.action_shape = 1 if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[ 'grad_var_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy, next_state_pred): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # self.rollouts.insert_state_pred(next_state_pred) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef #*10. self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step() # def update2(self, discrim_error): # # discrim_error: [S,P] # # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # # next_value = next_value.data # # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # # print (torch.mean(discrim_error, dim=0)) # # print (discrim_error) # discrim_error_unmodified = discrim_error.data.clone() # discrim_error = discrim_error.data # # self.returns[-1] = next_value # divide_by = torch.ones(self.num_processes).cuda() # for step in reversed(range(discrim_error.size(0)-1)): # divide_by += 1 # ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step] # discrim_error[step] = discrim_error_unmodified[step] / divide_by # divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1)) # # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1] # # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) # self.rollouts.value_preds = [] # self.rollouts.action_log_probs = [] # self.rollouts.dist_entropy = [] # self.rollouts.state_preds = [] # # advantages = Variable(self.rollouts.returns[:-1]) - values # # print (values) # # print (discrim_error_reverse.size()) #[S,P] # # discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1) # # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # val_to_maximize = -discrim_error - action_log_probs.detach() # baseline = torch.mean(val_to_maximize) # advantages = val_to_maximize - baseline #- values #(-.7)#values # # value_loss = advantages.pow(2).mean() # # action_loss = -(advantages.detach() * action_log_probs).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() # # print (grad_sum) # # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # # cost =- grad_sum # self.optimizer.zero_grad() # cost.backward() # nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() # #with reverse # def update2(self, discrim_error, discrim_error_reverse): # # discrim_error: [S,P] # # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # # next_value = next_value.data # # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # # print (torch.mean(discrim_error, dim=0)) # # print (discrim_error) # discrim_error_unmodified = discrim_error.data.clone() # discrim_error = discrim_error.data # # self.returns[-1] = next_value # divide_by = torch.ones(self.num_processes).cuda() # for step in reversed(range(discrim_error.size(0)-1)): # divide_by += 1 # ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step] # discrim_error[step] = discrim_error_unmodified[step] / divide_by # divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1)) # # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1] # # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) # self.rollouts.value_preds = [] # self.rollouts.action_log_probs = [] # self.rollouts.dist_entropy = [] # self.rollouts.state_preds = [] # # advantages = Variable(self.rollouts.returns[:-1]) - values # # print (values) # # print (discrim_error_reverse.size()) #[S,P] # discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1) # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # baseline = torch.mean(val_to_maximize) # advantages = val_to_maximize - baseline #- values #(-.7)#values # # value_loss = advantages.pow(2).mean() # # action_loss = -(advantages.detach() * action_log_probs).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() # # print (grad_sum) # # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # # cost =- grad_sum # self.optimizer.zero_grad() # cost.backward() # nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() #avg empowrement rather than avg error def update2(self, discrim_error, discrim_error_reverse): # discrim_error: [S,P] # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # next_value = next_value.data # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # print (torch.mean(discrim_error, dim=0)) # print (discrim_error) discrim_error_reverse = discrim_error_reverse.view( self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) #[S,P,1] discrim_error = discrim_error.view(self.num_steps, self.num_processes, 1) # val_to_maximize = (-discrim_error + discrim_error_reverse)/2. - action_log_probs.detach() #[S,P,1] val_to_maximize = -discrim_error - action_log_probs.detach() #[S,P,1] val_to_maximize = val_to_maximize.view(self.num_steps, self.num_processes) #[S,P] discrim_error_unmodified = val_to_maximize.data.clone() discrim_error = val_to_maximize.data # self.returns[-1] = next_value divide_by = torch.ones(self.num_processes).cuda() for step in reversed(range(discrim_error.size(0) - 1)): divide_by += 1 ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze( self.rollouts.masks[step + 1]) discrim_error_unmodified[ step] = ttmp + discrim_error_unmodified[step] discrim_error[step] = discrim_error_unmodified[step] / divide_by divide_by = divide_by * torch.squeeze( self.rollouts.masks[step + 1]) val_to_maximize = Variable( discrim_error.view(self.num_steps, self.num_processes, 1)) # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] # advantages = Variable(self.rollouts.returns[:-1]) - values # print (values) # print (discrim_error_reverse.size()) #[S,P] # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # val_to_maximize = discrim_error baseline = torch.mean(val_to_maximize) advantages = val_to_maximize - baseline #- values #(-.7)#values # value_loss = advantages.pow(2).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() action_loss = -(advantages.detach() * action_log_probs).mean() # print (grad_sum) # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # cost =- grad_sum self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
# param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt' # param_dict = torch.load(param_file) # policy.load_state_dict(param_dict) # # policy = torch.load(param_file).cuda() # print ('loaded params', param_file) # policy.cuda() # policies.append(policy) # #just one for now # break # policy = policies[0] policy = CNNPolicy(2 * 3, 18) #.cuda() #num-frames* channels, num-actions # param_file = home+'/Documents/tmp/multiple_seeds_of_policies/BreakoutNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt' # param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/'+f+'/model_params3/model_params9999360.pt' param_file = home + '/Documents/tmp/' + exp_name + '/' + env_name + 'NoFrameskip-v4/A2C/seed0/model_params3/model_params3999840.pt' param_dict = torch.load(param_file) policy.load_state_dict(param_dict) # policy = torch.load(param_file).cuda() print('loaded params', param_file) policy.cuda() class MASK_PREDICTOR(nn.Module): def __init__(self): super(MASK_PREDICTOR, self).__init__()