def initialise_policy(self): # initialise policy network policy_net = Policy( args=self.args, # pass_state_to_policy=self.args.pass_state_to_policy, pass_latent_to_policy=self.args.pass_latent_to_policy, pass_belief_to_policy=self.args.pass_belief_to_policy, pass_task_to_policy=self.args.pass_task_to_policy, dim_state=self.args.state_dim, dim_latent=self.args.latent_dim * 2, dim_belief=self.args.belief_dim, dim_task=self.args.task_dim, # hidden_layers=self.args.policy_layers, activation_function=self.args.policy_activation_function, policy_initialisation=self.args.policy_initialisation, # action_space=self.envs.action_space, init_std=self.args.policy_init_std, ).to(device) # initialise policy trainer if self.args.policy == 'a2c': policy = A2C( self.args, policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, policy_optimiser=self.args.policy_optimiser, policy_anneal_lr=self.args.policy_anneal_lr, train_steps=self.num_updates, optimiser_vae=self.vae.optimiser_vae, lr=self.args.lr_policy, eps=self.args.policy_eps, ) elif self.args.policy == 'ppo': policy = PPO( self.args, policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, policy_optimiser=self.args.policy_optimiser, policy_anneal_lr=self.args.policy_anneal_lr, train_steps=self.num_updates, lr=self.args.lr_policy, eps=self.args.policy_eps, ppo_epoch=self.args.ppo_num_epochs, num_mini_batch=self.args.ppo_num_minibatch, use_huber_loss=self.args.ppo_use_huberloss, use_clipped_value_loss=self.args.ppo_use_clipped_value_loss, clip_param=self.args.ppo_clip_param, optimiser_vae=self.vae.optimiser_vae, ) else: raise NotImplementedError return policy
def validate(self, data: Dict): super(Role, self).validate(data) policies = data.get('policies') policy_objs = Policy.get(filters={'id': policies}) if len(policies) != len(policy_objs): raise ValidationError(message=f'Failed to add policies to role.\n' f'Few policy id seems to be invalid.')
def load_policy_model(args, environment, device, folder=None): parent_folder = './checkpoint/policy' path = folder if folder is not None else parent_folder model = Policy(environment['action'], net=args.encoder, pretrained=args.pretrained, input=environment['input_size']) model.load_state_dict(torch.load(f'{path}/best_model.ckpt')) model = model.to(device) model.eval() return model
env = gym.make(args.env_name) state_dim = env.observation_space.shape[0] is_disc_action = len(env.action_space.shape) == 0 running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=4e-4) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=8e-4) """create agent""" agent = Agent(env, policy_net, device, running_state=running_state,
def main(): args = parse_args() # Initialize dbmng = DBManager(args.db) # Only one session to access to database _session = dbmng.get_session() # Open the file decorder = json.JSONDecoder(object_pairs_hook=collections.OrderedDict) input_f = open(args.input_file, 'r') input_j = decorder.decode(input_f.read()) # Load json keys and make it inserted nowtime = datetime.datetime.now() for key in input_j.keys(): if key == 'translations': for dic in input_j[key]: rec = Translation(id=dic['id'], locale=dic['locale'], t=dic['t'], created_at=nowtime, created_by='rightctl_initializer') _session.add(rec) else: _session.commit() elif key == 'rights': for dic in input_j[key]: rec = Right(action=dic['action'], enable_flag=True, created_at=nowtime, created_by='rightctl_initializer') _session.add(rec) else: _session.commit() elif key == 'policies': for dic in input_j[key]: rec = Policy(id=dic['id'], policy_tid=dic['policy_tid'], created_at=nowtime, created_by='rightctl_initializer') _session.add(rec) _session.commit() for right_s in dic['rights']: right = _session.query(Right).filter( Right.action == right_s).one() child = PolicyHasRight(policy_id=rec.id, right_id=right.id) _session.add(child) else: _session.commit() else: _session.commit() elif key == 'roles': for dic in input_j[key]: rec = Role(id=dic['id'], role_tid=dic['role_tid'], created_at=nowtime, created_by='rightctl_initializer') _session.add(rec) _session.commit() for policy_s in dic['policies']: policy = _session.query(Policy).filter( Policy.id == policy_s).one() child = RoleHasPolicy(role_id=rec.id, policy_id=policy.id) _session.add(child) else: _session.commit() else: _session.commit() else: None
) get_policy_dataset = environment['policy_dataset'] policy_train, policy_validation = get_policy_dataset( args.expert_path, args.policy_batch_size, maze_size=args.maze_size, maze_type=args.maze_type, ) # Model and action size print('\nCreating Models') action_dimension = environment['action'] inputs = environment['input_size'] * 2 if environment[ 'input_size'] is not None else None policy_model = Policy(action_dimension, net=args.encoder, pretrained=args.pretrained, input=environment['input_size']) idm_model = IDM(action_dimension, net=args.encoder, pretrained=args.pretrained, input=inputs) policy_model.to(device) idm_model.to(device) # Optimizer and loss print('\nCreating Optimizer and Loss') print(f'IDM learning rate: {args.lr}\nPolicy learning rate: {args.policy_lr}') idm_lr = args.lr idm_criterion = nn.CrossEntropyLoss() idm_optimizer = optim.Adam(idm_model.parameters(), lr=idm_lr)
def __init__(self, env_def, processes=1, dir='.', version=0, lr=2e-4, architecture='base', dropout=0, reconstruct=None, r_weight=.05): self.env_def = env_def self.num_processes = processes #cpu processes self.lr = lr self.version = version self.save_dir = dir + '/trained_models/' #Setup pathlib.Path(self.save_dir).mkdir(parents=True, exist_ok=True) if (self.num_mini_batch > processes): self.num_mini_batch = processes self.writer = SummaryWriter() self.total_steps = 0 #State torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) if not self.no_cuda and torch.cuda.is_available( ) and self.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True utils.cleanup_log_dir(self.log_dir) utils.cleanup_log_dir(self.eval_log_dir) torch.set_num_threads(1) self.level_path = None self.envs = None self.num_envs = -1 self.set_envs(num_envs=1) if (version > 0): self.actor_critic = self.load(path, version) else: self.actor_critic = Policy( self.envs.observation_space.shape, self.envs.action_space, base_kwargs={ 'recurrent': self.recurrent_policy, 'shapes': list(reversed(self.env_def.model_shape)), 'dropout': dropout }, model=architecture) self.actor_critic.to(self.device) #Reconstruction self.reconstruct = reconstruct is not None if (self.reconstruct): #layers = self.envs.observation_space.shape[0] #shapes = list(self.env_def.model_shape) #self.r_model = Decoder(layers, shapes=shapes).to(self.device) reconstruct.to(self.device) self.r_model = lambda x: reconstruct.adapter(reconstruct(x)) #self.r_model = lambda x: reconstruct.adapter(reconstruct(x)).clamp(min=1e-6).log() #self.r_loss = nn.L1Loss() #nn.NLLLoss() #nn.MSELoss() self.r_loss = lambda pred, true: -r_weight * (true * torch.log( pred.clamp(min=1e-7, max=1 - 1e-7))).sum(dim=1).mean() self.r_optimizer = reconstruct.optimizer #optim.Adam(reconstruct.parameters(), lr = .0001) if self.algo == 'a2c': self.agent = A2C_ACKTR(self.actor_critic, self.value_loss_coef, self.entropy_coef, lr=self.lr, eps=self.eps, alpha=self.alpha, max_grad_norm=self.max_grad_norm) elif self.algo == 'ppo': self.agent = PPO(self.actor_critic, self.clip_param, self.ppo_epoch, self.num_mini_batch, self.value_loss_coef, self.entropy_coef, lr=self.lr, eps=self.eps, max_grad_norm=self.max_grad_norm, use_clipped_value_loss=False) elif self.algo == 'acktr': self.agent = algo.A2C_ACKTR(self.actor_critic, self.value_loss_coef, self.entropy_coef, acktr=True) self.gail = False self.gail_experts_dir = './gail_experts' if self.gail: assert len(self.envs.observation_space.shape) == 1 self.gail_discr = gail.Discriminator( self.envs.observation_space.shape[0] + self.envs.action_space.shape[0], 100, self.device) file_name = os.path.join( self.gail_experts_dir, "trajs_{}.pt".format(env_name.split('-')[0].lower())) self.gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=self.gail_batch_size, shuffle=True, drop_last=True) self.rollouts = RolloutStorage( self.num_steps, self.num_processes, self.envs.observation_space.shape, self.envs.action_space, self.actor_critic.recurrent_hidden_state_size)
class Agent: #algorithm algo = 'a2c' #a2c, ppo, acktr use_gae = False #generalized advantage estimation gae_lambda = 0.95 entropy_coef = 0.01 #weight maximizing action entropy loss value_loss_coef = 0.1 #.5 #weight value function loss max_grad_norm = 0.5 #max norm of gradients #ppo hyperparameters clip_param = 0.2 #ppo clip num_steps = 5 #steps before an update ppo_epoch = 4 num_mini_batch = 32 seed = 1 device = 'cuda' if torch.cuda.is_available() else 'cpu' cuda_deterministic = False no_cuda = False use_proper_time_limits = False use_linear_lr_decay = False #experimnent setup log_interval = 1 #log per n updates log_dir = os.path.expanduser('/tmp/gym') eval_log_dir = log_dir + "_eval" save_interval = 100 eval_interval = None recurrent_policy = True #optimization, RMSprop and TD eps = 1e-5 #epsilon alpha = 0.99 gamma = 0.99 #discount factor #imitation learning with gail gail_batch_size = 128 gail_epoch = 5 def __init__(self, env_def, processes=1, dir='.', version=0, lr=2e-4, architecture='base', dropout=0, reconstruct=None, r_weight=.05): self.env_def = env_def self.num_processes = processes #cpu processes self.lr = lr self.version = version self.save_dir = dir + '/trained_models/' #Setup pathlib.Path(self.save_dir).mkdir(parents=True, exist_ok=True) if (self.num_mini_batch > processes): self.num_mini_batch = processes self.writer = SummaryWriter() self.total_steps = 0 #State torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) if not self.no_cuda and torch.cuda.is_available( ) and self.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True utils.cleanup_log_dir(self.log_dir) utils.cleanup_log_dir(self.eval_log_dir) torch.set_num_threads(1) self.level_path = None self.envs = None self.num_envs = -1 self.set_envs(num_envs=1) if (version > 0): self.actor_critic = self.load(path, version) else: self.actor_critic = Policy( self.envs.observation_space.shape, self.envs.action_space, base_kwargs={ 'recurrent': self.recurrent_policy, 'shapes': list(reversed(self.env_def.model_shape)), 'dropout': dropout }, model=architecture) self.actor_critic.to(self.device) #Reconstruction self.reconstruct = reconstruct is not None if (self.reconstruct): #layers = self.envs.observation_space.shape[0] #shapes = list(self.env_def.model_shape) #self.r_model = Decoder(layers, shapes=shapes).to(self.device) reconstruct.to(self.device) self.r_model = lambda x: reconstruct.adapter(reconstruct(x)) #self.r_model = lambda x: reconstruct.adapter(reconstruct(x)).clamp(min=1e-6).log() #self.r_loss = nn.L1Loss() #nn.NLLLoss() #nn.MSELoss() self.r_loss = lambda pred, true: -r_weight * (true * torch.log( pred.clamp(min=1e-7, max=1 - 1e-7))).sum(dim=1).mean() self.r_optimizer = reconstruct.optimizer #optim.Adam(reconstruct.parameters(), lr = .0001) if self.algo == 'a2c': self.agent = A2C_ACKTR(self.actor_critic, self.value_loss_coef, self.entropy_coef, lr=self.lr, eps=self.eps, alpha=self.alpha, max_grad_norm=self.max_grad_norm) elif self.algo == 'ppo': self.agent = PPO(self.actor_critic, self.clip_param, self.ppo_epoch, self.num_mini_batch, self.value_loss_coef, self.entropy_coef, lr=self.lr, eps=self.eps, max_grad_norm=self.max_grad_norm, use_clipped_value_loss=False) elif self.algo == 'acktr': self.agent = algo.A2C_ACKTR(self.actor_critic, self.value_loss_coef, self.entropy_coef, acktr=True) self.gail = False self.gail_experts_dir = './gail_experts' if self.gail: assert len(self.envs.observation_space.shape) == 1 self.gail_discr = gail.Discriminator( self.envs.observation_space.shape[0] + self.envs.action_space.shape[0], 100, self.device) file_name = os.path.join( self.gail_experts_dir, "trajs_{}.pt".format(env_name.split('-')[0].lower())) self.gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=self.gail_batch_size, shuffle=True, drop_last=True) self.rollouts = RolloutStorage( self.num_steps, self.num_processes, self.envs.observation_space.shape, self.envs.action_space, self.actor_critic.recurrent_hidden_state_size) def load(self, path, version): policy = torch.load(os.path.join(path, "agent_{}.tar".format(version))) #utils.get_vec_normalize(self.envs).ob_rms = ob_rms self.actor_critic = policy def save(self, path, version): #ob_rms = getattr(utils.get_vec_normalize(self.envs), 'ob_rms', None) torch.save(self.actor_critic, os.path.join(path, "agent_{}.tar".format(version))) def report(self, version, total_num_steps, FPS, rewards): file_path = os.path.join(self.save_dir, "actor_critic_results.csv") add_header = not os.path.exists(file_path) if (len(rewards) > 0): mean, median, min, max = np.mean(rewards), np.median( rewards), np.min(rewards), np.max(rewards) else: mean, median, min, max = np.nan, np.nan, np.nan, np.nan with open(file_path, 'a+') as results: writer = csv.writer(results) if (add_header): header = [ 'update', 'total_steps', 'FPS', 'mean_reward', 'median_reward', 'min_reward', 'max_reward' ] writer.writerow(header) writer.writerow( (version, total_num_steps, FPS, mean, median, min, max)) def set_envs(self, level_path=None, num_envs=None): num_envs = num_envs if num_envs else self.num_processes if (level_path != self.level_path or self.envs is None or num_envs != self.num_envs): if (self.envs is not None): self.envs.close() self.level_path = level_path self.envs = make_vec_envs(self.env_def, level_path, self.seed, num_envs, self.gamma, self.log_dir, self.device, True) self.num_envs = num_envs def update_reconstruction(self, rollouts): s, p, l, w, h = list(rollouts.obs.size()) x = rollouts.obs.view(-1, l, w, h) hidden = rollouts.recurrent_hidden_states.view(s * p, -1) mask = rollouts.masks.view(s * p, -1) #y = x.argmax(1) y = x self.r_optimizer.zero_grad() self.agent.optimizer.zero_grad() _, predictions, _ = self.actor_critic.base(x, hidden, mask) reconstructions = self.r_model(predictions) loss = self.r_loss(reconstructions, y) loss.backward() self.r_optimizer.step() self.agent.optimizer.step() return loss def update_reconstruct_next(self, rollouts): #Mask frames that are not relevant mask = rollouts.masks.unfold(0, 2, 1).min(-1)[0] mask = mask.view(-1) mask = torch.nonzero(mask).squeeze() #Image Pairs l, w, h = list(rollouts.obs.size())[2:] img_pairs = rollouts.obs.unfold(0, 2, 1) #128, 8, 14, 12, 16, 2 img_pairs = img_pairs.view(-1, l, w, h, 2) img_pairs = img_pairs[mask] x = img_pairs[:, :, :, :, 0] y = img_pairs[:, :, :, :, 1] #Input hidden states hidden_size = rollouts.recurrent_hidden_states.size(2) hidden = rollouts.recurrent_hidden_states[:-1].view( -1, hidden_size) #129, 8, 512 hidden = hidden[mask] #Update model self.r_optimizer.zero_grad() mask = torch.ones_like(mask).float().unsqueeze(1) _, predictions, _ = self.actor_critic.base(x, hidden, mask) reconstructions = self.r_model(predictions) loss = self.r_loss( reconstructions, y) #model -> x or x and a? x already contains action features loss.backward() self.r_optimizer.step() print(loss.item()) #add loss weight return loss def play(self, env, runs=1, visual=False): env = GridGame() reward_mean = 0 for i in range(runs): score = self.play_game(env, visual) reward_mean += score / runs return score_mean def play_game(self, level): eval_envs = make_vec_envs(env_name, self.seed + self.num_processes, self.num_processes, None, eval_log_dir, device, True) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( self.num_processes, self.actor_critic.recurrent_hidden_state_size).to(self.device) eval_masks = torch.zeros(self.num_processes, 1).to(self.device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32).to(device) if (done): print("Done!") eval_envs.close() def train_agent(self, num_env_steps): env_name = self.env_def.name obs = self.envs.reset() self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) n = 30 episode_rewards = deque(maxlen=n) episode_values = deque(maxlen=n) episode_end_values = deque(maxlen=n) episode_end_probs = deque(maxlen=n) episode_lengths = deque(maxlen=n) compile_est = deque(maxlen=n) first_steps = [True for i in range(self.num_processes)] start = time.time() num_updates = int( num_env_steps) // self.num_steps // self.num_processes for j in range(num_updates): if self.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( self.agent.optimizer, j, num_updates, self.agent.optimizer.lr if self.algo == "acktr" else self.lr) for step in range(self.num_steps): # Sample actions with torch.no_grad(): value, Q, action, action_prob, action_log_prob, recurrent_hidden_states = \ self.actor_critic.act(self.rollouts.obs[step], self.rollouts.recurrent_hidden_states[step], self.rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = self.envs.step(action) for i, step in enumerate(first_steps): if step: episode_values.append(value[i].item()) elif (done[i]): episode_end_values.append(Q[i].item()) episode_end_probs.append(action_log_prob[i].item()) first_steps = done for worker, info in enumerate(infos): if 'episode' in info.keys(): r = info['episode']['r'] l = info['episode']['l'] episode_rewards.append(r) episode_lengths.append(l) if (r < -1): compile_est.append(value[worker].item()) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) self.rollouts.insert(obs, recurrent_hidden_states, action, action_prob, action_log_prob, value, Q, reward, masks, bad_masks) with torch.no_grad(): next_value = self.actor_critic.get_value( self.rollouts.obs[-1], self.rollouts.recurrent_hidden_states[-1], self.rollouts.masks[-1]).detach() if self.gail: if j >= 10: self.envs.venv.eval() gail_epoch = self.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): self.gail_discr.update( self.gail_train_loader, self.rollouts, utils.get_vec_normalize(self.envs)._obfilt) for step in range(self.num_steps): self.rollouts.rewards[ step] = self.gail_discr.predict_reward( self.rollouts.obs[step], self.rollouts.actions[step], self.gamma, self.rollouts.masks[step]) self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.gae_lambda, self.use_proper_time_limits) value_loss, action_loss, dist_entropy = self.agent.update( self.rollouts) if (self.reconstruct): recon_loss = self.update_reconstruction(self.rollouts) self.writer.add_scalar('generator/Reconstruction Loss', recon_loss.item(), self.total_steps) self.rollouts.after_update() #Tensorboard Reporting self.total_steps += self.num_processes * self.num_steps self.writer.add_scalar('value/Mean Reward', np.mean(episode_rewards), self.total_steps) self.writer.add_scalar('value/Episode Mean Length', np.mean(episode_lengths), self.total_steps) self.writer.add_scalar('policy/Action Loss', action_loss, self.total_steps) self.writer.add_scalar('value/Value Loss', value_loss, self.total_steps) self.writer.add_scalar('policy/Distribution Entropy', dist_entropy, self.total_steps) self.writer.add_scalar('value/Win Probability', np.mean(np.array(episode_rewards) > 0), self.total_steps) self.writer.add_scalar('value/Starting Value', np.mean(episode_values), self.total_steps) #self.writer.add_scalar('value/Ending Value', np.mean(episode_end_values), self.total_steps) self.writer.add_scalar('value/Log Probs', np.mean(episode_end_probs), self.total_steps) if (len(compile_est) > 0): self.writer.add_scalar('value/Compile Estimate', np.mean(compile_est), self.total_steps) # save for every interval-th episode or for the last epoch total_num_steps = (j + 1) * self.num_processes * self.num_steps end = time.time() if (j % self.save_interval == 0 or j == num_updates - 1) and self.save_dir != "": self.version += 1 #self.save(self.version) self.report(self.version, total_num_steps, int(total_num_steps / (end - start)), episode_rewards) if j % self.log_interval == 0 and len(episode_rewards) > 1: print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
def initialise_policy(self): # initialise rollout storage for the policy self.policy_storage = OnlineStorage( self.args, self.args.policy_num_steps, self.args.num_processes, self.args.obs_dim, self.args.act_space, hidden_size=self.args.aggregator_hidden_size, latent_dim=self.args.latent_dim, normalise_observations=self.args.norm_obs_for_policy, normalise_rewards=self.args.norm_rew_for_policy, ) # initialise policy network input_dim = self.args.obs_dim * int( self.args.condition_policy_on_state) input_dim += ( 1 + int(not self.args.sample_embeddings)) * self.args.latent_dim if hasattr(self.envs.action_space, 'low'): action_low = self.envs.action_space.low action_high = self.envs.action_space.high else: action_low = action_high = None policy_net = Policy( state_dim=input_dim, action_space=self.args.act_space, init_std=self.args.policy_init_std, hidden_layers=self.args.policy_layers, activation_function=self.args.policy_activation_function, normalise_actions=self.args.normalise_actions, action_low=action_low, action_high=action_high, ).to(device) # initialise policy trainer if self.args.policy == 'a2c': self.policy = A2C( policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, optimiser_vae=self.vae.optimiser_vae, lr=self.args.lr_policy, eps=self.args.policy_eps, alpha=self.args.a2c_alpha, ) elif self.args.policy == 'ppo': self.policy = PPO( policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, optimiser_vae=self.vae.optimiser_vae, lr=self.args.lr_policy, eps=self.args.policy_eps, ppo_epoch=self.args.ppo_num_epochs, num_mini_batch=self.args.ppo_num_minibatch, use_huber_loss=self.args.ppo_use_huberloss, use_clipped_value_loss=self.args.ppo_use_clipped_value_loss, clip_param=self.args.ppo_clip_param, ) else: raise NotImplementedError
def initialise_policy(self): if hasattr(self.envs.action_space, 'low'): action_low = self.envs.action_space.low action_high = self.envs.action_space.high else: action_low = action_high = None # initialise policy network policy_net = Policy( args=self.args, # pass_state_to_policy=self.args.pass_state_to_policy, pass_latent_to_policy= False, # use metalearner.py if you want to use the VAE pass_belief_to_policy=self.args.pass_belief_to_policy, pass_task_to_policy=self.args.pass_task_to_policy, dim_state=self.args.state_dim, dim_latent=0, dim_belief=self.args.belief_dim, dim_task=self.args.task_dim, # hidden_layers=self.args.policy_layers, activation_function=self.args.policy_activation_function, policy_initialisation=self.args.policy_initialisation, # action_space=self.envs.action_space, init_std=self.args.policy_init_std, norm_actions_of_policy=self.args.norm_actions_of_policy, action_low=action_low, action_high=action_high, ).to(device) # initialise policy trainer if self.args.policy == 'a2c': policy = A2C( self.args, policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, policy_optimiser=self.args.policy_optimiser, policy_anneal_lr=self.args.policy_anneal_lr, train_steps=self.num_updates, lr=self.args.lr_policy, eps=self.args.policy_eps, ) elif self.args.policy == 'ppo': policy = PPO( self.args, policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, policy_optimiser=self.args.policy_optimiser, policy_anneal_lr=self.args.policy_anneal_lr, train_steps=self.num_updates, lr=self.args.lr_policy, eps=self.args.policy_eps, ppo_epoch=self.args.ppo_num_epochs, num_mini_batch=self.args.ppo_num_minibatch, use_huber_loss=self.args.ppo_use_huberloss, use_clipped_value_loss=self.args.ppo_use_clipped_value_loss, clip_param=self.args.ppo_clip_param, ) else: raise NotImplementedError return policy
os.makedirs(args.model_dir) except: shutil.rmtree(args.model_dir) os.makedirs(args.model_dir) env = deepmind_lab.Lab('tests/empty_room_test', ['RGB_INTERLEAVED'], config=CONFIG) env.reset() obs_shape = env.observations()['RGB_INTERLEAVED'].shape obs_shape = (obs_shape[2], obs_shape[0], obs_shape[1]) print('Observation Space: ', obs_shape) action_space = Discrete(9) env.close() actor_critic = Policy(obs_shape, action_space) actor_critic.to(args.device) learner = Learner(args, q_batch, actor_critic) for i in range(len(LEVELS)): print('Build Actor {:d}'.format(i)) rollouts = RolloutStorage(args.num_steps, 1, obs_shape, action_space, actor_critic.recurrent_hidden_state_size) actor_critic = Policy(obs_shape, action_space) actor_critic.to(args.device) actor_name = 'actor_' + str(i) actor = Actor(args, q_trace, learner, actor_critic, rollouts, LEVELS[i], actor_name) actors.append(actor)
def Policy(self): return Policy(self)
def initialise_policy(self): # variables for task encoder (used for oracle) state_dim = self.envs.observation_space.shape[0] # TODO: this isn't ideal, find a nicer way to get the task dimension! if 'BeliefOracle' in self.args.env_name: task_dim = gym.make(self.args.env_name).observation_space.shape[0] - \ gym.make(self.args.env_name.replace('BeliefOracle', '')).observation_space.shape[0] latent_dim = self.args.latent_dim state_embedding_size = self.args.state_embedding_size use_task_encoder = True elif 'Oracle' in self.args.env_name: task_dim = gym.make(self.args.env_name).observation_space.shape[0] - \ gym.make(self.args.env_name.replace('Oracle', '')).observation_space.shape[0] latent_dim = self.args.latent_dim state_embedding_size = self.args.state_embedding_size use_task_encoder = True else: task_dim = latent_dim = state_embedding_size = 0 use_task_encoder = False # initialise rollout storage for the policy self.policy_storage = OnlineStorage( self.args, self.args.policy_num_steps, self.args.num_processes, self.args.obs_dim, self.args.act_space, hidden_size=0, latent_dim=self.args.latent_dim, normalise_observations=self.args.norm_obs_for_policy, normalise_rewards=self.args.norm_rew_for_policy, ) if hasattr(self.envs.action_space, 'low'): action_low = self.envs.action_space.low action_high = self.envs.action_space.high else: action_low = action_high = None # initialise policy network policy_net = Policy( # general state_dim=int(self.args.condition_policy_on_state) * state_dim, action_space=self.envs.action_space, init_std=self.args.policy_init_std, hidden_layers=self.args.policy_layers, activation_function=self.args.policy_activation_function, use_task_encoder=use_task_encoder, # task encoding things (for oracle) task_dim=task_dim, latent_dim=latent_dim, state_embed_dim=state_embedding_size, # normalise_actions=self.args.normalise_actions, action_low=action_low, action_high=action_high, ).to(device) # initialise policy if self.args.policy == 'a2c': # initialise policy trainer (A2C) self.policy = A2C( policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, lr=self.args.lr_policy, eps=self.args.policy_eps, alpha=self.args.a2c_alpha, ) elif self.args.policy == 'ppo': # initialise policy network self.policy = PPO( policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, lr=self.args.lr_policy, eps=self.args.policy_eps, ppo_epoch=self.args.ppo_num_epochs, num_mini_batch=self.args.ppo_num_minibatch, use_huber_loss=self.args.ppo_use_huberloss, use_clipped_value_loss=self.args.ppo_use_clipped_value_loss, clip_param=self.args.ppo_clip_param, ) else: raise NotImplementedError
seg.eval() if opt_seg.ft: checkpoint = torch.load(opt_seg.ft_resume) seg.module.load_state_dict(checkpoint['state_dict'], strict=False) # self.logger.info("=> loaded checkpoint '{}' (epoch {})".format(args.ft_resume, checkpoint['epoch'])) #################################################### ##### Policy ################################################# from models.policy import Policy lr = 3e-4 gamma = 1 lambd_entropy = 0.3 # hidden_dim: # action choices # policy = Policy(hidden_dim=2, rnn_type='lstm') policy = Policy(hidden_dim=4, input_dim=23, rnn_type=None) r_neg = 5 r_pos = 5 if evaluation == "policy": # checkpoint = torch.load("/home/chenwy/DynamicLightEnlighten/bdd100k_seg/policy_model/image.Lab.seg_0.75_msn.act4_adapt20.0.55.clip5.5_entropy0.3_lr3e3.800epoch_2019-04-02-03-48/model_best.pth.tar") # policy.load_state_dict(checkpoint['state_dict']) policy.eval() else: # params_list = [{'params': policy.vgg.parameters(), 'lr': lr},] # params_list.append({'params': policy.rnn.parameters(), 'lr': lr*10}) # params_list = [{'params': policy.resnet.parameters(), 'lr': lr},] ################################## # params_list = [{'params': policy.fcn.pretrained.parameters(), 'lr': lr*10}, # {'params': policy.fcn.head.parameters(), 'lr': lr*10}] ##################################
seg = DataParallelModel(seg).cuda() seg.eval() if opt_seg.ft: checkpoint = torch.load(opt_seg.ft_resume) seg.module.load_state_dict(checkpoint['state_dict'], strict=False) # self.logger.info("=> loaded checkpoint '{}' (epoch {})".format(args.ft_resume, checkpoint['epoch'])) #################################################### ##### Policy ################################################# from models.policy import Policy lr = 3e-4 gamma = 1 lambd_entropy = 0.3 # policy = Policy(hidden_dim=2, rnn_type='lstm') policy = Policy(hidden_dim=4, input_dim=23, rnn_type=None) r_neg = 5 r_pos = 5 if evaluation: checkpoint = torch.load( "/home/chenwy/DynamicLightEnlighten/bdd100k_seg/policy_model/image_lstm.vgg.avgpool.argmax_delta.clip1.2.action-mean0.975_entropy.0_gamma.1_lr1e4_update.5_2019-02-01-13-05/model_best.pth.tar" ) policy.load_state_dict(checkpoint['state_dict']) policy.eval() else: # params_list = [{'params': policy.vgg.parameters(), 'lr': lr},] # params_list.append({'params': policy.rnn.parameters(), 'lr': lr*10}) # params_list = [{'params': policy.resnet.parameters(), 'lr': lr},] ################################## # params_list = [{'params': policy.fcn.pretrained.parameters(), 'lr': lr*10},