def learn(self): logger.info("Training") n_steps = 0 # best_success_rate = 0. for epoch in range(self.args.n_epochs): residual_losses = [] for _ in range(self.args.n_cycles): # Collect trajectories self.controller.reconfigure_heuristic(self.get_residual) n_steps += self.collect_trajectories( self.args.num_rollouts_per_mpi) # Update residual logger.debug("Updating") for _ in range(self.args.n_batches): residual_loss = self._update_residual() residual_losses.append( residual_loss.detach().cpu().numpy()) logger.debug('Loss', residual_loss) self._update_target_network(self.residual_target, self.residual) success_rate = self.eval_agent() if MPI.COMM_WORLD.Get_rank() == 0: print( '[{}] epoch is: {}, Num steps: {}, eval success rate is: {:.3f}' .format(datetime.now(), epoch, n_steps, success_rate)) logger.record_tabular('epoch', epoch) logger.record_tabular('n_steps', n_steps) logger.record_tabular('success_rate', success_rate) logger.record_tabular('residual_loss', np.mean(residual_losses)) logger.dump_tabular()
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params self.T = self.env_params['max_timesteps'] # create the network # Create T actors self.actor_networks = [ residualactor(env_params) for _ in range(self.T) ] self.critic_networks = [ residualcritic(env_params) for _ in range(self.T) ] # sync the networks across the cpus sync_all_networks(self.actor_networks) sync_all_networks(self.critic_networks) # build up the target network # if use gpu if self.args.cuda: _ = [self.actor_networks[i].cuda() for i in range(self.T)] _ = [self.critic_networks[i].cuda() for i in range(self.T)] # create the optimizer # Create T optimizers self.actor_optims = [ torch.optim.Adam(self.actor_networks[i].parameters(), lr=self.args.lr_actor) for i in range(self.T) ] self.critic_optims = [ torch.optim.Adam(self.critic_networks[i].parameters(), lr=self.args.lr_critic) for i in range(self.T) ] # her sampler self.her_module = residual_her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = residual_replay_buffer( self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) logger.info("initialized agent")
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params, residual=True) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params, residual=True) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward, self.env.extract_features) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.f_norm = normalizer(size=env_params['num_features']) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) logger.info("initialized agent")
def learn(self): """ train the network """ logger.info("Training..") n_steps = 0 best_success_rate = 0. # start to collect samples for epoch in range(self.args.n_epochs): actor_losses = [] critic_losses = [] switch_losses = [] for _ in range(self.args.n_cycles): mb_obs, mb_ag, mb_g, mb_actions, mb_switch_actions = [], [], [], [], [] for _ in range(self.args.num_rollouts_per_mpi): # reset the rollouts ep_obs, ep_ag, ep_g, ep_actions, ep_switch_actions = [], [], [], [], [] # reset the environment observation = self.env.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for _ in range(self.env_params['max_timesteps']): n_steps += 1 with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) _, switch_actions_q_values = self.critic_switch_network( input_tensor, pi) switch_action = self._select_switch_actions( switch_actions_q_values) # feed the actions into the environment if switch_action == 0: # Hardcoded action action = self.controller.act(observation) else: # Learned policy action action = self._select_actions(pi) observation_new, _, _, info = self.env.step(action) obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] # append rollouts ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) ep_actions.append(action.copy()) ep_switch_actions.append(switch_action) # re-assign the observation obs = obs_new ag = ag_new observation = observation_new ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) mb_obs.append(ep_obs) mb_ag.append(ep_ag) mb_g.append(ep_g) mb_actions.append(ep_actions) mb_switch_actions.append(ep_switch_actions) # convert them into arrays mb_obs = np.array(mb_obs) mb_ag = np.array(mb_ag) mb_g = np.array(mb_g) mb_actions = np.array(mb_actions) mb_switch_actions = np.array(mb_switch_actions) # store the episodes self.buffer.store_episode( [mb_obs, mb_ag, mb_g, mb_actions, mb_switch_actions]) self._update_normalizer( [mb_obs, mb_ag, mb_g, mb_actions, mb_switch_actions]) for _ in range(self.args.n_batches): # train the network critic_loss, actor_loss, switch_loss = self._update_network( ) actor_losses.append(actor_loss.detach().numpy()) critic_losses.append(critic_loss.detach().numpy()) switch_losses.append(switch_loss.detach().numpy()) # soft update self._soft_update_target_network(self.actor_target_network, self.actor_network) self._soft_update_target_network( self.critic_switch_target_network, self.critic_switch_network) # start to do the evaluation success_rate, prop_hardcoded = self._eval_agent() if MPI.COMM_WORLD.Get_rank() == 0: print( '[{}] epoch is: {}, Num steps: {}, eval success rate is: {:.3f}' .format(datetime.now(), epoch, n_steps, success_rate)) logger.record_tabular('epoch', epoch) logger.record_tabular('n_steps', n_steps) logger.record_tabular('success_rate', success_rate) logger.record_tabular('prop_hardcoded', prop_hardcoded) logger.record_tabular('actor_loss', np.mean(actor_losses)) logger.record_tabular('critic_loss', np.mean(critic_losses)) logger.record_tabular('switch_loss', np.mean(switch_losses)) logger.dump_tabular() if success_rate > best_success_rate: logger.info("Better success rate... Saving policy") torch.save([ self.o_norm.mean, self.o_norm.std, self.g_norm.mean, self.g_norm.std, self.actor_network.state_dict() ], self.model_path + '/model.pt') best_success_rate = success_rate
def learn(self): logger.info("Training") # ILC loop # 1. Train the model on real environment transitions # 2. Plan in the model to get the optimal policy, and the direction of improvement # 3. Do line search on the real environment to find the right step size initial_residual_parameters = copy.deepcopy(self.residual.state_dict()) for epoch in range(self.args.n_epochs): # 0. Fix start and goals self.populate_sim_states_and_goals() # 1. Plan in the model to get the optimal policy logger.info("Improving policy in the model") residual_losses = [] for _ in range(self.args.n_cycles): # Collect trajectories self.controller.reconfigure_heuristic(self.get_residual) self.controller.reconfigure_dynamics( self.get_dynamics_residual) self.collect_trajectories( self.args.num_rollouts_per_mpi) # Update residual logger.info("Updating") for _ in range(self.args.n_batches): residual_loss = self._update_residual() residual_losses.append( residual_loss.detach().cpu().numpy()) logger.info('Residual Loss', residual_loss.item()) self._update_target_network( self.residual_target, self.residual) if not self.args.planning: # Get the direction of improvement logger.info("Computing direction of improvement") final_residual_parameters = copy.deepcopy( self.residual.state_dict()) gradient = {} for key in initial_residual_parameters.keys(): gradient[key] = final_residual_parameters[key] - \ initial_residual_parameters[key] # 2. Line search in the real world logger.info("Line search in the real world") logger.info("Evaluating initial policy in the real world") initial_real_value_estimate = self.evaluate_real_world( initial_residual_parameters) logger.info("Initial cost-to-go", initial_real_value_estimate) alpha = 1.0 while True: logger.info("Alpha", alpha) current_residual_parameters = {} for key in initial_residual_parameters.keys(): current_residual_parameters[key] = initial_residual_parameters[key] + \ alpha * gradient[key] current_real_value_estimate = self.evaluate_real_world( current_residual_parameters) logger.info("Current cost-to-go", current_real_value_estimate) if current_real_value_estimate < initial_real_value_estimate: # Cost to go decreased - found an alpha logger.info("Initial cost-to-go", initial_real_value_estimate, "Final cost-to-go", current_real_value_estimate) initial_real_value_estimate = current_real_value_estimate initial_residual_parameters = copy.deepcopy( current_residual_parameters) break else: # Decrease alpha alpha *= 0.5 if alpha < self.args.alpha_threshold: # If alpha is really really small # Don't update the residual logger.info( "Alpha really small. Not updating residual") logger.info("Best cost-to-go so far", initial_real_value_estimate) break # Assign chosen residual parameters for the residual self.residual.load_state_dict(initial_residual_parameters) self.residual_target.load_state_dict( initial_residual_parameters) logger.info("Evaluating") success_rate = self.eval_agent() if not self.args.planning: # 3. Train model on real world transitions collected so far logger.info("Training model residual using real world samples") model_losses = [] for _ in range(self.args.n_model_batches): model_loss = self._update_model() model_losses.append(model_loss.detach().cpu().numpy()) logger.info('Model Loss', model_loss.item()) else: model_losses = [0] if MPI.COMM_WORLD.Get_rank() == 0: print('[{}] epoch is: {}, Num planning steps: {}, Num real steps: {}, eval success rate is: {:.3f}'.format( datetime.now(), epoch, self.n_planning_steps, self.n_real_steps, success_rate)) logger.record_tabular('epoch', epoch) logger.record_tabular('n_planning_steps', self.n_planning_steps) logger.record_tabular('n_real_steps', self.n_real_steps) logger.record_tabular('success_rate', success_rate) logger.record_tabular( 'residual_loss', np.mean(residual_losses)) logger.record_tabular('model_loss', np.mean(model_losses)) # logger.record_tabular( # 'cost-to-go', initial_real_value_estimate) logger.dump_tabular()
def learn(self): """ train the network """ logger.info("Training..") n_steps = 0 best_success_rate = 0. prev_actor_losses = [0.0] actor_losses = [0.0] critic_losses = [] original_actor_lr = self.args.lr_actor coin_flipping = False # start to collect samples for epoch in range(self.args.n_epochs): # If residual, then account for burn-in period by monitoring the decrement in loss if (epoch == 0 or abs(np.mean(actor_losses) - np.mean(prev_actor_losses)) > self.args.threshold): # Do not update actor, just update critic logger.info('Only training critic') self.change_actor_lr(0.0) coin_flipping = True else: # Update actor as well self.change_actor_lr(original_actor_lr) coin_flipping = False prev_actor_losses = actor_losses actor_losses = [] critic_losses = [] for _ in range(self.args.n_cycles): mb_obs, mb_ag, mb_g, mb_actions, mb_f = [], [], [], [], [] for _ in range(self.args.num_rollouts_per_mpi): # reset the rollouts ep_obs, ep_ag, ep_g, ep_actions, ep_f = [], [], [], [], [] # reset the environment observation = self.env.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] f = self.env.extract_features(obs, g) # start to collect samples for t in range(self.env_params['max_timesteps']): n_steps += 1 with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) action = self._select_actions(pi, coin_flipping) # feed the actions into the environment observation_new, _, _, info = self.env.step(action) obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] # append rollouts ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) ep_actions.append(action.copy()) ep_f.append(f.copy()) # re-assign the observation obs = obs_new ag = ag_new f = self.env.extract_features(obs, g) ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_f.append(f.copy()) mb_obs.append(ep_obs) mb_ag.append(ep_ag) mb_g.append(ep_g) mb_actions.append(ep_actions) mb_f.append(ep_f) # convert them into arrays mb_obs = np.array(mb_obs) mb_ag = np.array(mb_ag) mb_g = np.array(mb_g) mb_actions = np.array(mb_actions) mb_f = np.array(mb_f) # store the episodes self.buffer.store_episode( [mb_obs, mb_ag, mb_g, mb_actions, mb_f]) self._update_normalizer( [mb_obs, mb_ag, mb_g, mb_actions, mb_f]) for _ in range(self.args.n_batches): # train the network critic_loss, actor_loss = self._update_network() actor_losses.append(actor_loss.detach().numpy()) critic_losses.append(critic_loss.detach().numpy()) # soft update self._soft_update_target_network(self.actor_target_network, self.actor_network) self._soft_update_target_network(self.critic_target_network, self.critic_network) # start to do the evaluation success_rate = self._eval_agent() if MPI.COMM_WORLD.Get_rank() == 0: print( '[{}] epoch is: {}, Num steps: {}, eval success rate is: {:.3f}' .format(datetime.now(), epoch, n_steps, success_rate)) logger.record_tabular('epoch', epoch) logger.record_tabular('n_steps', n_steps) logger.record_tabular('success_rate', success_rate) logger.record_tabular('actor_loss', np.mean(actor_losses)) logger.record_tabular('critic_loss', np.mean(critic_losses)) logger.dump_tabular() if success_rate > best_success_rate: logger.info("Better success rate... Saving policy") torch.save([ self.o_norm.mean, self.o_norm.std, self.g_norm.mean, self.g_norm.std, self.actor_network.state_dict() ], self.model_path + '/model.pt') best_success_rate = success_rate
def learn(self): """ train the network """ logger.info("Training..") n_psdp_iters = 0 n_steps = 0 best_success_rate = 0. epoch = 0 success_rate = self._eval_agent() if MPI.COMM_WORLD.Get_rank() == 0: print( '[{}] epoch is: {}, Num steps: {}, eval success rate is: {:.3f}' .format(datetime.now(), epoch, n_steps, success_rate)) # start to collect samples #assert self.args.n_cycles == self.T, "Number of cycles should be equal to horizon" #actor_losses, prev_actor_losses = [0.], [0.] #critic_losses, prev_critic_losses = [0.], [0.] current_t = self.T for epoch in range(self.args.n_epochs): # TODO: Burn-in critic? #prev_actor_losses = actor_losses #prev_critic_losses = critic_losses actor_losses = [] critic_losses = [] if epoch % 10 == 0: current_t = current_t - 1 logger.info("Training residual policy at time step {}".format( current_t)) # TODO: Update actors one at a time by monitoring corresponding critic loss # Once the critic has been sufficiently trained, then we can start training the actor # at that time-step before moving onto the next time-step for _ in range(self.args.n_cycles): # current_t -= 1 # if (current_t + 1) % 10 == 0: # logger.info( # "Training residual policy at time step {}".format(current_t)) # for current_t in range(self.T-1, -1, -1): mb_obs, mb_ag, mb_g, mb_actions = [], [], [], [] for _ in range(self.args.num_rollouts_per_mpi): # reset the rollouts ep_obs, ep_ag, ep_g, ep_actions = [], [], [], [] # reset the environment observation = self.env.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for t in range(self.env_params['max_timesteps']): n_steps += 1 with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) if t == current_t: # Use untrained residual policy pi = self.actor_networks[t](input_tensor) action = self._select_actions(pi) # elif t > current_t: else: # Use current trained policy # If it has not been trained, it will predict zeros as # a result of our initialization pi = self.actor_networks[t](input_tensor) action = pi.cpu().numpy().squeeze() # feed the actions into the environment observation_new, _, _, info = self.env.step(action) obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] # append rollouts ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) ep_actions.append(action.copy()) # re-assign the observation obs = obs_new ag = ag_new ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) mb_obs.append(ep_obs) mb_ag.append(ep_ag) mb_g.append(ep_g) mb_actions.append(ep_actions) # convert them into arrays mb_obs = np.array(mb_obs) mb_ag = np.array(mb_ag) mb_g = np.array(mb_g) mb_actions = np.array(mb_actions) # store the episodes self.buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions]) self._update_normalizer([mb_obs, mb_ag, mb_g, mb_actions], current_t) for _ in range(self.args.n_batches): # train the network critic_loss, actor_loss = self._update_network(current_t) critic_losses.append(critic_loss.detach().numpy()) actor_losses.append(actor_loss.detach().numpy()) # soft update # self._soft_update_target_network( # self.actor_target_networks[current_t], self.actor_networks[current_t]) # FIX: No target network updates # self._soft_update_target_network( # self.critic_target_network, self.critic_network) # self._hard_update_target_network( # self.critic_target_network, self.critic_network) # start to do the evaluation success_rate = self._eval_agent() if MPI.COMM_WORLD.Get_rank() == 0: print( '[{}] epoch is: {}, Current time step : {}, Num steps: {}, eval success rate is: {:.3f}' .format(datetime.now(), epoch, current_t, n_steps, success_rate)) logger.record_tabular('epoch', epoch) logger.record_tabular('n_steps', n_steps) logger.record_tabular('success_rate', success_rate) logger.record_tabular('actor_loss', np.mean(actor_losses)) logger.record_tabular('critic_loss', np.mean(critic_losses)) logger.dump_tabular() if success_rate > best_success_rate: logger.info("Better success rate... Saving policy") # torch.save([self.o_norm.mean, self.o_norm.std, self.g_norm.mean, self.g_norm.std, self.actor_network.state_dict()], # self.model_path + '/model.pt') torch.save([ self.o_norm.mean, self.o_norm.std, self.g_norm.mean, self.g_norm.std ] + [ self.actor_networks[t].state_dict() for t in range(self.T) ], self.model_path + '/model.pt') best_success_rate = success_rate