def run_td3_train(env, agent, max_timesteps, logger, log_interval): timestep_counter = 0 total_updates = max_timesteps / env.num_envs epinfobuf = deque(maxlen=100) observations = env.reset() loss_a = 0 loss_c = 0 explained_var = 0 while (True): # collection of training data mb_obs, mb_as, mb_dones, mb_rs, mb_obs_ = [], [], [], [], [] epinfos = [] for i in range(0, agent.nsteps, env.num_envs): observations = torch.Tensor(observations) if timestep_counter > agent.learn_start_step: actions = agent.choose_action(observations) actions = actions.cpu().numpy().clip(env.action_space.low, env.action_space.high) else: actions = [] for i in range(env.num_envs): actions.append(env.action_space.sample()) actions = np.asarray(actions, dtype=np.float32) observations = observations.cpu().numpy() observations_, rewards, dones, infos = env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) mb_obs.append(observations) mb_as.append(actions) mb_rs.append(rewards) mb_obs_.append(observations_) mb_dones.append(dones) observations = observations_ epinfobuf.extend(epinfos) def reshape_data(arr): s = arr.shape return arr.reshape(s[0] * s[1], *s[2:]) mb_obs = reshape_data(np.asarray(mb_obs, dtype=np.float32)) mb_rs = reshape_data(np.asarray(mb_rs, dtype=np.float32)) mb_as = reshape_data(np.asarray(mb_as)) mb_dones = reshape_data(np.asarray(mb_dones, dtype=np.uint8)) mb_obs_ = reshape_data(np.asarray(mb_obs_, dtype=np.float32)) # store transition transition = { 'state': mb_obs if mb_obs.ndim == 2 else np.expand_dims(mb_obs, 1), 'action': mb_as if mb_as.ndim == 2 else np.expand_dims(mb_as, 1), 'reward': mb_rs if mb_rs.ndim == 2 else np.expand_dims(mb_rs, 1), 'next_state': mb_obs_ if mb_obs_.ndim == 2 else np.expand_dims(mb_obs_, 1), 'done': mb_dones if mb_dones.ndim == 2 else np.expand_dims(mb_dones, 1), } agent.store_transition(transition) # training controller timestep_counter += agent.nsteps if timestep_counter >= max_timesteps: break if timestep_counter > agent.batch_size: # Update observation and reward mean and var. if agent.norm_ob: agent.ob_mean, agent.ob_var = env.ob_rms.mean, env.ob_rms.var if agent.norm_rw: agent.rw_mean, agent.rw_var = env.ret_rms.mean, env.ret_rms.var for i in range(0, agent.nsteps): agent.learn() # adjust learning rate for policy and value function # decay_coef = 1 - agent.learn_step_counter / total_updates # adjust_learning_rate(agent.optimizer_a, original_lr=agent.lr, decay_coef=decay_coef) # adjust_learning_rate(agent.optimizer_c, original_lr=agent.lrv, decay_coef=decay_coef) explained_var += 0.5 * explained_variance(agent.Qe1, agent.Qt) explained_var += 0.5 * explained_variance(agent.Qe2, agent.Qt) loss_a += agent.loss_a.item() loss_c += agent.loss_c.item() if agent.learn_step_counter % log_interval == 0: print( "------------------log information------------------") print("total_timesteps:".ljust(20) + str(timestep_counter)) print("iterations:".ljust(20) + str(agent.learn_step_counter) + " / " + str(int(total_updates))) print("explained_var:".ljust(20) + str(explained_var / log_interval)) logger.add_scalar("explained_var/train", explained_var / log_interval, timestep_counter) print("episode_len:".ljust(20) + "{:.1f}".format( np.mean([epinfo['l'] for epinfo in epinfobuf]))) print("episode_rew:".ljust(20) + str(np.mean([epinfo['r'] for epinfo in epinfobuf]))) logger.add_scalar( "episode_reward/train", np.mean([epinfo['r'] for epinfo in epinfobuf]), timestep_counter) print("max_episode_rew:".ljust(20) + str(np.max([epinfo['r'] for epinfo in epinfobuf]))) print("min_episode_rew:".ljust(20) + str(np.min([epinfo['r'] for epinfo in epinfobuf]))) print("loss_a:".ljust(20) + str(loss_a / log_interval)) logger.add_scalar("actor_loss/train", loss_a / log_interval, timestep_counter) print("loss_c:".ljust(20) + str(loss_c / log_interval)) logger.add_scalar("critic_loss/train", loss_c / log_interval, timestep_counter) print("action_noise_std:".ljust(20) + str(agent.noise)) explained_var = 0 loss_a = 0 loss_c = 0 return agent
def run_pg_train(env, agent, max_timesteps, logger): timestep_counter = 0 total_updates = max_timesteps // agent.nsteps epinfobuf = deque(maxlen=100) while (True): mb_obs, mb_rewards, mb_actions, mb_dones, mb_logpacs, mb_obs_, mb_mus, mb_sigmas \ , mb_distris= [], [], [], [], [], [], [], [], [] epinfos = [] observations = env.reset() for i in range(0, agent.nsteps, env.num_envs): observations = torch.Tensor(observations) if not agent.dicrete_action: actions, mus, logsigmas, sigmas = agent.choose_action( observations) logp = agent.compute_logp(mus, logsigmas, sigmas, actions) mus = mus.cpu().numpy() sigmas = sigmas.cpu().numpy() mb_mus.append(mus) mb_sigmas.append(sigmas) else: actions, distris = agent.choose_action(observations) logp = agent.compute_logp(distris, actions) distris = distris.cpu().numpy() mb_distris.append(distris) observations = observations.cpu().numpy() actions = actions.cpu().numpy() logp = logp.cpu().numpy() observations_, rewards, dones, infos = env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) mb_obs.append(observations) mb_actions.append(actions) mb_logpacs.append(logp) mb_dones.append(dones.astype(np.uint8)) mb_rewards.append(rewards) mb_obs_.append(observations_) observations = observations_ epinfobuf.extend(epinfos) # make all final states marked by done, preventing wrong estimating of returns and advantages. # done flag: # 0: undone and not the final state # 1: realdone # 2: undone but the final state mb_dones[-1][np.where(mb_dones[-1] == 0)] = 2 def reshape_data(arr): s = arr.shape return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) mb_obs = reshape_data(np.asarray(mb_obs, dtype=np.float32)) mb_rewards = reshape_data(np.asarray(mb_rewards, dtype=np.float32)) mb_actions = reshape_data(np.asarray(mb_actions)) mb_logpacs = reshape_data(np.asarray(mb_logpacs, dtype=np.float32)) mb_dones = reshape_data(np.asarray(mb_dones, dtype=np.uint8)) mb_obs_ = reshape_data(np.asarray(mb_obs_, dtype=np.float32)) assert mb_obs.ndim <= 2 and mb_rewards.ndim <= 2 and mb_actions.ndim <= 2 and \ mb_logpacs.ndim <= 2 and mb_dones.ndim <= 2 and mb_obs_.ndim <= 2, \ "databuffer only supports 1-D data's batch." if not agent.dicrete_action: mb_mus = reshape_data(np.asarray(mb_mus, dtype=np.float32)) mb_sigmas = reshape_data(np.asarray(mb_sigmas, dtype=np.float32)) assert mb_mus.ndim <= 2 and mb_sigmas.ndim <= 2, "databuffer only supports 1-D data's batch." else: mb_distris = reshape_data(np.asarray(mb_distris, dtype=np.float32)) assert mb_distris.ndim <= 2, "databuffer only supports 1-D data's batch." # store transition transition = { 'state': mb_obs if mb_obs.ndim == 2 else np.expand_dims(mb_obs, 1), 'action': mb_actions if mb_actions.ndim == 2 else np.expand_dims( mb_actions, 1), 'reward': mb_rewards if mb_rewards.ndim == 2 else np.expand_dims( mb_rewards, 1), 'next_state': mb_obs_ if mb_obs_.ndim == 2 else np.expand_dims(mb_obs_, 1), 'done': mb_dones if mb_dones.ndim == 2 else np.expand_dims(mb_dones, 1), 'logpac': mb_logpacs if mb_logpacs.ndim == 2 else np.expand_dims( mb_logpacs, 1), } if not agent.dicrete_action: transition['mu'] = mb_mus if mb_mus.ndim == 2 else np.expand_dims( mb_mus, 1) transition[ 'sigma'] = mb_sigmas if mb_sigmas.ndim == 2 else np.expand_dims( mb_sigmas, 1) else: transition[ 'distri'] = mb_distris if mb_distris.ndim == 2 else np.expand_dims( mb_distris, 1) agent.store_transition(transition) # agent learning step agent.learn() # training controller timestep_counter += agent.nsteps if timestep_counter >= max_timesteps: break # adjust learning rate for policy and value function decay_coef = 1 - agent.learn_step_counter / total_updates adjust_learning_rate(agent.optimizer, original_lr=agent.lr, decay_coef=decay_coef) if agent.value_type is not None: adjust_learning_rate(agent.v_optimizer, original_lr=agent.lr_v, decay_coef=decay_coef) print("------------------log information------------------") print("total_timesteps:".ljust(20) + str(timestep_counter)) print("iterations:".ljust(20) + str(agent.learn_step_counter) + " / " + str(int(total_updates))) if agent.value_type is not None: explained_var = explained_variance(agent.V.cpu().numpy(), agent.esti_R.cpu().numpy()) print("explained_var:".ljust(20) + str(explained_var)) logger.add_scalar("explained_var/train", explained_var, timestep_counter) print("episode_len:".ljust(20) + "{:.1f}".format(np.mean([epinfo['l'] for epinfo in epinfobuf]))) print("episode_rew:".ljust(20) + str(np.mean([epinfo['r'] for epinfo in epinfobuf]))) logger.add_scalar("episode_reward/train", np.mean([epinfo['r'] for epinfo in epinfobuf]), timestep_counter) print("mean_kl:".ljust(20) + str(agent.cur_kl)) logger.add_scalar("mean_kl/train", agent.cur_kl, timestep_counter) print("policy_ent:".ljust(20) + str(agent.policy_ent)) logger.add_scalar("policy_ent/train", agent.policy_ent, timestep_counter) print("policy_loss:".ljust(20) + str(agent.policy_loss)) logger.add_scalar("policy_loss/train", agent.policy_loss, timestep_counter) print("value_loss:".ljust(20) + str(agent.value_loss)) logger.add_scalar("value_loss/train", agent.value_loss, timestep_counter) return agent
def run_htrpo_train(env, agent, max_timesteps, logger, eval_interval = None, num_evals = 5, render = False): timestep_counter = 0 total_updates = max_timesteps // agent.nsteps epinfobuf = deque(maxlen=100) success_history = deque(maxlen=100) ep_num = 0 if eval_interval: eval_ret, eval_success = agent.eval_brain(env, render=render, eval_num=num_evals) print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print("eval_ep_rew:".ljust(20) + str(np.mean(eval_ret))) print("eval_suc_rate:".ljust(20) + str(np.mean(eval_success))) print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") logger.add_scalar("episode_reward/eval", np.mean(eval_ret), timestep_counter) logger.add_scalar("success_rate/eval", np.mean(eval_success), timestep_counter) while (True): mb_obs, mb_rewards, mb_actions, mb_dones, mb_logpacs, mb_obs_, mb_mus, mb_sigmas \ , mb_distris = [], [], [], [], [], [], [], [], [] mb_dg, mb_ag = [], [] epinfos = [] successes = [] obs_dict = env.reset() # env.render() for i in range(0, agent.nsteps, env.num_envs): for key in obs_dict.keys(): obs_dict[key] = torch.Tensor(obs_dict[key]) if not agent.dicrete_action: actions, mus, logsigmas, sigmas = agent.choose_action(obs_dict["observation"], other_data=obs_dict["desired_goal"]) logp = agent.compute_logp(mus, logsigmas, sigmas, actions) mus = mus.cpu().numpy() sigmas = sigmas.cpu().numpy() mb_mus.append(mus) mb_sigmas.append(sigmas) else: actions, distris = agent.choose_action(obs_dict["observation"], other_data=obs_dict["desired_goal"]) logp = agent.compute_logp(distris, actions) distris = distris.cpu().numpy() mb_distris.append(distris) observations = obs_dict['observation'].cpu().numpy() actions = actions.cpu().numpy() logp = logp.cpu().numpy() if np.random.rand() < 0.0: actions = np.concatenate([np.expand_dims(env.action_space.sample(), axis=0) for i in range(env.num_envs)], axis = 0) obs_dict_, rewards, dones, infos = env.step(actions) else: obs_dict_, rewards, dones, infos = env.step(actions) # if timestep_counter > 350000: # env.render() mb_obs.append(observations) mb_actions.append(actions) mb_logpacs.append(logp) mb_dones.append(dones.astype(np.uint8)) mb_rewards.append(rewards) mb_obs_.append(obs_dict_['observation'].copy()) mb_dg.append(obs_dict_['desired_goal'].copy()) mb_ag.append(obs_dict_['achieved_goal'].copy()) for e, info in enumerate(infos): if dones[e]: epinfos.append(info.get('episode')) successes.append(info.get('is_success')) for k in obs_dict_.keys(): obs_dict_[k][e] = info.get('new_obs')[k] ep_num += 1 obs_dict = obs_dict_ epinfobuf.extend(epinfos) success_history.extend(successes) # make all final states marked by done, preventing wrong estimating of returns and advantages. # done flag: # 0: undone and not the final state # 1: realdone # 2: undone but the final state ep_num += (mb_dones[-1] == 0).sum() mb_dones[-1][np.where(mb_dones[-1] == 0)] = 2 def reshape_data(arr): s = arr.shape return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) mb_obs = reshape_data(np.asarray(mb_obs, dtype=np.float32)) mb_rewards = reshape_data(np.asarray(mb_rewards, dtype=np.float32)) mb_actions = reshape_data(np.asarray(mb_actions)) mb_logpacs = reshape_data(np.asarray(mb_logpacs, dtype=np.float32)) mb_dones = reshape_data(np.asarray(mb_dones, dtype=np.uint8)) mb_obs_ = reshape_data(np.asarray(mb_obs_, dtype=np.float32)) mb_ag = reshape_data(np.asarray(mb_ag, dtype=np.float32)) mb_dg = reshape_data(np.asarray(mb_dg, dtype=np.float32)) assert mb_rewards.ndim <= 2 and mb_actions.ndim <= 2 and \ mb_logpacs.ndim <= 2 and mb_dones.ndim <= 2, \ "databuffer only supports 1-D data's batch." if not agent.dicrete_action: mb_mus = reshape_data(np.asarray(mb_mus, dtype=np.float32)) mb_sigmas = reshape_data(np.asarray(mb_sigmas, dtype=np.float32)) assert mb_mus.ndim <= 2 and mb_sigmas.ndim <= 2, "databuffer only supports 1-D data's batch." else: mb_distris = reshape_data(np.asarray(mb_distris, dtype=np.float32)) assert mb_distris.ndim <= 2, "databuffer only supports 1-D data's batch." # store transition transition = { 'state': mb_obs if mb_obs.ndim == 2 or mb_obs.ndim == 4 else np.expand_dims(mb_obs, 1), 'action': mb_actions if mb_actions.ndim == 2 else np.expand_dims(mb_actions, 1), 'reward': mb_rewards if mb_rewards.ndim == 2 else np.expand_dims(mb_rewards, 1), 'next_state': mb_obs_ if mb_obs_.ndim == 2 or mb_obs_.ndim == 4 else np.expand_dims(mb_obs_, 1), 'done': mb_dones if mb_dones.ndim == 2 else np.expand_dims(mb_dones, 1), 'logpac': mb_logpacs if mb_logpacs.ndim == 2 else np.expand_dims(mb_logpacs, 1), 'other_data': { 'desired_goal': mb_dg if mb_dg.ndim == 2 else np.expand_dims(mb_dg, 1), 'achieved_goal': mb_ag if mb_ag.ndim == 2 else np.expand_dims(mb_ag, 1), } } if not agent.dicrete_action: transition['mu'] = mb_mus if mb_mus.ndim == 2 else np.expand_dims(mb_mus, 1) transition['sigma'] = mb_sigmas if mb_sigmas.ndim == 2 else np.expand_dims(mb_sigmas, 1) else: transition['distri'] = mb_distris if mb_distris.ndim == 2 else np.expand_dims(mb_distris, 1) agent.store_transition(transition) # agent learning step agent.learn() # training controller timestep_counter += agent.nsteps if timestep_counter > max_timesteps: break print("------------------log information------------------") print("total_timesteps:".ljust(20) + str(timestep_counter)) print("valid_ep_ratio:".ljust(20) + "{:.3f}".format(agent.n_valid_ep / ep_num)) logger.add_scalar("valid_ep_ratio/train", agent.n_valid_ep / ep_num, timestep_counter) if agent.n_valid_ep > 0: print("iterations:".ljust(20) + str(agent.learn_step_counter) + " / " + str(int(total_updates))) if agent.value_type is not None: explained_var = explained_variance(agent.V.cpu().numpy(), agent.esti_R.cpu().numpy()) print("explained_var:".ljust(20) + str(explained_var)) logger.add_scalar("explained_var/train", explained_var, timestep_counter) print("episode_len:".ljust(20) + "{:.1f}".format(np.mean([epinfo['l'] for epinfo in epinfobuf]))) rew = np.mean([epinfo['r'] for epinfo in epinfobuf]) + agent.max_steps print("episode_rew:".ljust(20) + str(rew)) logger.add_scalar("episode_reward/train", rew, timestep_counter) print("success_rate:".ljust(20) + "{:.3f}".format(100 * np.mean(success_history)) + "%") logger.add_scalar("success_rate/train", np.mean(success_history), timestep_counter) print("mean_kl:".ljust(20) + str(agent.cur_kl)) logger.add_scalar("mean_kl/train", agent.cur_kl, timestep_counter) print("policy_ent:".ljust(20) + str(agent.policy_ent)) logger.add_scalar("policy_ent/train", agent.policy_ent, timestep_counter) print("value_loss:".ljust(20) + str(agent.value_loss)) logger.add_scalar("value_loss/train", agent.value_loss, timestep_counter) print("actual_imprv:".ljust(20) + "{:.5f}".format(agent.improvement)) logger.add_scalar("actual_imprv/train", agent.improvement, timestep_counter) print("exp_imprv:".ljust(20) + "{:.5f}".format(agent.expected_improvement)) logger.add_scalar("exp_imprv/train", agent.expected_improvement, timestep_counter) ep_num = 0 else: print("No valid episode was collected. Policy has not been updated.") if eval_interval and timestep_counter % eval_interval == 0: agent.save_model("output/models/HTRPO") eval_ret, eval_success = agent.eval_brain(env, render=render, eval_num=num_evals) print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print("eval_ep_rew:".ljust(20) + str(np.mean(eval_ret))) print("eval_suc_rate:".ljust(20) + str(np.mean(eval_success))) print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") logger.add_scalar("episode_reward/eval", np.mean(eval_ret), timestep_counter) logger.add_scalar("success_rate/eval", np.mean(eval_success), timestep_counter) return agent