def onpolicy_inference(): env = make_vec_envs( args.env_name, args.seed + 1000, 1, None, None, device='cuda:0', allow_early_resets=False, env_kwargs=env_kwargs, ) env_obj = env.venv.venv.envs[0].env.env if args.env_name.find('door') <= -1: env_obj.unity = None render_func = get_render_func(env) if evaluation and not render: render_func = None if env_kwargs['visionnet_input']: visionmodel = VisionModelXYZ() visionmodel = load_visionmodel(args.load_name, args.visionmodel_path, VisionModelXYZ()) actor_critic, ob_rms = torch.load(args.load_name) actor_critic = actor_critic.eval() if env_kwargs['visionnet_input'] and args.env_name.find('doorenv') > -1: actor_critic.visionmodel = visionmodel actor_critic.visionnet_input = env_obj.visionnet_input actor_critic.to("cuda:0") if args.env_name.find('doorenv') > -1: actor_critic.nn = env_obj.nn recurrent_hidden_states = torch.zeros( 1, actor_critic.recurrent_hidden_state_size) masks = torch.zeros(1, 1) knob_noisy = args.knob_noisy def add_noise(obs, epoch=100): satulation = 100. sdv = torch.tensor([ 3.440133806003181, 3.192113342496682, 1.727412865751099 ]) / satulation #Vision SDV for arm noise = torch.distributions.Normal(torch.tensor([0.0, 0.0, 0.0]), sdv).sample().cuda() noise *= min(1., epoch / satulation) obs[:, -3:] += noise return obs full_obs = env.reset() # print("init obs", full_obs) initial_state = full_obs[:, 2:2 + env.action_space.shape[0]] if args.env_name.find('doorenv') > -1 and env_obj.visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs) else: obs = full_obs if render_func is not None: render_func('human') if args.env_name.find('doorenv') > -1: if env_obj.xml_path.find("baxter") > -1: doorhinge_idx = 20 elif env_obj.xml_path.find("float") > -1: if env_obj.xml_path.find("hook") > -1: doorhinge_idx = 6 elif env_obj.xml_path.find("gripper") > -1: doorhinge_idx = 11 else: if env_obj.xml_path.find("mobile") > -1: if env_obj.xml_path.find("hook") > -1: doorhinge_idx = 9 if env_obj.xml_path.find("gripper") > -1: doorhinge_idx = 14 else: if env_obj.xml_path.find("hook") > -1: doorhinge_idx = 7 if env_obj.xml_path.find("gripper") > -1: doorhinge_idx = 12 start_time = int(time.mktime(time.localtime())) i = 0 epi_step = 0 total_time = 0 epi_counter = 1 dooropen_counter = 0 door_opened = False test_num = 100 while True: with torch.no_grad(): value, action, _, recurrent_hidden_states = actor_critic.act( obs, recurrent_hidden_states, masks, deterministic=args.det) next_action = action if i % 511 == 0: current_state = initial_state pos_control = False if pos_control: frame_skip = 1 if i % (512 / frame_skip - 1) == 0: current_state = initial_state next_action = current_state + next_action for kk in range(frame_skip): full_obs, reward, done, infos = env.step(next_action) else: full_obs, reward, done, infos = env.step(next_action) current_state = full_obs[:, 2:2 + env.action_space.shape[0]] if args.env_name.find('doorenv') > -1 and env_obj.visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs) else: obs = full_obs masks.fill_(0.0 if done else 1.0) if render_func is not None: render_func('human') i += 1 epi_step += 1 if args.env_name.find('doorenv') > -1: if not door_opened and abs( env_obj.sim.data.qpos[doorhinge_idx]) >= 0.2: dooropen_counter += 1 opening_time = epi_step / 50 print("door opened! opening time is {}".format(opening_time)) total_time += opening_time door_opened = True if args.env_name.find('Fetch') > -1: if not door_opened and infos[0]['is_success'] == 1: dooropen_counter += 1 opening_time = epi_step / 50 print("Reached destenation! Time is {}".format(opening_time)) total_time += opening_time door_opened = True if evaluation: if i % 512 == 511: if env_obj.unity: env_obj.close() env = make_vec_envs( args.env_name, args.seed + 1000, 1, None, None, device='cuda:0', allow_early_resets=False, env_kwargs=env_kwargs, ) if render: render_func = get_render_func(env) env_obj = env.venv.venv.envs[0].env.env if args.env_name.find('doorenv') <= -1: env_obj.unity = None env.reset() print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter)) eval_print(dooropen_counter, epi_counter, start_time, total_time) epi_counter += 1 epi_step = 0 door_opened = False if i >= 512 * test_num: eval_print(dooropen_counter, epi_counter - 1, start_time, total_time) break
def onpolicy_main(): print("onpolicy main") torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) # Make vector env envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) # agly ways to access to the environment attirubutes if args.env_name.find('doorenv') > -1: if args.num_processes > 1: visionnet_input = envs.venv.venv.visionnet_input nn = envs.venv.venv.nn env_name = envs.venv.venv.xml_path else: visionnet_input = envs.venv.venv.envs[ 0].env.env.env.visionnet_input nn = envs.venv.venv.envs[0].env.env.env.nn env_name = envs.venv.venv.envs[0].env.env.env.xml_path dummy_obs = np.zeros(nn * 2 + 3) else: dummy_obs = envs.observation_space visionnet_input = None nn = None if pretrained_policy_load: print("loading", pretrained_policy_load) actor_critic, ob_rms = torch.load(pretrained_policy_load) else: actor_critic = Policy(dummy_obs.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) if visionnet_input: visionmodel = load_visionmodel(env_name, args.visionmodel_path, VisionModelXYZ()) actor_critic.visionmodel = visionmodel.eval() actor_critic.nn = nn actor_critic.to(device) #disable normalizer vec_norm = get_vec_normalize(envs) vec_norm.eval() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, dummy_obs.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) full_obs = envs.reset() initial_state = full_obs[:, :envs.action_space.shape[0]] if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs, 0) else: obs = full_obs rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) pos_control = False total_switches = 0 prev_selection = "" for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) next_action = action if pos_control: frame_skip = 2 if step % (512 / frame_skip - 1) == 0: current_state = initial_state next_action = current_state + next_action for kk in range(frame_skip): full_obs, reward, done, infos = envs.step(next_action) current_state = full_obs[:, :envs.action_space.shape[0]] else: full_obs, reward, done, infos = envs.step(next_action) # convert img to obs if door_env and using visionnet if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: if knob_noisy: obs = add_noise(full_obs, j) else: obs = full_obs for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() writer.add_scalar("Value loss", value_loss, j) writer.add_scalar("action loss", action_loss, j) writer.add_scalar("dist entropy loss", dist_entropy, j) writer.add_scalar("Episode rewards", np.mean(episode_rewards), j) # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join( save_path, args.env_name + "_{}.{}.pt".format(args.save_name, j))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) DR = True #Domain Randomization ################## for multiprocess world change ###################### if DR: print("changing world") envs.close_extras() envs.close() del envs envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) full_obs = envs.reset() if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: obs = full_obs
def main(raw_args=None): # If this is being called as a function from another python script if raw_args is not None: args = get_args(raw_args) else: args = main_args if args.algo != 'ipo': raise NotImplementedError # Total number of envs (both domains) args.num_processes = args.num_envs1 + args.num_envs2 knob_noisy = args.knob_noisy pretrained_policy_load = args.pretrained_policy_load args.world_path_domain1 = os.path.expanduser(args.world_path_domain1) args.world_path_domain2 = os.path.expanduser(args.world_path_domain2) # Env kwargs for domain 1 env_kwargs1 = dict(port = args.port, visionnet_input = args.visionnet_input, unity = args.unity, world_path = args.world_path_domain1) # Env kwargs for domain 2 env_kwargs2 = dict(port = args.port, visionnet_input = args.visionnet_input, unity = args.unity, world_path = args.world_path_domain2) print("Training with IPO.") torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) # Make vector env for two domains (each contains num_processes/2 envs) envs1 = make_vec_envs(args.env_name, args.seed, args.num_envs1, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs1) envs2 = make_vec_envs(args.env_name, args.seed, args.num_envs2, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs2) # agly ways to access to the environment attirubutes if args.env_name.find('doorenv')>-1: visionnet_input = envs1.venv.venv.visionnet_input nn = envs1.venv.venv.nn env_name = envs1.venv.venv.xml_path dummy_obs = np.zeros(nn*2+3) else: dummy_obs = envs1.observation_space visionnet_input = None nn = None if pretrained_policy_load: print("loading", pretrained_policy_load) actor_critic, ob_rms = torch.load(pretrained_policy_load) else: actor_critic = Policy_av( dummy_obs.shape, envs1.action_space, base_kwargs={'recurrent': args.recurrent_policy}) # actor_critic = Policy( # dummy_obs.shape, # envs1.action_space, # base_kwargs={'recurrent': args.recurrent_policy}) if visionnet_input: raise NotImplementedError visionmodel = load_visionmodel(env_name, args.visionmodel_path, VisionModelXYZ()) actor_critic.visionmodel = visionmodel.eval() actor_critic.nn = nn actor_critic.to(device) #disable normalizer vec_norm1 = get_vec_normalize(envs1) vec_norm1.eval() vec_norm2 = get_vec_normalize(envs2) vec_norm2.eval() # Create two agents (one for each domain) params1 = [{'params': actor_critic.base.actor1.parameters()}, {'params': actor_critic.base.critic1.parameters()}, {'params': actor_critic.base.critic_linear1.parameters()}, {'params': actor_critic.base.fc_mean1.parameters()}, {'params': actor_critic.base.logstd1.parameters()}] params2 = [{'params': actor_critic.base.actor2.parameters()}, {'params': actor_critic.base.critic2.parameters()}, {'params': actor_critic.base.critic_linear2.parameters()}, {'params': actor_critic.base.fc_mean2.parameters()}, {'params': actor_critic.base.logstd2.parameters()}] # params1 = None # params2 = None agent1 = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, optim_params = params1) agent2 = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, optim_params = params2) # Rollout storage for each domain rollouts1 = RolloutStorage(args.num_steps, args.num_envs1, dummy_obs.shape, envs1.action_space, actor_critic.recurrent_hidden_state_size) rollouts2 = RolloutStorage(args.num_steps, args.num_envs2, dummy_obs.shape, envs2.action_space, actor_critic.recurrent_hidden_state_size) full_obs1 = envs1.reset() initial_state1 = full_obs1[:,:envs1.action_space.shape[0]] full_obs2 = envs2.reset() initial_state2 = full_obs2[:,:envs2.action_space.shape[0]] if args.env_name.find('doorenv')>-1 and visionnet_input: obs1 = actor_critic.obs2inputs(full_obs1, 0) obs2 = actor_critic.obs2inputs(full_obs2, 0) else: if knob_noisy: obs1 = add_noise(full_obs1, 0) obs2 = add_noise(full_obs2, 0) else: obs1 = full_obs1 obs2 = full_obs2 rollouts1.obs[0].copy_(obs1) rollouts1.to(device) rollouts2.obs[0].copy_(obs2) rollouts2.to(device) episode_rewards1 = deque(maxlen=10) episode_rewards2 = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes num_updates = int(num_updates/2) # Since have two domains per iteration best_training_reward = -np.inf for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent1.optimizer, j, num_updates, args.lr) utils.update_linear_schedule( agent2.optimizer, j, num_updates, args.lr) ################## Do rollouts and updates for domain 1 ################## pos_control = False total_switches = 0 prev_selection = "" for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts1.obs[step], rollouts1.recurrent_hidden_states[step], rollouts1.masks[step]) next_action = action try: # print(next_action) full_obs, reward, done, infos = envs1.step(next_action) except: ipy.embed() if knob_noisy: obs = add_noise(full_obs, j) else: obs = full_obs for info in infos: if 'episode' in info.keys(): episode_rewards1.append(info['episode']['r']) masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts1.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts1.obs[-1], rollouts1.recurrent_hidden_states[-1], rollouts1.masks[-1]).detach() rollouts1.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent1.update(rollouts1) rollouts1.after_update() value_loss1 = value_loss action_loss1 = action_loss dist_entropy1 = dist_entropy ################## Do rollouts and updates for domain 2 ################## pos_control = False total_switches = 0 prev_selection = "" for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts2.obs[step], rollouts2.recurrent_hidden_states[step], rollouts2.masks[step]) next_action = action try: # print(next_action) full_obs, reward, done, infos = envs2.step(next_action) except: ipy.embed() if knob_noisy: obs = add_noise(full_obs, j) else: obs = full_obs for info in infos: if 'episode' in info.keys(): episode_rewards2.append(info['episode']['r']) masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts2.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts2.obs[-1], rollouts2.recurrent_hidden_states[-1], rollouts2.masks[-1]).detach() rollouts2.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent2.update(rollouts2) rollouts2.after_update() value_loss2 = value_loss action_loss2 = action_loss dist_entropy2 = dist_entropy ###################### Logs and storage ######################## value_loss = (value_loss1 + value_loss2)/2 action_loss = (action_loss1 + action_loss2)/2 dist_entropy = (dist_entropy1 + dist_entropy2)/2 episode_rewards = [] for ii in range(len(episode_rewards1)): episode_rewards.append((episode_rewards1[ii]+episode_rewards2[ii])/2) # episode_rewards = episode_rewards1 writer.add_scalar("Value loss", value_loss, j) writer.add_scalar("action loss", action_loss, j) writer.add_scalar("dist entropy loss", dist_entropy, j) writer.add_scalar("Episode rewards", np.mean(episode_rewards), j) if np.mean(episode_rewards) > best_training_reward: best_training_reward = np.mean(episode_rewards) current_is_best = True else: current_is_best = False # save for every interval-th episode or for the last epoch or for best so far if (j % args.save_interval == 0 or j == num_updates - 1 or current_is_best) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, None ], os.path.join(save_path, args.env_name + "_{}.{}.pt".format(args.save_name,j))) if current_is_best: torch.save([ actor_critic, None ], os.path.join(save_path, args.env_name + "_{}.best.pt".format(args.save_name))) # torch.save([ # actor_critic, # getattr(utils.get_vec_normalize(envs1), 'ob_rms', None) # ], os.path.join(save_path, args.env_name + "_{}.{}.pt".format(args.save_name,j))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): raise NotImplementedError ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) DR=False # True #Domain Randomization ################## for multiprocess world change ###################### if DR: raise NotImplementedError print("changing world") envs.close_extras() envs.close() del envs envs = make_vec_envs_domains(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs1=env_kwargs1, env_kwargs2=env_kwargs2) full_obs = envs.reset() if args.env_name.find('doorenv')>-1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: obs = full_obs
def offpolicy_main(variant): print("offpolicy main") if args.algo == 'sac': algo = "SAC" elif args.algo == 'td3': algo = "TD3" setup_logger('{0}_{1}'.format(args.env_name, args.save_name), variant=variant) ptu.set_gpu_mode(True) # optionally set the GPU (default=True) expl_env, eval_env, env_obj = prepare_env(args.env_name, args.visionmodel_path, **env_kwargs) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size expl_policy, eval_policy, trainer = prepare_trainer( algo, expl_env, obs_dim, action_dim, args.pretrained_policy_load, variant) if args.env_name.find('doorenv') > -1: expl_policy.knob_noisy = eval_policy.knob_noisy = args.knob_noisy expl_policy.nn = eval_policy.nn = env_obj.nn expl_policy.visionnet_input = eval_policy.visionnet_input = env_obj.visionnet_input if args.visionnet_input: visionmodel = load_visionmodel(expl_env._wrapped_env.xml_path, args.visionmodel_path, VisionModelXYZ()) visionmodel.to(ptu.device) expl_policy.visionmodel = visionmodel.eval() else: expl_policy.visionmodel = None eval_path_collector = MdpPathCollector( eval_env, eval_policy, doorenv=args.env_name.find('doorenv') > -1, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, doorenv=args.env_name.find('doorenv') > -1, ) if not args.replaybuffer_load: replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) else: replay_buffer = pickle.load(open(args.replaybuffer_load, "rb")) replay_buffer._env_info_keys = replay_buffer.env_info_sizes.keys() print("Loaded the replay buffer that has length of {}".format( replay_buffer.get_diagnostics())) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.save_interval = args.save_interval algorithm.save_dir = args.save_dir algorithm.algo = args.algo algorithm.env_name = args.env_name algorithm.save_name = args.save_name algorithm.env_kwargs = env_kwargs summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) algorithm.writer = writer algorithm.to(ptu.device) algorithm.train()
def onpolicy_inference(seed, env_name, det, load_name, evaluation, render, knob_noisy, visionnet_input, env_kwargs, actor_critic=None, verbose=True, pos_control=True, step_skip=4): env = make_vec_envs( env_name, seed + 1000, 1, None, None, device='cuda:0', allow_early_resets=False, env_kwargs=env_kwargs, ) env_obj = env.venv.venv.envs[0].env.env if env_name.find('door') <= -1: env_obj.unity = None render_func = get_render_func(env) if evaluation and not render: render_func = None if env_kwargs['visionnet_input']: visionmodel = VisionModelXYZ() visionmodel = load_visionmodel(load_name, args.visionmodel_path, VisionModelXYZ()) if not actor_critic: actor_critic, ob_rms = torch.load(load_name) actor_critic = actor_critic.eval() if env_kwargs['visionnet_input'] and env_name.find('doorenv') > -1: actor_critic.visionmodel = visionmodel actor_critic.visionnet_input = env_obj.visionnet_input actor_critic.to("cuda:0") if env_name.find('doorenv') > -1: actor_critic.nn = env_obj.nn recurrent_hidden_states = torch.zeros( 1, actor_critic.recurrent_hidden_state_size) masks = torch.zeros(1, 1) full_obs = env.reset() initial_state = full_obs[:, :env.action_space.shape[0]] if env_name.find('doorenv') > -1 and env_obj.visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs) else: obs = full_obs if render_func is not None: render_func('human') # if env_name.find('doorenv')>-1: # if env_obj.xml_path.find("baxter")>-1: # doorhinge_idx = 20 # elif env_obj.xml_path.find("float")>-1: # if env_obj.xml_path.find("hook")>-1: # doorhinge_idx = 6 # elif env_obj.xml_path.find("gripper")>-1: # doorhinge_idx = 11 # else: # if env_obj.xml_path.find("mobile")>-1: # if env_obj.xml_path.find("hook")>-1: # doorhinge_idx = 9 # if env_obj.xml_path.find("gripper")>-1: # doorhinge_idx = 14 # else: # if env_obj.xml_path.find("hook")>-1: # doorhinge_idx = 7 # if env_obj.xml_path.find("gripper")>-1: # doorhinge_idx = 12 start_time = int(time.mktime(time.localtime())) i = 0 epi_step = 0 total_time = 0 epi_counter = 1 dooropen_counter = 0 door_opened = False test_num = 100 while True: with torch.no_grad(): value, action, _, recurrent_hidden_states = actor_critic.act( obs, recurrent_hidden_states, masks, deterministic=det) next_action = action if pos_control: # print("enjoy step_skip",step_skip) if i % (512 / step_skip - 1) == 0: current_state = initial_state next_action = current_state + next_action for kk in range(step_skip): full_obs, reward, done, infos = env.step(next_action) current_state = full_obs[:, :env.action_space.shape[0]] else: for kk in range(step_skip): full_obs, reward, done, infos = env.step(next_action) if env_name.find('doorenv') > -1 and env_obj.visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs) else: obs = full_obs masks.fill_(0.0 if done else 1.0) if render_func is not None: render_func('human') i += 1 epi_step += 1 if env_name.find('doorenv') > -1: # if not door_opened and abs(env_obj.sim.data.qpos[doorhinge_idx])>=0.2: if not door_opened and abs(env_obj.get_doorangle()) >= 0.2: dooropen_counter += 1 opening_time = epi_step / (1.0 / mujoco_timestep) * step_skip if verbose: print( "door opened! opening time is {}".format(opening_time)) total_time += opening_time door_opened = True if env_name.find('Fetch') > -1: if not door_opened and infos[0]['is_success'] == 1: dooropen_counter += 1 opening_time = epi_step / (1.0 / mujoco_timestep) * step_skip if verbose: print( "Reached destenation! Time is {}".format(opening_time)) total_time += opening_time door_opened = True if evaluation: if i % (512 / step_skip - 1) == 0: if env_obj.unity: env_obj.close() env = make_vec_envs( env_name, seed + 1000, 1, None, None, device='cuda:0', allow_early_resets=False, env_kwargs=env_kwargs, ) if render: render_func = get_render_func(env) env_obj = env.venv.venv.envs[0].env.env if env_name.find('doorenv') <= -1: env_obj.unity = None env.reset() if verbose: print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format( epi_counter)) eval_print(dooropen_counter, epi_counter, start_time, total_time) epi_counter += 1 epi_step = 0 door_opened = False if i >= 512 / step_skip * test_num: if verbose: print("dooropening counter:", dooropen_counter, " epi counter:", epi_counter) eval_print(dooropen_counter, epi_counter - 1, start_time, total_time) break opening_rate, opening_timeavg = eval_print(dooropen_counter, epi_counter - 1, start_time, total_time) return opening_rate, opening_timeavg