def offpolicy_main(variant): print("offpolicy main") if args.algo == 'sac': algo = "SAC" elif args.algo == 'td3': algo = "TD3" setup_logger('{0}_{1}'.format(args.env_name, args.save_name), variant=variant) ptu.set_gpu_mode(True) # optionally set the GPU (default=True) expl_env, eval_env, env_obj = prepare_env(args.env_name, args.visionmodel_path, **env_kwargs) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size expl_policy, eval_policy, trainer = prepare_trainer( algo, expl_env, obs_dim, action_dim, args.pretrained_policy_load, variant) if args.env_name.find('doorenv') > -1: expl_policy.knob_noisy = eval_policy.knob_noisy = args.knob_noisy expl_policy.nn = eval_policy.nn = env_obj.nn expl_policy.visionnet_input = eval_policy.visionnet_input = env_obj.visionnet_input if args.visionnet_input: visionmodel = load_visionmodel(expl_env._wrapped_env.xml_path, args.visionmodel_path, VisionModelXYZ()) visionmodel.to(ptu.device) expl_policy.visionmodel = visionmodel.eval() else: expl_policy.visionmodel = None eval_path_collector = MdpPathCollector( eval_env, eval_policy, doorenv=args.env_name.find('doorenv') > -1, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, doorenv=args.env_name.find('doorenv') > -1, ) if not args.replaybuffer_load: replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) else: replay_buffer = pickle.load(open(args.replaybuffer_load, "rb")) replay_buffer._env_info_keys = replay_buffer.env_info_sizes.keys() print("Loaded the replay buffer that has length of {}".format( replay_buffer.get_diagnostics())) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.save_interval = args.save_interval algorithm.save_dir = args.save_dir algorithm.algo = args.algo algorithm.env_name = args.env_name algorithm.save_name = args.save_name algorithm.env_kwargs = env_kwargs summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) algorithm.writer = writer algorithm.to(ptu.device) algorithm.train()
def offpolicy_inference(): import time from gym import wrappers filename = str(uuid.uuid4()) gpu = True env, _, _ = prepare_env(args.env_name, args.visionmodel_path, **env_kwargs) snapshot = torch.load(args.load_name) policy = snapshot['evaluation/policy'] if args.env_name.find('doorenv') > -1: policy.knob_noisy = args.knob_noisy policy.nn = env._wrapped_env.nn policy.visionnet_input = env_kwargs['visionnet_input'] epi_counter = 1 dooropen_counter = 0 total_time = 0 test_num = 100 if evaluation: render = False else: if not args.unity: render = True else: render = False start_time = int(time.mktime(time.localtime())) if gpu: set_gpu_mode(True) while True: if args.env_name.find('doorenv') > -1: path, door_opened, opening_time = rollout( env, policy, max_path_length=512, doorenv=True, render=render, evaluate=True, ) print("done first") if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if evaluation: env, _, _ = prepare_env(args.env_name, args.visionmodel_path, **env_kwargs) if door_opened: dooropen_counter += 1 total_time += opening_time eval_print(dooropen_counter, epi_counter, start_time, total_time) else: path = rollout( env, policy, max_path_length=512, doorenv=False, render=render, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if evaluation: print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter)) epi_counter += 1 if args.env_name.find('door') > -1 and epi_counter > test_num: eval_print(dooropen_counter, epi_counter, start_time, total_time) break
def offpolicy_inference(seed, env_name, det, load_name, evaluation, render, knob_noisy, visionnet_input, env_kwargs, actor_critic=None, verbose=True, pos_control=True, step_skip=4): import time from gym import wrappers print("evaluatin started!") filename = str(uuid.uuid4()) gpu = True env, _, _ = prepare_env(env_name, **env_kwargs) if not actor_critic: snapshot = torch.load(load_name) policy = snapshot['evaluation/policy'] else: policy = actor_critic if env_name.find('doorenv') > -1: policy.knob_noisy = knob_noisy policy.nn = env._wrapped_env.nn policy.visionnet_input = env_kwargs['visionnet_input'] epi_counter = 1 dooropen_counter = 0 total_time = 0 test_num = 100 start_time = int(time.mktime(time.localtime())) if gpu: set_gpu_mode(True) while True: # print("new env") if env_name.find('doorenv') > -1: if evaluation: path, door_opened, opening_time = rollout( env, policy, max_path_length=512, render=render, evaluate=evaluation, verbose=True, doorenv=True, pos_control=pos_control, step_skip=step_skip, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() # if evaluation: # print("1") env, _, _ = prepare_env(env_name, **env_kwargs) if door_opened: dooropen_counter += 1 total_time += opening_time if verbose: print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format( epi_counter)) eval_print(dooropen_counter, epi_counter, start_time, total_time) else: path = rollout( env, policy, max_path_length=512, render=render, evaluate=evaluation, verbose=True, doorenv=True, pos_control=pos_control, step_skip=step_skip, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() else: path = rollout( env, policy, max_path_length=512, doorenv=False, render=render, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if evaluation: if verbose: print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter)) eval_print(dooropen_counter, epi_counter, start_time, total_time) epi_counter += 1 if env_name.find('door') > -1 and epi_counter > test_num: if verbose: print("dooropening counter:", dooropen_counter, " epi counter:", epi_counter) eval_print(dooropen_counter, epi_counter, start_time, total_time) break opening_rate, opening_timeavg = eval_print(dooropen_counter, epi_counter - 1, start_time, total_time) return opening_rate, opening_timeavg