def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape), }) Qs = self.Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = self.sampler = samplers.get(variant['sampler_params']) set_random_seed(variant['run_params']['seed']) save_path = os.path.join(os.path.dirname(__file__),"..","..", "results", f"logs",f"sac", f"HalfCheetahBulletEnv-v0_{variant['run_params']['seed']}") print("this is the save path: " + save_path) os.makedirs(save_path, exist_ok=True) # create wrapped environment eval_env_wrapped = TimeLimit(evaluation_environment, 1000) eval_callback = EvalCallback( eval_env_wrapped, callback_on_new_best=None, best_model_save_path=None, n_eval_episodes=10, log_path=save_path, eval_freq=10000, # TODO change hardcoded value deterministic=True, verbose=1, ) eval_callback.init_callback(policy) sampler.set_callback(eval_callback) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) elif args.policy == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") replay_buffer = utils.ReplayBuffer(state_dim, action_dim, max_size=200000) eval_callback.init_callback(policy) # Evaluate untrained policy # evaluations = [eval_policy(policy, args.env, args.seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 for t in range(int(args.max_timesteps)): episode_timesteps += 1 # Select action randomly or according to policy if t < args.start_timesteps: