示例#1
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params
            else training_environment)

        variant['Q_params']['config'].update({
            'input_shapes': (
                training_environment.observation_shape,
                training_environment.action_shape),
        })
        Qs = self.Qs = value_functions.get(variant['Q_params'])

        variant['policy_params']['config'].update({
            'action_range': (training_environment.action_space.low,
                             training_environment.action_space.high),
            'input_shapes': training_environment.observation_shape,
            'output_shape': training_environment.action_shape,
        })
        policy = self.policy = policies.get(variant['policy_params'])

        variant['replay_pool_params']['config'].update({
            'environment': training_environment,
        })
        replay_pool = self.replay_pool = replay_pools.get(
            variant['replay_pool_params'])

        variant['sampler_params']['config'].update({
            'environment': training_environment,
            'policy': policy,
            'pool': replay_pool,
        })
        sampler = self.sampler = samplers.get(variant['sampler_params'])

        set_random_seed(variant['run_params']['seed'])
        save_path = os.path.join(os.path.dirname(__file__),"..","..", "results", f"logs",f"sac", f"HalfCheetahBulletEnv-v0_{variant['run_params']['seed']}")
        print("this is the save path: " + save_path)
        os.makedirs(save_path, exist_ok=True)

        # create wrapped environment
        eval_env_wrapped = TimeLimit(evaluation_environment, 1000)

        eval_callback = EvalCallback(
            eval_env_wrapped,
            callback_on_new_best=None,
            best_model_save_path=None,
            n_eval_episodes=10,
            log_path=save_path,
            eval_freq=10000,  # TODO change hardcoded value
            deterministic=True,
            verbose=1,
        )
        eval_callback.init_callback(policy)
        sampler.set_callback(eval_callback)
        variant['algorithm_params']['config'].update({
            'training_environment': training_environment,
            'evaluation_environment': evaluation_environment,
            'policy': policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler
        })
        self.algorithm = algorithms.get(variant['algorithm_params'])

        self._built = True
示例#2
0
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)
    elif args.policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./models/{policy_file}")

    replay_buffer = utils.ReplayBuffer(state_dim, action_dim, max_size=200000)

    eval_callback.init_callback(policy)

    # Evaluate untrained policy
    # evaluations = [eval_policy(policy, args.env, args.seed)]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(args.max_timesteps)):

        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < args.start_timesteps: